mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-04-04 03:54:57 +08:00
Compare commits
12 Commits
5d691b3e3e
...
d46f2b2300
Author | SHA1 | Date | |
---|---|---|---|
|
d46f2b2300 | ||
|
9da7e17efe | ||
|
b0de354c63 | ||
|
41090e5a7c | ||
|
605b380114 | ||
|
9f8d455130 | ||
|
7abae557fb | ||
|
6a60e5edb1 | ||
|
28bdff356f | ||
|
03b662a769 | ||
|
6c468583c5 | ||
|
5a6bd010f7 |
180
.gitignore
vendored
180
.gitignore
vendored
@ -18,5 +18,183 @@ TEMP
|
||||
weight.json
|
||||
ffmpeg*
|
||||
ffprobe*
|
||||
cfg.json
|
||||
speakers.json
|
||||
ref_audios
|
||||
tools/AP_BWE_main/24kto48k/*
|
||||
!tools/AP_BWE_main/24kto48k/readme.txt
|
||||
!tools/AP_BWE_main/24kto48k/readme.txt
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# UV
|
||||
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
#uv.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
||||
.pdm.toml
|
||||
.pdm-python
|
||||
.pdm-build/
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
# Ruff stuff:
|
||||
.ruff_cache/
|
||||
|
||||
# PyPI configuration file
|
||||
.pypirc
|
||||
|
@ -3,7 +3,7 @@ import math
|
||||
import os, sys, gc
|
||||
import random
|
||||
import traceback
|
||||
|
||||
import time
|
||||
import torchaudio
|
||||
from tqdm import tqdm
|
||||
now_dir = os.getcwd()
|
||||
@ -462,8 +462,6 @@ class TTS:
|
||||
n_speakers=self.configs.n_speakers,
|
||||
**kwargs
|
||||
)
|
||||
if hasattr(vits_model, "enc_q"):
|
||||
del vits_model.enc_q
|
||||
self.configs.is_v3_synthesizer = False
|
||||
else:
|
||||
vits_model = SynthesizerTrnV3(
|
||||
@ -474,7 +472,8 @@ class TTS:
|
||||
)
|
||||
self.configs.is_v3_synthesizer = True
|
||||
self.init_bigvgan()
|
||||
|
||||
if "pretrained" not in weights_path and hasattr(vits_model, "enc_q"):
|
||||
del vits_model.enc_q
|
||||
|
||||
if if_lora_v3==False:
|
||||
print(f"Loading VITS weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}")
|
||||
@ -908,11 +907,14 @@ class TTS:
|
||||
split_bucket = False
|
||||
print(i18n("分段返回模式不支持分桶处理,已自动关闭分桶处理"))
|
||||
|
||||
if split_bucket and speed_factor==1.0:
|
||||
if split_bucket and speed_factor==1.0 and not (self.configs.is_v3_synthesizer and parallel_infer):
|
||||
print(i18n("分桶处理模式已开启"))
|
||||
elif speed_factor!=1.0:
|
||||
print(i18n("语速调节不支持分桶处理,已自动关闭分桶处理"))
|
||||
split_bucket = False
|
||||
elif self.configs.is_v3_synthesizer and parallel_infer:
|
||||
print(i18n("当开启并行推理模式时,SoVits V3模型不支持分桶处理,已自动关闭分桶处理"))
|
||||
split_bucket = False
|
||||
else:
|
||||
print(i18n("分桶处理模式已关闭"))
|
||||
|
||||
@ -936,7 +938,7 @@ class TTS:
|
||||
raise ValueError("ref_audio_path cannot be empty, when the reference audio is not set using set_ref_audio()")
|
||||
|
||||
###### setting reference audio and prompt text preprocessing ########
|
||||
t0 = ttime()
|
||||
t0 = time.perf_counter()
|
||||
if (ref_audio_path is not None) and (ref_audio_path != self.prompt_cache["ref_audio_path"]):
|
||||
if not os.path.exists(ref_audio_path):
|
||||
raise ValueError(f"{ref_audio_path} not exists")
|
||||
@ -975,7 +977,7 @@ class TTS:
|
||||
|
||||
|
||||
###### text preprocessing ########
|
||||
t1 = ttime()
|
||||
t1 = time.perf_counter()
|
||||
data:list = None
|
||||
if not return_fragment:
|
||||
data = self.text_preprocessor.preprocess(text, text_lang, text_split_method, self.configs.version)
|
||||
@ -1027,7 +1029,7 @@ class TTS:
|
||||
return batch[0]
|
||||
|
||||
|
||||
t2 = ttime()
|
||||
t2 = time.perf_counter()
|
||||
try:
|
||||
print("############ 推理 ############")
|
||||
###### inference ######
|
||||
@ -1036,7 +1038,7 @@ class TTS:
|
||||
audio = []
|
||||
output_sr = self.configs.sampling_rate if not self.configs.is_v3_synthesizer else 24000
|
||||
for item in data:
|
||||
t3 = ttime()
|
||||
t3 = time.perf_counter()
|
||||
if return_fragment:
|
||||
item = make_batch(item)
|
||||
if item is None:
|
||||
@ -1071,7 +1073,7 @@ class TTS:
|
||||
max_len=max_len,
|
||||
repetition_penalty=repetition_penalty,
|
||||
)
|
||||
t4 = ttime()
|
||||
t4 = time.perf_counter()
|
||||
t_34 += t4 - t3
|
||||
|
||||
refer_audio_spec:torch.Tensor = [item.to(dtype=self.precision, device=self.configs.device) for item in self.prompt_cache["refer_spec"]]
|
||||
@ -1094,6 +1096,7 @@ class TTS:
|
||||
print(f"############ {i18n('合成音频')} ############")
|
||||
if not self.configs.is_v3_synthesizer:
|
||||
if speed_factor == 1.0:
|
||||
print(f"{i18n('并行合成中')}...")
|
||||
# ## vits并行推理 method 2
|
||||
pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)]
|
||||
upsample_rate = math.prod(self.vits_model.upsample_rates)
|
||||
@ -1118,17 +1121,28 @@ class TTS:
|
||||
audio_fragment
|
||||
) ###试试重建不带上prompt部分
|
||||
else:
|
||||
for i, idx in enumerate(tqdm(idx_list)):
|
||||
phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
|
||||
_pred_semantic = (pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)) # .unsqueeze(0)#mq要多unsqueeze一次
|
||||
audio_fragment = self.v3_synthesis(
|
||||
_pred_semantic, phones, speed=speed_factor, sample_steps=sample_steps
|
||||
)
|
||||
batch_audio_fragment.append(
|
||||
audio_fragment
|
||||
)
|
||||
if parallel_infer:
|
||||
print(f"{i18n('并行合成中')}...")
|
||||
audio_fragments = self.v3_synthesis_batched_infer(
|
||||
idx_list,
|
||||
pred_semantic_list,
|
||||
batch_phones,
|
||||
speed=speed_factor,
|
||||
sample_steps=sample_steps
|
||||
)
|
||||
batch_audio_fragment.extend(audio_fragments)
|
||||
else:
|
||||
for i, idx in enumerate(tqdm(idx_list)):
|
||||
phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
|
||||
_pred_semantic = (pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)) # .unsqueeze(0)#mq要多unsqueeze一次
|
||||
audio_fragment = self.v3_synthesis(
|
||||
_pred_semantic, phones, speed=speed_factor, sample_steps=sample_steps
|
||||
)
|
||||
batch_audio_fragment.append(
|
||||
audio_fragment
|
||||
)
|
||||
|
||||
t5 = ttime()
|
||||
t5 = time.perf_counter()
|
||||
t_45 += t5 - t4
|
||||
if return_fragment:
|
||||
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t4 - t3, t5 - t4))
|
||||
@ -1219,13 +1233,13 @@ class TTS:
|
||||
|
||||
if super_sampling:
|
||||
print(f"############ {i18n('音频超采样')} ############")
|
||||
t1 = ttime()
|
||||
t1 = time.perf_counter()
|
||||
self.init_sr_model()
|
||||
if not self.sr_model_not_exist:
|
||||
audio,sr=self.sr_model(audio.unsqueeze(0),sr)
|
||||
max_audio=np.abs(audio).max()
|
||||
if max_audio > 1: audio /= max_audio
|
||||
t2 = ttime()
|
||||
t2 = time.perf_counter()
|
||||
print(f"超采样用时:{t2-t1:.3f}s")
|
||||
else:
|
||||
audio = audio.cpu().numpy()
|
||||
@ -1260,7 +1274,7 @@ class TTS:
|
||||
ref_audio = ref_audio.mean(0).unsqueeze(0)
|
||||
if ref_sr!=24000:
|
||||
ref_audio=resample(ref_audio, ref_sr, self.configs.device)
|
||||
# print("ref_audio",ref_audio.abs().mean())
|
||||
|
||||
mel2 = mel_fn(ref_audio)
|
||||
mel2 = norm_spec(mel2)
|
||||
T_min = min(mel2.shape[2], fea_ref.shape[2])
|
||||
@ -1285,15 +1299,156 @@ class TTS:
|
||||
|
||||
cfm_res = self.vits_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0)
|
||||
cfm_res = cfm_res[:, :, mel2.shape[2]:]
|
||||
mel2 = cfm_res[:, :, -T_min:]
|
||||
|
||||
mel2 = cfm_res[:, :, -T_min:]
|
||||
fea_ref = fea_todo_chunk[:, :, -T_min:]
|
||||
|
||||
cfm_resss.append(cfm_res)
|
||||
cmf_res = torch.cat(cfm_resss, 2)
|
||||
cmf_res = denorm_spec(cmf_res)
|
||||
cfm_res = torch.cat(cfm_resss, 2)
|
||||
cfm_res = denorm_spec(cfm_res)
|
||||
|
||||
|
||||
with torch.inference_mode():
|
||||
wav_gen = self.bigvgan_model(cmf_res)
|
||||
wav_gen = self.bigvgan_model(cfm_res)
|
||||
audio=wav_gen[0][0]#.cpu().detach().numpy()
|
||||
|
||||
return audio
|
||||
|
||||
|
||||
|
||||
def v3_synthesis_batched_infer(self,
|
||||
idx_list:List[int],
|
||||
semantic_tokens_list:List[torch.Tensor],
|
||||
batch_phones:List[torch.Tensor],
|
||||
speed:float=1.0,
|
||||
sample_steps:int=32
|
||||
)->List[torch.Tensor]:
|
||||
|
||||
prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device)
|
||||
prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device)
|
||||
refer_audio_spec = self.prompt_cache["refer_spec"][0].to(dtype=self.precision, device=self.configs.device)
|
||||
|
||||
fea_ref,ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
|
||||
ref_audio:torch.Tensor = self.prompt_cache["raw_audio"]
|
||||
ref_sr = self.prompt_cache["raw_sr"]
|
||||
ref_audio=ref_audio.to(self.configs.device).float()
|
||||
if (ref_audio.shape[0] == 2):
|
||||
ref_audio = ref_audio.mean(0).unsqueeze(0)
|
||||
if ref_sr!=24000:
|
||||
ref_audio=resample(ref_audio, ref_sr, self.configs.device)
|
||||
|
||||
mel2 = mel_fn(ref_audio)
|
||||
mel2 = norm_spec(mel2)
|
||||
T_min = min(mel2.shape[2], fea_ref.shape[2])
|
||||
mel2 = mel2[:, :, :T_min]
|
||||
fea_ref = fea_ref[:, :, :T_min]
|
||||
if (T_min > 468):
|
||||
mel2 = mel2[:, :, -468:]
|
||||
fea_ref = fea_ref[:, :, -468:]
|
||||
T_min = 468
|
||||
chunk_len = 934 - T_min
|
||||
|
||||
mel2=mel2.to(self.precision)
|
||||
|
||||
|
||||
# #### batched inference
|
||||
overlapped_len = 12
|
||||
feat_chunks = []
|
||||
feat_lens = []
|
||||
feat_list = []
|
||||
|
||||
for i, idx in enumerate(idx_list):
|
||||
phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
|
||||
semantic_tokens = semantic_tokens_list[i][-idx:].unsqueeze(0).unsqueeze(0) # .unsqueeze(0)#mq要多unsqueeze一次
|
||||
feat, _ = self.vits_model.decode_encp(semantic_tokens, phones, refer_audio_spec, ge, speed)
|
||||
feat_list.append(feat)
|
||||
feat_lens.append(feat.shape[2])
|
||||
|
||||
feats = torch.cat(feat_list, 2)
|
||||
feats_padded = F.pad(feats, (overlapped_len,0), "constant", 0)
|
||||
pos = 0
|
||||
padding_len = 0
|
||||
while True:
|
||||
if pos ==0:
|
||||
chunk = feats_padded[:, :, pos:pos + chunk_len]
|
||||
else:
|
||||
pos = pos - overlapped_len
|
||||
chunk = feats_padded[:, :, pos:pos + chunk_len]
|
||||
pos += chunk_len
|
||||
if (chunk.shape[-1] == 0): break
|
||||
|
||||
# padding for the last chunk
|
||||
padding_len = chunk_len - chunk.shape[2]
|
||||
if padding_len != 0:
|
||||
chunk = F.pad(chunk, (0,padding_len), "constant", 0)
|
||||
feat_chunks.append(chunk)
|
||||
|
||||
|
||||
|
||||
feat_chunks = torch.cat(feat_chunks, 0)
|
||||
bs = feat_chunks.shape[0]
|
||||
fea_ref = fea_ref.repeat(bs,1,1)
|
||||
fea = torch.cat([fea_ref, feat_chunks], 2).transpose(2, 1)
|
||||
pred_spec = self.vits_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0)
|
||||
pred_spec = pred_spec[:, :, -chunk_len:]
|
||||
dd = pred_spec.shape[1]
|
||||
pred_spec = pred_spec.permute(1, 0, 2).contiguous().view(dd, -1).unsqueeze(0)
|
||||
# pred_spec = pred_spec[..., :-padding_len]
|
||||
|
||||
|
||||
pred_spec = denorm_spec(pred_spec)
|
||||
|
||||
with torch.no_grad():
|
||||
wav_gen = self.bigvgan_model(pred_spec)
|
||||
audio = wav_gen[0][0]#.cpu().detach().numpy()
|
||||
|
||||
|
||||
audio_fragments = []
|
||||
upsample_rate = 256
|
||||
pos = 0
|
||||
|
||||
while pos < audio.shape[-1]:
|
||||
audio_fragment = audio[pos:pos+chunk_len*upsample_rate]
|
||||
audio_fragments.append(audio_fragment)
|
||||
pos += chunk_len*upsample_rate
|
||||
|
||||
audio = self.sola_algorithm(audio_fragments, overlapped_len*upsample_rate)
|
||||
audio = audio[overlapped_len*upsample_rate:-padding_len*upsample_rate]
|
||||
|
||||
audio_fragments = []
|
||||
for feat_len in feat_lens:
|
||||
audio_fragment = audio[:feat_len*upsample_rate]
|
||||
audio_fragments.append(audio_fragment)
|
||||
audio = audio[feat_len*upsample_rate:]
|
||||
|
||||
|
||||
return audio_fragments
|
||||
|
||||
|
||||
|
||||
def sola_algorithm(self,
|
||||
audio_fragments:List[torch.Tensor],
|
||||
overlap_len:int,
|
||||
):
|
||||
|
||||
for i in range(len(audio_fragments)-1):
|
||||
f1 = audio_fragments[i]
|
||||
f2 = audio_fragments[i+1]
|
||||
w1 = f1[-overlap_len:]
|
||||
w2 = f2[:overlap_len]
|
||||
assert w1.shape == w2.shape
|
||||
corr = F.conv1d(w1.view(1,1,-1), w2.view(1,1,-1),padding=w2.shape[-1]//2).view(-1)[:-1]
|
||||
idx = corr.argmax()
|
||||
f1_ = f1[:-(overlap_len-idx)]
|
||||
audio_fragments[i] = f1_
|
||||
|
||||
f2_ = f2[idx:]
|
||||
window = torch.hann_window((overlap_len-idx)*2, device=f1.device, dtype=f1.dtype)
|
||||
f2_[:(overlap_len-idx)] = window[:(overlap_len-idx)]*f2_[:(overlap_len-idx)] + window[(overlap_len-idx):]*f1[-(overlap_len-idx):]
|
||||
audio_fragments[i+1] = f2_
|
||||
|
||||
|
||||
return torch.cat(audio_fragments, 0)
|
||||
|
||||
|
||||
|
||||
|
@ -238,7 +238,7 @@ def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
|
||||
else:
|
||||
visible_sample_steps=False
|
||||
visible_inp_refs=True
|
||||
yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False}
|
||||
yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False},{"__type__": "update", "value":i18n("模型加载中,请等待"),"interactive":False}
|
||||
|
||||
dict_s2 = load_sovits_new(sovits_path)
|
||||
hps = dict_s2["config"]
|
||||
@ -294,6 +294,7 @@ def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
|
||||
# torch.save(vq_model.state_dict(),"merge_win.pth")
|
||||
vq_model.eval()
|
||||
|
||||
yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False},{"__type__": "update", "value":i18n("合成语音"),"interactive":True}
|
||||
with open("./weight.json")as f:
|
||||
data=f.read()
|
||||
data=json.loads(data)
|
||||
@ -877,7 +878,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
with gr.Row():
|
||||
inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频,超过会报错!"), type="filepath", scale=13)
|
||||
with gr.Column(scale=13):
|
||||
ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。")+i18n("v3暂不支持该模式,使用了会报错。"), value=False, interactive=True, show_label=True,scale=1)
|
||||
ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。")+i18n("v3暂不支持该模式,使用了会报错。"), value=False, interactive=True if model_version!="v3"else False, show_label=True,scale=1)
|
||||
gr.Markdown(html_left(i18n("使用无参考文本模式时建议使用微调的GPT")+"<br>"+i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。")))
|
||||
prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="", lines=5, max_lines=5,scale=1)
|
||||
with gr.Column(scale=14):
|
||||
@ -915,7 +916,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
# phoneme=gr.Textbox(label=i18n("音素框"), value="")
|
||||
# get_phoneme_button = gr.Button(i18n("目标文本转音素"), variant="primary")
|
||||
with gr.Row():
|
||||
inference_button = gr.Button(i18n("合成语音"), variant="primary", size='lg', scale=25)
|
||||
inference_button = gr.Button(value=i18n("合成语音"), variant="primary", size='lg', scale=25)
|
||||
output = gr.Audio(label=i18n("输出的语音"), scale=14)
|
||||
|
||||
inference_button.click(
|
||||
@ -923,7 +924,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
[inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free,speed,if_freeze,inp_refs,sample_steps,if_sr_Checkbox,pause_second_slider],
|
||||
[output],
|
||||
)
|
||||
SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language,sample_steps,inp_refs,ref_text_free,if_sr_Checkbox])
|
||||
SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language,sample_steps,inp_refs,ref_text_free,if_sr_Checkbox,inference_button])
|
||||
GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], [])
|
||||
|
||||
# gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"))
|
||||
|
@ -41,12 +41,13 @@ gpt_path = os.environ.get("gpt_path", None)
|
||||
sovits_path = os.environ.get("sovits_path", None)
|
||||
cnhubert_base_path = os.environ.get("cnhubert_base_path", None)
|
||||
bert_path = os.environ.get("bert_path", None)
|
||||
version=os.environ.get("version","v2")
|
||||
version=model_version=os.environ.get("version","v2")
|
||||
|
||||
import gradio as gr
|
||||
from TTS_infer_pack.TTS import TTS, TTS_Config, NO_PROMPT_ERROR
|
||||
from TTS_infer_pack.text_segmentation_method import get_method
|
||||
from tools.i18n.i18n import I18nAuto, scan_language_list
|
||||
from inference_webui import DictToAttrRecursive
|
||||
|
||||
language=os.environ.get("language","Auto")
|
||||
language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
|
||||
@ -221,19 +222,16 @@ def get_weights_names(GPT_weight_root, SoVITS_weight_root):
|
||||
SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root)
|
||||
|
||||
|
||||
from process_ckpt import get_sovits_version_from_path_fast
|
||||
from process_ckpt import get_sovits_version_from_path_fast,load_sovits_new
|
||||
def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
|
||||
global version, dict_language
|
||||
global version, model_version, dict_language,if_lora_v3
|
||||
version, model_version, if_lora_v3=get_sovits_version_from_path_fast(sovits_path)
|
||||
|
||||
# print(sovits_path,version, model_version, if_lora_v3)
|
||||
if if_lora_v3 and not os.path.exists(path_sovits_v3):
|
||||
info= path_sovits_v3 + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重")
|
||||
gr.Warning(info)
|
||||
raise FileExistsError(info)
|
||||
|
||||
tts_pipeline.init_vits_weights(sovits_path)
|
||||
|
||||
dict_language = dict_language_v1 if tts_pipeline.configs.version =='v1' else dict_language_v2
|
||||
dict_language = dict_language_v1 if version =='v1' else dict_language_v2
|
||||
if prompt_language is not None and text_language is not None:
|
||||
if prompt_language in list(dict_language.keys()):
|
||||
prompt_text_update, prompt_language_update = {'__type__':'update'}, {'__type__':'update', 'value':prompt_language}
|
||||
@ -251,8 +249,11 @@ def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
|
||||
else:
|
||||
visible_sample_steps=False
|
||||
visible_inp_refs=True
|
||||
yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False}
|
||||
#prompt_language,text_language,prompt_text,prompt_language,text,text_language,inp_refs,ref_text_free,
|
||||
yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "interactive": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "interactive": True if model_version!="v3"else False},{"__type__": "update", "value":i18n("模型加载中,请等待"),"interactive":False}
|
||||
|
||||
tts_pipeline.init_vits_weights(sovits_path)
|
||||
yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "interactive": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "interactive": True if model_version!="v3"else False},{"__type__": "update", "value":i18n("合成语音"),"interactive":True}
|
||||
with open("./weight.json")as f:
|
||||
data=f.read()
|
||||
data=json.loads(data)
|
||||
@ -279,14 +280,14 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
gr.Markdown(value=i18n("*请上传并填写参考信息"))
|
||||
with gr.Row():
|
||||
inp_ref = gr.Audio(label=i18n("主参考音频(请上传3~10秒内参考音频,超过会报错!)"), type="filepath")
|
||||
inp_refs = gr.File(label=i18n("辅参考音频(可选多个,或不选)"),file_count="multiple")
|
||||
inp_refs = gr.File(label=i18n("辅参考音频(可选多个,或不选)"),file_count="multiple", visible=True if model_version!="v3"else False)
|
||||
prompt_text = gr.Textbox(label=i18n("主参考音频的文本"), value="", lines=2)
|
||||
with gr.Row():
|
||||
prompt_language = gr.Dropdown(
|
||||
label=i18n("主参考音频的语种"), choices=list(dict_language.keys()), value=i18n("中文")
|
||||
)
|
||||
with gr.Column():
|
||||
ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True, show_label=True)
|
||||
ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True if model_version!="v3"else False, show_label=True)
|
||||
gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT")+"<br>"+i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。"))
|
||||
|
||||
with gr.Column():
|
||||
@ -355,7 +356,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
[output, seed],
|
||||
)
|
||||
stop_infer.click(tts_pipeline.stop, [], [])
|
||||
SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language])
|
||||
SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language,sample_steps,inp_refs,ref_text_free,inference_button])#
|
||||
GPT_dropdown.change(tts_pipeline.init_t2s_weights, [GPT_dropdown], [])
|
||||
|
||||
with gr.Group():
|
||||
|
@ -429,26 +429,25 @@ def train_and_evaluate(
|
||||
# scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)})
|
||||
# scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)})
|
||||
# scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)})
|
||||
image_dict = {
|
||||
"slice/mel_org": utils.plot_spectrogram_to_numpy(
|
||||
y_mel[0].data.cpu().numpy()
|
||||
),
|
||||
"slice/mel_gen": utils.plot_spectrogram_to_numpy(
|
||||
y_hat_mel[0].data.cpu().numpy()
|
||||
),
|
||||
"all/mel": utils.plot_spectrogram_to_numpy(
|
||||
mel[0].data.cpu().numpy()
|
||||
),
|
||||
"all/stats_ssl": utils.plot_spectrogram_to_numpy(
|
||||
stats_ssl[0].data.cpu().numpy()
|
||||
),
|
||||
}
|
||||
utils.summarize(
|
||||
writer=writer,
|
||||
global_step=global_step,
|
||||
images=image_dict,
|
||||
scalars=scalar_dict,
|
||||
)
|
||||
image_dict=None
|
||||
try:###Some people installed the wrong version of matplotlib.
|
||||
image_dict = {
|
||||
"slice/mel_org": utils.plot_spectrogram_to_numpy(
|
||||
y_mel[0].data.cpu().numpy()
|
||||
),
|
||||
"slice/mel_gen": utils.plot_spectrogram_to_numpy(
|
||||
y_hat_mel[0].data.cpu().numpy()
|
||||
),
|
||||
"all/mel": utils.plot_spectrogram_to_numpy(
|
||||
mel[0].data.cpu().numpy()
|
||||
),
|
||||
"all/stats_ssl": utils.plot_spectrogram_to_numpy(
|
||||
stats_ssl[0].data.cpu().numpy()
|
||||
),
|
||||
}
|
||||
except:pass
|
||||
if image_dict:utils.summarize(writer=writer,global_step=global_step,images=image_dict,scalars=scalar_dict,)
|
||||
else:utils.summarize(writer=writer,global_step=global_step,scalars=scalar_dict,)
|
||||
global_step += 1
|
||||
if epoch % hps.train.save_every_epoch == 0 and rank == 0:
|
||||
if hps.train.if_save_latest == 0:
|
||||
|
@ -58,7 +58,7 @@ def download_and_decompress(model_dir: str='G2PWModel/'):
|
||||
extract_dir = os.path.join(parent_directory,"G2PWModel_1.1")
|
||||
extract_dir_new = os.path.join(parent_directory,"G2PWModel")
|
||||
print("Downloading g2pw model...")
|
||||
modelscope_url = "https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip"
|
||||
modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip"#"https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip"
|
||||
with requests.get(modelscope_url, stream=True) as r:
|
||||
r.raise_for_status()
|
||||
with open(zip_dir, 'wb') as f:
|
||||
|
@ -1,42 +1,37 @@
|
||||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3"
|
||||
},
|
||||
"accelerator": "GPU"
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "himHYZmra7ix"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "e9b7iFV3dm1f"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
|
||||
"%cd GPT-SoVITS\n",
|
||||
"!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
|
||||
"!pip install -r extra-req.txt --no-deps\n",
|
||||
"!pip install -r requirements.txt"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"cellView": "form",
|
||||
"id": "0NgxXg5sjv7z"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# @title Download pretrained models 下载预训练模型\n",
|
||||
"!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
|
||||
@ -53,16 +48,16 @@
|
||||
"!git clone https://huggingface.co/Delik/uvr5_weights\n",
|
||||
"!git config core.sparseCheckout true\n",
|
||||
"!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "0NgxXg5sjv7z",
|
||||
"cellView": "form"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"cellView": "form",
|
||||
"id": "cPDEH-9czOJF"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#@title Create folder models 创建文件夹模型\n",
|
||||
"import os\n",
|
||||
@ -77,16 +72,16 @@
|
||||
" print(f\"The folder '{folder_name}' was created successfully! (文件夹'{folder_name}'已成功创建!)\")\n",
|
||||
"\n",
|
||||
"print(\"All folders have been created. (所有文件夹均已创建。)\")"
|
||||
],
|
||||
"metadata": {
|
||||
"cellView": "form",
|
||||
"id": "cPDEH-9czOJF"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"cellView": "form",
|
||||
"id": "vbZY-LnM0tzq"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"import zipfile\n",
|
||||
@ -124,29 +119,35 @@
|
||||
" shutil.move(source_path, destination_path)\n",
|
||||
"\n",
|
||||
"print(f'Model downloaded. (模型已下载。)')"
|
||||
],
|
||||
"metadata": {
|
||||
"cellView": "form",
|
||||
"id": "vbZY-LnM0tzq"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"cellView": "form",
|
||||
"id": "4oRGUzkrk8C7"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# @title launch WebUI 启动WebUI\n",
|
||||
"!/usr/local/bin/pip install ipykernel\n",
|
||||
"!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
|
||||
"%cd /content/GPT-SoVITS/\n",
|
||||
"!/usr/local/bin/python webui.py"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "4oRGUzkrk8C7",
|
||||
"cellView": "form"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"accelerator": "GPU",
|
||||
"colab": {
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"name": "python3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
||||
|
39
README.md
39
README.md
@ -1,6 +1,5 @@
|
||||
<div align="center">
|
||||
|
||||
|
||||
<h1>GPT-SoVITS-WebUI</h1>
|
||||
A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>
|
||||
|
||||
@ -77,6 +76,7 @@ bash install.sh
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda activate GPTSoVits
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
@ -105,6 +105,7 @@ Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWeb
|
||||
Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) (Korean TTS Only)
|
||||
|
||||
##### MacOS Users
|
||||
|
||||
```bash
|
||||
brew install ffmpeg
|
||||
```
|
||||
@ -112,6 +113,7 @@ brew install ffmpeg
|
||||
#### Install Dependences
|
||||
|
||||
```bash
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
@ -150,9 +152,9 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
|
||||
|
||||
3. For UVR5 (Vocals/Accompaniment Separation & Reverberation Removal, additionally), download models from [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) and place them in `tools/uvr5/uvr5_weights`.
|
||||
|
||||
- If you want to use `bs_roformer` or `mel_band_roformer` models for UVR5, you can manually download the model and corresponding configuration file, and put them in `tools/uvr5/uvr5_weights`. **Rename the model file and configuration file, ensure that the model and configuration files have the same and corresponding names except for the suffix**. In addition, the model and configuration file names **must include `roformer`** in order to be recognized as models of the roformer class.
|
||||
- If you want to use `bs_roformer` or `mel_band_roformer` models for UVR5, you can manually download the model and corresponding configuration file, and put them in `tools/uvr5/uvr5_weights`. **Rename the model file and configuration file, ensure that the model and configuration files have the same and corresponding names except for the suffix**. In addition, the model and configuration file names **must include `roformer`** in order to be recognized as models of the roformer class.
|
||||
|
||||
- The suggestion is to **directly specify the model type** in the model name and configuration file name, such as `mel_mand_roformer`, `bs_roformer`. If not specified, the features will be compared from the configuration file to determine which type of model it is. For example, the model `bs_roformer_ep_368_sdr_12.9628.ckpt` and its corresponding configuration file `bs_roformer_ep_368_sdr_12.9628.yaml` are a pair, `kim_mel_band_roformer.ckpt` and `kim_mel_band_roformer.yaml` are also a pair.
|
||||
- The suggestion is to **directly specify the model type** in the model name and configuration file name, such as `mel_mand_roformer`, `bs_roformer`. If not specified, the features will be compared from the configuration file to determine which type of model it is. For example, the model `bs_roformer_ep_368_sdr_12.9628.ckpt` and its corresponding configuration file `bs_roformer_ep_368_sdr_12.9628.yaml` are a pair, `kim_mel_band_roformer.ckpt` and `kim_mel_band_roformer.yaml` are also a pair.
|
||||
|
||||
4. For Chinese ASR (additionally), download models from [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), and [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) and place them in `tools/asr/models`.
|
||||
|
||||
@ -200,6 +202,7 @@ if you want to switch to V1,then
|
||||
```bash
|
||||
python webui.py v1 <language(optional)>
|
||||
```
|
||||
|
||||
Or maunally switch version in WebUI
|
||||
|
||||
### Finetune
|
||||
@ -217,18 +220,20 @@ Or maunally switch version in WebUI
|
||||
|
||||
#### Integrated Package Users
|
||||
|
||||
Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
|
||||
Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
|
||||
|
||||
#### Others
|
||||
|
||||
```bash
|
||||
python GPT_SoVITS/inference_webui.py <language(optional)>
|
||||
```
|
||||
|
||||
OR
|
||||
|
||||
```bash
|
||||
python webui.py
|
||||
```
|
||||
|
||||
then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
|
||||
|
||||
## V2 Release Notes
|
||||
@ -243,7 +248,7 @@ New Features:
|
||||
|
||||
4. Improved synthesis quality for low-quality reference audio
|
||||
|
||||
[more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
||||
[more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||
|
||||
Use v2 from v1 environment:
|
||||
|
||||
@ -253,7 +258,7 @@ Use v2 from v1 environment:
|
||||
|
||||
3. Download v2 pretrained models from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) and put them into `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`.
|
||||
|
||||
Chinese v2 additional: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.
|
||||
Chinese v2 additional: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.
|
||||
|
||||
## V3 Release Notes
|
||||
|
||||
@ -263,7 +268,7 @@ New Features:
|
||||
|
||||
2. GPT model is more stable, with fewer repetitions and omissions, and it is easier to generate speech with richer emotional expression.
|
||||
|
||||
[more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
||||
[more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||
|
||||
Use v3 from v2 environment:
|
||||
|
||||
@ -273,8 +278,7 @@ Use v3 from v2 environment:
|
||||
|
||||
3. Download v3 pretrained models (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS\pretrained_models`.
|
||||
|
||||
additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt)
|
||||
|
||||
additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt)
|
||||
|
||||
## Todo List
|
||||
|
||||
@ -297,15 +301,20 @@ Use v3 from v2 environment:
|
||||
- [ ] model mix
|
||||
|
||||
## (Additional) Method for running from the command line
|
||||
|
||||
Use the command line to open the WebUI for UVR5
|
||||
|
||||
```
|
||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
||||
```
|
||||
|
||||
<!-- If you can't open a browser, follow the format below for UVR processing,This is using mdxnet for audio processing
|
||||
```
|
||||
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
||||
``` -->
|
||||
|
||||
This is how the audio segmentation of the dataset is done using the command line
|
||||
|
||||
```
|
||||
python audio_slicer.py \
|
||||
--input_path "<path_to_original_audio_file_or_directory>" \
|
||||
@ -315,16 +324,21 @@ python audio_slicer.py \
|
||||
--min_interval <shortest_time_gap_between_adjacent_subclips>
|
||||
--hop_size <step_size_for_computing_volume_curve>
|
||||
```
|
||||
|
||||
This is how dataset ASR processing is done using the command line(Only Chinese)
|
||||
|
||||
```
|
||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
||||
```
|
||||
|
||||
ASR processing is performed through Faster_Whisper(ASR marking except Chinese)
|
||||
|
||||
(No progress bars, GPU performance may cause time delays)
|
||||
|
||||
```
|
||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
||||
```
|
||||
|
||||
A custom list save path is enabled
|
||||
|
||||
## Credits
|
||||
@ -332,6 +346,7 @@ A custom list save path is enabled
|
||||
Special thanks to the following projects and contributors:
|
||||
|
||||
### Theoretical Research
|
||||
|
||||
- [ar-vits](https://github.com/innnky/ar-vits)
|
||||
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
||||
- [vits](https://github.com/jaywalnut310/vits)
|
||||
@ -341,17 +356,23 @@ Special thanks to the following projects and contributors:
|
||||
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
||||
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
||||
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
||||
|
||||
### Pretrained Models
|
||||
|
||||
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
||||
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
||||
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
||||
|
||||
### Text Frontend for Inference
|
||||
|
||||
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
||||
- [split-lang](https://github.com/DoodleBears/split-lang)
|
||||
- [g2pW](https://github.com/GitYCC/g2pW)
|
||||
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
||||
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
||||
|
||||
### WebUI Tools
|
||||
|
||||
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
||||
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
||||
- [SubFix](https://github.com/cronrpc/SubFix)
|
||||
|
50
api_v2.py
50
api_v2.py
@ -169,6 +169,9 @@ class TTS_Request(BaseModel):
|
||||
sample_steps:int = 32
|
||||
super_sampling:bool = False
|
||||
|
||||
# List of supported audio file extensions
|
||||
AUDIO_EXTENSIONS = ('.wav', '.aac', '.flac', '.ogg', '.mp3', '.m4a')
|
||||
|
||||
### modify from https://github.com/RVC-Boss/GPT-SoVITS/pull/894/files
|
||||
def pack_ogg(io_buffer:BytesIO, data:np.ndarray, rate:int):
|
||||
with sf.SoundFile(io_buffer, mode='w', samplerate=rate, channels=1, format='ogg') as audio_file:
|
||||
@ -339,10 +342,57 @@ async def tts_handle(req:dict):
|
||||
except Exception as e:
|
||||
return JSONResponse(status_code=400, content={"message": f"tts failed", "Exception": str(e)})
|
||||
|
||||
def list_audio_files(ref_audio_path: str = None):
|
||||
try:
|
||||
if ref_audio_path:
|
||||
directory = ref_audio_path
|
||||
if not os.path.isdir(directory):
|
||||
return JSONResponse(status_code=404, content={"message": "Specified path is not a valid directory."})
|
||||
else:
|
||||
directory = os.getcwd()
|
||||
|
||||
files = os.listdir(directory)
|
||||
audio_files = [f for f in files if f.lower().endswith(AUDIO_EXTENSIONS)]
|
||||
|
||||
if not audio_files:
|
||||
return JSONResponse(status_code=404, content={"message": "No audio files found in the directory."})
|
||||
return JSONResponse(content={"audio_files": audio_files})
|
||||
except Exception as e:
|
||||
return JSONResponse(status_code=500, content={"message": f"Failed to list audio files: {str(e)}"})
|
||||
|
||||
def list_gpt_files(directory_path: str = "GPT_weights_v2"):
|
||||
try:
|
||||
current_dir = os.path.join(os.getcwd(), directory_path)
|
||||
files = os.listdir(current_dir)
|
||||
gpt_files = [f for f in files if f.lower().endswith('.ckpt')]
|
||||
if not gpt_files:
|
||||
return JSONResponse(status_code=404, content={"message": "No gpt files found in the directory."})
|
||||
return JSONResponse(content={"gpt_files": gpt_files})
|
||||
except Exception as e:
|
||||
return JSONResponse(status_code=500, content={"message": f"Failed to list gpt files: {str(e)}"})
|
||||
|
||||
def list_sovits_files(directory_path: str = "SoVITS_weights_v2"):
|
||||
try:
|
||||
current_dir = os.path.join(os.getcwd(), directory_path)
|
||||
files = os.listdir(current_dir)
|
||||
sovits_files = [f for f in files if f.lower().endswith('.pth')]
|
||||
if not sovits_files:
|
||||
return JSONResponse(status_code=404, content={"message": "No sovits files found in the directory."})
|
||||
return JSONResponse(content={"sovits_files": sovits_files})
|
||||
except Exception as e:
|
||||
return JSONResponse(status_code=500, content={"message": f"Failed to list sovits files: {str(e)}"})
|
||||
|
||||
@APP.get("/list_audio_files")
|
||||
async def get_audio_files(ref_audio_path: str = None):
|
||||
return list_audio_files(ref_audio_path)
|
||||
|
||||
@APP.get("/list_gpt_files")
|
||||
async def get_gpt_files(directory_path: str = "GPT_weights_v2"):
|
||||
return list_gpt_files(directory_path)
|
||||
|
||||
@APP.get("/list_sovits_files")
|
||||
async def get_sovits_files(directory_path: str = "SoVITS_weights_v2"):
|
||||
return list_sovits_files(directory_path)
|
||||
|
||||
@APP.get("/control")
|
||||
async def control(command: str = None):
|
||||
|
@ -1,23 +1,10 @@
|
||||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"provenance": [],
|
||||
"include_colab_link": true
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3"
|
||||
},
|
||||
"accelerator": "GPU"
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "view-in-github",
|
||||
"colab_type": "text"
|
||||
"colab_type": "text",
|
||||
"id": "view-in-github"
|
||||
},
|
||||
"source": [
|
||||
"<a href=\"https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
||||
@ -25,18 +12,20 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"环境配置 environment"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "_o6a8GS2lWQM"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"环境配置 environment"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "e9b7iFV3dm1f"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install -q condacolab\n",
|
||||
"# Setting up condacolab and installing packages\n",
|
||||
@ -47,13 +36,17 @@
|
||||
"!conda install -y -q -c pytorch -c nvidia cudatoolkit\n",
|
||||
"%cd -q /content/GPT-SoVITS\n",
|
||||
"!conda install -y -q -c conda-forge gcc gxx ffmpeg cmake -c pytorch -c nvidia\n",
|
||||
"!/usr/local/bin/pip install -r extra-req.txt --no-deps\n",
|
||||
"!/usr/local/bin/pip install -r requirements.txt"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "0NgxXg5sjv7z"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# @title Download pretrained models 下载预训练模型\n",
|
||||
"!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
|
||||
@ -71,27 +64,35 @@
|
||||
"!git clone https://huggingface.co/Delik/uvr5_weights\n",
|
||||
"!git config core.sparseCheckout true\n",
|
||||
"!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "0NgxXg5sjv7z"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "4oRGUzkrk8C7"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# @title launch WebUI 启动WebUI\n",
|
||||
"!/usr/local/bin/pip install ipykernel\n",
|
||||
"!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
|
||||
"%cd /content/GPT-SoVITS/\n",
|
||||
"!/usr/local/bin/python webui.py"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "4oRGUzkrk8C7"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
}
|
||||
]
|
||||
],
|
||||
"metadata": {
|
||||
"accelerator": "GPU",
|
||||
"colab": {
|
||||
"include_colab_link": true,
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"name": "python3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
||||
|
@ -286,3 +286,17 @@ https://github.com/RVC-Boss/GPT-SoVITS/pull/2112 https://github.com/RVC-Boss/GPT
|
||||
修复短文本语种选择出错 https://github.com/RVC-Boss/GPT-SoVITS/pull/2122
|
||||
|
||||
修复v3sovits未传参以支持调节语速
|
||||
|
||||
### 202503
|
||||
|
||||
修复一批由依赖的库版本不对导致的问题https://github.com/RVC-Boss/GPT-SoVITS/commit/6c468583c5566e5fbb4fb805e4cc89c403e997b8
|
||||
|
||||
修复模型加载异步逻辑https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa
|
||||
|
||||
修复其他若干bug
|
||||
|
||||
重点更新:
|
||||
|
||||
1-v3支持并行推理 https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa
|
||||
|
||||
2-整合包修复onnxruntime GPU推理的支持,影响:(1)g2pw有个onnx模型原先是CPU推理现在用GPU,显著降低推理的CPU瓶颈 (2)foxjoy去混响模型现在可使用GPU推理
|
||||
|
@ -76,6 +76,7 @@ bash install.sh
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda activate GPTSoVits
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
@ -101,9 +102,10 @@ conda install -c conda-forge 'ffmpeg<7'
|
||||
|
||||
下载并将 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) 和 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) 放置在 GPT-SoVITS 根目录下。
|
||||
|
||||
安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语TTS)
|
||||
安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语 TTS)
|
||||
|
||||
##### MacOS 用户
|
||||
|
||||
```bash
|
||||
brew install ffmpeg
|
||||
```
|
||||
@ -111,6 +113,7 @@ brew install ffmpeg
|
||||
#### 安装依赖
|
||||
|
||||
```bash
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
@ -147,14 +150,13 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
|
||||
|
||||
1. 从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型,并将其放置在 `GPT_SoVITS/pretrained_models` 目录中。
|
||||
|
||||
2. 从 [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) 下载模型,解压并重命名为 `G2PWModel`,然后将其放置在 `GPT_SoVITS/text` 目录中。(仅限中文TTS)
|
||||
2. 从 [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) 下载模型,解压并重命名为 `G2PWModel`,然后将其放置在 `GPT_SoVITS/text` 目录中。(仅限中文 TTS)
|
||||
|
||||
3. 对于 UVR5(人声/伴奏分离和混响移除,额外功能),从 [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 下载模型,并将其放置在 `tools/uvr5/uvr5_weights` 目录中。
|
||||
|
||||
- 如果你在 UVR5 中使用 `bs_roformer` 或 `mel_band_roformer`模型,你可以手动下载模型和相应的配置文件,并将它们放在 `tools/UVR5/UVR5_weights` 中。**重命名模型文件和配置文件,确保除后缀外**,模型和配置文件具有相同且对应的名称。此外,模型和配置文件名**必须包含“roformer”**,才能被识别为 roformer 类的模型。
|
||||
|
||||
- 建议在模型名称和配置文件名中**直接指定模型类型**,例如`mel_mand_roformer`、`bs_roformer`。如果未指定,将从配置文中比对特征,以确定它是哪种类型的模型。例如,模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对。`kim_mel_band_roformer.ckpt` 和 `kim_mel_band_roformer.yaml` 也是一对。
|
||||
- 如果你在 UVR5 中使用 `bs_roformer` 或 `mel_band_roformer`模型,你可以手动下载模型和相应的配置文件,并将它们放在 `tools/UVR5/UVR5_weights` 中。**重命名模型文件和配置文件,确保除后缀外**,模型和配置文件具有相同且对应的名称。此外,模型和配置文件名**必须包含“roformer”**,才能被识别为 roformer 类的模型。
|
||||
|
||||
- 建议在模型名称和配置文件名中**直接指定模型类型**,例如`mel_mand_roformer`、`bs_roformer`。如果未指定,将从配置文中比对特征,以确定它是哪种类型的模型。例如,模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对。`kim_mel_band_roformer.ckpt` 和 `kim_mel_band_roformer.yaml` 也是一对。
|
||||
|
||||
4. 对于中文 ASR(额外功能),从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型,并将它们放置在 `tools/asr/models` 目录中。
|
||||
|
||||
@ -184,12 +186,12 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神。
|
||||
|
||||
## 微调与推理
|
||||
|
||||
### 打开WebUI
|
||||
### 打开 WebUI
|
||||
|
||||
#### 整合包用户
|
||||
|
||||
双击`go-webui.bat`或者使用`go-webui.ps1`
|
||||
若想使用V1,则双击`go-webui-v1.bat`或者使用`go-webui-v1.ps1`
|
||||
若想使用 V1,则双击`go-webui-v1.bat`或者使用`go-webui-v1.ps1`
|
||||
|
||||
#### 其他
|
||||
|
||||
@ -197,12 +199,13 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神。
|
||||
python webui.py <language(optional)>
|
||||
```
|
||||
|
||||
若想使用V1,则
|
||||
若想使用 V1,则
|
||||
|
||||
```bash
|
||||
python webui.py v1 <language(optional)>
|
||||
```
|
||||
或者在webUI内动态切换
|
||||
|
||||
或者在 webUI 内动态切换
|
||||
|
||||
### 微调
|
||||
|
||||
@ -215,25 +218,27 @@ python webui.py v1 <language(optional)>
|
||||
5. 校对标注
|
||||
6. 前往下一个窗口,点击训练
|
||||
|
||||
### 打开推理WebUI
|
||||
### 打开推理 WebUI
|
||||
|
||||
#### 整合包用户
|
||||
|
||||
双击 `go-webui.bat` 或者使用 `go-webui.ps1` ,然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理webUI
|
||||
双击 `go-webui.bat` 或者使用 `go-webui.ps1` ,然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
|
||||
|
||||
#### 其他
|
||||
|
||||
```bash
|
||||
python GPT_SoVITS/inference_webui.py <language(optional)>
|
||||
```
|
||||
|
||||
或者
|
||||
|
||||
```bash
|
||||
python webui.py
|
||||
```
|
||||
然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理webUI
|
||||
|
||||
## V2发布说明
|
||||
然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
|
||||
|
||||
## V2 发布说明
|
||||
|
||||
新特性:
|
||||
|
||||
@ -241,42 +246,41 @@ python webui.py
|
||||
|
||||
2. 更好的文本前端
|
||||
|
||||
3. 底模由2k小时扩展至5k小时
|
||||
3. 底模由 2k 小时扩展至 5k 小时
|
||||
|
||||
4. 对低音质参考音频(尤其是来源于网络的高频严重缺失、听着很闷的音频)合成出来音质更好
|
||||
|
||||
详见[wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
||||
详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||
|
||||
从v1环境迁移至v2
|
||||
从 v1 环境迁移至 v2
|
||||
|
||||
1. 需要pip安装requirements.txt更新环境
|
||||
1. 需要 pip 安装 requirements.txt 更新环境
|
||||
|
||||
2. 需要克隆github上的最新代码
|
||||
2. 需要克隆 github 上的最新代码
|
||||
|
||||
3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到GPT_SoVITS\pretrained_models\gsv-v2final-pretrained下
|
||||
3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到 GPT_SoVITS\pretrained_models\gsv-v2final-pretrained 下
|
||||
|
||||
中文额外需要下载[G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(下载G2PW模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下)
|
||||
中文额外需要下载[G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(下载 G2PW 模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下)
|
||||
|
||||
## V3更新说明
|
||||
## V3 更新说明
|
||||
|
||||
新模型特点:
|
||||
|
||||
1. 音色相似度更像,需要更少训练集来逼近本人(不训练直接使用底模模式下音色相似性提升更大)
|
||||
|
||||
2. GPT合成更稳定,重复漏字更少,也更容易跑出丰富情感
|
||||
2. GPT 合成更稳定,重复漏字更少,也更容易跑出丰富情感
|
||||
|
||||
详见[wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
||||
详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||
|
||||
从v2环境迁移至v3
|
||||
从 v2 环境迁移至 v3
|
||||
|
||||
1. 需要pip安装requirements.txt更新环境
|
||||
1. 需要 pip 安装 requirements.txt 更新环境
|
||||
|
||||
2. 需要克隆github上的最新代码
|
||||
2. 需要克隆 github 上的最新代码
|
||||
|
||||
3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些v3新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS\pretrained_models`目录下
|
||||
|
||||
如果想用音频超分功能缓解v3模型生成24k音频觉得闷的问题,需要下载额外的模型参数,参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt)
|
||||
3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些 v3 新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS\pretrained_models`目录下
|
||||
|
||||
如果想用音频超分功能缓解 v3 模型生成 24k 音频觉得闷的问题,需要下载额外的模型参数,参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt)
|
||||
|
||||
## 待办事项清单
|
||||
|
||||
@ -299,16 +303,21 @@ python webui.py
|
||||
- [ ] 模型混合。
|
||||
|
||||
## (附加)命令行运行方式
|
||||
使用命令行打开UVR5的WebUI
|
||||
````
|
||||
|
||||
使用命令行打开 UVR5 的 WebUI
|
||||
|
||||
```
|
||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
||||
````
|
||||
```
|
||||
|
||||
<!-- 如果打不开浏览器,请按照下面的格式进行UVR处理,这是使用mdxnet进行音频处理的方式
|
||||
````
|
||||
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
||||
```` -->
|
||||
|
||||
这是使用命令行完成数据集的音频切分的方式
|
||||
````
|
||||
|
||||
```
|
||||
python audio_slicer.py \
|
||||
--input_path "<path_to_original_audio_file_or_directory>" \
|
||||
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
|
||||
@ -316,17 +325,22 @@ python audio_slicer.py \
|
||||
--min_length <minimum_duration_of_each_subclip> \
|
||||
--min_interval <shortest_time_gap_between_adjacent_subclips>
|
||||
--hop_size <step_size_for_computing_volume_curve>
|
||||
````
|
||||
这是使用命令行完成数据集ASR处理的方式(仅限中文)
|
||||
````
|
||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
||||
````
|
||||
通过Faster_Whisper进行ASR处理(除中文之外的ASR标记)
|
||||
```
|
||||
|
||||
这是使用命令行完成数据集 ASR 处理的方式(仅限中文)
|
||||
|
||||
```
|
||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
||||
```
|
||||
|
||||
通过 Faster_Whisper 进行 ASR 处理(除中文之外的 ASR 标记)
|
||||
|
||||
(没有进度条,GPU 性能可能会导致时间延迟)
|
||||
|
||||
(没有进度条,GPU性能可能会导致时间延迟)
|
||||
```
|
||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
||||
```
|
||||
|
||||
启用自定义列表保存路径
|
||||
|
||||
## 致谢
|
||||
@ -334,6 +348,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
||||
特别感谢以下项目和贡献者:
|
||||
|
||||
### 理论研究
|
||||
|
||||
- [ar-vits](https://github.com/innnky/ar-vits)
|
||||
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
||||
- [vits](https://github.com/jaywalnut310/vits)
|
||||
@ -343,17 +358,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
||||
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
||||
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
||||
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
||||
|
||||
### 预训练模型
|
||||
|
||||
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
||||
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
||||
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
||||
|
||||
### 推理用文本前端
|
||||
|
||||
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
||||
- [split-lang](https://github.com/DoodleBears/split-lang)
|
||||
- [g2pW](https://github.com/GitYCC/g2pW)
|
||||
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
||||
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
||||
|
||||
### WebUI 工具
|
||||
|
||||
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
||||
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
||||
- [SubFix](https://github.com/cronrpc/SubFix)
|
||||
|
@ -20,17 +20,17 @@
|
||||
|
||||
## 機能:
|
||||
|
||||
1. **Zero-Shot TTS:** たった5秒間の音声サンプルで、即座にテキストからその音声に変換できます。
|
||||
1. **Zero-Shot TTS:** たった 5 秒間の音声サンプルで、即座にテキストからその音声に変換できます。
|
||||
|
||||
2. **Few-Shot TTS:** わずか1分間のトレーニングデータでモデルを微調整し、音声のクオリティを向上。
|
||||
2. **Few-Shot TTS:** わずか 1 分間のトレーニングデータでモデルを微調整し、音声のクオリティを向上。
|
||||
|
||||
3. **多言語サポート:** 現在、英語、日本語、韓国語、広東語、中国語をサポートしています。
|
||||
|
||||
4. **WebUI ツール:** 統合されたツールは、音声と伴奏(BGM等)の分離、トレーニングセットの自動セグメンテーション、ASR(中国語のみ)、テキストラベリング等を含むため、初心者の方でもトレーニングデータセットの作成やGPT/SoVITSモデルのトレーニング等を非常に簡単に行えます。
|
||||
4. **WebUI ツール:** 統合されたツールは、音声と伴奏(BGM 等)の分離、トレーニングセットの自動セグメンテーション、ASR(中国語のみ)、テキストラベリング等を含むため、初心者の方でもトレーニングデータセットの作成や GPT/SoVITS モデルのトレーニング等を非常に簡単に行えます。
|
||||
|
||||
**[デモ動画](https://www.bilibili.com/video/BV12g4y1m7Uw)をチェック!**
|
||||
|
||||
声の事前学習無しかつFew-Shotでトレーニングされたモデルのデモ:
|
||||
声の事前学習無しかつ Few-Shot でトレーニングされたモデルのデモ:
|
||||
|
||||
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
|
||||
|
||||
@ -43,7 +43,7 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
|
||||
- Python 3.9, PyTorch 2.0.1, CUDA 11
|
||||
- Python 3.10.13, PyTorch 2.1.2, CUDA 12.3
|
||||
- Python 3.9, PyTorch 2.2.2, macOS 14.4.1 (Apple silicon)
|
||||
- Python 3.9, PyTorch 2.2.2, CPUデバイス
|
||||
- Python 3.9, PyTorch 2.2.2, CPU デバイス
|
||||
|
||||
_注記: numba==0.56.4 は py<3.11 が必要です_
|
||||
|
||||
@ -61,22 +61,22 @@ bash install.sh
|
||||
|
||||
### macOS
|
||||
|
||||
**注:MacでGPUを使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面はCPUを使用して訓練することを強く推奨します。**
|
||||
**注:Mac で GPU を使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面は CPU を使用して訓練することを強く推奨します。**
|
||||
|
||||
1. `xcode-select --install` を実行して、Xcodeコマンドラインツールをインストールします。
|
||||
2. `brew install ffmpeg` を実行してFFmpegをインストールします。
|
||||
1. `xcode-select --install` を実行して、Xcode コマンドラインツールをインストールします。
|
||||
2. `brew install ffmpeg` を実行して FFmpeg をインストールします。
|
||||
3. 上記の手順を完了した後、以下のコマンドを実行してこのプロジェクトをインストールします。
|
||||
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda activate GPTSoVits
|
||||
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### 手動インストール
|
||||
|
||||
#### FFmpegをインストールします。
|
||||
#### FFmpeg をインストールします。
|
||||
|
||||
##### Conda ユーザー
|
||||
|
||||
@ -97,6 +97,7 @@ conda install -c conda-forge 'ffmpeg<7'
|
||||
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます。
|
||||
|
||||
##### MacOS ユーザー
|
||||
|
||||
```bash
|
||||
brew install ffmpeg
|
||||
```
|
||||
@ -104,6 +105,7 @@ brew install ffmpeg
|
||||
#### 依存関係をインストールします
|
||||
|
||||
```bash
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirementx.txt
|
||||
```
|
||||
|
||||
@ -138,17 +140,17 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
|
||||
|
||||
1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) から事前訓練済みモデルをダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリに配置してください。
|
||||
|
||||
2. [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください。(中国語TTSのみ)
|
||||
2. [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください。(中国語 TTS のみ)
|
||||
|
||||
3. UVR5(ボーカル/伴奏(BGM等)分離 & リバーブ除去の追加機能)の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`tools/uvr5/uvr5_weights` ディレクトリに配置してください。
|
||||
3. UVR5(ボーカル/伴奏(BGM 等)分離 & リバーブ除去の追加機能)の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`tools/uvr5/uvr5_weights` ディレクトリに配置してください。
|
||||
|
||||
- UVR5でbs_roformerまたはmel_band_roformerモデルを使用する場合、モデルと対応する設定ファイルを手動でダウンロードし、`tools/UVR5/UVR5_weights`フォルダに配置することができます。**モデルファイルと設定ファイルの名前は、拡張子を除いて同じであることを確認してください**。さらに、モデルと設定ファイルの名前には**「roformer」が含まれている必要があります**。これにより、roformerクラスのモデルとして認識されます。
|
||||
- UVR5 で bs_roformer または mel_band_roformer モデルを使用する場合、モデルと対応する設定ファイルを手動でダウンロードし、`tools/UVR5/UVR5_weights`フォルダに配置することができます。**モデルファイルと設定ファイルの名前は、拡張子を除いて同じであることを確認してください**。さらに、モデルと設定ファイルの名前には**「roformer」が含まれている必要があります**。これにより、roformer クラスのモデルとして認識されます。
|
||||
|
||||
- モデル名と設定ファイル名には、**直接モデルタイプを指定することをお勧めします**。例:mel_mand_roformer、bs_roformer。指定しない場合、設定文から特徴を照合して、モデルの種類を特定します。例えば、モデル`bs_roformer_ep_368_sdr_12.9628.ckpt`と対応する設定ファイル`bs_roformer_ep_368_sdr_12.9628.yaml`はペアです。同様に、`kim_mel_band_roformer.ckpt`と`kim_mel_band_roformer.yaml`もペアです。
|
||||
- モデル名と設定ファイル名には、**直接モデルタイプを指定することをお勧めします**。例:mel_mand_roformer、bs_roformer。指定しない場合、設定文から特徴を照合して、モデルの種類を特定します。例えば、モデル`bs_roformer_ep_368_sdr_12.9628.ckpt`と対応する設定ファイル`bs_roformer_ep_368_sdr_12.9628.yaml`はペアです。同様に、`kim_mel_band_roformer.ckpt`と`kim_mel_band_roformer.yaml`もペアです。
|
||||
|
||||
4. 中国語ASR(追加機能)の場合は、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、および [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。
|
||||
4. 中国語 ASR(追加機能)の場合は、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、および [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。
|
||||
|
||||
5. 英語または日本語のASR(追加機能)を使用する場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。また、[他のモデル](https://huggingface.co/Systran) は、より小さいサイズで高クオリティな可能性があります。
|
||||
5. 英語または日本語の ASR(追加機能)を使用する場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。また、[他のモデル](https://huggingface.co/Systran) は、より小さいサイズで高クオリティな可能性があります。
|
||||
|
||||
## データセット形式
|
||||
|
||||
@ -169,14 +171,15 @@ vocal_path|speaker_name|language|text
|
||||
```
|
||||
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
|
||||
```
|
||||
|
||||
## 微調整と推論
|
||||
|
||||
### WebUIを開く
|
||||
### WebUI を開く
|
||||
|
||||
#### 統合パッケージ利用者
|
||||
|
||||
`go-webui.bat`をダブルクリックするか、`go-webui.ps1`を使用します。
|
||||
V1に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックするか、`go-webui-v1.ps1`を使用してください。
|
||||
V1 に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックするか、`go-webui-v1.ps1`を使用してください。
|
||||
|
||||
#### その他
|
||||
|
||||
@ -184,12 +187,13 @@ V1に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックす
|
||||
python webui.py <言語(オプション)>
|
||||
```
|
||||
|
||||
V1に切り替えたい場合は
|
||||
V1 に切り替えたい場合は
|
||||
|
||||
```bash
|
||||
python webui.py v1 <言語(オプション)>
|
||||
```
|
||||
またはWebUIで手動でバージョンを切り替えてください。
|
||||
|
||||
または WebUI で手動でバージョンを切り替えてください。
|
||||
|
||||
### 微調整
|
||||
|
||||
@ -202,25 +206,27 @@ python webui.py v1 <言語(オプション)>
|
||||
5. ASR転写を校正する
|
||||
6. 次のタブに移動し、モデルを微調整する
|
||||
|
||||
### 推論WebUIを開く
|
||||
### 推論 WebUI を開く
|
||||
|
||||
#### 統合パッケージ利用者
|
||||
|
||||
`go-webui-v2.bat`をダブルクリックするか、`go-webui-v2.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論webuiを開きます。
|
||||
`go-webui-v2.bat`をダブルクリックするか、`go-webui-v2.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます。
|
||||
|
||||
#### その他
|
||||
|
||||
```bash
|
||||
python GPT_SoVITS/inference_webui.py <言語(オプション)>
|
||||
```
|
||||
|
||||
または
|
||||
|
||||
```bash
|
||||
python webui.py
|
||||
```
|
||||
その後、`1-GPT-SoVITS-TTS/1C-inference`で推論webuiを開きます。
|
||||
|
||||
## V2リリースノート
|
||||
その後、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます。
|
||||
|
||||
## V2 リリースノート
|
||||
|
||||
新機能:
|
||||
|
||||
@ -228,21 +234,21 @@ python webui.py
|
||||
|
||||
2. 最適化されたテキストフロントエンド
|
||||
|
||||
3. 事前学習済みモデルが2千時間から5千時間に拡張
|
||||
3. 事前学習済みモデルが 2 千時間から 5 千時間に拡張
|
||||
|
||||
4. 低品質の参照音声に対する合成品質の向上
|
||||
|
||||
[詳細はこちら](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
||||
[詳細はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||
|
||||
V1環境からV2を使用するには:
|
||||
V1 環境から V2 を使用するには:
|
||||
|
||||
1. `pip install -r requirements.txt`を使用していくつかのパッケージを更新
|
||||
|
||||
2. 最新のコードをgithubからクローン
|
||||
2. 最新のコードを github からクローン
|
||||
|
||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)からV2の事前学習モデルをダウンロードし、それらを`GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`に配置
|
||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)から V2 の事前学習モデルをダウンロードし、それらを`GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`に配置
|
||||
|
||||
中国語V2追加: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(G2PWモデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します)
|
||||
中国語 V2 追加: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(G2PW モデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します)
|
||||
|
||||
## V3 リリースノート
|
||||
|
||||
@ -250,19 +256,19 @@ V1環境からV2を使用するには:
|
||||
|
||||
1. 音色の類似性が向上し、ターゲットスピーカーを近似するために必要な学習データが少なくなりました(音色の類似性は、ファインチューニングなしでベースモデルを直接使用することで顕著に改善されます)。
|
||||
|
||||
2. GPTモデルがより安定し、繰り返しや省略が減少し、より豊かな感情表現を持つ音声の生成が容易になりました。
|
||||
2. GPT モデルがより安定し、繰り返しや省略が減少し、より豊かな感情表現を持つ音声の生成が容易になりました。
|
||||
|
||||
[詳細情報はこちら](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
||||
[詳細情報はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||
|
||||
v2 環境から v3 を使用する方法:
|
||||
|
||||
1. `pip install -r requirements.txt` を実行して、いくつかのパッケージを更新します。
|
||||
|
||||
2. GitHubから最新のコードをクローンします。
|
||||
2. GitHub から最新のコードをクローンします。
|
||||
|
||||
3. v3の事前学習済みモデル(s1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ)を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS\pretrained_models フォルダに配置します。
|
||||
3. v3 の事前学習済みモデル(s1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ)を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS\pretrained_models フォルダに配置します。
|
||||
|
||||
追加: 音声超解像モデルについては、[ダウンロード方法](../../tools/AP_BWE_main/24kto48k/readme.txt)を参照してください。
|
||||
追加: 音声超解像モデルについては、[ダウンロード方法](../../tools/AP_BWE_main/24kto48k/readme.txt)を参照してください。
|
||||
|
||||
## Todo リスト
|
||||
|
||||
@ -285,15 +291,20 @@ v2 環境から v3 を使用する方法:
|
||||
- [ ] モデルミックス
|
||||
|
||||
## (追加の) コマンドラインから実行する方法
|
||||
|
||||
コマンド ラインを使用して UVR5 の WebUI を開きます
|
||||
|
||||
```
|
||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
||||
```
|
||||
|
||||
<!-- ブラウザを開けない場合は、以下の形式に従って UVR 処理を行ってください。これはオーディオ処理に mdxnet を使用しています。
|
||||
```
|
||||
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
||||
``` -->
|
||||
|
||||
コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです。
|
||||
|
||||
```
|
||||
python audio_slicer.py \
|
||||
--input_path "<path_to_original_audio_file_or_directory>" \
|
||||
@ -303,16 +314,21 @@ python audio_slicer.py \
|
||||
--min_interval <shortest_time_gap_between_adjacent_subclips>
|
||||
--hop_size <step_size_for_computing_volume_curve>
|
||||
```
|
||||
|
||||
コマンドラインを使用してデータセット ASR 処理を行う方法です (中国語のみ)
|
||||
|
||||
```
|
||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
||||
```
|
||||
ASR処理はFaster_Whisperを通じて実行されます(中国語を除くASRマーキング)
|
||||
|
||||
ASR 処理は Faster_Whisper を通じて実行されます(中国語を除く ASR マーキング)
|
||||
|
||||
(進行状況バーは表示されません。GPU のパフォーマンスにより時間遅延が発生する可能性があります)
|
||||
|
||||
```
|
||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
||||
```
|
||||
|
||||
カスタムリストの保存パスが有効になっています
|
||||
|
||||
## クレジット
|
||||
@ -320,6 +336,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
||||
特に以下のプロジェクトと貢献者に感謝します:
|
||||
|
||||
### 理論研究
|
||||
|
||||
- [ar-vits](https://github.com/innnky/ar-vits)
|
||||
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
||||
- [vits](https://github.com/jaywalnut310/vits)
|
||||
@ -329,17 +346,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
||||
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
||||
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
||||
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
||||
|
||||
### 事前学習モデル
|
||||
|
||||
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
||||
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
||||
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
||||
|
||||
### 推論用テキストフロントエンド
|
||||
|
||||
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
||||
- [split-lang](https://github.com/DoodleBears/split-lang)
|
||||
- [g2pW](https://github.com/GitYCC/g2pW)
|
||||
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
||||
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
||||
|
||||
### WebUI ツール
|
||||
|
||||
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
||||
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
||||
- [SubFix](https://github.com/cronrpc/SubFix)
|
||||
|
@ -70,7 +70,7 @@ bash install.sh
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda activate GPTSoVits
|
||||
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
@ -99,6 +99,7 @@ conda install -c conda-forge 'ffmpeg<7'
|
||||
[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치 (Korean TTS 전용)
|
||||
|
||||
##### MacOS 사용자
|
||||
|
||||
```bash
|
||||
brew install ffmpeg
|
||||
```
|
||||
@ -106,6 +107,7 @@ brew install ffmpeg
|
||||
#### 의존성 설치
|
||||
|
||||
```bash
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
@ -147,9 +149,9 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
|
||||
|
||||
3. UVR5 (보컬/반주 분리 & 잔향 제거 추가 기능)의 경우, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 에서 모델을 다운로드하고 `tools/uvr5/uvr5_weights` 디렉토리에 배치하세요.
|
||||
|
||||
- UVR5에서 bs_roformer 또는 mel_band_roformer 모델을 사용할 경우, 모델과 해당 설정 파일을 수동으로 다운로드하여 `tools/UVR5/UVR5_weights` 폴더에 저장할 수 있습니다. **모델 파일과 설정 파일의 이름은 확장자를 제외하고 동일한 이름을 가지도록 해야 합니다**. 또한, 모델과 설정 파일 이름에는 **“roformer”**가 포함되어야 roformer 클래스의 모델로 인식됩니다.
|
||||
- UVR5에서 bs_roformer 또는 mel_band_roformer 모델을 사용할 경우, 모델과 해당 설정 파일을 수동으로 다운로드하여 `tools/UVR5/UVR5_weights` 폴더에 저장할 수 있습니다. **모델 파일과 설정 파일의 이름은 확장자를 제외하고 동일한 이름을 가지도록 해야 합니다**. 또한, 모델과 설정 파일 이름에는 **“roformer”**가 포함되어야 roformer 클래스의 모델로 인식됩니다.
|
||||
|
||||
- 모델 이름과 설정 파일 이름에 **모델 유형을 직접 지정하는 것이 좋습니다**. 예: mel_mand_roformer, bs_roformer. 지정하지 않으면 설정 파일을 기준으로 특성을 비교하여 어떤 유형의 모델인지를 판단합니다. 예를 들어, 모델 `bs_roformer_ep_368_sdr_12.9628.ckpt`와 해당 설정 파일 `bs_roformer_ep_368_sdr_12.9628.yaml`은 한 쌍입니다. `kim_mel_band_roformer.ckpt`와 `kim_mel_band_roformer.yaml`도 한 쌍입니다.
|
||||
- 모델 이름과 설정 파일 이름에 **모델 유형을 직접 지정하는 것이 좋습니다**. 예: mel_mand_roformer, bs_roformer. 지정하지 않으면 설정 파일을 기준으로 특성을 비교하여 어떤 유형의 모델인지를 판단합니다. 예를 들어, 모델 `bs_roformer_ep_368_sdr_12.9628.ckpt`와 해당 설정 파일 `bs_roformer_ep_368_sdr_12.9628.yaml`은 한 쌍입니다. `kim_mel_band_roformer.ckpt`와 `kim_mel_band_roformer.yaml`도 한 쌍입니다.
|
||||
|
||||
4. 중국어 ASR (추가 기능)의 경우, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 및 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 에서 모델을 다운로드하고, `tools/asr/models` 디렉토리에 배치하세요.
|
||||
|
||||
@ -195,6 +197,7 @@ V1으로 전환하려면,
|
||||
```bash
|
||||
python webui.py v1 <언어(옵션)>
|
||||
```
|
||||
|
||||
또는 WebUI에서 수동으로 버전을 전환하십시오.
|
||||
|
||||
### 미세 조정
|
||||
@ -219,11 +222,13 @@ python webui.py v1 <언어(옵션)>
|
||||
```bash
|
||||
python GPT_SoVITS/inference_webui.py <언어(옵션)>
|
||||
```
|
||||
|
||||
또는
|
||||
|
||||
```bash
|
||||
python webui.py
|
||||
```
|
||||
|
||||
그런 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.
|
||||
|
||||
## V2 릴리스 노트
|
||||
@ -238,7 +243,7 @@ python webui.py
|
||||
|
||||
4. 저품질 참조 오디오에 대한 합성 품질 향상
|
||||
|
||||
[자세한 내용](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
||||
[자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||
|
||||
V1 환경에서 V2를 사용하려면:
|
||||
|
||||
@ -248,7 +253,7 @@ V1 환경에서 V2를 사용하려면:
|
||||
|
||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)에서 V2 사전 학습 모델을 다운로드하여 `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`에 넣으십시오.
|
||||
|
||||
중국어 V2 추가: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.)
|
||||
중국어 V2 추가: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.)
|
||||
|
||||
## V3 릴리스 노트
|
||||
|
||||
@ -258,7 +263,7 @@ V1 환경에서 V2를 사용하려면:
|
||||
|
||||
2. GPT 모델이 더 안정적이며 반복 및 생략이 적고, 더 풍부한 감정 표현을 가진 음성을 생성하기가 더 쉽습니다.
|
||||
|
||||
[자세한 내용](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
||||
[자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||
|
||||
v2 환경에서 v3 사용하기:
|
||||
|
||||
@ -268,8 +273,7 @@ v2 환경에서 v3 사용하기:
|
||||
|
||||
3. v3 사전 훈련된 모델(s1v3.ckpt, s2Gv3.pth, 그리고 models--nvidia--bigvgan_v2_24khz_100band_256x 폴더)을 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 다운로드하여 `GPT_SoVITS\pretrained_models` 폴더에 넣습니다.
|
||||
|
||||
추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요.
|
||||
|
||||
추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요.
|
||||
|
||||
## 할 일 목록
|
||||
|
||||
@ -293,15 +297,20 @@ v2 환경에서 v3 사용하기:
|
||||
- [ ] 모델 블렌딩.
|
||||
|
||||
## (추가적인) 명령줄에서 실행하는 방법
|
||||
|
||||
명령줄을 사용하여 UVR5용 WebUI 열기
|
||||
|
||||
```
|
||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
||||
```
|
||||
|
||||
<!-- 브라우저를 열 수 없는 경우 UVR 처리를 위해 아래 형식을 따르십시오. 이는 오디오 처리를 위해 mdxnet을 사용하는 것입니다.
|
||||
```
|
||||
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
||||
``` -->
|
||||
|
||||
명령줄을 사용하여 데이터세트의 오디오 분할을 수행하는 방법은 다음과 같습니다.
|
||||
|
||||
```
|
||||
python audio_slicer.py \
|
||||
--input_path "<path_to_original_audio_file_or_directory>" \
|
||||
@ -311,16 +320,21 @@ python audio_slicer.py \
|
||||
--min_interval <shortest_time_gap_between_adjacent_subclips>
|
||||
--hop_size <step_size_for_computing_volume_curve>
|
||||
```
|
||||
|
||||
명령줄을 사용하여 데이터 세트 ASR 처리를 수행하는 방법입니다(중국어만 해당).
|
||||
|
||||
```
|
||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
||||
```
|
||||
|
||||
ASR 처리는 Faster_Whisper(중국어를 제외한 ASR 마킹)를 통해 수행됩니다.
|
||||
|
||||
(진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음)
|
||||
|
||||
```
|
||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
||||
```
|
||||
|
||||
사용자 정의 목록 저장 경로가 활성화되었습니다.
|
||||
|
||||
## 감사의 말
|
||||
@ -328,6 +342,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
||||
다음 프로젝트와 기여자들에게 특별히 감사드립니다:
|
||||
|
||||
### 이론 연구
|
||||
|
||||
- [ar-vits](https://github.com/innnky/ar-vits)
|
||||
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
||||
- [vits](https://github.com/jaywalnut310/vits)
|
||||
@ -337,17 +352,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
||||
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
||||
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
||||
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
||||
|
||||
### 사전 학습 모델
|
||||
|
||||
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
||||
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
||||
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
||||
|
||||
### 추론용 텍스트 프론트엔드
|
||||
|
||||
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
||||
- [split-lang](https://github.com/DoodleBears/split-lang)
|
||||
- [g2pW](https://github.com/GitYCC/g2pW)
|
||||
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
||||
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
||||
|
||||
### WebUI 도구
|
||||
|
||||
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
||||
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
||||
- [SubFix](https://github.com/cronrpc/SubFix)
|
||||
|
@ -72,7 +72,7 @@ bash install.sh
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda activate GPTSoVits
|
||||
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
@ -99,6 +99,7 @@ conda install -c conda-forge 'ffmpeg<7'
|
||||
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin.
|
||||
|
||||
##### MacOS Kullanıcıları
|
||||
|
||||
```bash
|
||||
brew install ffmpeg
|
||||
```
|
||||
@ -106,6 +107,7 @@ brew install ffmpeg
|
||||
#### Bağımlılıkları Yükleme
|
||||
|
||||
```bash
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
@ -142,9 +144,9 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
|
||||
|
||||
3. UVR5 (Vokal/Enstrümantal Ayrımı & Yankı Giderme) için, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) üzerinden modelleri indirip `tools/uvr5/uvr5_weights` dizinine yerleştirin.
|
||||
|
||||
- UVR5'te bs_roformer veya mel_band_roformer modellerini kullanıyorsanız, modeli ve ilgili yapılandırma dosyasını manuel olarak indirip `tools/UVR5/UVR5_weights` klasörüne yerleştirebilirsiniz. **Model dosyası ve yapılandırma dosyasının adı, uzantı dışında aynı olmalıdır**. Ayrıca, model ve yapılandırma dosyasının adlarında **“roformer”** kelimesi yer almalıdır, böylece roformer sınıfındaki bir model olarak tanınır.
|
||||
- UVR5'te bs_roformer veya mel_band_roformer modellerini kullanıyorsanız, modeli ve ilgili yapılandırma dosyasını manuel olarak indirip `tools/UVR5/UVR5_weights` klasörüne yerleştirebilirsiniz. **Model dosyası ve yapılandırma dosyasının adı, uzantı dışında aynı olmalıdır**. Ayrıca, model ve yapılandırma dosyasının adlarında **“roformer”** kelimesi yer almalıdır, böylece roformer sınıfındaki bir model olarak tanınır.
|
||||
|
||||
- Model adı ve yapılandırma dosyası adı içinde **doğrudan model tipini belirtmek önerilir**. Örneğin: mel_mand_roformer, bs_roformer. Belirtilmezse, yapılandırma dosyasından özellikler karşılaştırılarak model tipi belirlenir. Örneğin, `bs_roformer_ep_368_sdr_12.9628.ckpt` modeli ve karşılık gelen yapılandırma dosyası `bs_roformer_ep_368_sdr_12.9628.yaml` bir çifttir. Aynı şekilde, `kim_mel_band_roformer.ckpt` ve `kim_mel_band_roformer.yaml` da bir çifttir.
|
||||
- Model adı ve yapılandırma dosyası adı içinde **doğrudan model tipini belirtmek önerilir**. Örneğin: mel_mand_roformer, bs_roformer. Belirtilmezse, yapılandırma dosyasından özellikler karşılaştırılarak model tipi belirlenir. Örneğin, `bs_roformer_ep_368_sdr_12.9628.ckpt` modeli ve karşılık gelen yapılandırma dosyası `bs_roformer_ep_368_sdr_12.9628.yaml` bir çifttir. Aynı şekilde, `kim_mel_band_roformer.ckpt` ve `kim_mel_band_roformer.yaml` da bir çifttir.
|
||||
|
||||
4. Çince ASR için, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) ve [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) üzerinden modelleri indirip `tools/asr/models` dizinine yerleştirin.
|
||||
|
||||
@ -192,6 +194,7 @@ V1'e geçmek istiyorsanız,
|
||||
```bash
|
||||
python webui.py v1 <dil(isteğe bağlı)>
|
||||
```
|
||||
|
||||
veya WebUI'de manuel olarak sürüm değiştirin.
|
||||
|
||||
### İnce Ayar
|
||||
@ -216,11 +219,13 @@ veya WebUI'de manuel olarak sürüm değiştirin.
|
||||
```bash
|
||||
python GPT_SoVITS/inference_webui.py <dil(isteğe bağlı)>
|
||||
```
|
||||
|
||||
VEYA
|
||||
|
||||
```bash
|
||||
python webui.py
|
||||
```
|
||||
|
||||
ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.
|
||||
|
||||
## V2 Sürüm Notları
|
||||
@ -235,7 +240,7 @@ Yeni Özellikler:
|
||||
|
||||
4. Düşük kaliteli referans sesler için geliştirilmiş sentez kalitesi
|
||||
|
||||
[detaylar burada](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
||||
[detaylar burada](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||
|
||||
V1 ortamından V2'yi kullanmak için:
|
||||
|
||||
@ -245,7 +250,7 @@ V1 ortamından V2'yi kullanmak için:
|
||||
|
||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) adresinden v2 önceden eğitilmiş modelleri indirin ve bunları `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained` dizinine yerleştirin.
|
||||
|
||||
Ek olarak Çince V2: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.)
|
||||
Ek olarak Çince V2: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.)
|
||||
|
||||
## V3 Sürüm Notları
|
||||
|
||||
@ -255,7 +260,7 @@ V1 ortamından V2'yi kullanmak için:
|
||||
|
||||
2. GPT modeli daha **kararlı** hale geldi, tekrarlar ve atlamalar azaldı ve **daha zengin duygusal ifadeler** ile konuşma üretmek daha kolay hale geldi.
|
||||
|
||||
[daha fazla detay](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
||||
[daha fazla detay](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||
|
||||
### v2 ortamında v3 kullanımı:
|
||||
|
||||
@ -265,7 +270,7 @@ V1 ortamından V2'yi kullanmak için:
|
||||
|
||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden v3 önceden eğitilmiş modellerini (s1v3.ckpt, s2Gv3.pth ve models--nvidia--bigvgan_v2_24khz_100band_256x klasörünü) indirin ve `GPT_SoVITS\pretrained_models` dizinine yerleştirin.
|
||||
|
||||
ek: Ses Süper Çözünürlük modeli için [nasıl indirileceği](../../tools/AP_BWE_main/24kto48k/readme.txt) hakkında bilgi alabilirsiniz.
|
||||
ek: Ses Süper Çözünürlük modeli için [nasıl indirileceği](../../tools/AP_BWE_main/24kto48k/readme.txt) hakkında bilgi alabilirsiniz.
|
||||
|
||||
## Yapılacaklar Listesi
|
||||
|
||||
@ -288,15 +293,20 @@ V1 ortamından V2'yi kullanmak için:
|
||||
- [ ] model karışımı
|
||||
|
||||
## (Ekstra) Komut satırından çalıştırma yöntemi
|
||||
|
||||
UVR5 için Web Arayüzünü açmak için komut satırını kullanın
|
||||
|
||||
```
|
||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
||||
```
|
||||
|
||||
<!-- Bir tarayıcı açamıyorsanız, UVR işleme için aşağıdaki formatı izleyin,Bu ses işleme için mdxnet kullanıyor
|
||||
```
|
||||
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
||||
``` -->
|
||||
|
||||
Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır
|
||||
|
||||
```
|
||||
python audio_slicer.py \
|
||||
--input_path "<orijinal_ses_dosyası_veya_dizininin_yolu>" \
|
||||
@ -306,16 +316,21 @@ python audio_slicer.py \
|
||||
--min_interval <bitişik_alt_klipler_arasındaki_en_kısa_zaman_aralığı>
|
||||
--hop_size <ses_eğrisini_hesaplamak_için_adım_boyutu>
|
||||
```
|
||||
|
||||
Veri seti ASR işleme komut satırı kullanılarak bu şekilde yapılır (Yalnızca Çince)
|
||||
|
||||
```
|
||||
python tools/asr/funasr_asr.py -i <girdi> -o <çıktı>
|
||||
```
|
||||
|
||||
ASR işleme Faster_Whisper aracılığıyla gerçekleştirilir (Çince dışındaki ASR işaretleme)
|
||||
|
||||
(İlerleme çubukları yok, GPU performansı zaman gecikmelerine neden olabilir)
|
||||
|
||||
```
|
||||
python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
|
||||
```
|
||||
|
||||
Özel bir liste kaydetme yolu etkinleştirildi
|
||||
|
||||
## Katkı Verenler
|
||||
@ -323,6 +338,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
|
||||
Özellikle aşağıdaki projelere ve katkıda bulunanlara teşekkür ederiz:
|
||||
|
||||
### Teorik Araştırma
|
||||
|
||||
- [ar-vits](https://github.com/innnky/ar-vits)
|
||||
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
||||
- [vits](https://github.com/jaywalnut310/vits)
|
||||
@ -332,17 +348,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
|
||||
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
||||
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
||||
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
||||
|
||||
### Önceden Eğitilmiş Modeller
|
||||
|
||||
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
||||
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
||||
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
||||
|
||||
### Tahmin İçin Metin Ön Ucu
|
||||
|
||||
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
||||
- [split-lang](https://github.com/DoodleBears/split-lang)
|
||||
- [g2pW](https://github.com/GitYCC/g2pW)
|
||||
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
||||
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
||||
|
||||
### WebUI Araçları
|
||||
|
||||
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
||||
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
||||
- [SubFix](https://github.com/cronrpc/SubFix)
|
||||
|
1
extra-req.txt
Normal file
1
extra-req.txt
Normal file
@ -0,0 +1 @@
|
||||
faster-whisper
|
@ -27,7 +27,8 @@
|
||||
"!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
|
||||
"%cd GPT-SoVITS\n",
|
||||
"!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
|
||||
"!pip install -r requirements.txt"
|
||||
"!pip install -r requirements.txt\n",
|
||||
"!pip install -r extra-req.txt --no-deps"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
55
install.sh
55
install.sh
@ -1,15 +1,17 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# 安装构建工具
|
||||
# Install build tools
|
||||
echo "Installing GCC..."
|
||||
conda install -c conda-forge gcc=14
|
||||
conda install -c conda-forge gcc=14 -y
|
||||
|
||||
echo "Installing G++..."
|
||||
conda install -c conda-forge gxx
|
||||
conda install -c conda-forge gxx -y
|
||||
|
||||
echo "Installing ffmpeg and cmake..."
|
||||
conda install ffmpeg cmake
|
||||
conda install ffmpeg cmake -y
|
||||
|
||||
# 设置编译环境
|
||||
# Set up build environment
|
||||
@ -18,7 +20,7 @@ export CC="$CONDA_PREFIX/bin/gcc"
|
||||
export CXX="$CONDA_PREFIX/bin/g++"
|
||||
|
||||
echo "Checking for CUDA installation..."
|
||||
if command -v nvidia-smi &> /dev/null; then
|
||||
if command -v nvidia-smi &>/dev/null; then
|
||||
USE_CUDA=true
|
||||
echo "CUDA found."
|
||||
else
|
||||
@ -26,7 +28,6 @@ else
|
||||
USE_CUDA=false
|
||||
fi
|
||||
|
||||
|
||||
if [ "$USE_CUDA" = false ]; then
|
||||
echo "Checking for ROCm installation..."
|
||||
if [ -d "/opt/rocm" ]; then
|
||||
@ -48,7 +49,7 @@ fi
|
||||
if [ "$USE_CUDA" = true ]; then
|
||||
echo "Installing PyTorch with CUDA support..."
|
||||
conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia
|
||||
elif [ "$USE_ROCM" = true ] ; then
|
||||
elif [ "$USE_ROCM" = true ]; then
|
||||
echo "Installing PyTorch with ROCm support..."
|
||||
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/rocm6.2
|
||||
else
|
||||
@ -56,21 +57,53 @@ else
|
||||
conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 cpuonly -c pytorch
|
||||
fi
|
||||
|
||||
|
||||
echo "Installing Python dependencies from requirements.txt..."
|
||||
|
||||
# 刷新环境
|
||||
# Refresh environment
|
||||
hash -r
|
||||
|
||||
# pyopenjtalk Installation
|
||||
conda install jq -y
|
||||
|
||||
OS_TYPE=$(uname)
|
||||
|
||||
PACKAGE_NAME="pyopenjtalk"
|
||||
|
||||
VERSION=$(curl -s https://pypi.org/pypi/$PACKAGE_NAME/json | jq -r .info.version)
|
||||
|
||||
wget "https://files.pythonhosted.org/packages/source/${PACKAGE_NAME:0:1}/$PACKAGE_NAME/$PACKAGE_NAME-$VERSION.tar.gz"
|
||||
|
||||
TAR_FILE=$(ls ${PACKAGE_NAME}-*.tar.gz)
|
||||
DIR_NAME="${TAR_FILE%.tar.gz}"
|
||||
|
||||
tar -xzf "$TAR_FILE"
|
||||
rm "$TAR_FILE"
|
||||
|
||||
CMAKE_FILE="$DIR_NAME/lib/open_jtalk/src/CMakeLists.txt"
|
||||
|
||||
if [[ "$OS_TYPE" == "darwin"* ]]; then
|
||||
sed -i '' -E 's/cmake_minimum_required\(VERSION[^\)]*\)/cmake_minimum_required(VERSION 3.5...3.31)/' "$CMAKE_FILE"
|
||||
else
|
||||
sed -i -E 's/cmake_minimum_required\(VERSION[^\)]*\)/cmake_minimum_required(VERSION 3.5...3.31)/' "$CMAKE_FILE"
|
||||
fi
|
||||
|
||||
tar -czf "$TAR_FILE" "$DIR_NAME"
|
||||
|
||||
pip install "$TAR_FILE"
|
||||
|
||||
rm -rf "$TAR_FILE" "$DIR_NAME"
|
||||
|
||||
pip install -r extra-req.txt --no-deps
|
||||
|
||||
pip install -r requirements.txt
|
||||
|
||||
if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ] ; then
|
||||
if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ]; then
|
||||
echo "Update to WSL compatible runtime lib..."
|
||||
location=`pip show torch | grep Location | awk -F ": " '{print $2}'`
|
||||
cd ${location}/torch/lib/
|
||||
location=$(pip show torch | grep Location | awk -F ": " '{print $2}')
|
||||
cd "${location}"/torch/lib/ || exit
|
||||
rm libhsa-runtime64.so*
|
||||
cp /opt/rocm/lib/libhsa-runtime64.so.1.2 libhsa-runtime64.so
|
||||
fi
|
||||
|
||||
echo "Installation completed successfully!"
|
||||
|
||||
|
@ -3,7 +3,7 @@ scipy
|
||||
tensorboard
|
||||
librosa==0.9.2
|
||||
numba==0.56.4
|
||||
pytorch-lightning
|
||||
pytorch-lightning>2.0
|
||||
gradio>=4.0,<=4.24.0
|
||||
ffmpeg-python
|
||||
onnxruntime; sys_platform == 'darwin'
|
||||
@ -26,7 +26,6 @@ jieba_fast
|
||||
jieba
|
||||
split-lang
|
||||
fast_langdetect>=0.3.0
|
||||
Faster_Whisper
|
||||
wordsegment
|
||||
rotary_embedding_torch
|
||||
ToJyutping
|
||||
@ -38,4 +37,9 @@ python_mecab_ko; sys_platform != 'win32'
|
||||
fastapi<0.112.2
|
||||
x_transformers
|
||||
torchmetrics<=1.5
|
||||
attrdict
|
||||
pydantic<=2.10.6
|
||||
ctranslate2>=4.0,<5
|
||||
huggingface_hub>=0.13
|
||||
tokenizers>=0.13,<1
|
||||
av>=11
|
||||
tqdm
|
||||
|
10
webui.py
10
webui.py
@ -298,9 +298,9 @@ def change_tts_inference(bert_path,cnhubert_base_path,gpu_number,gpt_path,sovits
|
||||
cmd = '"%s" GPT_SoVITS/inference_webui_fast.py "%s"'%(python_exec, language)
|
||||
else:
|
||||
cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language)
|
||||
#####v3暂不支持加速推理
|
||||
if version=="v3":
|
||||
cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language)
|
||||
# #####v3暂不支持加速推理
|
||||
# if version=="v3":
|
||||
# cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language)
|
||||
if p_tts_inference is None:
|
||||
os.environ["gpt_path"]=gpt_path if "/" in gpt_path else "%s/%s"%(GPT_weight_root,gpt_path)
|
||||
os.environ["sovits_path"]=sovits_path if "/"in sovits_path else "%s/%s"%(SoVITS_weight_root,sovits_path)
|
||||
@ -849,8 +849,8 @@ def switch_version(version_):
|
||||
{'__type__': 'update', "value": default_sovits_save_every_epoch,"maximum": max_sovits_save_every_epoch}, \
|
||||
{'__type__': 'update', "visible": True if version!="v3"else False}, \
|
||||
{'__type__': 'update', "value": False if not if_force_ckpt else True, "interactive": True if not if_force_ckpt else False}, \
|
||||
{'__type__': 'update', "interactive": False if version == "v3" else True, "value": False}, \
|
||||
{'__type__': 'update', "visible": True if version== "v3" else False}
|
||||
{'__type__': 'update', "interactive": True, "value": False}, \
|
||||
{'__type__': 'update', "visible": True if version== "v3" else False} # {'__type__': 'update', "interactive": False if version == "v3" else True, "value": False}, \ ####batch infer
|
||||
|
||||
if os.path.exists('GPT_SoVITS/text/G2PWModel'):...
|
||||
else:
|
||||
|
Loading…
x
Reference in New Issue
Block a user