Compare commits

...

12 Commits

Author SHA1 Message Date
2h0ng
b3b812b25b
Merge f6bdcf68462870d887fe69a5c916f2f4c7512805 into 9da7e17efe05041e31d3c3f42c8730ae890397f2 2025-04-02 04:19:19 +09:00
RVC-Boss
9da7e17efe
Add files via upload 2025-04-01 18:44:35 +08:00
RVC-Boss
b0de354c63
Update Changelog_CN.md 2025-04-01 17:21:48 +08:00
RVC-Boss
41090e5a7c
Update g2pw url 2025-04-01 17:15:52 +08:00
RVC-Boss
605b380114
修复模型加载异步逻辑
修复模型加载异步逻辑
2025-04-01 16:50:54 +08:00
RVC-Boss
9f8d455130
支持v3并行推理
support v3 models batch inference
2025-04-01 16:31:48 +08:00
RVC-Boss
7abae557fb
删除加载v3sovits模型缺少enc_q告警
删除加载v3sovits模型缺少enc_q告警
2025-04-01 16:31:15 +08:00
RVC-Boss
6a60e5edb1
v3解锁并行推理;修复模型加载异步逻辑
v3解锁并行推理;修复模型加载异步逻辑
2025-04-01 16:29:52 +08:00
RVC-Boss
28bdff356f
fix https://github.com/RVC-Boss/GPT-SoVITS/issues/2250
fix https://github.com/RVC-Boss/GPT-SoVITS/issues/2250
2025-04-01 10:34:02 +08:00
ChasonJiang
03b662a769
为sovits_v3 适配并行推理 (#2241)
* 为sovits_v3 适配并行推理

* 清理无用代码
2025-03-31 11:56:05 +08:00
XXXXRT666
6c468583c5
Fix dependency-related issues via requirements update (#2236)
* Update requirements.txt

* Create constraints.txt

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* pyopenjtalk and onnx fix

* Update requirements.txt

* Update requirements.txt

* Update install.sh

* update shell install.sh

* update docs

* Update Install.sh

* fix bugs

* Update .gitignore

* Update .gitignore

* Update install.sh

* Update install.sh

* Update extra-req.txt

* Update requirements.txt
2025-03-31 11:27:12 +08:00
2h0ng
f6bdcf6846
Create SECURITY.md
This security page is created for better facilitate vulnerability report process in a private and collaborative manner.
2024-11-21 01:07:16 -05:00
20 changed files with 811 additions and 269 deletions

178
.gitignore vendored
View File

@ -18,5 +18,183 @@ TEMP
weight.json weight.json
ffmpeg* ffmpeg*
ffprobe* ffprobe*
cfg.json
speakers.json
ref_audios
tools/AP_BWE_main/24kto48k/* tools/AP_BWE_main/24kto48k/*
!tools/AP_BWE_main/24kto48k/readme.txt !tools/AP_BWE_main/24kto48k/readme.txt
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
#uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# Ruff stuff:
.ruff_cache/
# PyPI configuration file
.pypirc

View File

@ -3,7 +3,7 @@ import math
import os, sys, gc import os, sys, gc
import random import random
import traceback import traceback
import time
import torchaudio import torchaudio
from tqdm import tqdm from tqdm import tqdm
now_dir = os.getcwd() now_dir = os.getcwd()
@ -462,8 +462,6 @@ class TTS:
n_speakers=self.configs.n_speakers, n_speakers=self.configs.n_speakers,
**kwargs **kwargs
) )
if hasattr(vits_model, "enc_q"):
del vits_model.enc_q
self.configs.is_v3_synthesizer = False self.configs.is_v3_synthesizer = False
else: else:
vits_model = SynthesizerTrnV3( vits_model = SynthesizerTrnV3(
@ -474,7 +472,8 @@ class TTS:
) )
self.configs.is_v3_synthesizer = True self.configs.is_v3_synthesizer = True
self.init_bigvgan() self.init_bigvgan()
if "pretrained" not in weights_path and hasattr(vits_model, "enc_q"):
del vits_model.enc_q
if if_lora_v3==False: if if_lora_v3==False:
print(f"Loading VITS weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}") print(f"Loading VITS weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}")
@ -908,11 +907,14 @@ class TTS:
split_bucket = False split_bucket = False
print(i18n("分段返回模式不支持分桶处理,已自动关闭分桶处理")) print(i18n("分段返回模式不支持分桶处理,已自动关闭分桶处理"))
if split_bucket and speed_factor==1.0: if split_bucket and speed_factor==1.0 and not (self.configs.is_v3_synthesizer and parallel_infer):
print(i18n("分桶处理模式已开启")) print(i18n("分桶处理模式已开启"))
elif speed_factor!=1.0: elif speed_factor!=1.0:
print(i18n("语速调节不支持分桶处理,已自动关闭分桶处理")) print(i18n("语速调节不支持分桶处理,已自动关闭分桶处理"))
split_bucket = False split_bucket = False
elif self.configs.is_v3_synthesizer and parallel_infer:
print(i18n("当开启并行推理模式时SoVits V3模型不支持分桶处理已自动关闭分桶处理"))
split_bucket = False
else: else:
print(i18n("分桶处理模式已关闭")) print(i18n("分桶处理模式已关闭"))
@ -936,7 +938,7 @@ class TTS:
raise ValueError("ref_audio_path cannot be empty, when the reference audio is not set using set_ref_audio()") raise ValueError("ref_audio_path cannot be empty, when the reference audio is not set using set_ref_audio()")
###### setting reference audio and prompt text preprocessing ######## ###### setting reference audio and prompt text preprocessing ########
t0 = ttime() t0 = time.perf_counter()
if (ref_audio_path is not None) and (ref_audio_path != self.prompt_cache["ref_audio_path"]): if (ref_audio_path is not None) and (ref_audio_path != self.prompt_cache["ref_audio_path"]):
if not os.path.exists(ref_audio_path): if not os.path.exists(ref_audio_path):
raise ValueError(f"{ref_audio_path} not exists") raise ValueError(f"{ref_audio_path} not exists")
@ -975,7 +977,7 @@ class TTS:
###### text preprocessing ######## ###### text preprocessing ########
t1 = ttime() t1 = time.perf_counter()
data:list = None data:list = None
if not return_fragment: if not return_fragment:
data = self.text_preprocessor.preprocess(text, text_lang, text_split_method, self.configs.version) data = self.text_preprocessor.preprocess(text, text_lang, text_split_method, self.configs.version)
@ -1027,7 +1029,7 @@ class TTS:
return batch[0] return batch[0]
t2 = ttime() t2 = time.perf_counter()
try: try:
print("############ 推理 ############") print("############ 推理 ############")
###### inference ###### ###### inference ######
@ -1036,7 +1038,7 @@ class TTS:
audio = [] audio = []
output_sr = self.configs.sampling_rate if not self.configs.is_v3_synthesizer else 24000 output_sr = self.configs.sampling_rate if not self.configs.is_v3_synthesizer else 24000
for item in data: for item in data:
t3 = ttime() t3 = time.perf_counter()
if return_fragment: if return_fragment:
item = make_batch(item) item = make_batch(item)
if item is None: if item is None:
@ -1071,7 +1073,7 @@ class TTS:
max_len=max_len, max_len=max_len,
repetition_penalty=repetition_penalty, repetition_penalty=repetition_penalty,
) )
t4 = ttime() t4 = time.perf_counter()
t_34 += t4 - t3 t_34 += t4 - t3
refer_audio_spec:torch.Tensor = [item.to(dtype=self.precision, device=self.configs.device) for item in self.prompt_cache["refer_spec"]] refer_audio_spec:torch.Tensor = [item.to(dtype=self.precision, device=self.configs.device) for item in self.prompt_cache["refer_spec"]]
@ -1094,6 +1096,7 @@ class TTS:
print(f"############ {i18n('合成音频')} ############") print(f"############ {i18n('合成音频')} ############")
if not self.configs.is_v3_synthesizer: if not self.configs.is_v3_synthesizer:
if speed_factor == 1.0: if speed_factor == 1.0:
print(f"{i18n('并行合成中')}...")
# ## vits并行推理 method 2 # ## vits并行推理 method 2
pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)] pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)]
upsample_rate = math.prod(self.vits_model.upsample_rates) upsample_rate = math.prod(self.vits_model.upsample_rates)
@ -1118,17 +1121,28 @@ class TTS:
audio_fragment audio_fragment
) ###试试重建不带上prompt部分 ) ###试试重建不带上prompt部分
else: else:
for i, idx in enumerate(tqdm(idx_list)): if parallel_infer:
phones = batch_phones[i].unsqueeze(0).to(self.configs.device) print(f"{i18n('并行合成中')}...")
_pred_semantic = (pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)) # .unsqueeze(0)#mq要多unsqueeze一次 audio_fragments = self.v3_synthesis_batched_infer(
audio_fragment = self.v3_synthesis( idx_list,
_pred_semantic, phones, speed=speed_factor, sample_steps=sample_steps pred_semantic_list,
batch_phones,
speed=speed_factor,
sample_steps=sample_steps
)
batch_audio_fragment.extend(audio_fragments)
else:
for i, idx in enumerate(tqdm(idx_list)):
phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
_pred_semantic = (pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)) # .unsqueeze(0)#mq要多unsqueeze一次
audio_fragment = self.v3_synthesis(
_pred_semantic, phones, speed=speed_factor, sample_steps=sample_steps
)
batch_audio_fragment.append(
audio_fragment
) )
batch_audio_fragment.append(
audio_fragment
)
t5 = ttime() t5 = time.perf_counter()
t_45 += t5 - t4 t_45 += t5 - t4
if return_fragment: if return_fragment:
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t4 - t3, t5 - t4)) print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t4 - t3, t5 - t4))
@ -1219,13 +1233,13 @@ class TTS:
if super_sampling: if super_sampling:
print(f"############ {i18n('音频超采样')} ############") print(f"############ {i18n('音频超采样')} ############")
t1 = ttime() t1 = time.perf_counter()
self.init_sr_model() self.init_sr_model()
if not self.sr_model_not_exist: if not self.sr_model_not_exist:
audio,sr=self.sr_model(audio.unsqueeze(0),sr) audio,sr=self.sr_model(audio.unsqueeze(0),sr)
max_audio=np.abs(audio).max() max_audio=np.abs(audio).max()
if max_audio > 1: audio /= max_audio if max_audio > 1: audio /= max_audio
t2 = ttime() t2 = time.perf_counter()
print(f"超采样用时:{t2-t1:.3f}s") print(f"超采样用时:{t2-t1:.3f}s")
else: else:
audio = audio.cpu().numpy() audio = audio.cpu().numpy()
@ -1260,7 +1274,7 @@ class TTS:
ref_audio = ref_audio.mean(0).unsqueeze(0) ref_audio = ref_audio.mean(0).unsqueeze(0)
if ref_sr!=24000: if ref_sr!=24000:
ref_audio=resample(ref_audio, ref_sr, self.configs.device) ref_audio=resample(ref_audio, ref_sr, self.configs.device)
# print("ref_audio",ref_audio.abs().mean())
mel2 = mel_fn(ref_audio) mel2 = mel_fn(ref_audio)
mel2 = norm_spec(mel2) mel2 = norm_spec(mel2)
T_min = min(mel2.shape[2], fea_ref.shape[2]) T_min = min(mel2.shape[2], fea_ref.shape[2])
@ -1285,15 +1299,156 @@ class TTS:
cfm_res = self.vits_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0) cfm_res = self.vits_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0)
cfm_res = cfm_res[:, :, mel2.shape[2]:] cfm_res = cfm_res[:, :, mel2.shape[2]:]
mel2 = cfm_res[:, :, -T_min:]
mel2 = cfm_res[:, :, -T_min:]
fea_ref = fea_todo_chunk[:, :, -T_min:] fea_ref = fea_todo_chunk[:, :, -T_min:]
cfm_resss.append(cfm_res) cfm_resss.append(cfm_res)
cmf_res = torch.cat(cfm_resss, 2) cfm_res = torch.cat(cfm_resss, 2)
cmf_res = denorm_spec(cmf_res) cfm_res = denorm_spec(cfm_res)
with torch.inference_mode(): with torch.inference_mode():
wav_gen = self.bigvgan_model(cmf_res) wav_gen = self.bigvgan_model(cfm_res)
audio=wav_gen[0][0]#.cpu().detach().numpy() audio=wav_gen[0][0]#.cpu().detach().numpy()
return audio return audio
def v3_synthesis_batched_infer(self,
idx_list:List[int],
semantic_tokens_list:List[torch.Tensor],
batch_phones:List[torch.Tensor],
speed:float=1.0,
sample_steps:int=32
)->List[torch.Tensor]:
prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device)
prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device)
refer_audio_spec = self.prompt_cache["refer_spec"][0].to(dtype=self.precision, device=self.configs.device)
fea_ref,ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
ref_audio:torch.Tensor = self.prompt_cache["raw_audio"]
ref_sr = self.prompt_cache["raw_sr"]
ref_audio=ref_audio.to(self.configs.device).float()
if (ref_audio.shape[0] == 2):
ref_audio = ref_audio.mean(0).unsqueeze(0)
if ref_sr!=24000:
ref_audio=resample(ref_audio, ref_sr, self.configs.device)
mel2 = mel_fn(ref_audio)
mel2 = norm_spec(mel2)
T_min = min(mel2.shape[2], fea_ref.shape[2])
mel2 = mel2[:, :, :T_min]
fea_ref = fea_ref[:, :, :T_min]
if (T_min > 468):
mel2 = mel2[:, :, -468:]
fea_ref = fea_ref[:, :, -468:]
T_min = 468
chunk_len = 934 - T_min
mel2=mel2.to(self.precision)
# #### batched inference
overlapped_len = 12
feat_chunks = []
feat_lens = []
feat_list = []
for i, idx in enumerate(idx_list):
phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
semantic_tokens = semantic_tokens_list[i][-idx:].unsqueeze(0).unsqueeze(0) # .unsqueeze(0)#mq要多unsqueeze一次
feat, _ = self.vits_model.decode_encp(semantic_tokens, phones, refer_audio_spec, ge, speed)
feat_list.append(feat)
feat_lens.append(feat.shape[2])
feats = torch.cat(feat_list, 2)
feats_padded = F.pad(feats, (overlapped_len,0), "constant", 0)
pos = 0
padding_len = 0
while True:
if pos ==0:
chunk = feats_padded[:, :, pos:pos + chunk_len]
else:
pos = pos - overlapped_len
chunk = feats_padded[:, :, pos:pos + chunk_len]
pos += chunk_len
if (chunk.shape[-1] == 0): break
# padding for the last chunk
padding_len = chunk_len - chunk.shape[2]
if padding_len != 0:
chunk = F.pad(chunk, (0,padding_len), "constant", 0)
feat_chunks.append(chunk)
feat_chunks = torch.cat(feat_chunks, 0)
bs = feat_chunks.shape[0]
fea_ref = fea_ref.repeat(bs,1,1)
fea = torch.cat([fea_ref, feat_chunks], 2).transpose(2, 1)
pred_spec = self.vits_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0)
pred_spec = pred_spec[:, :, -chunk_len:]
dd = pred_spec.shape[1]
pred_spec = pred_spec.permute(1, 0, 2).contiguous().view(dd, -1).unsqueeze(0)
# pred_spec = pred_spec[..., :-padding_len]
pred_spec = denorm_spec(pred_spec)
with torch.no_grad():
wav_gen = self.bigvgan_model(pred_spec)
audio = wav_gen[0][0]#.cpu().detach().numpy()
audio_fragments = []
upsample_rate = 256
pos = 0
while pos < audio.shape[-1]:
audio_fragment = audio[pos:pos+chunk_len*upsample_rate]
audio_fragments.append(audio_fragment)
pos += chunk_len*upsample_rate
audio = self.sola_algorithm(audio_fragments, overlapped_len*upsample_rate)
audio = audio[overlapped_len*upsample_rate:-padding_len*upsample_rate]
audio_fragments = []
for feat_len in feat_lens:
audio_fragment = audio[:feat_len*upsample_rate]
audio_fragments.append(audio_fragment)
audio = audio[feat_len*upsample_rate:]
return audio_fragments
def sola_algorithm(self,
audio_fragments:List[torch.Tensor],
overlap_len:int,
):
for i in range(len(audio_fragments)-1):
f1 = audio_fragments[i]
f2 = audio_fragments[i+1]
w1 = f1[-overlap_len:]
w2 = f2[:overlap_len]
assert w1.shape == w2.shape
corr = F.conv1d(w1.view(1,1,-1), w2.view(1,1,-1),padding=w2.shape[-1]//2).view(-1)[:-1]
idx = corr.argmax()
f1_ = f1[:-(overlap_len-idx)]
audio_fragments[i] = f1_
f2_ = f2[idx:]
window = torch.hann_window((overlap_len-idx)*2, device=f1.device, dtype=f1.dtype)
f2_[:(overlap_len-idx)] = window[:(overlap_len-idx)]*f2_[:(overlap_len-idx)] + window[(overlap_len-idx):]*f1[-(overlap_len-idx):]
audio_fragments[i+1] = f2_
return torch.cat(audio_fragments, 0)

View File

@ -238,7 +238,7 @@ def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
else: else:
visible_sample_steps=False visible_sample_steps=False
visible_inp_refs=True visible_inp_refs=True
yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False} yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False},{"__type__": "update", "value":i18n("模型加载中,请等待"),"interactive":False}
dict_s2 = load_sovits_new(sovits_path) dict_s2 = load_sovits_new(sovits_path)
hps = dict_s2["config"] hps = dict_s2["config"]
@ -294,6 +294,7 @@ def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
# torch.save(vq_model.state_dict(),"merge_win.pth") # torch.save(vq_model.state_dict(),"merge_win.pth")
vq_model.eval() vq_model.eval()
yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False},{"__type__": "update", "value":i18n("合成语音"),"interactive":True}
with open("./weight.json")as f: with open("./weight.json")as f:
data=f.read() data=f.read()
data=json.loads(data) data=json.loads(data)
@ -877,7 +878,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
with gr.Row(): with gr.Row():
inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频超过会报错"), type="filepath", scale=13) inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频超过会报错"), type="filepath", scale=13)
with gr.Column(scale=13): with gr.Column(scale=13):
ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。")+i18n("v3暂不支持该模式使用了会报错。"), value=False, interactive=True, show_label=True,scale=1) ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。")+i18n("v3暂不支持该模式使用了会报错。"), value=False, interactive=True if model_version!="v3"else False, show_label=True,scale=1)
gr.Markdown(html_left(i18n("使用无参考文本模式时建议使用微调的GPT")+"<br>"+i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。"))) gr.Markdown(html_left(i18n("使用无参考文本模式时建议使用微调的GPT")+"<br>"+i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。")))
prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="", lines=5, max_lines=5,scale=1) prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="", lines=5, max_lines=5,scale=1)
with gr.Column(scale=14): with gr.Column(scale=14):
@ -915,7 +916,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
# phoneme=gr.Textbox(label=i18n("音素框"), value="") # phoneme=gr.Textbox(label=i18n("音素框"), value="")
# get_phoneme_button = gr.Button(i18n("目标文本转音素"), variant="primary") # get_phoneme_button = gr.Button(i18n("目标文本转音素"), variant="primary")
with gr.Row(): with gr.Row():
inference_button = gr.Button(i18n("合成语音"), variant="primary", size='lg', scale=25) inference_button = gr.Button(value=i18n("合成语音"), variant="primary", size='lg', scale=25)
output = gr.Audio(label=i18n("输出的语音"), scale=14) output = gr.Audio(label=i18n("输出的语音"), scale=14)
inference_button.click( inference_button.click(
@ -923,7 +924,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
[inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free,speed,if_freeze,inp_refs,sample_steps,if_sr_Checkbox,pause_second_slider], [inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free,speed,if_freeze,inp_refs,sample_steps,if_sr_Checkbox,pause_second_slider],
[output], [output],
) )
SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language,sample_steps,inp_refs,ref_text_free,if_sr_Checkbox]) SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language,sample_steps,inp_refs,ref_text_free,if_sr_Checkbox,inference_button])
GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], []) GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], [])
# gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。")) # gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"))

View File

@ -41,12 +41,13 @@ gpt_path = os.environ.get("gpt_path", None)
sovits_path = os.environ.get("sovits_path", None) sovits_path = os.environ.get("sovits_path", None)
cnhubert_base_path = os.environ.get("cnhubert_base_path", None) cnhubert_base_path = os.environ.get("cnhubert_base_path", None)
bert_path = os.environ.get("bert_path", None) bert_path = os.environ.get("bert_path", None)
version=os.environ.get("version","v2") version=model_version=os.environ.get("version","v2")
import gradio as gr import gradio as gr
from TTS_infer_pack.TTS import TTS, TTS_Config, NO_PROMPT_ERROR from TTS_infer_pack.TTS import TTS, TTS_Config, NO_PROMPT_ERROR
from TTS_infer_pack.text_segmentation_method import get_method from TTS_infer_pack.text_segmentation_method import get_method
from tools.i18n.i18n import I18nAuto, scan_language_list from tools.i18n.i18n import I18nAuto, scan_language_list
from inference_webui import DictToAttrRecursive
language=os.environ.get("language","Auto") language=os.environ.get("language","Auto")
language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
@ -221,19 +222,16 @@ def get_weights_names(GPT_weight_root, SoVITS_weight_root):
SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root) SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root)
from process_ckpt import get_sovits_version_from_path_fast from process_ckpt import get_sovits_version_from_path_fast,load_sovits_new
def change_sovits_weights(sovits_path,prompt_language=None,text_language=None): def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
global version, dict_language global version, model_version, dict_language,if_lora_v3
version, model_version, if_lora_v3=get_sovits_version_from_path_fast(sovits_path) version, model_version, if_lora_v3=get_sovits_version_from_path_fast(sovits_path)
# print(sovits_path,version, model_version, if_lora_v3)
if if_lora_v3 and not os.path.exists(path_sovits_v3): if if_lora_v3 and not os.path.exists(path_sovits_v3):
info= path_sovits_v3 + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重") info= path_sovits_v3 + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重")
gr.Warning(info) gr.Warning(info)
raise FileExistsError(info) raise FileExistsError(info)
dict_language = dict_language_v1 if version =='v1' else dict_language_v2
tts_pipeline.init_vits_weights(sovits_path)
dict_language = dict_language_v1 if tts_pipeline.configs.version =='v1' else dict_language_v2
if prompt_language is not None and text_language is not None: if prompt_language is not None and text_language is not None:
if prompt_language in list(dict_language.keys()): if prompt_language in list(dict_language.keys()):
prompt_text_update, prompt_language_update = {'__type__':'update'}, {'__type__':'update', 'value':prompt_language} prompt_text_update, prompt_language_update = {'__type__':'update'}, {'__type__':'update', 'value':prompt_language}
@ -251,8 +249,11 @@ def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
else: else:
visible_sample_steps=False visible_sample_steps=False
visible_inp_refs=True visible_inp_refs=True
yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False} #prompt_language,text_language,prompt_text,prompt_language,text,text_language,inp_refs,ref_text_free,
yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "interactive": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "interactive": True if model_version!="v3"else False},{"__type__": "update", "value":i18n("模型加载中,请等待"),"interactive":False}
tts_pipeline.init_vits_weights(sovits_path)
yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "interactive": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "interactive": True if model_version!="v3"else False},{"__type__": "update", "value":i18n("合成语音"),"interactive":True}
with open("./weight.json")as f: with open("./weight.json")as f:
data=f.read() data=f.read()
data=json.loads(data) data=json.loads(data)
@ -279,14 +280,14 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
gr.Markdown(value=i18n("*请上传并填写参考信息")) gr.Markdown(value=i18n("*请上传并填写参考信息"))
with gr.Row(): with gr.Row():
inp_ref = gr.Audio(label=i18n("主参考音频(请上传3~10秒内参考音频超过会报错)"), type="filepath") inp_ref = gr.Audio(label=i18n("主参考音频(请上传3~10秒内参考音频超过会报错)"), type="filepath")
inp_refs = gr.File(label=i18n("辅参考音频(可选多个,或不选)"),file_count="multiple") inp_refs = gr.File(label=i18n("辅参考音频(可选多个,或不选)"),file_count="multiple", visible=True if model_version!="v3"else False)
prompt_text = gr.Textbox(label=i18n("主参考音频的文本"), value="", lines=2) prompt_text = gr.Textbox(label=i18n("主参考音频的文本"), value="", lines=2)
with gr.Row(): with gr.Row():
prompt_language = gr.Dropdown( prompt_language = gr.Dropdown(
label=i18n("主参考音频的语种"), choices=list(dict_language.keys()), value=i18n("中文") label=i18n("主参考音频的语种"), choices=list(dict_language.keys()), value=i18n("中文")
) )
with gr.Column(): with gr.Column():
ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True, show_label=True) ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True if model_version!="v3"else False, show_label=True)
gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT")+"<br>"+i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。")) gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT")+"<br>"+i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。"))
with gr.Column(): with gr.Column():
@ -355,7 +356,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
[output, seed], [output, seed],
) )
stop_infer.click(tts_pipeline.stop, [], []) stop_infer.click(tts_pipeline.stop, [], [])
SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language]) SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language,sample_steps,inp_refs,ref_text_free,inference_button])#
GPT_dropdown.change(tts_pipeline.init_t2s_weights, [GPT_dropdown], []) GPT_dropdown.change(tts_pipeline.init_t2s_weights, [GPT_dropdown], [])
with gr.Group(): with gr.Group():

View File

@ -429,26 +429,25 @@ def train_and_evaluate(
# scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)}) # scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)})
# scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)}) # scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)})
# scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)}) # scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)})
image_dict = { image_dict=None
"slice/mel_org": utils.plot_spectrogram_to_numpy( try:###Some people installed the wrong version of matplotlib.
y_mel[0].data.cpu().numpy() image_dict = {
), "slice/mel_org": utils.plot_spectrogram_to_numpy(
"slice/mel_gen": utils.plot_spectrogram_to_numpy( y_mel[0].data.cpu().numpy()
y_hat_mel[0].data.cpu().numpy() ),
), "slice/mel_gen": utils.plot_spectrogram_to_numpy(
"all/mel": utils.plot_spectrogram_to_numpy( y_hat_mel[0].data.cpu().numpy()
mel[0].data.cpu().numpy() ),
), "all/mel": utils.plot_spectrogram_to_numpy(
"all/stats_ssl": utils.plot_spectrogram_to_numpy( mel[0].data.cpu().numpy()
stats_ssl[0].data.cpu().numpy() ),
), "all/stats_ssl": utils.plot_spectrogram_to_numpy(
} stats_ssl[0].data.cpu().numpy()
utils.summarize( ),
writer=writer, }
global_step=global_step, except:pass
images=image_dict, if image_dict:utils.summarize(writer=writer,global_step=global_step,images=image_dict,scalars=scalar_dict,)
scalars=scalar_dict, else:utils.summarize(writer=writer,global_step=global_step,scalars=scalar_dict,)
)
global_step += 1 global_step += 1
if epoch % hps.train.save_every_epoch == 0 and rank == 0: if epoch % hps.train.save_every_epoch == 0 and rank == 0:
if hps.train.if_save_latest == 0: if hps.train.if_save_latest == 0:

View File

@ -58,7 +58,7 @@ def download_and_decompress(model_dir: str='G2PWModel/'):
extract_dir = os.path.join(parent_directory,"G2PWModel_1.1") extract_dir = os.path.join(parent_directory,"G2PWModel_1.1")
extract_dir_new = os.path.join(parent_directory,"G2PWModel") extract_dir_new = os.path.join(parent_directory,"G2PWModel")
print("Downloading g2pw model...") print("Downloading g2pw model...")
modelscope_url = "https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip" modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip"#"https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip"
with requests.get(modelscope_url, stream=True) as r: with requests.get(modelscope_url, stream=True) as r:
r.raise_for_status() r.raise_for_status()
with open(zip_dir, 'wb') as f: with open(zip_dir, 'wb') as f:

View File

@ -1,42 +1,37 @@
{ {
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [ "cells": [
{ {
"cell_type": "markdown", "cell_type": "markdown",
"source": [
"# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)"
],
"metadata": { "metadata": {
"id": "himHYZmra7ix" "id": "himHYZmra7ix"
} },
"source": [
"# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)"
]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"id": "e9b7iFV3dm1f" "id": "e9b7iFV3dm1f"
}, },
"outputs": [],
"source": [ "source": [
"!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n", "!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
"%cd GPT-SoVITS\n", "%cd GPT-SoVITS\n",
"!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n", "!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
"!pip install -r extra-req.txt --no-deps\n",
"!pip install -r requirements.txt" "!pip install -r requirements.txt"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "0NgxXg5sjv7z"
},
"outputs": [],
"source": [ "source": [
"# @title Download pretrained models 下载预训练模型\n", "# @title Download pretrained models 下载预训练模型\n",
"!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n", "!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
@ -53,16 +48,16 @@
"!git clone https://huggingface.co/Delik/uvr5_weights\n", "!git clone https://huggingface.co/Delik/uvr5_weights\n",
"!git config core.sparseCheckout true\n", "!git config core.sparseCheckout true\n",
"!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/" "!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
], ]
"metadata": {
"id": "0NgxXg5sjv7z",
"cellView": "form"
},
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "cPDEH-9czOJF"
},
"outputs": [],
"source": [ "source": [
"#@title Create folder models 创建文件夹模型\n", "#@title Create folder models 创建文件夹模型\n",
"import os\n", "import os\n",
@ -77,16 +72,16 @@
" print(f\"The folder '{folder_name}' was created successfully! (文件夹'{folder_name}'已成功创建!)\")\n", " print(f\"The folder '{folder_name}' was created successfully! (文件夹'{folder_name}'已成功创建!)\")\n",
"\n", "\n",
"print(\"All folders have been created. (所有文件夹均已创建。)\")" "print(\"All folders have been created. (所有文件夹均已创建。)\")"
], ]
"metadata": {
"cellView": "form",
"id": "cPDEH-9czOJF"
},
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "vbZY-LnM0tzq"
},
"outputs": [],
"source": [ "source": [
"import requests\n", "import requests\n",
"import zipfile\n", "import zipfile\n",
@ -124,29 +119,35 @@
" shutil.move(source_path, destination_path)\n", " shutil.move(source_path, destination_path)\n",
"\n", "\n",
"print(f'Model downloaded. (模型已下载。)')" "print(f'Model downloaded. (模型已下载。)')"
], ]
"metadata": {
"cellView": "form",
"id": "vbZY-LnM0tzq"
},
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "4oRGUzkrk8C7"
},
"outputs": [],
"source": [ "source": [
"# @title launch WebUI 启动WebUI\n", "# @title launch WebUI 启动WebUI\n",
"!/usr/local/bin/pip install ipykernel\n", "!/usr/local/bin/pip install ipykernel\n",
"!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n", "!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
"%cd /content/GPT-SoVITS/\n", "%cd /content/GPT-SoVITS/\n",
"!/usr/local/bin/python webui.py" "!/usr/local/bin/python webui.py"
], ]
"metadata": {
"id": "4oRGUzkrk8C7",
"cellView": "form"
},
"execution_count": null,
"outputs": []
} }
] ],
"metadata": {
"accelerator": "GPU",
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 0
} }

View File

@ -1,6 +1,5 @@
<div align="center"> <div align="center">
<h1>GPT-SoVITS-WebUI</h1> <h1>GPT-SoVITS-WebUI</h1>
A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br> A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>
@ -77,6 +76,7 @@ bash install.sh
```bash ```bash
conda create -n GPTSoVits python=3.9 conda create -n GPTSoVits python=3.9
conda activate GPTSoVits conda activate GPTSoVits
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt pip install -r requirements.txt
``` ```
@ -105,6 +105,7 @@ Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWeb
Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) (Korean TTS Only) Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) (Korean TTS Only)
##### MacOS Users ##### MacOS Users
```bash ```bash
brew install ffmpeg brew install ffmpeg
``` ```
@ -112,6 +113,7 @@ brew install ffmpeg
#### Install Dependences #### Install Dependences
```bash ```bash
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt pip install -r requirements.txt
``` ```
@ -150,9 +152,9 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
3. For UVR5 (Vocals/Accompaniment Separation & Reverberation Removal, additionally), download models from [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) and place them in `tools/uvr5/uvr5_weights`. 3. For UVR5 (Vocals/Accompaniment Separation & Reverberation Removal, additionally), download models from [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) and place them in `tools/uvr5/uvr5_weights`.
- If you want to use `bs_roformer` or `mel_band_roformer` models for UVR5, you can manually download the model and corresponding configuration file, and put them in `tools/uvr5/uvr5_weights`. **Rename the model file and configuration file, ensure that the model and configuration files have the same and corresponding names except for the suffix**. In addition, the model and configuration file names **must include `roformer`** in order to be recognized as models of the roformer class. - If you want to use `bs_roformer` or `mel_band_roformer` models for UVR5, you can manually download the model and corresponding configuration file, and put them in `tools/uvr5/uvr5_weights`. **Rename the model file and configuration file, ensure that the model and configuration files have the same and corresponding names except for the suffix**. In addition, the model and configuration file names **must include `roformer`** in order to be recognized as models of the roformer class.
- The suggestion is to **directly specify the model type** in the model name and configuration file name, such as `mel_mand_roformer`, `bs_roformer`. If not specified, the features will be compared from the configuration file to determine which type of model it is. For example, the model `bs_roformer_ep_368_sdr_12.9628.ckpt` and its corresponding configuration file `bs_roformer_ep_368_sdr_12.9628.yaml` are a pair, `kim_mel_band_roformer.ckpt` and `kim_mel_band_roformer.yaml` are also a pair. - The suggestion is to **directly specify the model type** in the model name and configuration file name, such as `mel_mand_roformer`, `bs_roformer`. If not specified, the features will be compared from the configuration file to determine which type of model it is. For example, the model `bs_roformer_ep_368_sdr_12.9628.ckpt` and its corresponding configuration file `bs_roformer_ep_368_sdr_12.9628.yaml` are a pair, `kim_mel_band_roformer.ckpt` and `kim_mel_band_roformer.yaml` are also a pair.
4. For Chinese ASR (additionally), download models from [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), and [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) and place them in `tools/asr/models`. 4. For Chinese ASR (additionally), download models from [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), and [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) and place them in `tools/asr/models`.
@ -200,6 +202,7 @@ if you want to switch to V1,then
```bash ```bash
python webui.py v1 <language(optional)> python webui.py v1 <language(optional)>
``` ```
Or maunally switch version in WebUI Or maunally switch version in WebUI
### Finetune ### Finetune
@ -217,18 +220,20 @@ Or maunally switch version in WebUI
#### Integrated Package Users #### Integrated Package Users
Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference` Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
#### Others #### Others
```bash ```bash
python GPT_SoVITS/inference_webui.py <language(optional)> python GPT_SoVITS/inference_webui.py <language(optional)>
``` ```
OR OR
```bash ```bash
python webui.py python webui.py
``` ```
then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference` then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
## V2 Release Notes ## V2 Release Notes
@ -243,7 +248,7 @@ New Features:
4. Improved synthesis quality for low-quality reference audio 4. Improved synthesis quality for low-quality reference audio
[more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) [more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
Use v2 from v1 environment: Use v2 from v1 environment:
@ -253,7 +258,7 @@ Use v2 from v1 environment:
3. Download v2 pretrained models from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) and put them into `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`. 3. Download v2 pretrained models from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) and put them into `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`.
Chinese v2 additional: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`. Chinese v2 additional: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.
## V3 Release Notes ## V3 Release Notes
@ -263,7 +268,7 @@ New Features:
2. GPT model is more stable, with fewer repetitions and omissions, and it is easier to generate speech with richer emotional expression. 2. GPT model is more stable, with fewer repetitions and omissions, and it is easier to generate speech with richer emotional expression.
[more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) [more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
Use v3 from v2 environment: Use v3 from v2 environment:
@ -273,8 +278,7 @@ Use v3 from v2 environment:
3. Download v3 pretrained models (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS\pretrained_models`. 3. Download v3 pretrained models (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS\pretrained_models`.
additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt) additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt)
## Todo List ## Todo List
@ -297,15 +301,20 @@ Use v3 from v2 environment:
- [ ] model mix - [ ] model mix
## (Additional) Method for running from the command line ## (Additional) Method for running from the command line
Use the command line to open the WebUI for UVR5 Use the command line to open the WebUI for UVR5
``` ```
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5> python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
``` ```
<!-- If you can't open a browser, follow the format below for UVR processing,This is using mdxnet for audio processing <!-- If you can't open a browser, follow the format below for UVR processing,This is using mdxnet for audio processing
``` ```
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
``` --> ``` -->
This is how the audio segmentation of the dataset is done using the command line This is how the audio segmentation of the dataset is done using the command line
``` ```
python audio_slicer.py \ python audio_slicer.py \
--input_path "<path_to_original_audio_file_or_directory>" \ --input_path "<path_to_original_audio_file_or_directory>" \
@ -315,16 +324,21 @@ python audio_slicer.py \
--min_interval <shortest_time_gap_between_adjacent_subclips> --min_interval <shortest_time_gap_between_adjacent_subclips>
--hop_size <step_size_for_computing_volume_curve> --hop_size <step_size_for_computing_volume_curve>
``` ```
This is how dataset ASR processing is done using the command line(Only Chinese) This is how dataset ASR processing is done using the command line(Only Chinese)
``` ```
python tools/asr/funasr_asr.py -i <input> -o <output> python tools/asr/funasr_asr.py -i <input> -o <output>
``` ```
ASR processing is performed through Faster_Whisper(ASR marking except Chinese) ASR processing is performed through Faster_Whisper(ASR marking except Chinese)
(No progress bars, GPU performance may cause time delays) (No progress bars, GPU performance may cause time delays)
``` ```
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision> python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
``` ```
A custom list save path is enabled A custom list save path is enabled
## Credits ## Credits
@ -332,6 +346,7 @@ A custom list save path is enabled
Special thanks to the following projects and contributors: Special thanks to the following projects and contributors:
### Theoretical Research ### Theoretical Research
- [ar-vits](https://github.com/innnky/ar-vits) - [ar-vits](https://github.com/innnky/ar-vits)
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR) - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
- [vits](https://github.com/jaywalnut310/vits) - [vits](https://github.com/jaywalnut310/vits)
@ -341,17 +356,23 @@ Special thanks to the following projects and contributors:
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41) - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py) - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py) - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
### Pretrained Models ### Pretrained Models
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain) - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large) - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
- [BigVGAN](https://github.com/NVIDIA/BigVGAN) - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
### Text Frontend for Inference ### Text Frontend for Inference
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization) - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
- [split-lang](https://github.com/DoodleBears/split-lang) - [split-lang](https://github.com/DoodleBears/split-lang)
- [g2pW](https://github.com/GitYCC/g2pW) - [g2pW](https://github.com/GitYCC/g2pW)
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW) - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw) - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
### WebUI Tools ### WebUI Tools
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui) - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
- [audio-slicer](https://github.com/openvpi/audio-slicer) - [audio-slicer](https://github.com/openvpi/audio-slicer)
- [SubFix](https://github.com/cronrpc/SubFix) - [SubFix](https://github.com/cronrpc/SubFix)

45
SECURITY.md Normal file
View File

@ -0,0 +1,45 @@
## Security Policy
### Supported Versions
We actively support the following versions:
| Version | Supported |
| ---------- | --------- |
| 20240821v2 | ✅ |
Please ensure you are using the latest version to receive security updates and fixes.
### Reporting a Vulnerability
If you discover a security vulnerability in GPT-SoVITS-WebUI, we encourage you to report it responsibly via GitHub Security Advisories. Here's how you can do it:
1. **Open a GitHub Security Advisory**:
- Navigate to the repository's [Security tab](https://github.com/RVC-Boss/GPT-SoVITS/security).
- Select "Report a vulnerability."
- Provide the following details:
- A detailed description of the vulnerability.
- Steps to reproduce the issue (if applicable).
- Any potential impact and severity level.
2. **Response Time**: We will acknowledge your report within 72 hours and provide an estimated timeline for resolution.
3. **Responsible Disclosure**: We request that you do not publicly disclose the vulnerability until it has been resolved. If necessary, we will work with you to determine an appropriate disclosure timeline.
### Best Practices for Users
To maintain security while using GPT-SoVITS-WebUI:
- **Update Regularly**: Always use the latest version to ensure you're benefiting from security updates.
- **Environment Isolation**: Run the application in isolated environments (e.g., Docker, Conda environments) to reduce potential risks.
- **Data Privacy**: Avoid using sensitive or private data unless necessary, as models are not encrypted by default.
### Security Practices
To ensure a secure codebase, we follow these practices:
- **Dependency Monitoring**: Regular updates and audits of third-party dependencies.
- **Code Reviews**: All new contributions undergo thorough reviews to ensure they meet our security standards.
- **Static Analysis**: Automated tools are used to identify common vulnerabilities in the code.
### Acknowledgments
We thank the community for reporting issues and helping us improve security. If your vulnerability report leads to a fix, we would be happy to acknowledge your contribution in the release notes (if desired).

View File

@ -1,23 +1,10 @@
{ {
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [ "cells": [
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": { "metadata": {
"id": "view-in-github", "colab_type": "text",
"colab_type": "text" "id": "view-in-github"
}, },
"source": [ "source": [
"<a href=\"https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" "<a href=\"https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
@ -25,18 +12,20 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"source": [
"环境配置 environment"
],
"metadata": { "metadata": {
"id": "_o6a8GS2lWQM" "id": "_o6a8GS2lWQM"
} },
"source": [
"环境配置 environment"
]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"id": "e9b7iFV3dm1f" "id": "e9b7iFV3dm1f"
}, },
"outputs": [],
"source": [ "source": [
"!pip install -q condacolab\n", "!pip install -q condacolab\n",
"# Setting up condacolab and installing packages\n", "# Setting up condacolab and installing packages\n",
@ -47,13 +36,17 @@
"!conda install -y -q -c pytorch -c nvidia cudatoolkit\n", "!conda install -y -q -c pytorch -c nvidia cudatoolkit\n",
"%cd -q /content/GPT-SoVITS\n", "%cd -q /content/GPT-SoVITS\n",
"!conda install -y -q -c conda-forge gcc gxx ffmpeg cmake -c pytorch -c nvidia\n", "!conda install -y -q -c conda-forge gcc gxx ffmpeg cmake -c pytorch -c nvidia\n",
"!/usr/local/bin/pip install -r extra-req.txt --no-deps\n",
"!/usr/local/bin/pip install -r requirements.txt" "!/usr/local/bin/pip install -r requirements.txt"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": {
"id": "0NgxXg5sjv7z"
},
"outputs": [],
"source": [ "source": [
"# @title Download pretrained models 下载预训练模型\n", "# @title Download pretrained models 下载预训练模型\n",
"!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n", "!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
@ -71,27 +64,35 @@
"!git clone https://huggingface.co/Delik/uvr5_weights\n", "!git clone https://huggingface.co/Delik/uvr5_weights\n",
"!git config core.sparseCheckout true\n", "!git config core.sparseCheckout true\n",
"!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/" "!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
], ]
"metadata": {
"id": "0NgxXg5sjv7z"
},
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": {
"id": "4oRGUzkrk8C7"
},
"outputs": [],
"source": [ "source": [
"# @title launch WebUI 启动WebUI\n", "# @title launch WebUI 启动WebUI\n",
"!/usr/local/bin/pip install ipykernel\n", "!/usr/local/bin/pip install ipykernel\n",
"!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n", "!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
"%cd /content/GPT-SoVITS/\n", "%cd /content/GPT-SoVITS/\n",
"!/usr/local/bin/python webui.py" "!/usr/local/bin/python webui.py"
], ]
"metadata": {
"id": "4oRGUzkrk8C7"
},
"execution_count": null,
"outputs": []
} }
] ],
"metadata": {
"accelerator": "GPU",
"colab": {
"include_colab_link": true,
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 0
} }

View File

@ -286,3 +286,17 @@ https://github.com/RVC-Boss/GPT-SoVITS/pull/2112 https://github.com/RVC-Boss/GPT
修复短文本语种选择出错 https://github.com/RVC-Boss/GPT-SoVITS/pull/2122 修复短文本语种选择出错 https://github.com/RVC-Boss/GPT-SoVITS/pull/2122
修复v3sovits未传参以支持调节语速 修复v3sovits未传参以支持调节语速
### 202503
修复一批由依赖的库版本不对导致的问题https://github.com/RVC-Boss/GPT-SoVITS/commit/6c468583c5566e5fbb4fb805e4cc89c403e997b8
修复模型加载异步逻辑https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa
修复其他若干bug
重点更新:
1-v3支持并行推理 https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa
2-整合包修复onnxruntime GPU推理的支持影响1g2pw有个onnx模型原先是CPU推理现在用GPU显著降低推理的CPU瓶颈 2foxjoy去混响模型现在可使用GPU推理

View File

@ -76,6 +76,7 @@ bash install.sh
```bash ```bash
conda create -n GPTSoVits python=3.9 conda create -n GPTSoVits python=3.9
conda activate GPTSoVits conda activate GPTSoVits
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt pip install -r requirements.txt
``` ```
@ -101,9 +102,10 @@ conda install -c conda-forge 'ffmpeg<7'
下载并将 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) 和 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) 放置在 GPT-SoVITS 根目录下。 下载并将 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) 和 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) 放置在 GPT-SoVITS 根目录下。
安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语TTS) 安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语 TTS)
##### MacOS 用户 ##### MacOS 用户
```bash ```bash
brew install ffmpeg brew install ffmpeg
``` ```
@ -111,6 +113,7 @@ brew install ffmpeg
#### 安装依赖 #### 安装依赖
```bash ```bash
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt pip install -r requirements.txt
``` ```
@ -147,14 +150,13 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
1. 从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型,并将其放置在 `GPT_SoVITS/pretrained_models` 目录中。 1. 从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型,并将其放置在 `GPT_SoVITS/pretrained_models` 目录中。
2. 从 [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) 下载模型,解压并重命名为 `G2PWModel`,然后将其放置在 `GPT_SoVITS/text` 目录中。仅限中文TTS 2. 从 [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) 下载模型,解压并重命名为 `G2PWModel`,然后将其放置在 `GPT_SoVITS/text` 目录中。(仅限中文 TTS
3. 对于 UVR5人声/伴奏分离和混响移除,额外功能),从 [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 下载模型,并将其放置在 `tools/uvr5/uvr5_weights` 目录中。 3. 对于 UVR5人声/伴奏分离和混响移除,额外功能),从 [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 下载模型,并将其放置在 `tools/uvr5/uvr5_weights` 目录中。
- 如果你在 UVR5 中使用 `bs_roformer``mel_band_roformer`模型,你可以手动下载模型和相应的配置文件,并将它们放在 `tools/UVR5/UVR5_weights` 中。**重命名模型文件和配置文件,确保除后缀外**,模型和配置文件具有相同且对应的名称。此外,模型和配置文件名**必须包含“roformer”**,才能被识别为 roformer 类的模型。 - 如果你在 UVR5 中使用 `bs_roformer``mel_band_roformer`模型,你可以手动下载模型和相应的配置文件,并将它们放在 `tools/UVR5/UVR5_weights` 中。**重命名模型文件和配置文件,确保除后缀外**,模型和配置文件具有相同且对应的名称。此外,模型和配置文件名**必须包含“roformer”**,才能被识别为 roformer 类的模型。
- 建议在模型名称和配置文件名中**直接指定模型类型**,例如`mel_mand_roformer``bs_roformer`。如果未指定,将从配置文中比对特征,以确定它是哪种类型的模型。例如,模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对。`kim_mel_band_roformer.ckpt``kim_mel_band_roformer.yaml` 也是一对。
- 建议在模型名称和配置文件名中**直接指定模型类型**,例如`mel_mand_roformer``bs_roformer`。如果未指定,将从配置文中比对特征,以确定它是哪种类型的模型。例如,模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对。`kim_mel_band_roformer.ckpt``kim_mel_band_roformer.yaml` 也是一对。
4. 对于中文 ASR额外功能从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型,并将它们放置在 `tools/asr/models` 目录中。 4. 对于中文 ASR额外功能从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型,并将它们放置在 `tools/asr/models` 目录中。
@ -184,12 +186,12 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神。
## 微调与推理 ## 微调与推理
### 打开WebUI ### 打开 WebUI
#### 整合包用户 #### 整合包用户
双击`go-webui.bat`或者使用`go-webui.ps1` 双击`go-webui.bat`或者使用`go-webui.ps1`
若想使用V1,则双击`go-webui-v1.bat`或者使用`go-webui-v1.ps1` 若想使用 V1,则双击`go-webui-v1.bat`或者使用`go-webui-v1.ps1`
#### 其他 #### 其他
@ -197,12 +199,13 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神。
python webui.py <language(optional)> python webui.py <language(optional)>
``` ```
若想使用V1,则 若想使用 V1,则
```bash ```bash
python webui.py v1 <language(optional)> python webui.py v1 <language(optional)>
``` ```
或者在webUI内动态切换
或者在 webUI 内动态切换
### 微调 ### 微调
@ -215,25 +218,27 @@ python webui.py v1 <language(optional)>
5. 校对标注 5. 校对标注
6. 前往下一个窗口,点击训练 6. 前往下一个窗口,点击训练
### 打开推理WebUI ### 打开推理 WebUI
#### 整合包用户 #### 整合包用户
双击 `go-webui.bat` 或者使用 `go-webui.ps1` ,然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理webUI 双击 `go-webui.bat` 或者使用 `go-webui.ps1` ,然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
#### 其他 #### 其他
```bash ```bash
python GPT_SoVITS/inference_webui.py <language(optional)> python GPT_SoVITS/inference_webui.py <language(optional)>
``` ```
或者 或者
```bash ```bash
python webui.py python webui.py
``` ```
然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理webUI
## V2发布说明 然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
## V2 发布说明
新特性: 新特性:
@ -241,42 +246,41 @@ python webui.py
2. 更好的文本前端 2. 更好的文本前端
3. 底模由2k小时扩展至5k小时 3. 底模由 2k 小时扩展至 5k 小时
4. 对低音质参考音频(尤其是来源于网络的高频严重缺失、听着很闷的音频)合成出来音质更好 4. 对低音质参考音频(尤其是来源于网络的高频严重缺失、听着很闷的音频)合成出来音质更好
详见[wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) 详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
从v1环境迁移至v2 v1 环境迁移至 v2
1. 需要pip安装requirements.txt更新环境 1. 需要 pip 安装 requirements.txt 更新环境
2. 需要克隆github上的最新代码 2. 需要克隆 github 上的最新代码
3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到GPT_SoVITS\pretrained_models\gsv-v2final-pretrained下 3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到 GPT_SoVITS\pretrained_models\gsv-v2final-pretrained
中文额外需要下载[G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)下载G2PW模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下) 中文额外需要下载[G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(下载 G2PW 模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下)
## V3更新说明 ## V3 更新说明
新模型特点: 新模型特点:
1. 音色相似度更像,需要更少训练集来逼近本人(不训练直接使用底模模式下音色相似性提升更大) 1. 音色相似度更像,需要更少训练集来逼近本人(不训练直接使用底模模式下音色相似性提升更大)
2. GPT合成更稳定重复漏字更少也更容易跑出丰富情感 2. GPT 合成更稳定,重复漏字更少,也更容易跑出丰富情感
详见[wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) 详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
从v2环境迁移至v3 v2 环境迁移至 v3
1. 需要pip安装requirements.txt更新环境 1. 需要 pip 安装 requirements.txt 更新环境
2. 需要克隆github上的最新代码 2. 需要克隆 github 上的最新代码
3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些v3新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS\pretrained_models`目录下 3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些 v3 新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS\pretrained_models`目录下
如果想用音频超分功能缓解v3模型生成24k音频觉得闷的问题需要下载额外的模型参数参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt)
如果想用音频超分功能缓解 v3 模型生成 24k 音频觉得闷的问题,需要下载额外的模型参数,参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt)
## 待办事项清单 ## 待办事项清单
@ -299,16 +303,21 @@ python webui.py
- [ ] 模型混合。 - [ ] 模型混合。
## (附加)命令行运行方式 ## (附加)命令行运行方式
使用命令行打开UVR5的WebUI
```` 使用命令行打开 UVR5 的 WebUI
```
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5> python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
```` ```
<!-- 如果打不开浏览器请按照下面的格式进行UVR处理这是使用mdxnet进行音频处理的方式 <!-- 如果打不开浏览器请按照下面的格式进行UVR处理这是使用mdxnet进行音频处理的方式
```` ````
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
```` --> ```` -->
这是使用命令行完成数据集的音频切分的方式 这是使用命令行完成数据集的音频切分的方式
````
```
python audio_slicer.py \ python audio_slicer.py \
--input_path "<path_to_original_audio_file_or_directory>" \ --input_path "<path_to_original_audio_file_or_directory>" \
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \ --output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
@ -316,17 +325,22 @@ python audio_slicer.py \
--min_length <minimum_duration_of_each_subclip> \ --min_length <minimum_duration_of_each_subclip> \
--min_interval <shortest_time_gap_between_adjacent_subclips> --min_interval <shortest_time_gap_between_adjacent_subclips>
--hop_size <step_size_for_computing_volume_curve> --hop_size <step_size_for_computing_volume_curve>
```` ```
这是使用命令行完成数据集ASR处理的方式仅限中文
```` 这是使用命令行完成数据集 ASR 处理的方式(仅限中文)
python tools/asr/funasr_asr.py -i <input> -o <output>
```` ```
通过Faster_Whisper进行ASR处理除中文之外的ASR标记 python tools/asr/funasr_asr.py -i <input> -o <output>
```
通过 Faster_Whisper 进行 ASR 处理(除中文之外的 ASR 标记)
没有进度条GPU 性能可能会导致时间延迟)
没有进度条GPU性能可能会导致时间延迟
``` ```
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision> python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
``` ```
启用自定义列表保存路径 启用自定义列表保存路径
## 致谢 ## 致谢
@ -334,6 +348,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
特别感谢以下项目和贡献者: 特别感谢以下项目和贡献者:
### 理论研究 ### 理论研究
- [ar-vits](https://github.com/innnky/ar-vits) - [ar-vits](https://github.com/innnky/ar-vits)
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR) - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
- [vits](https://github.com/jaywalnut310/vits) - [vits](https://github.com/jaywalnut310/vits)
@ -343,17 +358,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41) - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py) - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py) - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
### 预训练模型 ### 预训练模型
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain) - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large) - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
- [BigVGAN](https://github.com/NVIDIA/BigVGAN) - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
### 推理用文本前端 ### 推理用文本前端
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization) - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
- [split-lang](https://github.com/DoodleBears/split-lang) - [split-lang](https://github.com/DoodleBears/split-lang)
- [g2pW](https://github.com/GitYCC/g2pW) - [g2pW](https://github.com/GitYCC/g2pW)
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW) - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw) - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
### WebUI 工具 ### WebUI 工具
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui) - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
- [audio-slicer](https://github.com/openvpi/audio-slicer) - [audio-slicer](https://github.com/openvpi/audio-slicer)
- [SubFix](https://github.com/cronrpc/SubFix) - [SubFix](https://github.com/cronrpc/SubFix)

View File

@ -20,17 +20,17 @@
## 機能: ## 機能:
1. **Zero-Shot TTS:** たった5秒間の音声サンプルで、即座にテキストからその音声に変換できます。 1. **Zero-Shot TTS:** たった 5 秒間の音声サンプルで、即座にテキストからその音声に変換できます。
2. **Few-Shot TTS:** わずか1分間のトレーニングデータでモデルを微調整し、音声のクオリティを向上。 2. **Few-Shot TTS:** わずか 1 分間のトレーニングデータでモデルを微調整し、音声のクオリティを向上。
3. **多言語サポート:** 現在、英語、日本語、韓国語、広東語、中国語をサポートしています。 3. **多言語サポート:** 現在、英語、日本語、韓国語、広東語、中国語をサポートしています。
4. **WebUI ツール:** 統合されたツールは、音声と伴奏BGM等の分離、トレーニングセットの自動セグメンテーション、ASR中国語のみ、テキストラベリング等を含むため、初心者の方でもトレーニングデータセットの作成やGPT/SoVITSモデルのトレーニング等を非常に簡単に行えます。 4. **WebUI ツール:** 統合されたツールは、音声と伴奏BGM の分離、トレーニングセットの自動セグメンテーション、ASR中国語のみ、テキストラベリング等を含むため、初心者の方でもトレーニングデータセットの作成や GPT/SoVITS モデルのトレーニング等を非常に簡単に行えます。
**[デモ動画](https://www.bilibili.com/video/BV12g4y1m7Uw)をチェック!** **[デモ動画](https://www.bilibili.com/video/BV12g4y1m7Uw)をチェック!**
声の事前学習無しかつFew-Shotでトレーニングされたモデルのデモ: 声の事前学習無しかつ Few-Shot でトレーニングされたモデルのデモ:
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
@ -43,7 +43,7 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
- Python 3.9, PyTorch 2.0.1, CUDA 11 - Python 3.9, PyTorch 2.0.1, CUDA 11
- Python 3.10.13, PyTorch 2.1.2, CUDA 12.3 - Python 3.10.13, PyTorch 2.1.2, CUDA 12.3
- Python 3.9, PyTorch 2.2.2, macOS 14.4.1 (Apple silicon) - Python 3.9, PyTorch 2.2.2, macOS 14.4.1 (Apple silicon)
- Python 3.9, PyTorch 2.2.2, CPUデバイス - Python 3.9, PyTorch 2.2.2, CPU デバイス
_注記: numba==0.56.4 は py<3.11 が必要です_ _注記: numba==0.56.4 は py<3.11 が必要です_
@ -61,22 +61,22 @@ bash install.sh
### macOS ### macOS
**注MacでGPUを使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面はCPUを使用して訓練することを強く推奨します。** **注Mac GPU を使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面は CPU を使用して訓練することを強く推奨します。**
1. `xcode-select --install` を実行して、Xcodeコマンドラインツールをインストールします。 1. `xcode-select --install` を実行して、Xcode コマンドラインツールをインストールします。
2. `brew install ffmpeg` を実行してFFmpegをインストールします。 2. `brew install ffmpeg` を実行して FFmpeg をインストールします。
3. 上記の手順を完了した後、以下のコマンドを実行してこのプロジェクトをインストールします。 3. 上記の手順を完了した後、以下のコマンドを実行してこのプロジェクトをインストールします。
```bash ```bash
conda create -n GPTSoVits python=3.9 conda create -n GPTSoVits python=3.9
conda activate GPTSoVits conda activate GPTSoVits
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt pip install -r requirements.txt
``` ```
### 手動インストール ### 手動インストール
#### FFmpegをインストールします。 #### FFmpeg をインストールします。
##### Conda ユーザー ##### Conda ユーザー
@ -97,6 +97,7 @@ conda install -c conda-forge 'ffmpeg<7'
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます。 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます。
##### MacOS ユーザー ##### MacOS ユーザー
```bash ```bash
brew install ffmpeg brew install ffmpeg
``` ```
@ -104,6 +105,7 @@ brew install ffmpeg
#### 依存関係をインストールします #### 依存関係をインストールします
```bash ```bash
pip install -r extra-req.txt --no-deps
pip install -r requirementx.txt pip install -r requirementx.txt
``` ```
@ -138,17 +140,17 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) から事前訓練済みモデルをダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリに配置してください。 1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) から事前訓練済みモデルをダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリに配置してください。
2. [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください。中国語TTSのみ 2. [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください。(中国語 TTS のみ)
3. UVR5ボーカル/伴奏BGM等分離 & リバーブ除去の追加機能)の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`tools/uvr5/uvr5_weights` ディレクトリに配置してください。 3. UVR5ボーカル/伴奏BGM 等)分離 & リバーブ除去の追加機能)の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`tools/uvr5/uvr5_weights` ディレクトリに配置してください。
- UVR5でbs_roformerまたはmel_band_roformerモデルを使用する場合、モデルと対応する設定ファイルを手動でダウンロードし、`tools/UVR5/UVR5_weights`フォルダに配置することができます。**モデルファイルと設定ファイルの名前は、拡張子を除いて同じであることを確認してください**。さらに、モデルと設定ファイルの名前には**「roformer」が含まれている必要があります**。これにより、roformerクラスのモデルとして認識されます。 - UVR5 bs_roformer または mel_band_roformer モデルを使用する場合、モデルと対応する設定ファイルを手動でダウンロードし、`tools/UVR5/UVR5_weights`フォルダに配置することができます。**モデルファイルと設定ファイルの名前は、拡張子を除いて同じであることを確認してください**。さらに、モデルと設定ファイルの名前には**「roformer」が含まれている必要があります**。これにより、roformer クラスのモデルとして認識されます。
- モデル名と設定ファイル名には、**直接モデルタイプを指定することをお勧めします**。例mel_mand_roformer、bs_roformer。指定しない場合、設定文から特徴を照合して、モデルの種類を特定します。例えば、モデル`bs_roformer_ep_368_sdr_12.9628.ckpt`と対応する設定ファイル`bs_roformer_ep_368_sdr_12.9628.yaml`はペアです。同様に、`kim_mel_band_roformer.ckpt``kim_mel_band_roformer.yaml`もペアです。 - モデル名と設定ファイル名には、**直接モデルタイプを指定することをお勧めします**。例mel_mand_roformer、bs_roformer。指定しない場合、設定文から特徴を照合して、モデルの種類を特定します。例えば、モデル`bs_roformer_ep_368_sdr_12.9628.ckpt`と対応する設定ファイル`bs_roformer_ep_368_sdr_12.9628.yaml`はペアです。同様に、`kim_mel_band_roformer.ckpt``kim_mel_band_roformer.yaml`もペアです。
4. 中国語ASR追加機能の場合は、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、および [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。 4. 中国語 ASR追加機能の場合は、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、および [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。
5. 英語または日本語のASR追加機能を使用する場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。また、[他のモデル](https://huggingface.co/Systran) は、より小さいサイズで高クオリティな可能性があります。 5. 英語または日本語の ASR追加機能を使用する場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。また、[他のモデル](https://huggingface.co/Systran) は、より小さいサイズで高クオリティな可能性があります。
## データセット形式 ## データセット形式
@ -169,14 +171,15 @@ vocal_path|speaker_name|language|text
``` ```
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin. D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
``` ```
## 微調整と推論 ## 微調整と推論
### WebUIを開く ### WebUI を開く
#### 統合パッケージ利用者 #### 統合パッケージ利用者
`go-webui.bat`をダブルクリックするか、`go-webui.ps1`を使用します。 `go-webui.bat`をダブルクリックするか、`go-webui.ps1`を使用します。
V1に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックするか、`go-webui-v1.ps1`を使用してください。 V1 に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックするか、`go-webui-v1.ps1`を使用してください。
#### その他 #### その他
@ -184,12 +187,13 @@ V1に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックす
python webui.py <言語(オプション)> python webui.py <言語(オプション)>
``` ```
V1に切り替えたい場合は V1 に切り替えたい場合は
```bash ```bash
python webui.py v1 <言語(オプション)> python webui.py v1 <言語(オプション)>
``` ```
またはWebUIで手動でバージョンを切り替えてください。
または WebUI で手動でバージョンを切り替えてください。
### 微調整 ### 微調整
@ -202,25 +206,27 @@ python webui.py v1 <言語(オプション)>
5. ASR転写を校正する 5. ASR転写を校正する
6. 次のタブに移動し、モデルを微調整する 6. 次のタブに移動し、モデルを微調整する
### 推論WebUIを開く ### 推論 WebUI を開く
#### 統合パッケージ利用者 #### 統合パッケージ利用者
`go-webui-v2.bat`をダブルクリックするか、`go-webui-v2.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論webuiを開きます。 `go-webui-v2.bat`をダブルクリックするか、`go-webui-v2.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます。
#### その他 #### その他
```bash ```bash
python GPT_SoVITS/inference_webui.py <言語(オプション)> python GPT_SoVITS/inference_webui.py <言語(オプション)>
``` ```
または または
```bash ```bash
python webui.py python webui.py
``` ```
その後、`1-GPT-SoVITS-TTS/1C-inference`で推論webuiを開きます。
## V2リリースート その後、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます。
## V2 リリースノート
新機能: 新機能:
@ -228,21 +234,21 @@ python webui.py
2. 最適化されたテキストフロントエンド 2. 最適化されたテキストフロントエンド
3. 事前学習済みモデルが2千時間から5千時間に拡張 3. 事前学習済みモデルが 2 千時間から 5 千時間に拡張
4. 低品質の参照音声に対する合成品質の向上 4. 低品質の参照音声に対する合成品質の向上
[詳細はこちら](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) [詳細はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
V1環境からV2を使用するには: V1 環境から V2 を使用するには:
1. `pip install -r requirements.txt`を使用していくつかのパッケージを更新 1. `pip install -r requirements.txt`を使用していくつかのパッケージを更新
2. 最新のコードをgithubからクローン 2. 最新のコードを github からクローン
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)からV2の事前学習モデルをダウンロードし、それらを`GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`に配置 3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)から V2 の事前学習モデルをダウンロードし、それらを`GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`に配置
中国語V2追加: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)G2PWモデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します) 中国語 V2 追加: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)G2PW モデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します)
## V3 リリースノート ## V3 リリースノート
@ -250,19 +256,19 @@ V1環境からV2を使用するには:
1. 音色の類似性が向上し、ターゲットスピーカーを近似するために必要な学習データが少なくなりました(音色の類似性は、ファインチューニングなしでベースモデルを直接使用することで顕著に改善されます)。 1. 音色の類似性が向上し、ターゲットスピーカーを近似するために必要な学習データが少なくなりました(音色の類似性は、ファインチューニングなしでベースモデルを直接使用することで顕著に改善されます)。
2. GPTモデルがより安定し、繰り返しや省略が減少し、より豊かな感情表現を持つ音声の生成が容易になりました。 2. GPT モデルがより安定し、繰り返しや省略が減少し、より豊かな感情表現を持つ音声の生成が容易になりました。
[詳細情報はこちら](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) [詳細情報はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
v2 環境から v3 を使用する方法: v2 環境から v3 を使用する方法:
1. `pip install -r requirements.txt` を実行して、いくつかのパッケージを更新します。 1. `pip install -r requirements.txt` を実行して、いくつかのパッケージを更新します。
2. GitHubから最新のコードをクローンします。 2. GitHub から最新のコードをクローンします。
3. v3の事前学習済みモデルs1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ)を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS\pretrained_models フォルダに配置します。 3. v3 の事前学習済みモデルs1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ)を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS\pretrained_models フォルダに配置します。
追加: 音声超解像モデルについては、[ダウンロード方法](../../tools/AP_BWE_main/24kto48k/readme.txt)を参照してください。 追加: 音声超解像モデルについては、[ダウンロード方法](../../tools/AP_BWE_main/24kto48k/readme.txt)を参照してください。
## Todo リスト ## Todo リスト
@ -285,15 +291,20 @@ v2 環境から v3 を使用する方法:
- [ ] モデルミックス - [ ] モデルミックス
## (追加の) コマンドラインから実行する方法 ## (追加の) コマンドラインから実行する方法
コマンド ラインを使用して UVR5 の WebUI を開きます コマンド ラインを使用して UVR5 の WebUI を開きます
``` ```
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5> python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
``` ```
<!-- ブラウザを開けない場合は、以下の形式に従って UVR 処理を行ってください。これはオーディオ処理に mdxnet を使用しています。 <!-- ブラウザを開けない場合は、以下の形式に従って UVR 処理を行ってください。これはオーディオ処理に mdxnet を使用しています。
``` ```
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
``` --> ``` -->
コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです。 コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです。
``` ```
python audio_slicer.py \ python audio_slicer.py \
--input_path "<path_to_original_audio_file_or_directory>" \ --input_path "<path_to_original_audio_file_or_directory>" \
@ -303,16 +314,21 @@ python audio_slicer.py \
--min_interval <shortest_time_gap_between_adjacent_subclips> --min_interval <shortest_time_gap_between_adjacent_subclips>
--hop_size <step_size_for_computing_volume_curve> --hop_size <step_size_for_computing_volume_curve>
``` ```
コマンドラインを使用してデータセット ASR 処理を行う方法です (中国語のみ) コマンドラインを使用してデータセット ASR 処理を行う方法です (中国語のみ)
``` ```
python tools/asr/funasr_asr.py -i <input> -o <output> python tools/asr/funasr_asr.py -i <input> -o <output>
``` ```
ASR処理はFaster_Whisperを通じて実行されます(中国語を除くASRマーキング)
ASR 処理は Faster_Whisper を通じて実行されます(中国語を除く ASR マーキング)
(進行状況バーは表示されません。GPU のパフォーマンスにより時間遅延が発生する可能性があります) (進行状況バーは表示されません。GPU のパフォーマンスにより時間遅延が発生する可能性があります)
``` ```
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision> python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
``` ```
カスタムリストの保存パスが有効になっています カスタムリストの保存パスが有効になっています
## クレジット ## クレジット
@ -320,6 +336,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
特に以下のプロジェクトと貢献者に感謝します: 特に以下のプロジェクトと貢献者に感謝します:
### 理論研究 ### 理論研究
- [ar-vits](https://github.com/innnky/ar-vits) - [ar-vits](https://github.com/innnky/ar-vits)
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR) - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
- [vits](https://github.com/jaywalnut310/vits) - [vits](https://github.com/jaywalnut310/vits)
@ -329,17 +346,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41) - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py) - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py) - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
### 事前学習モデル ### 事前学習モデル
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain) - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large) - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
- [BigVGAN](https://github.com/NVIDIA/BigVGAN) - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
### 推論用テキストフロントエンド ### 推論用テキストフロントエンド
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization) - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
- [split-lang](https://github.com/DoodleBears/split-lang) - [split-lang](https://github.com/DoodleBears/split-lang)
- [g2pW](https://github.com/GitYCC/g2pW) - [g2pW](https://github.com/GitYCC/g2pW)
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW) - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw) - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
### WebUI ツール ### WebUI ツール
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui) - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
- [audio-slicer](https://github.com/openvpi/audio-slicer) - [audio-slicer](https://github.com/openvpi/audio-slicer)
- [SubFix](https://github.com/cronrpc/SubFix) - [SubFix](https://github.com/cronrpc/SubFix)

View File

@ -70,7 +70,7 @@ bash install.sh
```bash ```bash
conda create -n GPTSoVits python=3.9 conda create -n GPTSoVits python=3.9
conda activate GPTSoVits conda activate GPTSoVits
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt pip install -r requirements.txt
``` ```
@ -99,6 +99,7 @@ conda install -c conda-forge 'ffmpeg<7'
[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치 (Korean TTS 전용) [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치 (Korean TTS 전용)
##### MacOS 사용자 ##### MacOS 사용자
```bash ```bash
brew install ffmpeg brew install ffmpeg
``` ```
@ -106,6 +107,7 @@ brew install ffmpeg
#### 의존성 설치 #### 의존성 설치
```bash ```bash
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt pip install -r requirements.txt
``` ```
@ -147,9 +149,9 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
3. UVR5 (보컬/반주 분리 & 잔향 제거 추가 기능)의 경우, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 에서 모델을 다운로드하고 `tools/uvr5/uvr5_weights` 디렉토리에 배치하세요. 3. UVR5 (보컬/반주 분리 & 잔향 제거 추가 기능)의 경우, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 에서 모델을 다운로드하고 `tools/uvr5/uvr5_weights` 디렉토리에 배치하세요.
- UVR5에서 bs_roformer 또는 mel_band_roformer 모델을 사용할 경우, 모델과 해당 설정 파일을 수동으로 다운로드하여 `tools/UVR5/UVR5_weights` 폴더에 저장할 수 있습니다. **모델 파일과 설정 파일의 이름은 확장자를 제외하고 동일한 이름을 가지도록 해야 합니다**. 또한, 모델과 설정 파일 이름에는 **“roformer”**가 포함되어야 roformer 클래스의 모델로 인식됩니다. - UVR5에서 bs_roformer 또는 mel_band_roformer 모델을 사용할 경우, 모델과 해당 설정 파일을 수동으로 다운로드하여 `tools/UVR5/UVR5_weights` 폴더에 저장할 수 있습니다. **모델 파일과 설정 파일의 이름은 확장자를 제외하고 동일한 이름을 가지도록 해야 합니다**. 또한, 모델과 설정 파일 이름에는 **“roformer”**가 포함되어야 roformer 클래스의 모델로 인식됩니다.
- 모델 이름과 설정 파일 이름에 **모델 유형을 직접 지정하는 것이 좋습니다**. 예: mel_mand_roformer, bs_roformer. 지정하지 않으면 설정 파일을 기준으로 특성을 비교하여 어떤 유형의 모델인지를 판단합니다. 예를 들어, 모델 `bs_roformer_ep_368_sdr_12.9628.ckpt`와 해당 설정 파일 `bs_roformer_ep_368_sdr_12.9628.yaml`은 한 쌍입니다. `kim_mel_band_roformer.ckpt``kim_mel_band_roformer.yaml`도 한 쌍입니다. - 모델 이름과 설정 파일 이름에 **모델 유형을 직접 지정하는 것이 좋습니다**. 예: mel_mand_roformer, bs_roformer. 지정하지 않으면 설정 파일을 기준으로 특성을 비교하여 어떤 유형의 모델인지를 판단합니다. 예를 들어, 모델 `bs_roformer_ep_368_sdr_12.9628.ckpt`와 해당 설정 파일 `bs_roformer_ep_368_sdr_12.9628.yaml`은 한 쌍입니다. `kim_mel_band_roformer.ckpt``kim_mel_band_roformer.yaml`도 한 쌍입니다.
4. 중국어 ASR (추가 기능)의 경우, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 및 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 에서 모델을 다운로드하고, `tools/asr/models` 디렉토리에 배치하세요. 4. 중국어 ASR (추가 기능)의 경우, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 및 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 에서 모델을 다운로드하고, `tools/asr/models` 디렉토리에 배치하세요.
@ -195,6 +197,7 @@ V1으로 전환하려면,
```bash ```bash
python webui.py v1 <언어(옵션)> python webui.py v1 <언어(옵션)>
``` ```
또는 WebUI에서 수동으로 버전을 전환하십시오. 또는 WebUI에서 수동으로 버전을 전환하십시오.
### 미세 조정 ### 미세 조정
@ -219,11 +222,13 @@ python webui.py v1 <언어(옵션)>
```bash ```bash
python GPT_SoVITS/inference_webui.py <언어(옵션)> python GPT_SoVITS/inference_webui.py <언어(옵션)>
``` ```
또는 또는
```bash ```bash
python webui.py python webui.py
``` ```
그런 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다. 그런 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.
## V2 릴리스 노트 ## V2 릴리스 노트
@ -238,7 +243,7 @@ python webui.py
4. 저품질 참조 오디오에 대한 합성 품질 향상 4. 저품질 참조 오디오에 대한 합성 품질 향상
[자세한 내용](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) [자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
V1 환경에서 V2를 사용하려면: V1 환경에서 V2를 사용하려면:
@ -248,7 +253,7 @@ V1 환경에서 V2를 사용하려면:
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)에서 V2 사전 학습 모델을 다운로드하여 `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`에 넣으십시오. 3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)에서 V2 사전 학습 모델을 다운로드하여 `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`에 넣으십시오.
중국어 V2 추가: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.) 중국어 V2 추가: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.)
## V3 릴리스 노트 ## V3 릴리스 노트
@ -258,7 +263,7 @@ V1 환경에서 V2를 사용하려면:
2. GPT 모델이 더 안정적이며 반복 및 생략이 적고, 더 풍부한 감정 표현을 가진 음성을 생성하기가 더 쉽습니다. 2. GPT 모델이 더 안정적이며 반복 및 생략이 적고, 더 풍부한 감정 표현을 가진 음성을 생성하기가 더 쉽습니다.
[자세한 내용](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) [자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
v2 환경에서 v3 사용하기: v2 환경에서 v3 사용하기:
@ -268,8 +273,7 @@ v2 환경에서 v3 사용하기:
3. v3 사전 훈련된 모델(s1v3.ckpt, s2Gv3.pth, 그리고 models--nvidia--bigvgan_v2_24khz_100band_256x 폴더)을 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 다운로드하여 `GPT_SoVITS\pretrained_models` 폴더에 넣습니다. 3. v3 사전 훈련된 모델(s1v3.ckpt, s2Gv3.pth, 그리고 models--nvidia--bigvgan_v2_24khz_100band_256x 폴더)을 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 다운로드하여 `GPT_SoVITS\pretrained_models` 폴더에 넣습니다.
추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요. 추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요.
## 할 일 목록 ## 할 일 목록
@ -293,15 +297,20 @@ v2 환경에서 v3 사용하기:
- [ ] 모델 블렌딩. - [ ] 모델 블렌딩.
## (추가적인) 명령줄에서 실행하는 방법 ## (추가적인) 명령줄에서 실행하는 방법
명령줄을 사용하여 UVR5용 WebUI 열기 명령줄을 사용하여 UVR5용 WebUI 열기
``` ```
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5> python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
``` ```
<!-- 브라우저를 열 수 없는 경우 UVR 처리를 위해 아래 형식을 따르십시오. 이는 오디오 처리를 위해 mdxnet을 사용하는 것입니다. <!-- 브라우저를 열 수 없는 경우 UVR 처리를 위해 아래 형식을 따르십시오. 이는 오디오 처리를 위해 mdxnet을 사용하는 것입니다.
``` ```
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
``` --> ``` -->
명령줄을 사용하여 데이터세트의 오디오 분할을 수행하는 방법은 다음과 같습니다. 명령줄을 사용하여 데이터세트의 오디오 분할을 수행하는 방법은 다음과 같습니다.
``` ```
python audio_slicer.py \ python audio_slicer.py \
--input_path "<path_to_original_audio_file_or_directory>" \ --input_path "<path_to_original_audio_file_or_directory>" \
@ -311,16 +320,21 @@ python audio_slicer.py \
--min_interval <shortest_time_gap_between_adjacent_subclips> --min_interval <shortest_time_gap_between_adjacent_subclips>
--hop_size <step_size_for_computing_volume_curve> --hop_size <step_size_for_computing_volume_curve>
``` ```
명령줄을 사용하여 데이터 세트 ASR 처리를 수행하는 방법입니다(중국어만 해당). 명령줄을 사용하여 데이터 세트 ASR 처리를 수행하는 방법입니다(중국어만 해당).
``` ```
python tools/asr/funasr_asr.py -i <input> -o <output> python tools/asr/funasr_asr.py -i <input> -o <output>
``` ```
ASR 처리는 Faster_Whisper(중국어를 제외한 ASR 마킹)를 통해 수행됩니다. ASR 처리는 Faster_Whisper(중국어를 제외한 ASR 마킹)를 통해 수행됩니다.
(진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음) (진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음)
``` ```
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision> python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
``` ```
사용자 정의 목록 저장 경로가 활성화되었습니다. 사용자 정의 목록 저장 경로가 활성화되었습니다.
## 감사의 말 ## 감사의 말
@ -328,6 +342,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
다음 프로젝트와 기여자들에게 특별히 감사드립니다: 다음 프로젝트와 기여자들에게 특별히 감사드립니다:
### 이론 연구 ### 이론 연구
- [ar-vits](https://github.com/innnky/ar-vits) - [ar-vits](https://github.com/innnky/ar-vits)
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR) - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
- [vits](https://github.com/jaywalnut310/vits) - [vits](https://github.com/jaywalnut310/vits)
@ -337,17 +352,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41) - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py) - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py) - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
### 사전 학습 모델 ### 사전 학습 모델
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain) - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large) - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
- [BigVGAN](https://github.com/NVIDIA/BigVGAN) - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
### 추론용 텍스트 프론트엔드 ### 추론용 텍스트 프론트엔드
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization) - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
- [split-lang](https://github.com/DoodleBears/split-lang) - [split-lang](https://github.com/DoodleBears/split-lang)
- [g2pW](https://github.com/GitYCC/g2pW) - [g2pW](https://github.com/GitYCC/g2pW)
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW) - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw) - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
### WebUI 도구 ### WebUI 도구
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui) - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
- [audio-slicer](https://github.com/openvpi/audio-slicer) - [audio-slicer](https://github.com/openvpi/audio-slicer)
- [SubFix](https://github.com/cronrpc/SubFix) - [SubFix](https://github.com/cronrpc/SubFix)

View File

@ -72,7 +72,7 @@ bash install.sh
```bash ```bash
conda create -n GPTSoVits python=3.9 conda create -n GPTSoVits python=3.9
conda activate GPTSoVits conda activate GPTSoVits
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt pip install -r requirements.txt
``` ```
@ -99,6 +99,7 @@ conda install -c conda-forge 'ffmpeg<7'
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin. [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin.
##### MacOS Kullanıcıları ##### MacOS Kullanıcıları
```bash ```bash
brew install ffmpeg brew install ffmpeg
``` ```
@ -106,6 +107,7 @@ brew install ffmpeg
#### Bağımlılıkları Yükleme #### Bağımlılıkları Yükleme
```bash ```bash
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt pip install -r requirements.txt
``` ```
@ -142,9 +144,9 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
3. UVR5 (Vokal/Enstrümantal Ayrımı & Yankı Giderme) için, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) üzerinden modelleri indirip `tools/uvr5/uvr5_weights` dizinine yerleştirin. 3. UVR5 (Vokal/Enstrümantal Ayrımı & Yankı Giderme) için, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) üzerinden modelleri indirip `tools/uvr5/uvr5_weights` dizinine yerleştirin.
- UVR5'te bs_roformer veya mel_band_roformer modellerini kullanıyorsanız, modeli ve ilgili yapılandırma dosyasını manuel olarak indirip `tools/UVR5/UVR5_weights` klasörüne yerleştirebilirsiniz. **Model dosyası ve yapılandırma dosyasının adı, uzantı dışında aynı olmalıdır**. Ayrıca, model ve yapılandırma dosyasının adlarında **“roformer”** kelimesi yer almalıdır, böylece roformer sınıfındaki bir model olarak tanınır. - UVR5'te bs_roformer veya mel_band_roformer modellerini kullanıyorsanız, modeli ve ilgili yapılandırma dosyasını manuel olarak indirip `tools/UVR5/UVR5_weights` klasörüne yerleştirebilirsiniz. **Model dosyası ve yapılandırma dosyasının adı, uzantı dışında aynı olmalıdır**. Ayrıca, model ve yapılandırma dosyasının adlarında **“roformer”** kelimesi yer almalıdır, böylece roformer sınıfındaki bir model olarak tanınır.
- Model adı ve yapılandırma dosyası adı içinde **doğrudan model tipini belirtmek önerilir**. Örneğin: mel_mand_roformer, bs_roformer. Belirtilmezse, yapılandırma dosyasından özellikler karşılaştırılarak model tipi belirlenir. Örneğin, `bs_roformer_ep_368_sdr_12.9628.ckpt` modeli ve karşılık gelen yapılandırma dosyası `bs_roformer_ep_368_sdr_12.9628.yaml` bir çifttir. Aynı şekilde, `kim_mel_band_roformer.ckpt` ve `kim_mel_band_roformer.yaml` da bir çifttir. - Model adı ve yapılandırma dosyası adı içinde **doğrudan model tipini belirtmek önerilir**. Örneğin: mel_mand_roformer, bs_roformer. Belirtilmezse, yapılandırma dosyasından özellikler karşılaştırılarak model tipi belirlenir. Örneğin, `bs_roformer_ep_368_sdr_12.9628.ckpt` modeli ve karşılık gelen yapılandırma dosyası `bs_roformer_ep_368_sdr_12.9628.yaml` bir çifttir. Aynı şekilde, `kim_mel_band_roformer.ckpt` ve `kim_mel_band_roformer.yaml` da bir çifttir.
4. Çince ASR için, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) ve [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) üzerinden modelleri indirip `tools/asr/models` dizinine yerleştirin. 4. Çince ASR için, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) ve [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) üzerinden modelleri indirip `tools/asr/models` dizinine yerleştirin.
@ -192,6 +194,7 @@ V1'e geçmek istiyorsanız,
```bash ```bash
python webui.py v1 <dil(isteğe bağlı)> python webui.py v1 <dil(isteğe bağlı)>
``` ```
veya WebUI'de manuel olarak sürüm değiştirin. veya WebUI'de manuel olarak sürüm değiştirin.
### İnce Ayar ### İnce Ayar
@ -216,11 +219,13 @@ veya WebUI'de manuel olarak sürüm değiştirin.
```bash ```bash
python GPT_SoVITS/inference_webui.py <dil(isteğe bağlı)> python GPT_SoVITS/inference_webui.py <dil(isteğe bağlı)>
``` ```
VEYA VEYA
```bash ```bash
python webui.py python webui.py
``` ```
ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın. ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.
## V2 Sürüm Notları ## V2 Sürüm Notları
@ -235,7 +240,7 @@ Yeni Özellikler:
4. Düşük kaliteli referans sesler için geliştirilmiş sentez kalitesi 4. Düşük kaliteli referans sesler için geliştirilmiş sentez kalitesi
[detaylar burada](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) [detaylar burada](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
V1 ortamından V2'yi kullanmak için: V1 ortamından V2'yi kullanmak için:
@ -245,7 +250,7 @@ V1 ortamından V2'yi kullanmak için:
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) adresinden v2 önceden eğitilmiş modelleri indirin ve bunları `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained` dizinine yerleştirin. 3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) adresinden v2 önceden eğitilmiş modelleri indirin ve bunları `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained` dizinine yerleştirin.
Ek olarak Çince V2: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.) Ek olarak Çince V2: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.)
## V3 Sürüm Notları ## V3 Sürüm Notları
@ -255,7 +260,7 @@ V1 ortamından V2'yi kullanmak için:
2. GPT modeli daha **kararlı** hale geldi, tekrarlar ve atlamalar azaldı ve **daha zengin duygusal ifadeler** ile konuşma üretmek daha kolay hale geldi. 2. GPT modeli daha **kararlı** hale geldi, tekrarlar ve atlamalar azaldı ve **daha zengin duygusal ifadeler** ile konuşma üretmek daha kolay hale geldi.
[daha fazla detay](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) [daha fazla detay](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
### v2 ortamında v3 kullanımı: ### v2 ortamında v3 kullanımı:
@ -265,7 +270,7 @@ V1 ortamından V2'yi kullanmak için:
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden v3 önceden eğitilmiş modellerini (s1v3.ckpt, s2Gv3.pth ve models--nvidia--bigvgan_v2_24khz_100band_256x klasörünü) indirin ve `GPT_SoVITS\pretrained_models` dizinine yerleştirin. 3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden v3 önceden eğitilmiş modellerini (s1v3.ckpt, s2Gv3.pth ve models--nvidia--bigvgan_v2_24khz_100band_256x klasörünü) indirin ve `GPT_SoVITS\pretrained_models` dizinine yerleştirin.
ek: Ses Süper Çözünürlük modeli için [nasıl indirileceği](../../tools/AP_BWE_main/24kto48k/readme.txt) hakkında bilgi alabilirsiniz. ek: Ses Süper Çözünürlük modeli için [nasıl indirileceği](../../tools/AP_BWE_main/24kto48k/readme.txt) hakkında bilgi alabilirsiniz.
## Yapılacaklar Listesi ## Yapılacaklar Listesi
@ -288,15 +293,20 @@ V1 ortamından V2'yi kullanmak için:
- [ ] model karışımı - [ ] model karışımı
## (Ekstra) Komut satırından çalıştırma yöntemi ## (Ekstra) Komut satırından çalıştırma yöntemi
UVR5 için Web Arayüzünü açmak için komut satırını kullanın UVR5 için Web Arayüzünü açmak için komut satırını kullanın
``` ```
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5> python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
``` ```
<!-- Bir tarayıcı açamıyorsanız, UVR işleme için aşağıdaki formatı izleyin,Bu ses işleme için mdxnet kullanıyor <!-- Bir tarayıcı açamıyorsanız, UVR işleme için aşağıdaki formatı izleyin,Bu ses işleme için mdxnet kullanıyor
``` ```
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
``` --> ``` -->
Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır
``` ```
python audio_slicer.py \ python audio_slicer.py \
--input_path "<orijinal_ses_dosyası_veya_dizininin_yolu>" \ --input_path "<orijinal_ses_dosyası_veya_dizininin_yolu>" \
@ -306,16 +316,21 @@ python audio_slicer.py \
--min_interval <bitişik_alt_klipler_arasındaki_en_kısa_zaman_aralığı> --min_interval <bitişik_alt_klipler_arasındaki_en_kısa_zaman_aralığı>
--hop_size <ses_eğrisini_hesaplamak_için_adım_boyutu> --hop_size <ses_eğrisini_hesaplamak_için_adım_boyutu>
``` ```
Veri seti ASR işleme komut satırı kullanılarak bu şekilde yapılır (Yalnızca Çince) Veri seti ASR işleme komut satırı kullanılarak bu şekilde yapılır (Yalnızca Çince)
``` ```
python tools/asr/funasr_asr.py -i <girdi> -o <çıktı> python tools/asr/funasr_asr.py -i <girdi> -o <çıktı>
``` ```
ASR işleme Faster_Whisper aracılığıyla gerçekleştirilir (Çince dışındaki ASR işaretleme) ASR işleme Faster_Whisper aracılığıyla gerçekleştirilir (Çince dışındaki ASR işaretleme)
(İlerleme çubukları yok, GPU performansı zaman gecikmelerine neden olabilir) (İlerleme çubukları yok, GPU performansı zaman gecikmelerine neden olabilir)
``` ```
python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil> python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
``` ```
Özel bir liste kaydetme yolu etkinleştirildi Özel bir liste kaydetme yolu etkinleştirildi
## Katkı Verenler ## Katkı Verenler
@ -323,6 +338,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
Özellikle aşağıdaki projelere ve katkıda bulunanlara teşekkür ederiz: Özellikle aşağıdaki projelere ve katkıda bulunanlara teşekkür ederiz:
### Teorik Araştırma ### Teorik Araştırma
- [ar-vits](https://github.com/innnky/ar-vits) - [ar-vits](https://github.com/innnky/ar-vits)
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR) - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
- [vits](https://github.com/jaywalnut310/vits) - [vits](https://github.com/jaywalnut310/vits)
@ -332,17 +348,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41) - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py) - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py) - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
### Önceden Eğitilmiş Modeller ### Önceden Eğitilmiş Modeller
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain) - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large) - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
- [BigVGAN](https://github.com/NVIDIA/BigVGAN) - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
### Tahmin İçin Metin Ön Ucu ### Tahmin İçin Metin Ön Ucu
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization) - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
- [split-lang](https://github.com/DoodleBears/split-lang) - [split-lang](https://github.com/DoodleBears/split-lang)
- [g2pW](https://github.com/GitYCC/g2pW) - [g2pW](https://github.com/GitYCC/g2pW)
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW) - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw) - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
### WebUI Araçları ### WebUI Araçları
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui) - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
- [audio-slicer](https://github.com/openvpi/audio-slicer) - [audio-slicer](https://github.com/openvpi/audio-slicer)
- [SubFix](https://github.com/cronrpc/SubFix) - [SubFix](https://github.com/cronrpc/SubFix)

1
extra-req.txt Normal file
View File

@ -0,0 +1 @@
faster-whisper

View File

@ -27,7 +27,8 @@
"!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n", "!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
"%cd GPT-SoVITS\n", "%cd GPT-SoVITS\n",
"!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n", "!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
"!pip install -r requirements.txt" "!pip install -r requirements.txt\n",
"!pip install -r extra-req.txt --no-deps"
] ]
}, },
{ {

View File

@ -1,15 +1,17 @@
#!/bin/bash #!/bin/bash
set -e
# 安装构建工具 # 安装构建工具
# Install build tools # Install build tools
echo "Installing GCC..." echo "Installing GCC..."
conda install -c conda-forge gcc=14 conda install -c conda-forge gcc=14 -y
echo "Installing G++..." echo "Installing G++..."
conda install -c conda-forge gxx conda install -c conda-forge gxx -y
echo "Installing ffmpeg and cmake..." echo "Installing ffmpeg and cmake..."
conda install ffmpeg cmake conda install ffmpeg cmake -y
# 设置编译环境 # 设置编译环境
# Set up build environment # Set up build environment
@ -18,7 +20,7 @@ export CC="$CONDA_PREFIX/bin/gcc"
export CXX="$CONDA_PREFIX/bin/g++" export CXX="$CONDA_PREFIX/bin/g++"
echo "Checking for CUDA installation..." echo "Checking for CUDA installation..."
if command -v nvidia-smi &> /dev/null; then if command -v nvidia-smi &>/dev/null; then
USE_CUDA=true USE_CUDA=true
echo "CUDA found." echo "CUDA found."
else else
@ -26,7 +28,6 @@ else
USE_CUDA=false USE_CUDA=false
fi fi
if [ "$USE_CUDA" = false ]; then if [ "$USE_CUDA" = false ]; then
echo "Checking for ROCm installation..." echo "Checking for ROCm installation..."
if [ -d "/opt/rocm" ]; then if [ -d "/opt/rocm" ]; then
@ -48,7 +49,7 @@ fi
if [ "$USE_CUDA" = true ]; then if [ "$USE_CUDA" = true ]; then
echo "Installing PyTorch with CUDA support..." echo "Installing PyTorch with CUDA support..."
conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia
elif [ "$USE_ROCM" = true ] ; then elif [ "$USE_ROCM" = true ]; then
echo "Installing PyTorch with ROCm support..." echo "Installing PyTorch with ROCm support..."
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/rocm6.2 pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/rocm6.2
else else
@ -56,21 +57,53 @@ else
conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 cpuonly -c pytorch conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 cpuonly -c pytorch
fi fi
echo "Installing Python dependencies from requirements.txt..." echo "Installing Python dependencies from requirements.txt..."
# 刷新环境 # 刷新环境
# Refresh environment # Refresh environment
hash -r hash -r
# pyopenjtalk Installation
conda install jq -y
OS_TYPE=$(uname)
PACKAGE_NAME="pyopenjtalk"
VERSION=$(curl -s https://pypi.org/pypi/$PACKAGE_NAME/json | jq -r .info.version)
wget "https://files.pythonhosted.org/packages/source/${PACKAGE_NAME:0:1}/$PACKAGE_NAME/$PACKAGE_NAME-$VERSION.tar.gz"
TAR_FILE=$(ls ${PACKAGE_NAME}-*.tar.gz)
DIR_NAME="${TAR_FILE%.tar.gz}"
tar -xzf "$TAR_FILE"
rm "$TAR_FILE"
CMAKE_FILE="$DIR_NAME/lib/open_jtalk/src/CMakeLists.txt"
if [[ "$OS_TYPE" == "darwin"* ]]; then
sed -i '' -E 's/cmake_minimum_required\(VERSION[^\)]*\)/cmake_minimum_required(VERSION 3.5...3.31)/' "$CMAKE_FILE"
else
sed -i -E 's/cmake_minimum_required\(VERSION[^\)]*\)/cmake_minimum_required(VERSION 3.5...3.31)/' "$CMAKE_FILE"
fi
tar -czf "$TAR_FILE" "$DIR_NAME"
pip install "$TAR_FILE"
rm -rf "$TAR_FILE" "$DIR_NAME"
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt pip install -r requirements.txt
if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ] ; then if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ]; then
echo "Update to WSL compatible runtime lib..." echo "Update to WSL compatible runtime lib..."
location=`pip show torch | grep Location | awk -F ": " '{print $2}'` location=$(pip show torch | grep Location | awk -F ": " '{print $2}')
cd ${location}/torch/lib/ cd "${location}"/torch/lib/ || exit
rm libhsa-runtime64.so* rm libhsa-runtime64.so*
cp /opt/rocm/lib/libhsa-runtime64.so.1.2 libhsa-runtime64.so cp /opt/rocm/lib/libhsa-runtime64.so.1.2 libhsa-runtime64.so
fi fi
echo "Installation completed successfully!" echo "Installation completed successfully!"

View File

@ -3,7 +3,7 @@ scipy
tensorboard tensorboard
librosa==0.9.2 librosa==0.9.2
numba==0.56.4 numba==0.56.4
pytorch-lightning pytorch-lightning>2.0
gradio>=4.0,<=4.24.0 gradio>=4.0,<=4.24.0
ffmpeg-python ffmpeg-python
onnxruntime; sys_platform == 'darwin' onnxruntime; sys_platform == 'darwin'
@ -26,7 +26,6 @@ jieba_fast
jieba jieba
split-lang split-lang
fast_langdetect>=0.3.0 fast_langdetect>=0.3.0
Faster_Whisper
wordsegment wordsegment
rotary_embedding_torch rotary_embedding_torch
ToJyutping ToJyutping
@ -38,4 +37,9 @@ python_mecab_ko; sys_platform != 'win32'
fastapi<0.112.2 fastapi<0.112.2
x_transformers x_transformers
torchmetrics<=1.5 torchmetrics<=1.5
attrdict pydantic<=2.10.6
ctranslate2>=4.0,<5
huggingface_hub>=0.13
tokenizers>=0.13,<1
av>=11
tqdm

View File

@ -298,9 +298,9 @@ def change_tts_inference(bert_path,cnhubert_base_path,gpu_number,gpt_path,sovits
cmd = '"%s" GPT_SoVITS/inference_webui_fast.py "%s"'%(python_exec, language) cmd = '"%s" GPT_SoVITS/inference_webui_fast.py "%s"'%(python_exec, language)
else: else:
cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language) cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language)
#####v3暂不支持加速推理 # #####v3暂不支持加速推理
if version=="v3": # if version=="v3":
cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language) # cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language)
if p_tts_inference is None: if p_tts_inference is None:
os.environ["gpt_path"]=gpt_path if "/" in gpt_path else "%s/%s"%(GPT_weight_root,gpt_path) os.environ["gpt_path"]=gpt_path if "/" in gpt_path else "%s/%s"%(GPT_weight_root,gpt_path)
os.environ["sovits_path"]=sovits_path if "/"in sovits_path else "%s/%s"%(SoVITS_weight_root,sovits_path) os.environ["sovits_path"]=sovits_path if "/"in sovits_path else "%s/%s"%(SoVITS_weight_root,sovits_path)
@ -849,8 +849,8 @@ def switch_version(version_):
{'__type__': 'update', "value": default_sovits_save_every_epoch,"maximum": max_sovits_save_every_epoch}, \ {'__type__': 'update', "value": default_sovits_save_every_epoch,"maximum": max_sovits_save_every_epoch}, \
{'__type__': 'update', "visible": True if version!="v3"else False}, \ {'__type__': 'update', "visible": True if version!="v3"else False}, \
{'__type__': 'update', "value": False if not if_force_ckpt else True, "interactive": True if not if_force_ckpt else False}, \ {'__type__': 'update', "value": False if not if_force_ckpt else True, "interactive": True if not if_force_ckpt else False}, \
{'__type__': 'update', "interactive": False if version == "v3" else True, "value": False}, \ {'__type__': 'update', "interactive": True, "value": False}, \
{'__type__': 'update', "visible": True if version== "v3" else False} {'__type__': 'update', "visible": True if version== "v3" else False} # {'__type__': 'update', "interactive": False if version == "v3" else True, "value": False}, \ ####batch infer
if os.path.exists('GPT_SoVITS/text/G2PWModel'):... if os.path.exists('GPT_SoVITS/text/G2PWModel'):...
else: else: