Compare commits

...

14 Commits

Author SHA1 Message Date
Monophotic
f93eb3b866
Merge eed2095a42927ea0a9c6bd548eaf4926b6c99f8f into 9da7e17efe05041e31d3c3f42c8730ae890397f2 2025-04-02 04:19:18 +09:00
RVC-Boss
9da7e17efe
Add files via upload 2025-04-01 18:44:35 +08:00
RVC-Boss
b0de354c63
Update Changelog_CN.md 2025-04-01 17:21:48 +08:00
RVC-Boss
41090e5a7c
Update g2pw url 2025-04-01 17:15:52 +08:00
RVC-Boss
605b380114
修复模型加载异步逻辑
修复模型加载异步逻辑
2025-04-01 16:50:54 +08:00
RVC-Boss
9f8d455130
支持v3并行推理
support v3 models batch inference
2025-04-01 16:31:48 +08:00
RVC-Boss
7abae557fb
删除加载v3sovits模型缺少enc_q告警
删除加载v3sovits模型缺少enc_q告警
2025-04-01 16:31:15 +08:00
RVC-Boss
6a60e5edb1
v3解锁并行推理;修复模型加载异步逻辑
v3解锁并行推理;修复模型加载异步逻辑
2025-04-01 16:29:52 +08:00
RVC-Boss
28bdff356f
fix https://github.com/RVC-Boss/GPT-SoVITS/issues/2250
fix https://github.com/RVC-Boss/GPT-SoVITS/issues/2250
2025-04-01 10:34:02 +08:00
ChasonJiang
03b662a769
为sovits_v3 适配并行推理 (#2241)
* 为sovits_v3 适配并行推理

* 清理无用代码
2025-03-31 11:56:05 +08:00
XXXXRT666
6c468583c5
Fix dependency-related issues via requirements update (#2236)
* Update requirements.txt

* Create constraints.txt

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* pyopenjtalk and onnx fix

* Update requirements.txt

* Update requirements.txt

* Update install.sh

* update shell install.sh

* update docs

* Update Install.sh

* fix bugs

* Update .gitignore

* Update .gitignore

* Update install.sh

* Update install.sh

* Update extra-req.txt

* Update requirements.txt
2025-03-31 11:27:12 +08:00
Monophotic
eed2095a42
Delete dist/vite.svg,移除无用的图片 2024-10-10 11:02:57 +08:00
Monophotic
aad5afd5b0
Update requirements.txt,移除shutil 2024-10-10 11:01:27 +08:00
Svring
87fe6f2fd5 增设前端页面,并改进api_v2以进行适配。 2024-10-09 20:15:40 +08:00
23 changed files with 878 additions and 269 deletions

178
.gitignore vendored
View File

@ -18,5 +18,183 @@ TEMP
weight.json
ffmpeg*
ffprobe*
cfg.json
speakers.json
ref_audios
tools/AP_BWE_main/24kto48k/*
!tools/AP_BWE_main/24kto48k/readme.txt
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
#uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# Ruff stuff:
.ruff_cache/
# PyPI configuration file
.pypirc

View File

@ -3,7 +3,7 @@ import math
import os, sys, gc
import random
import traceback
import time
import torchaudio
from tqdm import tqdm
now_dir = os.getcwd()
@ -462,8 +462,6 @@ class TTS:
n_speakers=self.configs.n_speakers,
**kwargs
)
if hasattr(vits_model, "enc_q"):
del vits_model.enc_q
self.configs.is_v3_synthesizer = False
else:
vits_model = SynthesizerTrnV3(
@ -474,7 +472,8 @@ class TTS:
)
self.configs.is_v3_synthesizer = True
self.init_bigvgan()
if "pretrained" not in weights_path and hasattr(vits_model, "enc_q"):
del vits_model.enc_q
if if_lora_v3==False:
print(f"Loading VITS weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}")
@ -908,11 +907,14 @@ class TTS:
split_bucket = False
print(i18n("分段返回模式不支持分桶处理,已自动关闭分桶处理"))
if split_bucket and speed_factor==1.0:
if split_bucket and speed_factor==1.0 and not (self.configs.is_v3_synthesizer and parallel_infer):
print(i18n("分桶处理模式已开启"))
elif speed_factor!=1.0:
print(i18n("语速调节不支持分桶处理,已自动关闭分桶处理"))
split_bucket = False
elif self.configs.is_v3_synthesizer and parallel_infer:
print(i18n("当开启并行推理模式时SoVits V3模型不支持分桶处理已自动关闭分桶处理"))
split_bucket = False
else:
print(i18n("分桶处理模式已关闭"))
@ -936,7 +938,7 @@ class TTS:
raise ValueError("ref_audio_path cannot be empty, when the reference audio is not set using set_ref_audio()")
###### setting reference audio and prompt text preprocessing ########
t0 = ttime()
t0 = time.perf_counter()
if (ref_audio_path is not None) and (ref_audio_path != self.prompt_cache["ref_audio_path"]):
if not os.path.exists(ref_audio_path):
raise ValueError(f"{ref_audio_path} not exists")
@ -975,7 +977,7 @@ class TTS:
###### text preprocessing ########
t1 = ttime()
t1 = time.perf_counter()
data:list = None
if not return_fragment:
data = self.text_preprocessor.preprocess(text, text_lang, text_split_method, self.configs.version)
@ -1027,7 +1029,7 @@ class TTS:
return batch[0]
t2 = ttime()
t2 = time.perf_counter()
try:
print("############ 推理 ############")
###### inference ######
@ -1036,7 +1038,7 @@ class TTS:
audio = []
output_sr = self.configs.sampling_rate if not self.configs.is_v3_synthesizer else 24000
for item in data:
t3 = ttime()
t3 = time.perf_counter()
if return_fragment:
item = make_batch(item)
if item is None:
@ -1071,7 +1073,7 @@ class TTS:
max_len=max_len,
repetition_penalty=repetition_penalty,
)
t4 = ttime()
t4 = time.perf_counter()
t_34 += t4 - t3
refer_audio_spec:torch.Tensor = [item.to(dtype=self.precision, device=self.configs.device) for item in self.prompt_cache["refer_spec"]]
@ -1094,6 +1096,7 @@ class TTS:
print(f"############ {i18n('合成音频')} ############")
if not self.configs.is_v3_synthesizer:
if speed_factor == 1.0:
print(f"{i18n('并行合成中')}...")
# ## vits并行推理 method 2
pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)]
upsample_rate = math.prod(self.vits_model.upsample_rates)
@ -1118,17 +1121,28 @@ class TTS:
audio_fragment
) ###试试重建不带上prompt部分
else:
for i, idx in enumerate(tqdm(idx_list)):
phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
_pred_semantic = (pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)) # .unsqueeze(0)#mq要多unsqueeze一次
audio_fragment = self.v3_synthesis(
_pred_semantic, phones, speed=speed_factor, sample_steps=sample_steps
if parallel_infer:
print(f"{i18n('并行合成中')}...")
audio_fragments = self.v3_synthesis_batched_infer(
idx_list,
pred_semantic_list,
batch_phones,
speed=speed_factor,
sample_steps=sample_steps
)
batch_audio_fragment.extend(audio_fragments)
else:
for i, idx in enumerate(tqdm(idx_list)):
phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
_pred_semantic = (pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)) # .unsqueeze(0)#mq要多unsqueeze一次
audio_fragment = self.v3_synthesis(
_pred_semantic, phones, speed=speed_factor, sample_steps=sample_steps
)
batch_audio_fragment.append(
audio_fragment
)
batch_audio_fragment.append(
audio_fragment
)
t5 = ttime()
t5 = time.perf_counter()
t_45 += t5 - t4
if return_fragment:
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t4 - t3, t5 - t4))
@ -1219,13 +1233,13 @@ class TTS:
if super_sampling:
print(f"############ {i18n('音频超采样')} ############")
t1 = ttime()
t1 = time.perf_counter()
self.init_sr_model()
if not self.sr_model_not_exist:
audio,sr=self.sr_model(audio.unsqueeze(0),sr)
max_audio=np.abs(audio).max()
if max_audio > 1: audio /= max_audio
t2 = ttime()
t2 = time.perf_counter()
print(f"超采样用时:{t2-t1:.3f}s")
else:
audio = audio.cpu().numpy()
@ -1260,7 +1274,7 @@ class TTS:
ref_audio = ref_audio.mean(0).unsqueeze(0)
if ref_sr!=24000:
ref_audio=resample(ref_audio, ref_sr, self.configs.device)
# print("ref_audio",ref_audio.abs().mean())
mel2 = mel_fn(ref_audio)
mel2 = norm_spec(mel2)
T_min = min(mel2.shape[2], fea_ref.shape[2])
@ -1285,15 +1299,156 @@ class TTS:
cfm_res = self.vits_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0)
cfm_res = cfm_res[:, :, mel2.shape[2]:]
mel2 = cfm_res[:, :, -T_min:]
mel2 = cfm_res[:, :, -T_min:]
fea_ref = fea_todo_chunk[:, :, -T_min:]
cfm_resss.append(cfm_res)
cmf_res = torch.cat(cfm_resss, 2)
cmf_res = denorm_spec(cmf_res)
cfm_res = torch.cat(cfm_resss, 2)
cfm_res = denorm_spec(cfm_res)
with torch.inference_mode():
wav_gen = self.bigvgan_model(cmf_res)
wav_gen = self.bigvgan_model(cfm_res)
audio=wav_gen[0][0]#.cpu().detach().numpy()
return audio
def v3_synthesis_batched_infer(self,
idx_list:List[int],
semantic_tokens_list:List[torch.Tensor],
batch_phones:List[torch.Tensor],
speed:float=1.0,
sample_steps:int=32
)->List[torch.Tensor]:
prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device)
prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device)
refer_audio_spec = self.prompt_cache["refer_spec"][0].to(dtype=self.precision, device=self.configs.device)
fea_ref,ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
ref_audio:torch.Tensor = self.prompt_cache["raw_audio"]
ref_sr = self.prompt_cache["raw_sr"]
ref_audio=ref_audio.to(self.configs.device).float()
if (ref_audio.shape[0] == 2):
ref_audio = ref_audio.mean(0).unsqueeze(0)
if ref_sr!=24000:
ref_audio=resample(ref_audio, ref_sr, self.configs.device)
mel2 = mel_fn(ref_audio)
mel2 = norm_spec(mel2)
T_min = min(mel2.shape[2], fea_ref.shape[2])
mel2 = mel2[:, :, :T_min]
fea_ref = fea_ref[:, :, :T_min]
if (T_min > 468):
mel2 = mel2[:, :, -468:]
fea_ref = fea_ref[:, :, -468:]
T_min = 468
chunk_len = 934 - T_min
mel2=mel2.to(self.precision)
# #### batched inference
overlapped_len = 12
feat_chunks = []
feat_lens = []
feat_list = []
for i, idx in enumerate(idx_list):
phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
semantic_tokens = semantic_tokens_list[i][-idx:].unsqueeze(0).unsqueeze(0) # .unsqueeze(0)#mq要多unsqueeze一次
feat, _ = self.vits_model.decode_encp(semantic_tokens, phones, refer_audio_spec, ge, speed)
feat_list.append(feat)
feat_lens.append(feat.shape[2])
feats = torch.cat(feat_list, 2)
feats_padded = F.pad(feats, (overlapped_len,0), "constant", 0)
pos = 0
padding_len = 0
while True:
if pos ==0:
chunk = feats_padded[:, :, pos:pos + chunk_len]
else:
pos = pos - overlapped_len
chunk = feats_padded[:, :, pos:pos + chunk_len]
pos += chunk_len
if (chunk.shape[-1] == 0): break
# padding for the last chunk
padding_len = chunk_len - chunk.shape[2]
if padding_len != 0:
chunk = F.pad(chunk, (0,padding_len), "constant", 0)
feat_chunks.append(chunk)
feat_chunks = torch.cat(feat_chunks, 0)
bs = feat_chunks.shape[0]
fea_ref = fea_ref.repeat(bs,1,1)
fea = torch.cat([fea_ref, feat_chunks], 2).transpose(2, 1)
pred_spec = self.vits_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0)
pred_spec = pred_spec[:, :, -chunk_len:]
dd = pred_spec.shape[1]
pred_spec = pred_spec.permute(1, 0, 2).contiguous().view(dd, -1).unsqueeze(0)
# pred_spec = pred_spec[..., :-padding_len]
pred_spec = denorm_spec(pred_spec)
with torch.no_grad():
wav_gen = self.bigvgan_model(pred_spec)
audio = wav_gen[0][0]#.cpu().detach().numpy()
audio_fragments = []
upsample_rate = 256
pos = 0
while pos < audio.shape[-1]:
audio_fragment = audio[pos:pos+chunk_len*upsample_rate]
audio_fragments.append(audio_fragment)
pos += chunk_len*upsample_rate
audio = self.sola_algorithm(audio_fragments, overlapped_len*upsample_rate)
audio = audio[overlapped_len*upsample_rate:-padding_len*upsample_rate]
audio_fragments = []
for feat_len in feat_lens:
audio_fragment = audio[:feat_len*upsample_rate]
audio_fragments.append(audio_fragment)
audio = audio[feat_len*upsample_rate:]
return audio_fragments
def sola_algorithm(self,
audio_fragments:List[torch.Tensor],
overlap_len:int,
):
for i in range(len(audio_fragments)-1):
f1 = audio_fragments[i]
f2 = audio_fragments[i+1]
w1 = f1[-overlap_len:]
w2 = f2[:overlap_len]
assert w1.shape == w2.shape
corr = F.conv1d(w1.view(1,1,-1), w2.view(1,1,-1),padding=w2.shape[-1]//2).view(-1)[:-1]
idx = corr.argmax()
f1_ = f1[:-(overlap_len-idx)]
audio_fragments[i] = f1_
f2_ = f2[idx:]
window = torch.hann_window((overlap_len-idx)*2, device=f1.device, dtype=f1.dtype)
f2_[:(overlap_len-idx)] = window[:(overlap_len-idx)]*f2_[:(overlap_len-idx)] + window[(overlap_len-idx):]*f1[-(overlap_len-idx):]
audio_fragments[i+1] = f2_
return torch.cat(audio_fragments, 0)

View File

@ -238,7 +238,7 @@ def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
else:
visible_sample_steps=False
visible_inp_refs=True
yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False}
yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False},{"__type__": "update", "value":i18n("模型加载中,请等待"),"interactive":False}
dict_s2 = load_sovits_new(sovits_path)
hps = dict_s2["config"]
@ -294,6 +294,7 @@ def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
# torch.save(vq_model.state_dict(),"merge_win.pth")
vq_model.eval()
yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False},{"__type__": "update", "value":i18n("合成语音"),"interactive":True}
with open("./weight.json")as f:
data=f.read()
data=json.loads(data)
@ -877,7 +878,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
with gr.Row():
inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频超过会报错"), type="filepath", scale=13)
with gr.Column(scale=13):
ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。")+i18n("v3暂不支持该模式使用了会报错。"), value=False, interactive=True, show_label=True,scale=1)
ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。")+i18n("v3暂不支持该模式使用了会报错。"), value=False, interactive=True if model_version!="v3"else False, show_label=True,scale=1)
gr.Markdown(html_left(i18n("使用无参考文本模式时建议使用微调的GPT")+"<br>"+i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。")))
prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="", lines=5, max_lines=5,scale=1)
with gr.Column(scale=14):
@ -915,7 +916,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
# phoneme=gr.Textbox(label=i18n("音素框"), value="")
# get_phoneme_button = gr.Button(i18n("目标文本转音素"), variant="primary")
with gr.Row():
inference_button = gr.Button(i18n("合成语音"), variant="primary", size='lg', scale=25)
inference_button = gr.Button(value=i18n("合成语音"), variant="primary", size='lg', scale=25)
output = gr.Audio(label=i18n("输出的语音"), scale=14)
inference_button.click(
@ -923,7 +924,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
[inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free,speed,if_freeze,inp_refs,sample_steps,if_sr_Checkbox,pause_second_slider],
[output],
)
SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language,sample_steps,inp_refs,ref_text_free,if_sr_Checkbox])
SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language,sample_steps,inp_refs,ref_text_free,if_sr_Checkbox,inference_button])
GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], [])
# gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"))

View File

@ -41,12 +41,13 @@ gpt_path = os.environ.get("gpt_path", None)
sovits_path = os.environ.get("sovits_path", None)
cnhubert_base_path = os.environ.get("cnhubert_base_path", None)
bert_path = os.environ.get("bert_path", None)
version=os.environ.get("version","v2")
version=model_version=os.environ.get("version","v2")
import gradio as gr
from TTS_infer_pack.TTS import TTS, TTS_Config, NO_PROMPT_ERROR
from TTS_infer_pack.text_segmentation_method import get_method
from tools.i18n.i18n import I18nAuto, scan_language_list
from inference_webui import DictToAttrRecursive
language=os.environ.get("language","Auto")
language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
@ -221,19 +222,16 @@ def get_weights_names(GPT_weight_root, SoVITS_weight_root):
SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root)
from process_ckpt import get_sovits_version_from_path_fast
from process_ckpt import get_sovits_version_from_path_fast,load_sovits_new
def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
global version, dict_language
global version, model_version, dict_language,if_lora_v3
version, model_version, if_lora_v3=get_sovits_version_from_path_fast(sovits_path)
# print(sovits_path,version, model_version, if_lora_v3)
if if_lora_v3 and not os.path.exists(path_sovits_v3):
info= path_sovits_v3 + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重")
gr.Warning(info)
raise FileExistsError(info)
tts_pipeline.init_vits_weights(sovits_path)
dict_language = dict_language_v1 if tts_pipeline.configs.version =='v1' else dict_language_v2
dict_language = dict_language_v1 if version =='v1' else dict_language_v2
if prompt_language is not None and text_language is not None:
if prompt_language in list(dict_language.keys()):
prompt_text_update, prompt_language_update = {'__type__':'update'}, {'__type__':'update', 'value':prompt_language}
@ -251,8 +249,11 @@ def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
else:
visible_sample_steps=False
visible_inp_refs=True
yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False}
#prompt_language,text_language,prompt_text,prompt_language,text,text_language,inp_refs,ref_text_free,
yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "interactive": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "interactive": True if model_version!="v3"else False},{"__type__": "update", "value":i18n("模型加载中,请等待"),"interactive":False}
tts_pipeline.init_vits_weights(sovits_path)
yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "interactive": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "interactive": True if model_version!="v3"else False},{"__type__": "update", "value":i18n("合成语音"),"interactive":True}
with open("./weight.json")as f:
data=f.read()
data=json.loads(data)
@ -279,14 +280,14 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
gr.Markdown(value=i18n("*请上传并填写参考信息"))
with gr.Row():
inp_ref = gr.Audio(label=i18n("主参考音频(请上传3~10秒内参考音频超过会报错)"), type="filepath")
inp_refs = gr.File(label=i18n("辅参考音频(可选多个,或不选)"),file_count="multiple")
inp_refs = gr.File(label=i18n("辅参考音频(可选多个,或不选)"),file_count="multiple", visible=True if model_version!="v3"else False)
prompt_text = gr.Textbox(label=i18n("主参考音频的文本"), value="", lines=2)
with gr.Row():
prompt_language = gr.Dropdown(
label=i18n("主参考音频的语种"), choices=list(dict_language.keys()), value=i18n("中文")
)
with gr.Column():
ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True, show_label=True)
ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True if model_version!="v3"else False, show_label=True)
gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT")+"<br>"+i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。"))
with gr.Column():
@ -355,7 +356,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
[output, seed],
)
stop_infer.click(tts_pipeline.stop, [], [])
SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language])
SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language,sample_steps,inp_refs,ref_text_free,inference_button])#
GPT_dropdown.change(tts_pipeline.init_t2s_weights, [GPT_dropdown], [])
with gr.Group():

View File

@ -429,26 +429,25 @@ def train_and_evaluate(
# scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)})
# scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)})
# scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)})
image_dict = {
"slice/mel_org": utils.plot_spectrogram_to_numpy(
y_mel[0].data.cpu().numpy()
),
"slice/mel_gen": utils.plot_spectrogram_to_numpy(
y_hat_mel[0].data.cpu().numpy()
),
"all/mel": utils.plot_spectrogram_to_numpy(
mel[0].data.cpu().numpy()
),
"all/stats_ssl": utils.plot_spectrogram_to_numpy(
stats_ssl[0].data.cpu().numpy()
),
}
utils.summarize(
writer=writer,
global_step=global_step,
images=image_dict,
scalars=scalar_dict,
)
image_dict=None
try:###Some people installed the wrong version of matplotlib.
image_dict = {
"slice/mel_org": utils.plot_spectrogram_to_numpy(
y_mel[0].data.cpu().numpy()
),
"slice/mel_gen": utils.plot_spectrogram_to_numpy(
y_hat_mel[0].data.cpu().numpy()
),
"all/mel": utils.plot_spectrogram_to_numpy(
mel[0].data.cpu().numpy()
),
"all/stats_ssl": utils.plot_spectrogram_to_numpy(
stats_ssl[0].data.cpu().numpy()
),
}
except:pass
if image_dict:utils.summarize(writer=writer,global_step=global_step,images=image_dict,scalars=scalar_dict,)
else:utils.summarize(writer=writer,global_step=global_step,scalars=scalar_dict,)
global_step += 1
if epoch % hps.train.save_every_epoch == 0 and rank == 0:
if hps.train.if_save_latest == 0:

View File

@ -58,7 +58,7 @@ def download_and_decompress(model_dir: str='G2PWModel/'):
extract_dir = os.path.join(parent_directory,"G2PWModel_1.1")
extract_dir_new = os.path.join(parent_directory,"G2PWModel")
print("Downloading g2pw model...")
modelscope_url = "https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip"
modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip"#"https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip"
with requests.get(modelscope_url, stream=True) as r:
r.raise_for_status()
with open(zip_dir, 'wb') as f:

View File

@ -1,42 +1,37 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"source": [
"# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)"
],
"metadata": {
"id": "himHYZmra7ix"
}
},
"source": [
"# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "e9b7iFV3dm1f"
},
"outputs": [],
"source": [
"!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
"%cd GPT-SoVITS\n",
"!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
"!pip install -r extra-req.txt --no-deps\n",
"!pip install -r requirements.txt"
],
"execution_count": null,
"outputs": []
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "0NgxXg5sjv7z"
},
"outputs": [],
"source": [
"# @title Download pretrained models 下载预训练模型\n",
"!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
@ -53,16 +48,16 @@
"!git clone https://huggingface.co/Delik/uvr5_weights\n",
"!git config core.sparseCheckout true\n",
"!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
],
"metadata": {
"id": "0NgxXg5sjv7z",
"cellView": "form"
},
"execution_count": null,
"outputs": []
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "cPDEH-9czOJF"
},
"outputs": [],
"source": [
"#@title Create folder models 创建文件夹模型\n",
"import os\n",
@ -77,16 +72,16 @@
" print(f\"The folder '{folder_name}' was created successfully! (文件夹'{folder_name}'已成功创建!)\")\n",
"\n",
"print(\"All folders have been created. (所有文件夹均已创建。)\")"
],
"metadata": {
"cellView": "form",
"id": "cPDEH-9czOJF"
},
"execution_count": null,
"outputs": []
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "vbZY-LnM0tzq"
},
"outputs": [],
"source": [
"import requests\n",
"import zipfile\n",
@ -124,29 +119,35 @@
" shutil.move(source_path, destination_path)\n",
"\n",
"print(f'Model downloaded. (模型已下载。)')"
],
"metadata": {
"cellView": "form",
"id": "vbZY-LnM0tzq"
},
"execution_count": null,
"outputs": []
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"cellView": "form",
"id": "4oRGUzkrk8C7"
},
"outputs": [],
"source": [
"# @title launch WebUI 启动WebUI\n",
"!/usr/local/bin/pip install ipykernel\n",
"!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
"%cd /content/GPT-SoVITS/\n",
"!/usr/local/bin/python webui.py"
],
"metadata": {
"id": "4oRGUzkrk8C7",
"cellView": "form"
},
"execution_count": null,
"outputs": []
]
}
]
],
"metadata": {
"accelerator": "GPU",
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View File

@ -1,6 +1,5 @@
<div align="center">
<h1>GPT-SoVITS-WebUI</h1>
A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>
@ -77,6 +76,7 @@ bash install.sh
```bash
conda create -n GPTSoVits python=3.9
conda activate GPTSoVits
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
```
@ -105,6 +105,7 @@ Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWeb
Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) (Korean TTS Only)
##### MacOS Users
```bash
brew install ffmpeg
```
@ -112,6 +113,7 @@ brew install ffmpeg
#### Install Dependences
```bash
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
```
@ -150,9 +152,9 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
3. For UVR5 (Vocals/Accompaniment Separation & Reverberation Removal, additionally), download models from [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) and place them in `tools/uvr5/uvr5_weights`.
- If you want to use `bs_roformer` or `mel_band_roformer` models for UVR5, you can manually download the model and corresponding configuration file, and put them in `tools/uvr5/uvr5_weights`. **Rename the model file and configuration file, ensure that the model and configuration files have the same and corresponding names except for the suffix**. In addition, the model and configuration file names **must include `roformer`** in order to be recognized as models of the roformer class.
- If you want to use `bs_roformer` or `mel_band_roformer` models for UVR5, you can manually download the model and corresponding configuration file, and put them in `tools/uvr5/uvr5_weights`. **Rename the model file and configuration file, ensure that the model and configuration files have the same and corresponding names except for the suffix**. In addition, the model and configuration file names **must include `roformer`** in order to be recognized as models of the roformer class.
- The suggestion is to **directly specify the model type** in the model name and configuration file name, such as `mel_mand_roformer`, `bs_roformer`. If not specified, the features will be compared from the configuration file to determine which type of model it is. For example, the model `bs_roformer_ep_368_sdr_12.9628.ckpt` and its corresponding configuration file `bs_roformer_ep_368_sdr_12.9628.yaml` are a pair, `kim_mel_band_roformer.ckpt` and `kim_mel_band_roformer.yaml` are also a pair.
- The suggestion is to **directly specify the model type** in the model name and configuration file name, such as `mel_mand_roformer`, `bs_roformer`. If not specified, the features will be compared from the configuration file to determine which type of model it is. For example, the model `bs_roformer_ep_368_sdr_12.9628.ckpt` and its corresponding configuration file `bs_roformer_ep_368_sdr_12.9628.yaml` are a pair, `kim_mel_band_roformer.ckpt` and `kim_mel_band_roformer.yaml` are also a pair.
4. For Chinese ASR (additionally), download models from [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), and [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) and place them in `tools/asr/models`.
@ -200,6 +202,7 @@ if you want to switch to V1,then
```bash
python webui.py v1 <language(optional)>
```
Or maunally switch version in WebUI
### Finetune
@ -217,18 +220,20 @@ Or maunally switch version in WebUI
#### Integrated Package Users
Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
#### Others
```bash
python GPT_SoVITS/inference_webui.py <language(optional)>
```
OR
```bash
python webui.py
```
then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
## V2 Release Notes
@ -243,7 +248,7 @@ New Features:
4. Improved synthesis quality for low-quality reference audio
[more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
[more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
Use v2 from v1 environment:
@ -253,7 +258,7 @@ Use v2 from v1 environment:
3. Download v2 pretrained models from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) and put them into `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`.
Chinese v2 additional: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.
Chinese v2 additional: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.
## V3 Release Notes
@ -263,7 +268,7 @@ New Features:
2. GPT model is more stable, with fewer repetitions and omissions, and it is easier to generate speech with richer emotional expression.
[more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
[more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
Use v3 from v2 environment:
@ -273,8 +278,7 @@ Use v3 from v2 environment:
3. Download v3 pretrained models (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS\pretrained_models`.
additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt)
additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt)
## Todo List
@ -297,15 +301,20 @@ Use v3 from v2 environment:
- [ ] model mix
## (Additional) Method for running from the command line
Use the command line to open the WebUI for UVR5
```
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
```
<!-- If you can't open a browser, follow the format below for UVR processing,This is using mdxnet for audio processing
```
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
``` -->
This is how the audio segmentation of the dataset is done using the command line
```
python audio_slicer.py \
--input_path "<path_to_original_audio_file_or_directory>" \
@ -315,16 +324,21 @@ python audio_slicer.py \
--min_interval <shortest_time_gap_between_adjacent_subclips>
--hop_size <step_size_for_computing_volume_curve>
```
This is how dataset ASR processing is done using the command line(Only Chinese)
```
python tools/asr/funasr_asr.py -i <input> -o <output>
```
ASR processing is performed through Faster_Whisper(ASR marking except Chinese)
(No progress bars, GPU performance may cause time delays)
```
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
```
A custom list save path is enabled
## Credits
@ -332,6 +346,7 @@ A custom list save path is enabled
Special thanks to the following projects and contributors:
### Theoretical Research
- [ar-vits](https://github.com/innnky/ar-vits)
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
- [vits](https://github.com/jaywalnut310/vits)
@ -341,17 +356,23 @@ Special thanks to the following projects and contributors:
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
### Pretrained Models
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
### Text Frontend for Inference
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
- [split-lang](https://github.com/DoodleBears/split-lang)
- [g2pW](https://github.com/GitYCC/g2pW)
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
### WebUI Tools
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
- [audio-slicer](https://github.com/openvpi/audio-slicer)
- [SubFix](https://github.com/cronrpc/SubFix)

View File

@ -112,9 +112,12 @@ import wave
import signal
import numpy as np
import soundfile as sf
import shutil
from fastapi import FastAPI, Request, HTTPException, Response
from fastapi.responses import StreamingResponse, JSONResponse
from fastapi import FastAPI, UploadFile, File
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
import uvicorn
from io import BytesIO
from tools.i18n.i18n import I18nAuto
@ -141,6 +144,7 @@ if config_path in [None, ""]:
config_path = "GPT-SoVITS/configs/tts_infer.yaml"
tts_config = TTS_Config(config_path)
print("以下为TTS_CONFIG配置, 如需修改请查看/GPT_SoVITS/configs/tts_infer.yaml")
print(tts_config)
tts_pipeline = TTS(tts_config)
@ -459,7 +463,84 @@ async def set_sovits_weights(weights_path: str = None):
return JSONResponse(status_code=400, content={"message": f"change sovits weight failed", "Exception": str(e)})
return JSONResponse(status_code=200, content={"message": "success"})
APP.add_middleware(
CORSMiddleware,
allow_origins=["*"], # 允许所有域名的请求
allow_credentials=True,
allow_methods=["*"], # 允许所有方法
allow_headers=["*"], # 允许所有请求头
)
@APP.get("/info")
async def get_info():
try:
gpt_weights_dir_v2 = 'GPT_weights_v2'
sovits_weights_dir_v2 = 'SoVITS_weights_v2'
gpt_weights_dir = 'GPT_weights'
sovits_weights_dir = 'SoVITS_weights'
gpt_filenames = []
sovits_filenames = []
for dir in [gpt_weights_dir_v2, gpt_weights_dir]:
if os.path.exists(dir):
gpt_filenames.extend([f"{dir}/{f}" for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))])
for dir in [sovits_weights_dir_v2, sovits_weights_dir]:
if os.path.exists(dir):
sovits_filenames.extend([f"{dir}/{f}" for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))])
if not gpt_filenames:
return JSONResponse(status_code=404, content={"message": "No GPT weights files found"})
if not sovits_filenames:
return JSONResponse(status_code=404, content={"message": "No SoVITS weights files found"})
return JSONResponse(status_code=200, content={
"gpt_weights_files": gpt_filenames,
"sovits_weights_files": sovits_filenames,
"server_port": port
})
except Exception as e:
return JSONResponse(status_code=500, content={"message": f"Error retrieving weights info", "error": str(e)})
@APP.post("/tts")
async def tts_post_endpoint(request: TTS_Request):
req = request.model_dump()
print("\nProcessed request (req):")
print(f"Type: {type(req)}")
print("Content:")
for key, value in req.items():
print(f" {key}: {value}")
return await tts_handle(req)
@APP.post("/upload_file")
async def upload_file(file: UploadFile = File(...)):
try:
# Create a temporary directory if it doesn't exist
temp_dir = "temp_files"
os.makedirs(temp_dir, exist_ok=True)
# Define the path to save the uploaded file
file_path = os.path.join(temp_dir, file.filename)
# Save the uploaded file to the temporary directory
with open(file_path, "wb") as buffer:
shutil.copyfileobj(file.file, buffer)
return JSONResponse(status_code=200, content={"message": "File uploaded successfully", "file_path": file_path})
except Exception as e:
return JSONResponse(status_code=500, content={"message": "File upload failed", "error": str(e)})
APP.mount("/", StaticFiles(directory="dist", html=True), name="static")
print("--------------------------------")
print(f"前端界面已在 http://{host}:{port} 开启。")
print("目前的前端版本只适配默认端口9880, 更改api端口会导致前端页面无法工作, 但不影响后端api运行。")
print("在前端界面中上传的音频文件将会保存在 ./temp_files 目录下,如有需要请手动删除。")
print("请至少运行一遍webui.py, 放好模型, 再运行本API, 以确保存放模型的文件夹SoVITS_weights和GPT_weights存在。")
print("如遇配置错误,请检查命令行上方输出的配置详情,并修改文件/GPT_SoVITS/configs/tts_infer.yaml")
print("如果运行环境是mac, 请将tts_infer.yaml内custom条目下的device改为cpu, is_half改为false")
print("--------------------------------")
if __name__ == "__main__":
try:

View File

@ -1,23 +1,10 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
"colab_type": "text",
"id": "view-in-github"
},
"source": [
"<a href=\"https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
@ -25,18 +12,20 @@
},
{
"cell_type": "markdown",
"source": [
"环境配置 environment"
],
"metadata": {
"id": "_o6a8GS2lWQM"
}
},
"source": [
"环境配置 environment"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "e9b7iFV3dm1f"
},
"outputs": [],
"source": [
"!pip install -q condacolab\n",
"# Setting up condacolab and installing packages\n",
@ -47,13 +36,17 @@
"!conda install -y -q -c pytorch -c nvidia cudatoolkit\n",
"%cd -q /content/GPT-SoVITS\n",
"!conda install -y -q -c conda-forge gcc gxx ffmpeg cmake -c pytorch -c nvidia\n",
"!/usr/local/bin/pip install -r extra-req.txt --no-deps\n",
"!/usr/local/bin/pip install -r requirements.txt"
],
"execution_count": null,
"outputs": []
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "0NgxXg5sjv7z"
},
"outputs": [],
"source": [
"# @title Download pretrained models 下载预训练模型\n",
"!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
@ -71,27 +64,35 @@
"!git clone https://huggingface.co/Delik/uvr5_weights\n",
"!git config core.sparseCheckout true\n",
"!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
],
"metadata": {
"id": "0NgxXg5sjv7z"
},
"execution_count": null,
"outputs": []
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "4oRGUzkrk8C7"
},
"outputs": [],
"source": [
"# @title launch WebUI 启动WebUI\n",
"!/usr/local/bin/pip install ipykernel\n",
"!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
"%cd /content/GPT-SoVITS/\n",
"!/usr/local/bin/python webui.py"
],
"metadata": {
"id": "4oRGUzkrk8C7"
},
"execution_count": null,
"outputs": []
]
}
]
],
"metadata": {
"accelerator": "GPU",
"colab": {
"include_colab_link": true,
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

17
dist/assets/index-BXQvAA72.js vendored Normal file

File diff suppressed because one or more lines are too long

1
dist/assets/index-Dl43Gj3X.css vendored Normal file

File diff suppressed because one or more lines are too long

13
dist/index.html vendored Normal file
View File

@ -0,0 +1,13 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Vite Project</title>
<script type="module" crossorigin src="/assets/index-BXQvAA72.js"></script>
<link rel="stylesheet" crossorigin href="/assets/index-Dl43Gj3X.css">
</head>
<body>
<div id="app"></div>
</body>
</html>

View File

@ -286,3 +286,17 @@ https://github.com/RVC-Boss/GPT-SoVITS/pull/2112 https://github.com/RVC-Boss/GPT
修复短文本语种选择出错 https://github.com/RVC-Boss/GPT-SoVITS/pull/2122
修复v3sovits未传参以支持调节语速
### 202503
修复一批由依赖的库版本不对导致的问题https://github.com/RVC-Boss/GPT-SoVITS/commit/6c468583c5566e5fbb4fb805e4cc89c403e997b8
修复模型加载异步逻辑https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa
修复其他若干bug
重点更新:
1-v3支持并行推理 https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa
2-整合包修复onnxruntime GPU推理的支持影响1g2pw有个onnx模型原先是CPU推理现在用GPU显著降低推理的CPU瓶颈 2foxjoy去混响模型现在可使用GPU推理

View File

@ -76,6 +76,7 @@ bash install.sh
```bash
conda create -n GPTSoVits python=3.9
conda activate GPTSoVits
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
```
@ -101,9 +102,10 @@ conda install -c conda-forge 'ffmpeg<7'
下载并将 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) 和 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) 放置在 GPT-SoVITS 根目录下。
安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语TTS)
安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语 TTS)
##### MacOS 用户
```bash
brew install ffmpeg
```
@ -111,6 +113,7 @@ brew install ffmpeg
#### 安装依赖
```bash
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
```
@ -147,14 +150,13 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
1. 从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型,并将其放置在 `GPT_SoVITS/pretrained_models` 目录中。
2. 从 [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) 下载模型,解压并重命名为 `G2PWModel`,然后将其放置在 `GPT_SoVITS/text` 目录中。仅限中文TTS
2. 从 [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) 下载模型,解压并重命名为 `G2PWModel`,然后将其放置在 `GPT_SoVITS/text` 目录中。(仅限中文 TTS
3. 对于 UVR5人声/伴奏分离和混响移除,额外功能),从 [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 下载模型,并将其放置在 `tools/uvr5/uvr5_weights` 目录中。
- 如果你在 UVR5 中使用 `bs_roformer``mel_band_roformer`模型,你可以手动下载模型和相应的配置文件,并将它们放在 `tools/UVR5/UVR5_weights` 中。**重命名模型文件和配置文件,确保除后缀外**,模型和配置文件具有相同且对应的名称。此外,模型和配置文件名**必须包含“roformer”**,才能被识别为 roformer 类的模型。
- 建议在模型名称和配置文件名中**直接指定模型类型**,例如`mel_mand_roformer``bs_roformer`。如果未指定,将从配置文中比对特征,以确定它是哪种类型的模型。例如,模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对。`kim_mel_band_roformer.ckpt``kim_mel_band_roformer.yaml` 也是一对。
- 如果你在 UVR5 中使用 `bs_roformer``mel_band_roformer`模型,你可以手动下载模型和相应的配置文件,并将它们放在 `tools/UVR5/UVR5_weights` 中。**重命名模型文件和配置文件,确保除后缀外**,模型和配置文件具有相同且对应的名称。此外,模型和配置文件名**必须包含“roformer”**,才能被识别为 roformer 类的模型。
- 建议在模型名称和配置文件名中**直接指定模型类型**,例如`mel_mand_roformer``bs_roformer`。如果未指定,将从配置文中比对特征,以确定它是哪种类型的模型。例如,模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对。`kim_mel_band_roformer.ckpt``kim_mel_band_roformer.yaml` 也是一对。
4. 对于中文 ASR额外功能从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型,并将它们放置在 `tools/asr/models` 目录中。
@ -184,12 +186,12 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神。
## 微调与推理
### 打开WebUI
### 打开 WebUI
#### 整合包用户
双击`go-webui.bat`或者使用`go-webui.ps1`
若想使用V1,则双击`go-webui-v1.bat`或者使用`go-webui-v1.ps1`
若想使用 V1,则双击`go-webui-v1.bat`或者使用`go-webui-v1.ps1`
#### 其他
@ -197,12 +199,13 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神。
python webui.py <language(optional)>
```
若想使用V1,则
若想使用 V1,则
```bash
python webui.py v1 <language(optional)>
```
或者在webUI内动态切换
或者在 webUI 内动态切换
### 微调
@ -215,25 +218,27 @@ python webui.py v1 <language(optional)>
5. 校对标注
6. 前往下一个窗口,点击训练
### 打开推理WebUI
### 打开推理 WebUI
#### 整合包用户
双击 `go-webui.bat` 或者使用 `go-webui.ps1` ,然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理webUI
双击 `go-webui.bat` 或者使用 `go-webui.ps1` ,然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
#### 其他
```bash
python GPT_SoVITS/inference_webui.py <language(optional)>
```
或者
```bash
python webui.py
```
然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理webUI
## V2发布说明
然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
## V2 发布说明
新特性:
@ -241,42 +246,41 @@ python webui.py
2. 更好的文本前端
3. 底模由2k小时扩展至5k小时
3. 底模由 2k 小时扩展至 5k 小时
4. 对低音质参考音频(尤其是来源于网络的高频严重缺失、听着很闷的音频)合成出来音质更好
详见[wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
从v1环境迁移至v2
v1 环境迁移至 v2
1. 需要pip安装requirements.txt更新环境
1. 需要 pip 安装 requirements.txt 更新环境
2. 需要克隆github上的最新代码
2. 需要克隆 github 上的最新代码
3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到GPT_SoVITS\pretrained_models\gsv-v2final-pretrained下
3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到 GPT_SoVITS\pretrained_models\gsv-v2final-pretrained
中文额外需要下载[G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)下载G2PW模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下)
中文额外需要下载[G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(下载 G2PW 模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下)
## V3更新说明
## V3 更新说明
新模型特点:
1. 音色相似度更像,需要更少训练集来逼近本人(不训练直接使用底模模式下音色相似性提升更大)
2. GPT合成更稳定重复漏字更少也更容易跑出丰富情感
2. GPT 合成更稳定,重复漏字更少,也更容易跑出丰富情感
详见[wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
从v2环境迁移至v3
v2 环境迁移至 v3
1. 需要pip安装requirements.txt更新环境
1. 需要 pip 安装 requirements.txt 更新环境
2. 需要克隆github上的最新代码
2. 需要克隆 github 上的最新代码
3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些v3新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS\pretrained_models`目录下
如果想用音频超分功能缓解v3模型生成24k音频觉得闷的问题需要下载额外的模型参数参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt)
3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些 v3 新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS\pretrained_models`目录下
如果想用音频超分功能缓解 v3 模型生成 24k 音频觉得闷的问题,需要下载额外的模型参数,参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt)
## 待办事项清单
@ -299,16 +303,21 @@ python webui.py
- [ ] 模型混合。
## (附加)命令行运行方式
使用命令行打开UVR5的WebUI
````
使用命令行打开 UVR5 的 WebUI
```
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
````
```
<!-- 如果打不开浏览器请按照下面的格式进行UVR处理这是使用mdxnet进行音频处理的方式
````
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
```` -->
这是使用命令行完成数据集的音频切分的方式
````
```
python audio_slicer.py \
--input_path "<path_to_original_audio_file_or_directory>" \
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
@ -316,17 +325,22 @@ python audio_slicer.py \
--min_length <minimum_duration_of_each_subclip> \
--min_interval <shortest_time_gap_between_adjacent_subclips>
--hop_size <step_size_for_computing_volume_curve>
````
这是使用命令行完成数据集ASR处理的方式仅限中文
````
python tools/asr/funasr_asr.py -i <input> -o <output>
````
通过Faster_Whisper进行ASR处理除中文之外的ASR标记
```
这是使用命令行完成数据集 ASR 处理的方式(仅限中文)
```
python tools/asr/funasr_asr.py -i <input> -o <output>
```
通过 Faster_Whisper 进行 ASR 处理(除中文之外的 ASR 标记)
没有进度条GPU 性能可能会导致时间延迟)
没有进度条GPU性能可能会导致时间延迟
```
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
```
启用自定义列表保存路径
## 致谢
@ -334,6 +348,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
特别感谢以下项目和贡献者:
### 理论研究
- [ar-vits](https://github.com/innnky/ar-vits)
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
- [vits](https://github.com/jaywalnut310/vits)
@ -343,17 +358,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
### 预训练模型
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
### 推理用文本前端
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
- [split-lang](https://github.com/DoodleBears/split-lang)
- [g2pW](https://github.com/GitYCC/g2pW)
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
### WebUI 工具
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
- [audio-slicer](https://github.com/openvpi/audio-slicer)
- [SubFix](https://github.com/cronrpc/SubFix)

View File

@ -20,17 +20,17 @@
## 機能:
1. **Zero-Shot TTS:** たった5秒間の音声サンプルで、即座にテキストからその音声に変換できます。
1. **Zero-Shot TTS:** たった 5 秒間の音声サンプルで、即座にテキストからその音声に変換できます。
2. **Few-Shot TTS:** わずか1分間のトレーニングデータでモデルを微調整し、音声のクオリティを向上。
2. **Few-Shot TTS:** わずか 1 分間のトレーニングデータでモデルを微調整し、音声のクオリティを向上。
3. **多言語サポート:** 現在、英語、日本語、韓国語、広東語、中国語をサポートしています。
4. **WebUI ツール:** 統合されたツールは、音声と伴奏BGM等の分離、トレーニングセットの自動セグメンテーション、ASR中国語のみ、テキストラベリング等を含むため、初心者の方でもトレーニングデータセットの作成やGPT/SoVITSモデルのトレーニング等を非常に簡単に行えます。
4. **WebUI ツール:** 統合されたツールは、音声と伴奏BGM の分離、トレーニングセットの自動セグメンテーション、ASR中国語のみ、テキストラベリング等を含むため、初心者の方でもトレーニングデータセットの作成や GPT/SoVITS モデルのトレーニング等を非常に簡単に行えます。
**[デモ動画](https://www.bilibili.com/video/BV12g4y1m7Uw)をチェック!**
声の事前学習無しかつFew-Shotでトレーニングされたモデルのデモ:
声の事前学習無しかつ Few-Shot でトレーニングされたモデルのデモ:
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
@ -43,7 +43,7 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
- Python 3.9, PyTorch 2.0.1, CUDA 11
- Python 3.10.13, PyTorch 2.1.2, CUDA 12.3
- Python 3.9, PyTorch 2.2.2, macOS 14.4.1 (Apple silicon)
- Python 3.9, PyTorch 2.2.2, CPUデバイス
- Python 3.9, PyTorch 2.2.2, CPU デバイス
_注記: numba==0.56.4 は py<3.11 が必要です_
@ -61,22 +61,22 @@ bash install.sh
### macOS
**注MacでGPUを使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面はCPUを使用して訓練することを強く推奨します。**
**注Mac GPU を使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面は CPU を使用して訓練することを強く推奨します。**
1. `xcode-select --install` を実行して、Xcodeコマンドラインツールをインストールします。
2. `brew install ffmpeg` を実行してFFmpegをインストールします。
1. `xcode-select --install` を実行して、Xcode コマンドラインツールをインストールします。
2. `brew install ffmpeg` を実行して FFmpeg をインストールします。
3. 上記の手順を完了した後、以下のコマンドを実行してこのプロジェクトをインストールします。
```bash
conda create -n GPTSoVits python=3.9
conda activate GPTSoVits
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
```
### 手動インストール
#### FFmpegをインストールします。
#### FFmpeg をインストールします。
##### Conda ユーザー
@ -97,6 +97,7 @@ conda install -c conda-forge 'ffmpeg<7'
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます。
##### MacOS ユーザー
```bash
brew install ffmpeg
```
@ -104,6 +105,7 @@ brew install ffmpeg
#### 依存関係をインストールします
```bash
pip install -r extra-req.txt --no-deps
pip install -r requirementx.txt
```
@ -138,17 +140,17 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) から事前訓練済みモデルをダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリに配置してください。
2. [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください。中国語TTSのみ
2. [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください。(中国語 TTS のみ)
3. UVR5ボーカル/伴奏BGM等分離 & リバーブ除去の追加機能)の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`tools/uvr5/uvr5_weights` ディレクトリに配置してください。
3. UVR5ボーカル/伴奏BGM 等)分離 & リバーブ除去の追加機能)の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`tools/uvr5/uvr5_weights` ディレクトリに配置してください。
- UVR5でbs_roformerまたはmel_band_roformerモデルを使用する場合、モデルと対応する設定ファイルを手動でダウンロードし、`tools/UVR5/UVR5_weights`フォルダに配置することができます。**モデルファイルと設定ファイルの名前は、拡張子を除いて同じであることを確認してください**。さらに、モデルと設定ファイルの名前には**「roformer」が含まれている必要があります**。これにより、roformerクラスのモデルとして認識されます。
- UVR5 bs_roformer または mel_band_roformer モデルを使用する場合、モデルと対応する設定ファイルを手動でダウンロードし、`tools/UVR5/UVR5_weights`フォルダに配置することができます。**モデルファイルと設定ファイルの名前は、拡張子を除いて同じであることを確認してください**。さらに、モデルと設定ファイルの名前には**「roformer」が含まれている必要があります**。これにより、roformer クラスのモデルとして認識されます。
- モデル名と設定ファイル名には、**直接モデルタイプを指定することをお勧めします**。例mel_mand_roformer、bs_roformer。指定しない場合、設定文から特徴を照合して、モデルの種類を特定します。例えば、モデル`bs_roformer_ep_368_sdr_12.9628.ckpt`と対応する設定ファイル`bs_roformer_ep_368_sdr_12.9628.yaml`はペアです。同様に、`kim_mel_band_roformer.ckpt``kim_mel_band_roformer.yaml`もペアです。
- モデル名と設定ファイル名には、**直接モデルタイプを指定することをお勧めします**。例mel_mand_roformer、bs_roformer。指定しない場合、設定文から特徴を照合して、モデルの種類を特定します。例えば、モデル`bs_roformer_ep_368_sdr_12.9628.ckpt`と対応する設定ファイル`bs_roformer_ep_368_sdr_12.9628.yaml`はペアです。同様に、`kim_mel_band_roformer.ckpt``kim_mel_band_roformer.yaml`もペアです。
4. 中国語ASR追加機能の場合は、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、および [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。
4. 中国語 ASR追加機能の場合は、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、および [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。
5. 英語または日本語のASR追加機能を使用する場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。また、[他のモデル](https://huggingface.co/Systran) は、より小さいサイズで高クオリティな可能性があります。
5. 英語または日本語の ASR追加機能を使用する場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。また、[他のモデル](https://huggingface.co/Systran) は、より小さいサイズで高クオリティな可能性があります。
## データセット形式
@ -169,14 +171,15 @@ vocal_path|speaker_name|language|text
```
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
```
## 微調整と推論
### WebUIを開く
### WebUI を開く
#### 統合パッケージ利用者
`go-webui.bat`をダブルクリックするか、`go-webui.ps1`を使用します。
V1に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックするか、`go-webui-v1.ps1`を使用してください。
V1 に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックするか、`go-webui-v1.ps1`を使用してください。
#### その他
@ -184,12 +187,13 @@ V1に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックす
python webui.py <言語(オプション)>
```
V1に切り替えたい場合は
V1 に切り替えたい場合は
```bash
python webui.py v1 <言語(オプション)>
```
またはWebUIで手動でバージョンを切り替えてください。
または WebUI で手動でバージョンを切り替えてください。
### 微調整
@ -202,25 +206,27 @@ python webui.py v1 <言語(オプション)>
5. ASR転写を校正する
6. 次のタブに移動し、モデルを微調整する
### 推論WebUIを開く
### 推論 WebUI を開く
#### 統合パッケージ利用者
`go-webui-v2.bat`をダブルクリックするか、`go-webui-v2.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論webuiを開きます。
`go-webui-v2.bat`をダブルクリックするか、`go-webui-v2.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます。
#### その他
```bash
python GPT_SoVITS/inference_webui.py <言語(オプション)>
```
または
```bash
python webui.py
```
その後、`1-GPT-SoVITS-TTS/1C-inference`で推論webuiを開きます。
## V2リリースート
その後、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます。
## V2 リリースノート
新機能:
@ -228,21 +234,21 @@ python webui.py
2. 最適化されたテキストフロントエンド
3. 事前学習済みモデルが2千時間から5千時間に拡張
3. 事前学習済みモデルが 2 千時間から 5 千時間に拡張
4. 低品質の参照音声に対する合成品質の向上
[詳細はこちら](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
[詳細はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
V1環境からV2を使用するには:
V1 環境から V2 を使用するには:
1. `pip install -r requirements.txt`を使用していくつかのパッケージを更新
2. 最新のコードをgithubからクローン
2. 最新のコードを github からクローン
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)からV2の事前学習モデルをダウンロードし、それらを`GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`に配置
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)から V2 の事前学習モデルをダウンロードし、それらを`GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`に配置
中国語V2追加: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)G2PWモデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します)
中国語 V2 追加: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)G2PW モデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します)
## V3 リリースノート
@ -250,19 +256,19 @@ V1環境からV2を使用するには:
1. 音色の類似性が向上し、ターゲットスピーカーを近似するために必要な学習データが少なくなりました(音色の類似性は、ファインチューニングなしでベースモデルを直接使用することで顕著に改善されます)。
2. GPTモデルがより安定し、繰り返しや省略が減少し、より豊かな感情表現を持つ音声の生成が容易になりました。
2. GPT モデルがより安定し、繰り返しや省略が減少し、より豊かな感情表現を持つ音声の生成が容易になりました。
[詳細情報はこちら](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
[詳細情報はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
v2 環境から v3 を使用する方法:
1. `pip install -r requirements.txt` を実行して、いくつかのパッケージを更新します。
2. GitHubから最新のコードをクローンします。
2. GitHub から最新のコードをクローンします。
3. v3の事前学習済みモデルs1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ)を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS\pretrained_models フォルダに配置します。
3. v3 の事前学習済みモデルs1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ)を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS\pretrained_models フォルダに配置します。
追加: 音声超解像モデルについては、[ダウンロード方法](../../tools/AP_BWE_main/24kto48k/readme.txt)を参照してください。
追加: 音声超解像モデルについては、[ダウンロード方法](../../tools/AP_BWE_main/24kto48k/readme.txt)を参照してください。
## Todo リスト
@ -285,15 +291,20 @@ v2 環境から v3 を使用する方法:
- [ ] モデルミックス
## (追加の) コマンドラインから実行する方法
コマンド ラインを使用して UVR5 の WebUI を開きます
```
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
```
<!-- ブラウザを開けない場合は、以下の形式に従って UVR 処理を行ってください。これはオーディオ処理に mdxnet を使用しています。
```
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
``` -->
コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです。
```
python audio_slicer.py \
--input_path "<path_to_original_audio_file_or_directory>" \
@ -303,16 +314,21 @@ python audio_slicer.py \
--min_interval <shortest_time_gap_between_adjacent_subclips>
--hop_size <step_size_for_computing_volume_curve>
```
コマンドラインを使用してデータセット ASR 処理を行う方法です (中国語のみ)
```
python tools/asr/funasr_asr.py -i <input> -o <output>
```
ASR処理はFaster_Whisperを通じて実行されます(中国語を除くASRマーキング)
ASR 処理は Faster_Whisper を通じて実行されます(中国語を除く ASR マーキング)
(進行状況バーは表示されません。GPU のパフォーマンスにより時間遅延が発生する可能性があります)
```
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
```
カスタムリストの保存パスが有効になっています
## クレジット
@ -320,6 +336,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
特に以下のプロジェクトと貢献者に感謝します:
### 理論研究
- [ar-vits](https://github.com/innnky/ar-vits)
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
- [vits](https://github.com/jaywalnut310/vits)
@ -329,17 +346,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
### 事前学習モデル
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
### 推論用テキストフロントエンド
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
- [split-lang](https://github.com/DoodleBears/split-lang)
- [g2pW](https://github.com/GitYCC/g2pW)
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
### WebUI ツール
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
- [audio-slicer](https://github.com/openvpi/audio-slicer)
- [SubFix](https://github.com/cronrpc/SubFix)

View File

@ -70,7 +70,7 @@ bash install.sh
```bash
conda create -n GPTSoVits python=3.9
conda activate GPTSoVits
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
```
@ -99,6 +99,7 @@ conda install -c conda-forge 'ffmpeg<7'
[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치 (Korean TTS 전용)
##### MacOS 사용자
```bash
brew install ffmpeg
```
@ -106,6 +107,7 @@ brew install ffmpeg
#### 의존성 설치
```bash
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
```
@ -147,9 +149,9 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
3. UVR5 (보컬/반주 분리 & 잔향 제거 추가 기능)의 경우, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 에서 모델을 다운로드하고 `tools/uvr5/uvr5_weights` 디렉토리에 배치하세요.
- UVR5에서 bs_roformer 또는 mel_band_roformer 모델을 사용할 경우, 모델과 해당 설정 파일을 수동으로 다운로드하여 `tools/UVR5/UVR5_weights` 폴더에 저장할 수 있습니다. **모델 파일과 설정 파일의 이름은 확장자를 제외하고 동일한 이름을 가지도록 해야 합니다**. 또한, 모델과 설정 파일 이름에는 **“roformer”**가 포함되어야 roformer 클래스의 모델로 인식됩니다.
- UVR5에서 bs_roformer 또는 mel_band_roformer 모델을 사용할 경우, 모델과 해당 설정 파일을 수동으로 다운로드하여 `tools/UVR5/UVR5_weights` 폴더에 저장할 수 있습니다. **모델 파일과 설정 파일의 이름은 확장자를 제외하고 동일한 이름을 가지도록 해야 합니다**. 또한, 모델과 설정 파일 이름에는 **“roformer”**가 포함되어야 roformer 클래스의 모델로 인식됩니다.
- 모델 이름과 설정 파일 이름에 **모델 유형을 직접 지정하는 것이 좋습니다**. 예: mel_mand_roformer, bs_roformer. 지정하지 않으면 설정 파일을 기준으로 특성을 비교하여 어떤 유형의 모델인지를 판단합니다. 예를 들어, 모델 `bs_roformer_ep_368_sdr_12.9628.ckpt`와 해당 설정 파일 `bs_roformer_ep_368_sdr_12.9628.yaml`은 한 쌍입니다. `kim_mel_band_roformer.ckpt``kim_mel_band_roformer.yaml`도 한 쌍입니다.
- 모델 이름과 설정 파일 이름에 **모델 유형을 직접 지정하는 것이 좋습니다**. 예: mel_mand_roformer, bs_roformer. 지정하지 않으면 설정 파일을 기준으로 특성을 비교하여 어떤 유형의 모델인지를 판단합니다. 예를 들어, 모델 `bs_roformer_ep_368_sdr_12.9628.ckpt`와 해당 설정 파일 `bs_roformer_ep_368_sdr_12.9628.yaml`은 한 쌍입니다. `kim_mel_band_roformer.ckpt``kim_mel_band_roformer.yaml`도 한 쌍입니다.
4. 중국어 ASR (추가 기능)의 경우, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 및 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 에서 모델을 다운로드하고, `tools/asr/models` 디렉토리에 배치하세요.
@ -195,6 +197,7 @@ V1으로 전환하려면,
```bash
python webui.py v1 <언어(옵션)>
```
또는 WebUI에서 수동으로 버전을 전환하십시오.
### 미세 조정
@ -219,11 +222,13 @@ python webui.py v1 <언어(옵션)>
```bash
python GPT_SoVITS/inference_webui.py <언어(옵션)>
```
또는
```bash
python webui.py
```
그런 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.
## V2 릴리스 노트
@ -238,7 +243,7 @@ python webui.py
4. 저품질 참조 오디오에 대한 합성 품질 향상
[자세한 내용](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
[자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
V1 환경에서 V2를 사용하려면:
@ -248,7 +253,7 @@ V1 환경에서 V2를 사용하려면:
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)에서 V2 사전 학습 모델을 다운로드하여 `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`에 넣으십시오.
중국어 V2 추가: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.)
중국어 V2 추가: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.)
## V3 릴리스 노트
@ -258,7 +263,7 @@ V1 환경에서 V2를 사용하려면:
2. GPT 모델이 더 안정적이며 반복 및 생략이 적고, 더 풍부한 감정 표현을 가진 음성을 생성하기가 더 쉽습니다.
[자세한 내용](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
[자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
v2 환경에서 v3 사용하기:
@ -268,8 +273,7 @@ v2 환경에서 v3 사용하기:
3. v3 사전 훈련된 모델(s1v3.ckpt, s2Gv3.pth, 그리고 models--nvidia--bigvgan_v2_24khz_100band_256x 폴더)을 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 다운로드하여 `GPT_SoVITS\pretrained_models` 폴더에 넣습니다.
추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요.
추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요.
## 할 일 목록
@ -293,15 +297,20 @@ v2 환경에서 v3 사용하기:
- [ ] 모델 블렌딩.
## (추가적인) 명령줄에서 실행하는 방법
명령줄을 사용하여 UVR5용 WebUI 열기
```
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
```
<!-- 브라우저를 열 수 없는 경우 UVR 처리를 위해 아래 형식을 따르십시오. 이는 오디오 처리를 위해 mdxnet을 사용하는 것입니다.
```
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
``` -->
명령줄을 사용하여 데이터세트의 오디오 분할을 수행하는 방법은 다음과 같습니다.
```
python audio_slicer.py \
--input_path "<path_to_original_audio_file_or_directory>" \
@ -311,16 +320,21 @@ python audio_slicer.py \
--min_interval <shortest_time_gap_between_adjacent_subclips>
--hop_size <step_size_for_computing_volume_curve>
```
명령줄을 사용하여 데이터 세트 ASR 처리를 수행하는 방법입니다(중국어만 해당).
```
python tools/asr/funasr_asr.py -i <input> -o <output>
```
ASR 처리는 Faster_Whisper(중국어를 제외한 ASR 마킹)를 통해 수행됩니다.
(진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음)
```
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
```
사용자 정의 목록 저장 경로가 활성화되었습니다.
## 감사의 말
@ -328,6 +342,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
다음 프로젝트와 기여자들에게 특별히 감사드립니다:
### 이론 연구
- [ar-vits](https://github.com/innnky/ar-vits)
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
- [vits](https://github.com/jaywalnut310/vits)
@ -337,17 +352,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
### 사전 학습 모델
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
### 추론용 텍스트 프론트엔드
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
- [split-lang](https://github.com/DoodleBears/split-lang)
- [g2pW](https://github.com/GitYCC/g2pW)
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
### WebUI 도구
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
- [audio-slicer](https://github.com/openvpi/audio-slicer)
- [SubFix](https://github.com/cronrpc/SubFix)

View File

@ -72,7 +72,7 @@ bash install.sh
```bash
conda create -n GPTSoVits python=3.9
conda activate GPTSoVits
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
```
@ -99,6 +99,7 @@ conda install -c conda-forge 'ffmpeg<7'
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin.
##### MacOS Kullanıcıları
```bash
brew install ffmpeg
```
@ -106,6 +107,7 @@ brew install ffmpeg
#### Bağımlılıkları Yükleme
```bash
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
```
@ -142,9 +144,9 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
3. UVR5 (Vokal/Enstrümantal Ayrımı & Yankı Giderme) için, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) üzerinden modelleri indirip `tools/uvr5/uvr5_weights` dizinine yerleştirin.
- UVR5'te bs_roformer veya mel_band_roformer modellerini kullanıyorsanız, modeli ve ilgili yapılandırma dosyasını manuel olarak indirip `tools/UVR5/UVR5_weights` klasörüne yerleştirebilirsiniz. **Model dosyası ve yapılandırma dosyasının adı, uzantı dışında aynı olmalıdır**. Ayrıca, model ve yapılandırma dosyasının adlarında **“roformer”** kelimesi yer almalıdır, böylece roformer sınıfındaki bir model olarak tanınır.
- UVR5'te bs_roformer veya mel_band_roformer modellerini kullanıyorsanız, modeli ve ilgili yapılandırma dosyasını manuel olarak indirip `tools/UVR5/UVR5_weights` klasörüne yerleştirebilirsiniz. **Model dosyası ve yapılandırma dosyasının adı, uzantı dışında aynı olmalıdır**. Ayrıca, model ve yapılandırma dosyasının adlarında **“roformer”** kelimesi yer almalıdır, böylece roformer sınıfındaki bir model olarak tanınır.
- Model adı ve yapılandırma dosyası adı içinde **doğrudan model tipini belirtmek önerilir**. Örneğin: mel_mand_roformer, bs_roformer. Belirtilmezse, yapılandırma dosyasından özellikler karşılaştırılarak model tipi belirlenir. Örneğin, `bs_roformer_ep_368_sdr_12.9628.ckpt` modeli ve karşılık gelen yapılandırma dosyası `bs_roformer_ep_368_sdr_12.9628.yaml` bir çifttir. Aynı şekilde, `kim_mel_band_roformer.ckpt` ve `kim_mel_band_roformer.yaml` da bir çifttir.
- Model adı ve yapılandırma dosyası adı içinde **doğrudan model tipini belirtmek önerilir**. Örneğin: mel_mand_roformer, bs_roformer. Belirtilmezse, yapılandırma dosyasından özellikler karşılaştırılarak model tipi belirlenir. Örneğin, `bs_roformer_ep_368_sdr_12.9628.ckpt` modeli ve karşılık gelen yapılandırma dosyası `bs_roformer_ep_368_sdr_12.9628.yaml` bir çifttir. Aynı şekilde, `kim_mel_band_roformer.ckpt` ve `kim_mel_band_roformer.yaml` da bir çifttir.
4. Çince ASR için, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) ve [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) üzerinden modelleri indirip `tools/asr/models` dizinine yerleştirin.
@ -192,6 +194,7 @@ V1'e geçmek istiyorsanız,
```bash
python webui.py v1 <dil(isteğe bağlı)>
```
veya WebUI'de manuel olarak sürüm değiştirin.
### İnce Ayar
@ -216,11 +219,13 @@ veya WebUI'de manuel olarak sürüm değiştirin.
```bash
python GPT_SoVITS/inference_webui.py <dil(isteğe bağlı)>
```
VEYA
```bash
python webui.py
```
ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.
## V2 Sürüm Notları
@ -235,7 +240,7 @@ Yeni Özellikler:
4. Düşük kaliteli referans sesler için geliştirilmiş sentez kalitesi
[detaylar burada](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
[detaylar burada](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
V1 ortamından V2'yi kullanmak için:
@ -245,7 +250,7 @@ V1 ortamından V2'yi kullanmak için:
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) adresinden v2 önceden eğitilmiş modelleri indirin ve bunları `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained` dizinine yerleştirin.
Ek olarak Çince V2: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.)
Ek olarak Çince V2: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.)
## V3 Sürüm Notları
@ -255,7 +260,7 @@ V1 ortamından V2'yi kullanmak için:
2. GPT modeli daha **kararlı** hale geldi, tekrarlar ve atlamalar azaldı ve **daha zengin duygusal ifadeler** ile konuşma üretmek daha kolay hale geldi.
[daha fazla detay](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
[daha fazla detay](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
### v2 ortamında v3 kullanımı:
@ -265,7 +270,7 @@ V1 ortamından V2'yi kullanmak için:
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden v3 önceden eğitilmiş modellerini (s1v3.ckpt, s2Gv3.pth ve models--nvidia--bigvgan_v2_24khz_100band_256x klasörünü) indirin ve `GPT_SoVITS\pretrained_models` dizinine yerleştirin.
ek: Ses Süper Çözünürlük modeli için [nasıl indirileceği](../../tools/AP_BWE_main/24kto48k/readme.txt) hakkında bilgi alabilirsiniz.
ek: Ses Süper Çözünürlük modeli için [nasıl indirileceği](../../tools/AP_BWE_main/24kto48k/readme.txt) hakkında bilgi alabilirsiniz.
## Yapılacaklar Listesi
@ -288,15 +293,20 @@ V1 ortamından V2'yi kullanmak için:
- [ ] model karışımı
## (Ekstra) Komut satırından çalıştırma yöntemi
UVR5 için Web Arayüzünü açmak için komut satırını kullanın
```
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
```
<!-- Bir tarayıcı açamıyorsanız, UVR işleme için aşağıdaki formatı izleyin,Bu ses işleme için mdxnet kullanıyor
```
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
``` -->
Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır
```
python audio_slicer.py \
--input_path "<orijinal_ses_dosyası_veya_dizininin_yolu>" \
@ -306,16 +316,21 @@ python audio_slicer.py \
--min_interval <bitişik_alt_klipler_arasındaki_en_kısa_zaman_aralığı>
--hop_size <ses_eğrisini_hesaplamak_için_adım_boyutu>
```
Veri seti ASR işleme komut satırı kullanılarak bu şekilde yapılır (Yalnızca Çince)
```
python tools/asr/funasr_asr.py -i <girdi> -o <çıktı>
```
ASR işleme Faster_Whisper aracılığıyla gerçekleştirilir (Çince dışındaki ASR işaretleme)
(İlerleme çubukları yok, GPU performansı zaman gecikmelerine neden olabilir)
```
python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
```
Özel bir liste kaydetme yolu etkinleştirildi
## Katkı Verenler
@ -323,6 +338,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
Özellikle aşağıdaki projelere ve katkıda bulunanlara teşekkür ederiz:
### Teorik Araştırma
- [ar-vits](https://github.com/innnky/ar-vits)
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
- [vits](https://github.com/jaywalnut310/vits)
@ -332,17 +348,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
### Önceden Eğitilmiş Modeller
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
### Tahmin İçin Metin Ön Ucu
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
- [split-lang](https://github.com/DoodleBears/split-lang)
- [g2pW](https://github.com/GitYCC/g2pW)
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
### WebUI Araçları
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
- [audio-slicer](https://github.com/openvpi/audio-slicer)
- [SubFix](https://github.com/cronrpc/SubFix)

1
extra-req.txt Normal file
View File

@ -0,0 +1 @@
faster-whisper

View File

@ -27,7 +27,8 @@
"!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
"%cd GPT-SoVITS\n",
"!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
"!pip install -r requirements.txt"
"!pip install -r requirements.txt\n",
"!pip install -r extra-req.txt --no-deps"
]
},
{

View File

@ -1,15 +1,17 @@
#!/bin/bash
set -e
# 安装构建工具
# Install build tools
echo "Installing GCC..."
conda install -c conda-forge gcc=14
conda install -c conda-forge gcc=14 -y
echo "Installing G++..."
conda install -c conda-forge gxx
conda install -c conda-forge gxx -y
echo "Installing ffmpeg and cmake..."
conda install ffmpeg cmake
conda install ffmpeg cmake -y
# 设置编译环境
# Set up build environment
@ -18,7 +20,7 @@ export CC="$CONDA_PREFIX/bin/gcc"
export CXX="$CONDA_PREFIX/bin/g++"
echo "Checking for CUDA installation..."
if command -v nvidia-smi &> /dev/null; then
if command -v nvidia-smi &>/dev/null; then
USE_CUDA=true
echo "CUDA found."
else
@ -26,7 +28,6 @@ else
USE_CUDA=false
fi
if [ "$USE_CUDA" = false ]; then
echo "Checking for ROCm installation..."
if [ -d "/opt/rocm" ]; then
@ -48,7 +49,7 @@ fi
if [ "$USE_CUDA" = true ]; then
echo "Installing PyTorch with CUDA support..."
conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia
elif [ "$USE_ROCM" = true ] ; then
elif [ "$USE_ROCM" = true ]; then
echo "Installing PyTorch with ROCm support..."
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/rocm6.2
else
@ -56,21 +57,53 @@ else
conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 cpuonly -c pytorch
fi
echo "Installing Python dependencies from requirements.txt..."
# 刷新环境
# Refresh environment
hash -r
# pyopenjtalk Installation
conda install jq -y
OS_TYPE=$(uname)
PACKAGE_NAME="pyopenjtalk"
VERSION=$(curl -s https://pypi.org/pypi/$PACKAGE_NAME/json | jq -r .info.version)
wget "https://files.pythonhosted.org/packages/source/${PACKAGE_NAME:0:1}/$PACKAGE_NAME/$PACKAGE_NAME-$VERSION.tar.gz"
TAR_FILE=$(ls ${PACKAGE_NAME}-*.tar.gz)
DIR_NAME="${TAR_FILE%.tar.gz}"
tar -xzf "$TAR_FILE"
rm "$TAR_FILE"
CMAKE_FILE="$DIR_NAME/lib/open_jtalk/src/CMakeLists.txt"
if [[ "$OS_TYPE" == "darwin"* ]]; then
sed -i '' -E 's/cmake_minimum_required\(VERSION[^\)]*\)/cmake_minimum_required(VERSION 3.5...3.31)/' "$CMAKE_FILE"
else
sed -i -E 's/cmake_minimum_required\(VERSION[^\)]*\)/cmake_minimum_required(VERSION 3.5...3.31)/' "$CMAKE_FILE"
fi
tar -czf "$TAR_FILE" "$DIR_NAME"
pip install "$TAR_FILE"
rm -rf "$TAR_FILE" "$DIR_NAME"
pip install -r extra-req.txt --no-deps
pip install -r requirements.txt
if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ] ; then
if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ]; then
echo "Update to WSL compatible runtime lib..."
location=`pip show torch | grep Location | awk -F ": " '{print $2}'`
cd ${location}/torch/lib/
location=$(pip show torch | grep Location | awk -F ": " '{print $2}')
cd "${location}"/torch/lib/ || exit
rm libhsa-runtime64.so*
cp /opt/rocm/lib/libhsa-runtime64.so.1.2 libhsa-runtime64.so
fi
echo "Installation completed successfully!"

View File

@ -3,7 +3,7 @@ scipy
tensorboard
librosa==0.9.2
numba==0.56.4
pytorch-lightning
pytorch-lightning>2.0
gradio>=4.0,<=4.24.0
ffmpeg-python
onnxruntime; sys_platform == 'darwin'
@ -26,7 +26,6 @@ jieba_fast
jieba
split-lang
fast_langdetect>=0.3.0
Faster_Whisper
wordsegment
rotary_embedding_torch
ToJyutping
@ -38,4 +37,9 @@ python_mecab_ko; sys_platform != 'win32'
fastapi<0.112.2
x_transformers
torchmetrics<=1.5
attrdict
pydantic<=2.10.6
ctranslate2>=4.0,<5
huggingface_hub>=0.13
tokenizers>=0.13,<1
av>=11
tqdm

View File

@ -298,9 +298,9 @@ def change_tts_inference(bert_path,cnhubert_base_path,gpu_number,gpt_path,sovits
cmd = '"%s" GPT_SoVITS/inference_webui_fast.py "%s"'%(python_exec, language)
else:
cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language)
#####v3暂不支持加速推理
if version=="v3":
cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language)
# #####v3暂不支持加速推理
# if version=="v3":
# cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language)
if p_tts_inference is None:
os.environ["gpt_path"]=gpt_path if "/" in gpt_path else "%s/%s"%(GPT_weight_root,gpt_path)
os.environ["sovits_path"]=sovits_path if "/"in sovits_path else "%s/%s"%(SoVITS_weight_root,sovits_path)
@ -849,8 +849,8 @@ def switch_version(version_):
{'__type__': 'update', "value": default_sovits_save_every_epoch,"maximum": max_sovits_save_every_epoch}, \
{'__type__': 'update', "visible": True if version!="v3"else False}, \
{'__type__': 'update', "value": False if not if_force_ckpt else True, "interactive": True if not if_force_ckpt else False}, \
{'__type__': 'update', "interactive": False if version == "v3" else True, "value": False}, \
{'__type__': 'update', "visible": True if version== "v3" else False}
{'__type__': 'update', "interactive": True, "value": False}, \
{'__type__': 'update', "visible": True if version== "v3" else False} # {'__type__': 'update', "interactive": False if version == "v3" else True, "value": False}, \ ####batch infer
if os.path.exists('GPT_SoVITS/text/G2PWModel'):...
else: