mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-04-05 19:41:56 +08:00
Compare commits
32 Commits
20250228v3
...
main
Author | SHA1 | Date | |
---|---|---|---|
|
9da7e17efe | ||
|
b0de354c63 | ||
|
41090e5a7c | ||
|
605b380114 | ||
|
9f8d455130 | ||
|
7abae557fb | ||
|
6a60e5edb1 | ||
|
28bdff356f | ||
|
03b662a769 | ||
|
6c468583c5 | ||
|
ee4a466f79 | ||
|
b65ea9181e | ||
|
c0ce55a132 | ||
|
13573a1b06 | ||
|
fef65d40fe | ||
|
b0e465eb72 | ||
|
f1332ff53a | ||
|
b88bd391fc | ||
|
4635cb4293 | ||
|
86e6dea694 | ||
|
d7c24e9ac9 | ||
|
6c1c1bb72a | ||
|
265586990c | ||
|
7394dc7b0c | ||
|
165882d64f | ||
|
271db6a4de | ||
|
053a356ffe | ||
|
fe2f04bdb8 | ||
|
6dd2f72090 | ||
|
959a2ddbeb | ||
|
bb8a8efeca | ||
|
df33574a26 |
182
.gitignore
vendored
182
.gitignore
vendored
@ -17,4 +17,184 @@ SoVITS_weights_v3
|
||||
TEMP
|
||||
weight.json
|
||||
ffmpeg*
|
||||
ffprobe*
|
||||
ffprobe*
|
||||
cfg.json
|
||||
speakers.json
|
||||
ref_audios
|
||||
tools/AP_BWE_main/24kto48k/*
|
||||
!tools/AP_BWE_main/24kto48k/readme.txt
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# UV
|
||||
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
#uv.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
||||
.pdm.toml
|
||||
.pdm-python
|
||||
.pdm-build/
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
# Ruff stuff:
|
||||
.ruff_cache/
|
||||
|
||||
# PyPI configuration file
|
||||
.pypirc
|
||||
|
@ -5,7 +5,7 @@ from typing import List, Optional
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from AR.models.utils import make_pad_mask
|
||||
from AR.models.utils import make_pad_mask, make_pad_mask_left
|
||||
from AR.models.utils import (
|
||||
topk_sampling,
|
||||
sample,
|
||||
@ -162,7 +162,7 @@ class T2SBlock:
|
||||
)
|
||||
return x, k_cache, v_cache
|
||||
|
||||
def decode_next_token(self, x:torch.Tensor, k_cache:torch.Tensor, v_cache:torch.Tensor, attn_mask:Optional[torch.Tensor]=None, torch_sdpa:bool=True):
|
||||
def decode_next_token(self, x:torch.Tensor, k_cache:torch.Tensor, v_cache:torch.Tensor, attn_mask:torch.Tensor=None, torch_sdpa:bool=True):
|
||||
q, k, v = F.linear(x, self.qkv_w, self.qkv_b).chunk(3, dim=-1)
|
||||
|
||||
k_cache = torch.cat([k_cache, k], dim=1)
|
||||
@ -178,7 +178,7 @@ class T2SBlock:
|
||||
|
||||
|
||||
if torch_sdpa:
|
||||
attn = F.scaled_dot_product_attention(q, k, v)
|
||||
attn = F.scaled_dot_product_attention(q, k, v, (~attn_mask) if attn_mask is not None else None)
|
||||
else:
|
||||
attn = scaled_dot_product_attention(q, k, v, attn_mask)
|
||||
|
||||
@ -223,7 +223,7 @@ class T2STransformer:
|
||||
self, x:torch.Tensor,
|
||||
k_cache: List[torch.Tensor],
|
||||
v_cache: List[torch.Tensor],
|
||||
attn_mask : Optional[torch.Tensor]=None,
|
||||
attn_mask : torch.Tensor=None,
|
||||
torch_sdpa:bool=True
|
||||
):
|
||||
for i in range(self.num_blocks):
|
||||
@ -573,71 +573,88 @@ class Text2SemanticDecoder(nn.Module):
|
||||
x_item = self.ar_text_embedding(x_item.unsqueeze(0))
|
||||
x_item = x_item + self.bert_proj(bert_item.transpose(0, 1).unsqueeze(0))
|
||||
x_item = self.ar_text_position(x_item).squeeze(0)
|
||||
x_item = F.pad(x_item,(0,0,0,max_len-x_item.shape[0]),value=0) if x_item.shape[0]<max_len else x_item
|
||||
# x_item = F.pad(x_item,(0,0,0,max_len-x_item.shape[0]),value=0) if x_item.shape[0]<max_len else x_item ### padding right
|
||||
x_item = F.pad(x_item,(0,0,max_len-x_item.shape[0],0),value=0) if x_item.shape[0]<max_len else x_item ### padding left
|
||||
x_list.append(x_item)
|
||||
x = torch.stack(x_list, dim=0)
|
||||
x:torch.Tensor = torch.stack(x_list, dim=0)
|
||||
|
||||
|
||||
# AR Decoder
|
||||
y = prompts
|
||||
|
||||
x_len = x.shape[1]
|
||||
x_attn_mask = torch.zeros((x_len, x_len), dtype=torch.bool)
|
||||
stop = False
|
||||
|
||||
k_cache = None
|
||||
v_cache = None
|
||||
################### first step ##########################
|
||||
if y is not None:
|
||||
y_emb = self.ar_audio_embedding(y)
|
||||
y_len = y_emb.shape[1]
|
||||
prefix_len = y.shape[1]
|
||||
y_lens = torch.LongTensor([y_emb.shape[1]]*y_emb.shape[0]).to(x.device)
|
||||
y_pos = self.ar_audio_position(y_emb)
|
||||
xy_pos = torch.concat([x, y_pos], dim=1)
|
||||
ref_free = False
|
||||
else:
|
||||
y_emb = None
|
||||
y_len = 0
|
||||
prefix_len = 0
|
||||
y_lens = torch.LongTensor([y_len]*x.shape[0]).to(x.device)
|
||||
y_pos = None
|
||||
xy_pos = x
|
||||
y = torch.zeros(x.shape[0], 0, dtype=torch.int, device=x.device)
|
||||
ref_free = True
|
||||
assert y is not None, "Error: Prompt free is not supported batch_infer!"
|
||||
ref_free = False
|
||||
|
||||
y_emb = self.ar_audio_embedding(y)
|
||||
y_len = y_emb.shape[1]
|
||||
prefix_len = y.shape[1]
|
||||
y_lens = torch.LongTensor([y_emb.shape[1]]*y_emb.shape[0]).to(x.device)
|
||||
y_pos = self.ar_audio_position(y_emb)
|
||||
xy_pos = torch.concat([x, y_pos], dim=1)
|
||||
|
||||
|
||||
|
||||
##### create mask #####
|
||||
bsz = x.shape[0]
|
||||
src_len = x_len + y_len
|
||||
y_paddind_mask = make_pad_mask(y_lens, y_len)
|
||||
x_paddind_mask = make_pad_mask(x_lens, max_len)
|
||||
y_paddind_mask = make_pad_mask_left(y_lens, y_len)
|
||||
x_paddind_mask = make_pad_mask_left(x_lens, max_len)
|
||||
|
||||
# (bsz, x_len + y_len)
|
||||
xy_padding_mask = torch.concat([x_paddind_mask, y_paddind_mask], dim=1)
|
||||
padding_mask = torch.concat([x_paddind_mask, y_paddind_mask], dim=1)
|
||||
|
||||
x_mask = F.pad(
|
||||
torch.zeros(x_len, x_len, dtype=torch.bool, device=x.device),
|
||||
(0, y_len),
|
||||
value=True,
|
||||
)
|
||||
|
||||
x_mask = F.pad(
|
||||
x_attn_mask,
|
||||
(0, y_len), ###xx的纯0扩展到xx纯0+xy纯1,(x,x+y)
|
||||
value=True,
|
||||
)
|
||||
y_mask = F.pad( ###yy的右上1扩展到左边xy的0,(y,x+y)
|
||||
torch.triu(torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1),
|
||||
torch.triu(torch.ones(y_len, y_len, dtype=torch.bool, device=x.device), diagonal=1),
|
||||
(x_len, 0),
|
||||
value=False,
|
||||
)
|
||||
|
||||
xy_mask = torch.concat([x_mask, y_mask], dim=0).view(1 , src_len, src_len).repeat(bsz, 1, 1).to(x.device)
|
||||
_xy_padding_mask = xy_padding_mask.view(bsz, 1, src_len).repeat(1, src_len, 1)
|
||||
|
||||
for i in range(bsz):
|
||||
l = x_lens[i]
|
||||
_xy_padding_mask[i,l:max_len,:]=True
|
||||
|
||||
xy_attn_mask = xy_mask.logical_or(_xy_padding_mask)
|
||||
xy_attn_mask = xy_attn_mask.unsqueeze(1).expand(-1, self.num_head, -1, -1)
|
||||
xy_attn_mask = xy_attn_mask.bool()
|
||||
xy_padding_mask = xy_padding_mask.view(bsz, src_len, 1)
|
||||
causal_mask = torch.concat([x_mask, y_mask], dim=0).view(1 , src_len, src_len).repeat(bsz, 1, 1).to(x.device)
|
||||
# padding_mask = padding_mask.unsqueeze(1) * padding_mask.unsqueeze(2) ### [b, x+y, x+y]
|
||||
### 上面是错误的,会导致padding的token被"看见"
|
||||
|
||||
# 正确的padding_mask应该是:
|
||||
# | pad_len | x_len | y_len |
|
||||
# [[PAD, PAD, PAD, 1, 2, 3, 4, 5, 6],
|
||||
# [PAD, PAD, PAD, 1, 2, 3, 4, 5, 6],
|
||||
# [PAD, PAD, PAD, 1, 2, 3, 4, 5, 6], 前3行按理说也应该被mask掉,但是为了防止计算attention时不出现nan,还是保留了,不影响结果
|
||||
# [PAD, PAD, PAD, 1, 2, 3, 4, 5, 6],
|
||||
# [PAD, PAD, PAD, 1, 2, 3, 4, 5, 6],
|
||||
# [PAD, PAD, PAD, 1, 2, 3, 4, 5, 6],
|
||||
# [PAD, PAD, PAD, 1, 2, 3, 4, 5, 6],
|
||||
# [PAD, PAD, PAD, 1, 2, 3, 4, 5, 6],
|
||||
# [PAD, PAD, PAD, 1, 2, 3, 4, 5, 6]]
|
||||
|
||||
padding_mask = padding_mask.view(bsz, 1, src_len).repeat(1, src_len, 1)
|
||||
|
||||
attn_mask:torch.Tensor = causal_mask.logical_or(padding_mask)
|
||||
attn_mask = attn_mask.unsqueeze(1).expand(-1, self.num_head, -1, -1).bool()
|
||||
|
||||
|
||||
# 正确的attn_mask应该是这样的:
|
||||
# | pad_len | x_len | y_len |
|
||||
# [[PAD, PAD, PAD, 1, 2, 3, EOS, EOS, EOS],
|
||||
# [PAD, PAD, PAD, 1, 2, 3, EOS, EOS, EOS],
|
||||
# [PAD, PAD, PAD, 1, 2, 3, EOS, EOS, EOS], 前3行按理说也应该被mask掉,但是为了防止计算attention时不出现nan,还是保留了,不影响结果
|
||||
# [PAD, PAD, PAD, 1, 2, 3, EOS, EOS, EOS],
|
||||
# [PAD, PAD, PAD, 1, 2, 3, EOS, EOS, EOS],
|
||||
# [PAD, PAD, PAD, 1, 2, 3, EOS, EOS, EOS],
|
||||
# [PAD, PAD, PAD, 1, 2, 3, 4, EOS, EOS],
|
||||
# [PAD, PAD, PAD, 1, 2, 3, 4, 5, EOS],
|
||||
# [PAD, PAD, PAD, 1, 2, 3, 4, 5, 6]]
|
||||
|
||||
|
||||
###### decode #####
|
||||
y_list = [None]*y.shape[0]
|
||||
@ -645,18 +662,18 @@ class Text2SemanticDecoder(nn.Module):
|
||||
idx_list = [None]*y.shape[0]
|
||||
for idx in tqdm(range(1500)):
|
||||
if idx == 0:
|
||||
xy_dec, k_cache, v_cache = self.t2s_transformer.process_prompt(xy_pos, xy_attn_mask, xy_padding_mask, False)
|
||||
xy_dec, k_cache, v_cache = self.t2s_transformer.process_prompt(xy_pos, attn_mask, None)
|
||||
else:
|
||||
xy_dec, k_cache, v_cache = self.t2s_transformer.decode_next_token(xy_pos, k_cache, v_cache, xy_attn_mask, False)
|
||||
xy_dec, k_cache, v_cache = self.t2s_transformer.decode_next_token(xy_pos, k_cache, v_cache, attn_mask)
|
||||
logits = self.ar_predict_layer(
|
||||
xy_dec[:, -1]
|
||||
)
|
||||
|
||||
if idx == 0:
|
||||
xy_attn_mask = F.pad(xy_attn_mask[:,:,-1].unsqueeze(-2),(0,1),value=False)
|
||||
attn_mask = F.pad(attn_mask[:,:,-1].unsqueeze(-2),(0,1),value=False)
|
||||
logits = logits[:, :-1]
|
||||
else:
|
||||
xy_attn_mask = F.pad(xy_attn_mask,(0,1),value=False)
|
||||
attn_mask = F.pad(attn_mask,(0,1),value=False)
|
||||
|
||||
samples = sample(
|
||||
logits, y, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, temperature=temperature
|
||||
@ -686,7 +703,7 @@ class Text2SemanticDecoder(nn.Module):
|
||||
if reserved_idx_of_batch_for_y is not None:
|
||||
# index = torch.LongTensor(batch_idx_map).to(y.device)
|
||||
y = torch.index_select(y, dim=0, index=reserved_idx_of_batch_for_y)
|
||||
xy_attn_mask = torch.index_select(xy_attn_mask, dim=0, index=reserved_idx_of_batch_for_y)
|
||||
attn_mask = torch.index_select(attn_mask, dim=0, index=reserved_idx_of_batch_for_y)
|
||||
if k_cache is not None :
|
||||
for i in range(len(k_cache)):
|
||||
k_cache[i] = torch.index_select(k_cache[i], dim=0, index=reserved_idx_of_batch_for_y)
|
||||
|
@ -39,6 +39,39 @@ def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
|
||||
return expaned_lengths >= lengths.unsqueeze(-1)
|
||||
|
||||
|
||||
def make_pad_mask_left(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
|
||||
"""
|
||||
Args:
|
||||
lengths:
|
||||
A 1-D tensor containing sentence lengths.
|
||||
max_len:
|
||||
The length of masks.
|
||||
Returns:
|
||||
Return a 2-D bool tensor, where masked positions
|
||||
are filled with `True` and non-masked positions are
|
||||
filled with `False`.
|
||||
|
||||
#>>> lengths = torch.tensor([1, 3, 2, 5])
|
||||
#>>> make_pad_mask(lengths)
|
||||
tensor(
|
||||
[
|
||||
[True, True, False],
|
||||
[True, False, False],
|
||||
[True, True, False],
|
||||
...
|
||||
]
|
||||
)
|
||||
"""
|
||||
assert lengths.ndim == 1, lengths.ndim
|
||||
max_len = max(max_len, lengths.max())
|
||||
n = lengths.size(0)
|
||||
seq_range = torch.arange(0, max_len, device=lengths.device)
|
||||
expaned_lengths = seq_range.unsqueeze(0).repeat(n, 1)
|
||||
expaned_lengths -= (max_len-lengths).unsqueeze(-1)
|
||||
|
||||
return expaned_lengths<0
|
||||
|
||||
|
||||
# https://github.com/microsoft/unilm/blob/master/xtune/src/transformers/modeling_utils.py
|
||||
def top_k_top_p_filtering(
|
||||
logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1
|
||||
|
@ -12,33 +12,33 @@ import torch
|
||||
|
||||
|
||||
def multi_head_attention_forward_patched(
|
||||
query: Tensor,
|
||||
key: Tensor,
|
||||
value: Tensor,
|
||||
embed_dim_to_check: int,
|
||||
num_heads: int,
|
||||
in_proj_weight: Optional[Tensor],
|
||||
in_proj_bias: Optional[Tensor],
|
||||
bias_k: Optional[Tensor],
|
||||
bias_v: Optional[Tensor],
|
||||
add_zero_attn: bool,
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
embed_dim_to_check,
|
||||
num_heads,
|
||||
in_proj_weight,
|
||||
in_proj_bias,
|
||||
bias_k,
|
||||
bias_v,
|
||||
add_zero_attn,
|
||||
dropout_p: float,
|
||||
out_proj_weight: Tensor,
|
||||
out_proj_bias: Optional[Tensor],
|
||||
training: bool = True,
|
||||
key_padding_mask: Optional[Tensor] = None,
|
||||
need_weights: bool = True,
|
||||
attn_mask: Optional[Tensor] = None,
|
||||
use_separate_proj_weight: bool = False,
|
||||
q_proj_weight: Optional[Tensor] = None,
|
||||
k_proj_weight: Optional[Tensor] = None,
|
||||
v_proj_weight: Optional[Tensor] = None,
|
||||
static_k: Optional[Tensor] = None,
|
||||
static_v: Optional[Tensor] = None,
|
||||
average_attn_weights: bool = True,
|
||||
is_causal: bool = False,
|
||||
out_proj_weight,
|
||||
out_proj_bias,
|
||||
training = True,
|
||||
key_padding_mask = None,
|
||||
need_weights = True,
|
||||
attn_mask = None,
|
||||
use_separate_proj_weight = False,
|
||||
q_proj_weight = None,
|
||||
k_proj_weight = None,
|
||||
v_proj_weight = None,
|
||||
static_k = None,
|
||||
static_v = None,
|
||||
average_attn_weights = True,
|
||||
is_causal = False,
|
||||
cache=None,
|
||||
) -> Tuple[Tensor, Optional[Tensor]]:
|
||||
):
|
||||
r"""
|
||||
Args:
|
||||
query, key, value: map a query and a set of key-value pairs to an output.
|
||||
|
@ -2,7 +2,7 @@
|
||||
# LICENSE is in incl_licenses directory.
|
||||
|
||||
import torch.nn as nn
|
||||
from alias_free_activation.torch.resample import UpSample1d, DownSample1d
|
||||
from .resample import UpSample1d, DownSample1d
|
||||
|
||||
|
||||
class Activation1d(nn.Module):
|
||||
|
@ -3,8 +3,8 @@
|
||||
|
||||
import torch.nn as nn
|
||||
from torch.nn import functional as F
|
||||
from alias_free_activation.torch.filter import LowPassFilter1d
|
||||
from alias_free_activation.torch.filter import kaiser_sinc_filter1d
|
||||
from .filter import LowPassFilter1d
|
||||
from .filter import kaiser_sinc_filter1d
|
||||
|
||||
|
||||
class UpSample1d(nn.Module):
|
||||
|
@ -14,10 +14,10 @@ import torch.nn as nn
|
||||
from torch.nn import Conv1d, ConvTranspose1d
|
||||
from torch.nn.utils import weight_norm, remove_weight_norm
|
||||
|
||||
import activations
|
||||
from utils0 import init_weights, get_padding
|
||||
from alias_free_activation.torch.act import Activation1d as TorchActivation1d
|
||||
from env import AttrDict
|
||||
from . import activations
|
||||
from .utils0 import init_weights, get_padding
|
||||
from .alias_free_activation.torch.act import Activation1d as TorchActivation1d
|
||||
from .env import AttrDict
|
||||
|
||||
from huggingface_hub import PyTorchModelHubMixin, hf_hub_download
|
||||
|
||||
@ -93,7 +93,7 @@ class AMPBlock1(torch.nn.Module):
|
||||
|
||||
# Select which Activation1d, lazy-load cuda version to ensure backward compatibility
|
||||
if self.h.get("use_cuda_kernel", False):
|
||||
from alias_free_activation.cuda.activation1d import (
|
||||
from .alias_free_activation.cuda.activation1d import (
|
||||
Activation1d as CudaActivation1d,
|
||||
)
|
||||
|
||||
@ -193,7 +193,7 @@ class AMPBlock2(torch.nn.Module):
|
||||
|
||||
# Select which Activation1d, lazy-load cuda version to ensure backward compatibility
|
||||
if self.h.get("use_cuda_kernel", False):
|
||||
from alias_free_activation.cuda.activation1d import (
|
||||
from .alias_free_activation.cuda.activation1d import (
|
||||
Activation1d as CudaActivation1d,
|
||||
)
|
||||
|
||||
@ -271,7 +271,7 @@ class BigVGAN(
|
||||
|
||||
# Select which Activation1d, lazy-load cuda version to ensure backward compatibility
|
||||
if self.h.get("use_cuda_kernel", False):
|
||||
from alias_free_activation.cuda.activation1d import (
|
||||
from .alias_free_activation.cuda.activation1d import (
|
||||
Activation1d as CudaActivation1d,
|
||||
)
|
||||
|
||||
|
@ -15,7 +15,7 @@ from librosa.filters import mel as librosa_mel_fn
|
||||
import pathlib
|
||||
from tqdm import tqdm
|
||||
from typing import List, Tuple, Optional
|
||||
from env import AttrDict
|
||||
from .env import AttrDict
|
||||
|
||||
MAX_WAV_VALUE = 32767.0 # NOTE: 32768.0 -1 to prevent int16 overflow (results in popping sound in corner cases)
|
||||
|
||||
|
@ -9,7 +9,7 @@ from torch.nn.utils import weight_norm
|
||||
|
||||
matplotlib.use("Agg")
|
||||
import matplotlib.pylab as plt
|
||||
from meldataset import MAX_WAV_VALUE
|
||||
from .meldataset import MAX_WAV_VALUE
|
||||
from scipy.io.wavfile import write
|
||||
|
||||
|
||||
|
@ -3,7 +3,8 @@ import math
|
||||
import os, sys, gc
|
||||
import random
|
||||
import traceback
|
||||
|
||||
import time
|
||||
import torchaudio
|
||||
from tqdm import tqdm
|
||||
now_dir = os.getcwd()
|
||||
sys.path.append(now_dir)
|
||||
@ -15,10 +16,11 @@ import torch
|
||||
import torch.nn.functional as F
|
||||
import yaml
|
||||
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
||||
|
||||
from tools.audio_sr import AP_BWE
|
||||
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
||||
from feature_extractor.cnhubert import CNHubert
|
||||
from module.models import SynthesizerTrn
|
||||
from module.models import SynthesizerTrn, SynthesizerTrnV3
|
||||
from peft import LoraConfig, get_peft_model
|
||||
import librosa
|
||||
from time import time as ttime
|
||||
from tools.i18n.i18n import I18nAuto, scan_language_list
|
||||
@ -26,10 +28,98 @@ from tools.my_utils import load_audio
|
||||
from module.mel_processing import spectrogram_torch
|
||||
from TTS_infer_pack.text_segmentation_method import splits
|
||||
from TTS_infer_pack.TextPreprocessor import TextPreprocessor
|
||||
from BigVGAN.bigvgan import BigVGAN
|
||||
from module.mel_processing import spectrogram_torch,mel_spectrogram_torch
|
||||
from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
|
||||
language=os.environ.get("language","Auto")
|
||||
language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
|
||||
i18n = I18nAuto(language=language)
|
||||
|
||||
|
||||
|
||||
spec_min = -12
|
||||
spec_max = 2
|
||||
def norm_spec(x):
|
||||
return (x - spec_min) / (spec_max - spec_min) * 2 - 1
|
||||
def denorm_spec(x):
|
||||
return (x + 1) / 2 * (spec_max - spec_min) + spec_min
|
||||
mel_fn=lambda x: mel_spectrogram_torch(x, **{
|
||||
"n_fft": 1024,
|
||||
"win_size": 1024,
|
||||
"hop_size": 256,
|
||||
"num_mels": 100,
|
||||
"sampling_rate": 24000,
|
||||
"fmin": 0,
|
||||
"fmax": None,
|
||||
"center": False
|
||||
})
|
||||
|
||||
|
||||
def speed_change(input_audio:np.ndarray, speed:float, sr:int):
|
||||
# 将 NumPy 数组转换为原始 PCM 流
|
||||
raw_audio = input_audio.astype(np.int16).tobytes()
|
||||
|
||||
# 设置 ffmpeg 输入流
|
||||
input_stream = ffmpeg.input('pipe:', format='s16le', acodec='pcm_s16le', ar=str(sr), ac=1)
|
||||
|
||||
# 变速处理
|
||||
output_stream = input_stream.filter('atempo', speed)
|
||||
|
||||
# 输出流到管道
|
||||
out, _ = (
|
||||
output_stream.output('pipe:', format='s16le', acodec='pcm_s16le')
|
||||
.run(input=raw_audio, capture_stdout=True, capture_stderr=True)
|
||||
)
|
||||
|
||||
# 将管道输出解码为 NumPy 数组
|
||||
processed_audio = np.frombuffer(out, np.int16)
|
||||
|
||||
return processed_audio
|
||||
|
||||
|
||||
|
||||
resample_transform_dict={}
|
||||
def resample(audio_tensor, sr0, device):
|
||||
global resample_transform_dict
|
||||
if sr0 not in resample_transform_dict:
|
||||
resample_transform_dict[sr0] = torchaudio.transforms.Resample(
|
||||
sr0, 24000
|
||||
).to(device)
|
||||
return resample_transform_dict[sr0](audio_tensor)
|
||||
|
||||
|
||||
class DictToAttrRecursive(dict):
|
||||
def __init__(self, input_dict):
|
||||
super().__init__(input_dict)
|
||||
for key, value in input_dict.items():
|
||||
if isinstance(value, dict):
|
||||
value = DictToAttrRecursive(value)
|
||||
self[key] = value
|
||||
setattr(self, key, value)
|
||||
|
||||
def __getattr__(self, item):
|
||||
try:
|
||||
return self[item]
|
||||
except KeyError:
|
||||
raise AttributeError(f"Attribute {item} not found")
|
||||
|
||||
def __setattr__(self, key, value):
|
||||
if isinstance(value, dict):
|
||||
value = DictToAttrRecursive(value)
|
||||
super(DictToAttrRecursive, self).__setitem__(key, value)
|
||||
super().__setattr__(key, value)
|
||||
|
||||
def __delattr__(self, item):
|
||||
try:
|
||||
del self[item]
|
||||
except KeyError:
|
||||
raise AttributeError(f"Attribute {item} not found")
|
||||
|
||||
|
||||
class NO_PROMPT_ERROR(Exception):
|
||||
pass
|
||||
|
||||
|
||||
# configs/tts_infer.yaml
|
||||
"""
|
||||
custom:
|
||||
@ -56,11 +146,19 @@ default_v2:
|
||||
t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
|
||||
vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
|
||||
version: v2
|
||||
default_v3:
|
||||
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
|
||||
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
|
||||
device: cpu
|
||||
is_half: false
|
||||
t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt
|
||||
vits_weights_path: GPT_SoVITS/pretrained_models/s2Gv3.pth
|
||||
version: v3
|
||||
"""
|
||||
|
||||
def set_seed(seed:int):
|
||||
seed = int(seed)
|
||||
seed = seed if seed != -1 else random.randrange(1 << 32)
|
||||
seed = seed if seed != -1 else random.randint(0, 2**32 - 1)
|
||||
print(f"Set seed to {seed}")
|
||||
os.environ['PYTHONHASHSEED'] = str(seed)
|
||||
random.seed(seed)
|
||||
@ -82,7 +180,7 @@ def set_seed(seed:int):
|
||||
|
||||
class TTS_Config:
|
||||
default_configs={
|
||||
"default":{
|
||||
"v1":{
|
||||
"device": "cpu",
|
||||
"is_half": False,
|
||||
"version": "v1",
|
||||
@ -91,7 +189,7 @@ class TTS_Config:
|
||||
"cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base",
|
||||
"bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
|
||||
},
|
||||
"default_v2":{
|
||||
"v2":{
|
||||
"device": "cpu",
|
||||
"is_half": False,
|
||||
"version": "v2",
|
||||
@ -100,6 +198,15 @@ class TTS_Config:
|
||||
"cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base",
|
||||
"bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
|
||||
},
|
||||
"v3":{
|
||||
"device": "cpu",
|
||||
"is_half": False,
|
||||
"version": "v3",
|
||||
"t2s_weights_path": "GPT_SoVITS/pretrained_models/s1v3.ckpt",
|
||||
"vits_weights_path": "GPT_SoVITS/pretrained_models/s2Gv3.pth",
|
||||
"cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base",
|
||||
"bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
|
||||
},
|
||||
}
|
||||
configs:dict = None
|
||||
v1_languages:list = ["auto", "en", "zh", "ja", "all_zh", "all_ja"]
|
||||
@ -136,35 +243,42 @@ class TTS_Config:
|
||||
|
||||
assert isinstance(configs, dict)
|
||||
version = configs.get("version", "v2").lower()
|
||||
assert version in ["v1", "v2"]
|
||||
self.default_configs["default"] = configs.get("default", self.default_configs["default"])
|
||||
self.default_configs["default_v2"] = configs.get("default_v2", self.default_configs["default_v2"])
|
||||
|
||||
default_config_key = "default"if version=="v1" else "default_v2"
|
||||
self.configs:dict = configs.get("custom", deepcopy(self.default_configs[default_config_key]))
|
||||
assert version in ["v1", "v2", "v3"]
|
||||
self.default_configs[version] = configs.get(version, self.default_configs[version])
|
||||
self.configs:dict = configs.get("custom", deepcopy(self.default_configs[version]))
|
||||
|
||||
|
||||
self.device = self.configs.get("device", torch.device("cpu"))
|
||||
if "cuda" in str(self.device) and not torch.cuda.is_available():
|
||||
print(f"Warning: CUDA is not available, set device to CPU.")
|
||||
self.device = torch.device("cpu")
|
||||
|
||||
self.is_half = self.configs.get("is_half", False)
|
||||
# if str(self.device) == "cpu" and self.is_half:
|
||||
# print(f"Warning: Half precision is not supported on CPU, set is_half to False.")
|
||||
# self.is_half = False
|
||||
|
||||
self.version = version
|
||||
self.t2s_weights_path = self.configs.get("t2s_weights_path", None)
|
||||
self.vits_weights_path = self.configs.get("vits_weights_path", None)
|
||||
self.bert_base_path = self.configs.get("bert_base_path", None)
|
||||
self.cnhuhbert_base_path = self.configs.get("cnhuhbert_base_path", None)
|
||||
self.languages = self.v2_languages if self.version=="v2" else self.v1_languages
|
||||
self.languages = self.v1_languages if self.version=="v1" else self.v2_languages
|
||||
|
||||
self.is_v3_synthesizer:bool = False
|
||||
|
||||
|
||||
if (self.t2s_weights_path in [None, ""]) or (not os.path.exists(self.t2s_weights_path)):
|
||||
self.t2s_weights_path = self.default_configs[default_config_key]['t2s_weights_path']
|
||||
self.t2s_weights_path = self.default_configs[version]['t2s_weights_path']
|
||||
print(f"fall back to default t2s_weights_path: {self.t2s_weights_path}")
|
||||
if (self.vits_weights_path in [None, ""]) or (not os.path.exists(self.vits_weights_path)):
|
||||
self.vits_weights_path = self.default_configs[default_config_key]['vits_weights_path']
|
||||
self.vits_weights_path = self.default_configs[version]['vits_weights_path']
|
||||
print(f"fall back to default vits_weights_path: {self.vits_weights_path}")
|
||||
if (self.bert_base_path in [None, ""]) or (not os.path.exists(self.bert_base_path)):
|
||||
self.bert_base_path = self.default_configs[default_config_key]['bert_base_path']
|
||||
self.bert_base_path = self.default_configs[version]['bert_base_path']
|
||||
print(f"fall back to default bert_base_path: {self.bert_base_path}")
|
||||
if (self.cnhuhbert_base_path in [None, ""]) or (not os.path.exists(self.cnhuhbert_base_path)):
|
||||
self.cnhuhbert_base_path = self.default_configs[default_config_key]['cnhuhbert_base_path']
|
||||
self.cnhuhbert_base_path = self.default_configs[version]['cnhuhbert_base_path']
|
||||
print(f"fall back to default cnhuhbert_base_path: {self.cnhuhbert_base_path}")
|
||||
self.update_configs()
|
||||
|
||||
@ -187,7 +301,7 @@ class TTS_Config:
|
||||
else:
|
||||
print(i18n("路径不存在,使用默认配置"))
|
||||
self.save_configs(configs_path)
|
||||
with open(configs_path, 'r') as f:
|
||||
with open(configs_path, 'r', encoding='utf-8') as f:
|
||||
configs = yaml.load(f, Loader=yaml.FullLoader)
|
||||
|
||||
return configs
|
||||
@ -216,7 +330,7 @@ class TTS_Config:
|
||||
|
||||
def update_version(self, version:str)->None:
|
||||
self.version = version
|
||||
self.languages = self.v2_languages if self.version=="v2" else self.v1_languages
|
||||
self.languages = self.v1_languages if self.version=="v1" else self.v2_languages
|
||||
|
||||
def __str__(self):
|
||||
self.configs = self.update_configs()
|
||||
@ -244,10 +358,13 @@ class TTS:
|
||||
self.configs:TTS_Config = TTS_Config(configs)
|
||||
|
||||
self.t2s_model:Text2SemanticLightningModule = None
|
||||
self.vits_model:SynthesizerTrn = None
|
||||
self.vits_model:Union[SynthesizerTrn, SynthesizerTrnV3] = None
|
||||
self.bert_tokenizer:AutoTokenizer = None
|
||||
self.bert_model:AutoModelForMaskedLM = None
|
||||
self.cnhuhbert_model:CNHubert = None
|
||||
self.bigvgan_model:BigVGAN = None
|
||||
self.sr_model:AP_BWE = None
|
||||
self.sr_model_not_exist:bool = False
|
||||
|
||||
self._init_models()
|
||||
|
||||
@ -302,38 +419,82 @@ class TTS:
|
||||
self.bert_model = self.bert_model.half()
|
||||
|
||||
def init_vits_weights(self, weights_path: str):
|
||||
print(f"Loading VITS weights from {weights_path}")
|
||||
|
||||
self.configs.vits_weights_path = weights_path
|
||||
dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False)
|
||||
hps = dict_s2["config"]
|
||||
if dict_s2['weight']['enc_p.text_embedding.weight'].shape[0] == 322:
|
||||
self.configs.update_version("v1")
|
||||
else:
|
||||
self.configs.update_version("v2")
|
||||
self.configs.save_configs()
|
||||
version, model_version, if_lora_v3=get_sovits_version_from_path_fast(weights_path)
|
||||
path_sovits_v3=self.configs.default_configs["v3"]["vits_weights_path"]
|
||||
|
||||
if if_lora_v3==True and os.path.exists(path_sovits_v3)==False:
|
||||
info= path_sovits_v3 + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重")
|
||||
raise FileExistsError(info)
|
||||
|
||||
# dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False)
|
||||
dict_s2 = load_sovits_new(weights_path)
|
||||
hps = dict_s2["config"]
|
||||
|
||||
hps["model"]["semantic_frame_rate"] = "25hz"
|
||||
if 'enc_p.text_embedding.weight'not in dict_s2['weight']:
|
||||
hps["model"]["version"] = "v2"#v3model,v2sybomls
|
||||
elif dict_s2['weight']['enc_p.text_embedding.weight'].shape[0] == 322:
|
||||
hps["model"]["version"] = "v1"
|
||||
else:
|
||||
hps["model"]["version"] = "v2"
|
||||
# version = hps["model"]["version"]
|
||||
|
||||
hps["model"]["version"] = self.configs.version
|
||||
self.configs.filter_length = hps["data"]["filter_length"]
|
||||
self.configs.segment_size = hps["train"]["segment_size"]
|
||||
self.configs.sampling_rate = hps["data"]["sampling_rate"]
|
||||
self.configs.hop_length = hps["data"]["hop_length"]
|
||||
self.configs.win_length = hps["data"]["win_length"]
|
||||
self.configs.n_speakers = hps["data"]["n_speakers"]
|
||||
self.configs.semantic_frame_rate = "25hz"
|
||||
self.configs.semantic_frame_rate = hps["model"]["semantic_frame_rate"]
|
||||
kwargs = hps["model"]
|
||||
vits_model = SynthesizerTrn(
|
||||
self.configs.filter_length // 2 + 1,
|
||||
self.configs.segment_size // self.configs.hop_length,
|
||||
n_speakers=self.configs.n_speakers,
|
||||
**kwargs
|
||||
)
|
||||
# print(f"self.configs.sampling_rate:{self.configs.sampling_rate}")
|
||||
|
||||
self.configs.update_version(model_version)
|
||||
|
||||
# print(f"model_version:{model_version}")
|
||||
# print(f'hps["model"]["version"]:{hps["model"]["version"]}')
|
||||
if model_version!="v3":
|
||||
vits_model = SynthesizerTrn(
|
||||
self.configs.filter_length // 2 + 1,
|
||||
self.configs.segment_size // self.configs.hop_length,
|
||||
n_speakers=self.configs.n_speakers,
|
||||
**kwargs
|
||||
)
|
||||
self.configs.is_v3_synthesizer = False
|
||||
else:
|
||||
vits_model = SynthesizerTrnV3(
|
||||
self.configs.filter_length // 2 + 1,
|
||||
self.configs.segment_size // self.configs.hop_length,
|
||||
n_speakers=self.configs.n_speakers,
|
||||
**kwargs
|
||||
)
|
||||
self.configs.is_v3_synthesizer = True
|
||||
self.init_bigvgan()
|
||||
if "pretrained" not in weights_path and hasattr(vits_model, "enc_q"):
|
||||
del vits_model.enc_q
|
||||
|
||||
if if_lora_v3==False:
|
||||
print(f"Loading VITS weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}")
|
||||
else:
|
||||
print(f"Loading VITS pretrained weights from {weights_path}. {vits_model.load_state_dict(load_sovits_new(path_sovits_v3)['weight'], strict=False)}")
|
||||
lora_rank=dict_s2["lora_rank"]
|
||||
lora_config = LoraConfig(
|
||||
target_modules=["to_k", "to_q", "to_v", "to_out.0"],
|
||||
r=lora_rank,
|
||||
lora_alpha=lora_rank,
|
||||
init_lora_weights=True,
|
||||
)
|
||||
vits_model.cfm = get_peft_model(vits_model.cfm, lora_config)
|
||||
print(f"Loading LoRA weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}")
|
||||
|
||||
vits_model.cfm = vits_model.cfm.merge_and_unload()
|
||||
|
||||
if hasattr(vits_model, "enc_q"):
|
||||
del vits_model.enc_q
|
||||
|
||||
vits_model = vits_model.to(self.configs.device)
|
||||
vits_model = vits_model.eval()
|
||||
vits_model.load_state_dict(dict_s2["weight"], strict=False)
|
||||
|
||||
self.vits_model = vits_model
|
||||
if self.configs.is_half and str(self.configs.device)!="cpu":
|
||||
self.vits_model = self.vits_model.half()
|
||||
@ -355,6 +516,30 @@ class TTS:
|
||||
if self.configs.is_half and str(self.configs.device)!="cpu":
|
||||
self.t2s_model = self.t2s_model.half()
|
||||
|
||||
|
||||
def init_bigvgan(self):
|
||||
if self.bigvgan_model is not None:
|
||||
return
|
||||
self.bigvgan_model = BigVGAN.from_pretrained("%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,), use_cuda_kernel=False) # if True, RuntimeError: Ninja is required to load C++ extensions
|
||||
# remove weight norm in the model and set to eval mode
|
||||
self.bigvgan_model.remove_weight_norm()
|
||||
self.bigvgan_model = self.bigvgan_model.eval()
|
||||
if self.configs.is_half == True:
|
||||
self.bigvgan_model = self.bigvgan_model.half().to(self.configs.device)
|
||||
else:
|
||||
self.bigvgan_model = self.bigvgan_model.to(self.configs.device)
|
||||
|
||||
def init_sr_model(self):
|
||||
if self.sr_model is not None:
|
||||
return
|
||||
try:
|
||||
self.sr_model:AP_BWE=AP_BWE(self.configs.device,DictToAttrRecursive)
|
||||
self.sr_model_not_exist = False
|
||||
except FileNotFoundError:
|
||||
print(i18n("你没有下载超分模型的参数,因此不进行超分。如想超分请先参照教程把文件下载好"))
|
||||
self.sr_model_not_exist = True
|
||||
|
||||
|
||||
def enable_half_precision(self, enable: bool = True, save: bool = True):
|
||||
'''
|
||||
To enable half precision for the TTS model.
|
||||
@ -379,6 +564,8 @@ class TTS:
|
||||
self.bert_model =self.bert_model.half()
|
||||
if self.cnhuhbert_model is not None:
|
||||
self.cnhuhbert_model = self.cnhuhbert_model.half()
|
||||
if self.bigvgan_model is not None:
|
||||
self.bigvgan_model = self.bigvgan_model.half()
|
||||
else:
|
||||
if self.t2s_model is not None:
|
||||
self.t2s_model = self.t2s_model.float()
|
||||
@ -388,6 +575,8 @@ class TTS:
|
||||
self.bert_model = self.bert_model.float()
|
||||
if self.cnhuhbert_model is not None:
|
||||
self.cnhuhbert_model = self.cnhuhbert_model.float()
|
||||
if self.bigvgan_model is not None:
|
||||
self.bigvgan_model = self.bigvgan_model.float()
|
||||
|
||||
def set_device(self, device: torch.device, save: bool = True):
|
||||
'''
|
||||
@ -406,6 +595,11 @@ class TTS:
|
||||
self.bert_model = self.bert_model.to(device)
|
||||
if self.cnhuhbert_model is not None:
|
||||
self.cnhuhbert_model = self.cnhuhbert_model.to(device)
|
||||
if self.bigvgan_model is not None:
|
||||
self.bigvgan_model = self.bigvgan_model.to(device)
|
||||
if self.sr_model is not None:
|
||||
self.sr_model = self.sr_model.to(device)
|
||||
|
||||
|
||||
def set_ref_audio(self, ref_audio_path:str):
|
||||
'''
|
||||
@ -429,6 +623,11 @@ class TTS:
|
||||
self.prompt_cache["refer_spec"][0] = spec
|
||||
|
||||
def _get_ref_spec(self, ref_audio_path):
|
||||
raw_audio, raw_sr = torchaudio.load(ref_audio_path)
|
||||
raw_audio=raw_audio.to(self.configs.device).float()
|
||||
self.prompt_cache["raw_audio"] = raw_audio
|
||||
self.prompt_cache["raw_sr"] = raw_sr
|
||||
|
||||
audio = load_audio(ref_audio_path, int(self.configs.sampling_rate))
|
||||
audio = torch.FloatTensor(audio)
|
||||
maxx=audio.abs().max()
|
||||
@ -617,11 +816,11 @@ class TTS:
|
||||
Recovery the order of the audio according to the batch_index_list.
|
||||
|
||||
Args:
|
||||
data (List[list(np.ndarray)]): the out of order audio .
|
||||
data (List[list(torch.Tensor)]): the out of order audio .
|
||||
batch_index_list (List[list[int]]): the batch index list.
|
||||
|
||||
Returns:
|
||||
list (List[np.ndarray]): the data in the original order.
|
||||
list (List[torch.Tensor]): the data in the original order.
|
||||
'''
|
||||
length = len(sum(batch_index_list, []))
|
||||
_data = [None]*length
|
||||
@ -663,6 +862,8 @@ class TTS:
|
||||
"seed": -1, # int. random seed for reproducibility.
|
||||
"parallel_infer": True, # bool. whether to use parallel inference.
|
||||
"repetition_penalty": 1.35 # float. repetition penalty for T2S model.
|
||||
"sample_steps": 32, # int. number of sampling steps for VITS model V3.
|
||||
"super_sampling": False, # bool. whether to use super-sampling for audio when using VITS model V3.
|
||||
}
|
||||
returns:
|
||||
Tuple[int, np.ndarray]: sampling rate and audio data.
|
||||
@ -690,6 +891,8 @@ class TTS:
|
||||
actual_seed = set_seed(seed)
|
||||
parallel_infer = inputs.get("parallel_infer", True)
|
||||
repetition_penalty = inputs.get("repetition_penalty", 1.35)
|
||||
sample_steps = inputs.get("sample_steps", 32)
|
||||
super_sampling = inputs.get("super_sampling", False)
|
||||
|
||||
if parallel_infer:
|
||||
print(i18n("并行推理模式已开启"))
|
||||
@ -704,11 +907,14 @@ class TTS:
|
||||
split_bucket = False
|
||||
print(i18n("分段返回模式不支持分桶处理,已自动关闭分桶处理"))
|
||||
|
||||
if split_bucket and speed_factor==1.0:
|
||||
if split_bucket and speed_factor==1.0 and not (self.configs.is_v3_synthesizer and parallel_infer):
|
||||
print(i18n("分桶处理模式已开启"))
|
||||
elif speed_factor!=1.0:
|
||||
print(i18n("语速调节不支持分桶处理,已自动关闭分桶处理"))
|
||||
split_bucket = False
|
||||
elif self.configs.is_v3_synthesizer and parallel_infer:
|
||||
print(i18n("当开启并行推理模式时,SoVits V3模型不支持分桶处理,已自动关闭分桶处理"))
|
||||
split_bucket = False
|
||||
else:
|
||||
print(i18n("分桶处理模式已关闭"))
|
||||
|
||||
@ -724,12 +930,15 @@ class TTS:
|
||||
if not no_prompt_text:
|
||||
assert prompt_lang in self.configs.languages
|
||||
|
||||
if no_prompt_text and self.configs.is_v3_synthesizer:
|
||||
raise NO_PROMPT_ERROR("prompt_text cannot be empty when using SoVITS_V3")
|
||||
|
||||
if ref_audio_path in [None, ""] and \
|
||||
((self.prompt_cache["prompt_semantic"] is None) or (self.prompt_cache["refer_spec"] in [None, []])):
|
||||
raise ValueError("ref_audio_path cannot be empty, when the reference audio is not set using set_ref_audio()")
|
||||
|
||||
###### setting reference audio and prompt text preprocessing ########
|
||||
t0 = ttime()
|
||||
t0 = time.perf_counter()
|
||||
if (ref_audio_path is not None) and (ref_audio_path != self.prompt_cache["ref_audio_path"]):
|
||||
if not os.path.exists(ref_audio_path):
|
||||
raise ValueError(f"{ref_audio_path} not exists")
|
||||
@ -753,13 +962,13 @@ class TTS:
|
||||
if (prompt_text[-1] not in splits): prompt_text += "。" if prompt_lang != "en" else "."
|
||||
print(i18n("实际输入的参考文本:"), prompt_text)
|
||||
if self.prompt_cache["prompt_text"] != prompt_text:
|
||||
self.prompt_cache["prompt_text"] = prompt_text
|
||||
self.prompt_cache["prompt_lang"] = prompt_lang
|
||||
phones, bert_features, norm_text = \
|
||||
self.text_preprocessor.segment_and_extract_feature_for_text(
|
||||
prompt_text,
|
||||
prompt_lang,
|
||||
self.configs.version)
|
||||
self.prompt_cache["prompt_text"] = prompt_text
|
||||
self.prompt_cache["prompt_lang"] = prompt_lang
|
||||
self.prompt_cache["phones"] = phones
|
||||
self.prompt_cache["bert_features"] = bert_features
|
||||
self.prompt_cache["norm_text"] = norm_text
|
||||
@ -768,13 +977,12 @@ class TTS:
|
||||
|
||||
|
||||
###### text preprocessing ########
|
||||
t1 = ttime()
|
||||
t1 = time.perf_counter()
|
||||
data:list = None
|
||||
if not return_fragment:
|
||||
data = self.text_preprocessor.preprocess(text, text_lang, text_split_method, self.configs.version)
|
||||
if len(data) == 0:
|
||||
yield self.configs.sampling_rate, np.zeros(int(self.configs.sampling_rate),
|
||||
dtype=np.int16)
|
||||
yield 16000, np.zeros(int(16000), dtype=np.int16)
|
||||
return
|
||||
|
||||
batch_index_list:list = None
|
||||
@ -821,15 +1029,16 @@ class TTS:
|
||||
return batch[0]
|
||||
|
||||
|
||||
t2 = ttime()
|
||||
t2 = time.perf_counter()
|
||||
try:
|
||||
print("############ 推理 ############")
|
||||
###### inference ######
|
||||
t_34 = 0.0
|
||||
t_45 = 0.0
|
||||
audio = []
|
||||
output_sr = self.configs.sampling_rate if not self.configs.is_v3_synthesizer else 24000
|
||||
for item in data:
|
||||
t3 = ttime()
|
||||
t3 = time.perf_counter()
|
||||
if return_fragment:
|
||||
item = make_batch(item)
|
||||
if item is None:
|
||||
@ -850,7 +1059,7 @@ class TTS:
|
||||
else:
|
||||
prompt = self.prompt_cache["prompt_semantic"].expand(len(all_phoneme_ids), -1).to(self.configs.device)
|
||||
|
||||
|
||||
print(f"############ {i18n('预测语义Token')} ############")
|
||||
pred_semantic_list, idx_list = self.t2s_model.model.infer_panel(
|
||||
all_phoneme_ids,
|
||||
all_phoneme_lens,
|
||||
@ -864,7 +1073,7 @@ class TTS:
|
||||
max_len=max_len,
|
||||
repetition_penalty=repetition_penalty,
|
||||
)
|
||||
t4 = ttime()
|
||||
t4 = time.perf_counter()
|
||||
t_34 += t4 - t3
|
||||
|
||||
refer_audio_spec:torch.Tensor = [item.to(dtype=self.precision, device=self.configs.device) for item in self.prompt_cache["refer_spec"]]
|
||||
@ -884,70 +1093,92 @@ class TTS:
|
||||
# batch_audio_fragment = (self.vits_model.batched_decode(
|
||||
# pred_semantic, pred_semantic_len, batch_phones, batch_phones_len,refer_audio_spec
|
||||
# ))
|
||||
|
||||
if speed_factor == 1.0:
|
||||
# ## vits并行推理 method 2
|
||||
pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)]
|
||||
upsample_rate = math.prod(self.vits_model.upsample_rates)
|
||||
audio_frag_idx = [pred_semantic_list[i].shape[0]*2*upsample_rate for i in range(0, len(pred_semantic_list))]
|
||||
audio_frag_end_idx = [ sum(audio_frag_idx[:i+1]) for i in range(0, len(audio_frag_idx))]
|
||||
all_pred_semantic = torch.cat(pred_semantic_list).unsqueeze(0).unsqueeze(0).to(self.configs.device)
|
||||
_batch_phones = torch.cat(batch_phones).unsqueeze(0).to(self.configs.device)
|
||||
_batch_audio_fragment = (self.vits_model.decode(
|
||||
all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor
|
||||
).detach()[0, 0, :])
|
||||
audio_frag_end_idx.insert(0, 0)
|
||||
batch_audio_fragment= [_batch_audio_fragment[audio_frag_end_idx[i-1]:audio_frag_end_idx[i]] for i in range(1, len(audio_frag_end_idx))]
|
||||
else:
|
||||
# ## vits串行推理
|
||||
for i, idx in enumerate(idx_list):
|
||||
phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
|
||||
_pred_semantic = (pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)) # .unsqueeze(0)#mq要多unsqueeze一次
|
||||
audio_fragment =(self.vits_model.decode(
|
||||
_pred_semantic, phones, refer_audio_spec, speed=speed_factor
|
||||
print(f"############ {i18n('合成音频')} ############")
|
||||
if not self.configs.is_v3_synthesizer:
|
||||
if speed_factor == 1.0:
|
||||
print(f"{i18n('并行合成中')}...")
|
||||
# ## vits并行推理 method 2
|
||||
pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)]
|
||||
upsample_rate = math.prod(self.vits_model.upsample_rates)
|
||||
audio_frag_idx = [pred_semantic_list[i].shape[0]*2*upsample_rate for i in range(0, len(pred_semantic_list))]
|
||||
audio_frag_end_idx = [ sum(audio_frag_idx[:i+1]) for i in range(0, len(audio_frag_idx))]
|
||||
all_pred_semantic = torch.cat(pred_semantic_list).unsqueeze(0).unsqueeze(0).to(self.configs.device)
|
||||
_batch_phones = torch.cat(batch_phones).unsqueeze(0).to(self.configs.device)
|
||||
_batch_audio_fragment = (self.vits_model.decode(
|
||||
all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor
|
||||
).detach()[0, 0, :])
|
||||
batch_audio_fragment.append(
|
||||
audio_fragment
|
||||
) ###试试重建不带上prompt部分
|
||||
audio_frag_end_idx.insert(0, 0)
|
||||
batch_audio_fragment= [_batch_audio_fragment[audio_frag_end_idx[i-1]:audio_frag_end_idx[i]] for i in range(1, len(audio_frag_end_idx))]
|
||||
else:
|
||||
# ## vits串行推理
|
||||
for i, idx in enumerate(tqdm(idx_list)):
|
||||
phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
|
||||
_pred_semantic = (pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)) # .unsqueeze(0)#mq要多unsqueeze一次
|
||||
audio_fragment =(self.vits_model.decode(
|
||||
_pred_semantic, phones, refer_audio_spec, speed=speed_factor
|
||||
).detach()[0, 0, :])
|
||||
batch_audio_fragment.append(
|
||||
audio_fragment
|
||||
) ###试试重建不带上prompt部分
|
||||
else:
|
||||
if parallel_infer:
|
||||
print(f"{i18n('并行合成中')}...")
|
||||
audio_fragments = self.v3_synthesis_batched_infer(
|
||||
idx_list,
|
||||
pred_semantic_list,
|
||||
batch_phones,
|
||||
speed=speed_factor,
|
||||
sample_steps=sample_steps
|
||||
)
|
||||
batch_audio_fragment.extend(audio_fragments)
|
||||
else:
|
||||
for i, idx in enumerate(tqdm(idx_list)):
|
||||
phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
|
||||
_pred_semantic = (pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)) # .unsqueeze(0)#mq要多unsqueeze一次
|
||||
audio_fragment = self.v3_synthesis(
|
||||
_pred_semantic, phones, speed=speed_factor, sample_steps=sample_steps
|
||||
)
|
||||
batch_audio_fragment.append(
|
||||
audio_fragment
|
||||
)
|
||||
|
||||
t5 = ttime()
|
||||
t5 = time.perf_counter()
|
||||
t_45 += t5 - t4
|
||||
if return_fragment:
|
||||
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t4 - t3, t5 - t4))
|
||||
yield self.audio_postprocess([batch_audio_fragment],
|
||||
self.configs.sampling_rate,
|
||||
output_sr,
|
||||
None,
|
||||
speed_factor,
|
||||
False,
|
||||
fragment_interval
|
||||
fragment_interval,
|
||||
super_sampling if self.configs.is_v3_synthesizer else False
|
||||
)
|
||||
else:
|
||||
audio.append(batch_audio_fragment)
|
||||
|
||||
if self.stop_flag:
|
||||
yield self.configs.sampling_rate, np.zeros(int(self.configs.sampling_rate),
|
||||
dtype=np.int16)
|
||||
yield 16000, np.zeros(int(16000), dtype=np.int16)
|
||||
return
|
||||
|
||||
if not return_fragment:
|
||||
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t_34, t_45))
|
||||
if len(audio) == 0:
|
||||
yield self.configs.sampling_rate, np.zeros(int(self.configs.sampling_rate),
|
||||
dtype=np.int16)
|
||||
yield 16000, np.zeros(int(16000), dtype=np.int16)
|
||||
return
|
||||
yield self.audio_postprocess(audio,
|
||||
self.configs.sampling_rate,
|
||||
output_sr,
|
||||
batch_index_list,
|
||||
speed_factor,
|
||||
split_bucket,
|
||||
fragment_interval
|
||||
fragment_interval,
|
||||
super_sampling if self.configs.is_v3_synthesizer else False
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
# 必须返回一个空音频, 否则会导致显存不释放。
|
||||
yield self.configs.sampling_rate, np.zeros(int(self.configs.sampling_rate),
|
||||
dtype=np.int16)
|
||||
yield 16000, np.zeros(int(16000), dtype=np.int16)
|
||||
# 重置模型, 否则会导致显存释放不完全。
|
||||
del self.t2s_model
|
||||
del self.vits_model
|
||||
@ -975,7 +1206,8 @@ class TTS:
|
||||
batch_index_list:list=None,
|
||||
speed_factor:float=1.0,
|
||||
split_bucket:bool=True,
|
||||
fragment_interval:float=0.3
|
||||
fragment_interval:float=0.3,
|
||||
super_sampling:bool=False,
|
||||
)->Tuple[int, np.ndarray]:
|
||||
zero_wav = torch.zeros(
|
||||
int(self.configs.sampling_rate * fragment_interval),
|
||||
@ -988,7 +1220,7 @@ class TTS:
|
||||
max_audio=torch.abs(audio_fragment).max()#简单防止16bit爆音
|
||||
if max_audio>1: audio_fragment/=max_audio
|
||||
audio_fragment:torch.Tensor = torch.cat([audio_fragment, zero_wav], dim=0)
|
||||
audio[i][j] = audio_fragment.cpu().numpy()
|
||||
audio[i][j] = audio_fragment
|
||||
|
||||
|
||||
if split_bucket:
|
||||
@ -997,8 +1229,21 @@ class TTS:
|
||||
# audio = [item for batch in audio for item in batch]
|
||||
audio = sum(audio, [])
|
||||
|
||||
audio = torch.cat(audio, dim=0)
|
||||
|
||||
if super_sampling:
|
||||
print(f"############ {i18n('音频超采样')} ############")
|
||||
t1 = time.perf_counter()
|
||||
self.init_sr_model()
|
||||
if not self.sr_model_not_exist:
|
||||
audio,sr=self.sr_model(audio.unsqueeze(0),sr)
|
||||
max_audio=np.abs(audio).max()
|
||||
if max_audio > 1: audio /= max_audio
|
||||
t2 = time.perf_counter()
|
||||
print(f"超采样用时:{t2-t1:.3f}s")
|
||||
else:
|
||||
audio = audio.cpu().numpy()
|
||||
|
||||
audio = np.concatenate(audio, 0)
|
||||
audio = (audio * 32768).astype(np.int16)
|
||||
|
||||
# try:
|
||||
@ -1010,25 +1255,200 @@ class TTS:
|
||||
return sr, audio
|
||||
|
||||
|
||||
def v3_synthesis(self,
|
||||
semantic_tokens:torch.Tensor,
|
||||
phones:torch.Tensor,
|
||||
speed:float=1.0,
|
||||
sample_steps:int=32
|
||||
):
|
||||
|
||||
prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device)
|
||||
prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device)
|
||||
refer_audio_spec = self.prompt_cache["refer_spec"][0].to(dtype=self.precision, device=self.configs.device)
|
||||
|
||||
fea_ref,ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
|
||||
ref_audio:torch.Tensor = self.prompt_cache["raw_audio"]
|
||||
ref_sr = self.prompt_cache["raw_sr"]
|
||||
ref_audio=ref_audio.to(self.configs.device).float()
|
||||
if (ref_audio.shape[0] == 2):
|
||||
ref_audio = ref_audio.mean(0).unsqueeze(0)
|
||||
if ref_sr!=24000:
|
||||
ref_audio=resample(ref_audio, ref_sr, self.configs.device)
|
||||
|
||||
mel2 = mel_fn(ref_audio)
|
||||
mel2 = norm_spec(mel2)
|
||||
T_min = min(mel2.shape[2], fea_ref.shape[2])
|
||||
mel2 = mel2[:, :, :T_min]
|
||||
fea_ref = fea_ref[:, :, :T_min]
|
||||
if (T_min > 468):
|
||||
mel2 = mel2[:, :, -468:]
|
||||
fea_ref = fea_ref[:, :, -468:]
|
||||
T_min = 468
|
||||
chunk_len = 934 - T_min
|
||||
|
||||
mel2=mel2.to(self.precision)
|
||||
fea_todo, ge = self.vits_model.decode_encp(semantic_tokens, phones, refer_audio_spec, ge, speed)
|
||||
|
||||
cfm_resss = []
|
||||
idx = 0
|
||||
while (1):
|
||||
fea_todo_chunk = fea_todo[:, :, idx:idx + chunk_len]
|
||||
if (fea_todo_chunk.shape[-1] == 0): break
|
||||
idx += chunk_len
|
||||
fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1)
|
||||
|
||||
cfm_res = self.vits_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0)
|
||||
cfm_res = cfm_res[:, :, mel2.shape[2]:]
|
||||
|
||||
mel2 = cfm_res[:, :, -T_min:]
|
||||
fea_ref = fea_todo_chunk[:, :, -T_min:]
|
||||
|
||||
cfm_resss.append(cfm_res)
|
||||
cfm_res = torch.cat(cfm_resss, 2)
|
||||
cfm_res = denorm_spec(cfm_res)
|
||||
|
||||
|
||||
with torch.inference_mode():
|
||||
wav_gen = self.bigvgan_model(cfm_res)
|
||||
audio=wav_gen[0][0]#.cpu().detach().numpy()
|
||||
|
||||
return audio
|
||||
|
||||
|
||||
|
||||
def v3_synthesis_batched_infer(self,
|
||||
idx_list:List[int],
|
||||
semantic_tokens_list:List[torch.Tensor],
|
||||
batch_phones:List[torch.Tensor],
|
||||
speed:float=1.0,
|
||||
sample_steps:int=32
|
||||
)->List[torch.Tensor]:
|
||||
|
||||
prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device)
|
||||
prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device)
|
||||
refer_audio_spec = self.prompt_cache["refer_spec"][0].to(dtype=self.precision, device=self.configs.device)
|
||||
|
||||
fea_ref,ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
|
||||
ref_audio:torch.Tensor = self.prompt_cache["raw_audio"]
|
||||
ref_sr = self.prompt_cache["raw_sr"]
|
||||
ref_audio=ref_audio.to(self.configs.device).float()
|
||||
if (ref_audio.shape[0] == 2):
|
||||
ref_audio = ref_audio.mean(0).unsqueeze(0)
|
||||
if ref_sr!=24000:
|
||||
ref_audio=resample(ref_audio, ref_sr, self.configs.device)
|
||||
|
||||
mel2 = mel_fn(ref_audio)
|
||||
mel2 = norm_spec(mel2)
|
||||
T_min = min(mel2.shape[2], fea_ref.shape[2])
|
||||
mel2 = mel2[:, :, :T_min]
|
||||
fea_ref = fea_ref[:, :, :T_min]
|
||||
if (T_min > 468):
|
||||
mel2 = mel2[:, :, -468:]
|
||||
fea_ref = fea_ref[:, :, -468:]
|
||||
T_min = 468
|
||||
chunk_len = 934 - T_min
|
||||
|
||||
mel2=mel2.to(self.precision)
|
||||
|
||||
|
||||
def speed_change(input_audio:np.ndarray, speed:float, sr:int):
|
||||
# 将 NumPy 数组转换为原始 PCM 流
|
||||
raw_audio = input_audio.astype(np.int16).tobytes()
|
||||
# #### batched inference
|
||||
overlapped_len = 12
|
||||
feat_chunks = []
|
||||
feat_lens = []
|
||||
feat_list = []
|
||||
|
||||
# 设置 ffmpeg 输入流
|
||||
input_stream = ffmpeg.input('pipe:', format='s16le', acodec='pcm_s16le', ar=str(sr), ac=1)
|
||||
for i, idx in enumerate(idx_list):
|
||||
phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
|
||||
semantic_tokens = semantic_tokens_list[i][-idx:].unsqueeze(0).unsqueeze(0) # .unsqueeze(0)#mq要多unsqueeze一次
|
||||
feat, _ = self.vits_model.decode_encp(semantic_tokens, phones, refer_audio_spec, ge, speed)
|
||||
feat_list.append(feat)
|
||||
feat_lens.append(feat.shape[2])
|
||||
|
||||
# 变速处理
|
||||
output_stream = input_stream.filter('atempo', speed)
|
||||
feats = torch.cat(feat_list, 2)
|
||||
feats_padded = F.pad(feats, (overlapped_len,0), "constant", 0)
|
||||
pos = 0
|
||||
padding_len = 0
|
||||
while True:
|
||||
if pos ==0:
|
||||
chunk = feats_padded[:, :, pos:pos + chunk_len]
|
||||
else:
|
||||
pos = pos - overlapped_len
|
||||
chunk = feats_padded[:, :, pos:pos + chunk_len]
|
||||
pos += chunk_len
|
||||
if (chunk.shape[-1] == 0): break
|
||||
|
||||
# 输出流到管道
|
||||
out, _ = (
|
||||
output_stream.output('pipe:', format='s16le', acodec='pcm_s16le')
|
||||
.run(input=raw_audio, capture_stdout=True, capture_stderr=True)
|
||||
)
|
||||
# padding for the last chunk
|
||||
padding_len = chunk_len - chunk.shape[2]
|
||||
if padding_len != 0:
|
||||
chunk = F.pad(chunk, (0,padding_len), "constant", 0)
|
||||
feat_chunks.append(chunk)
|
||||
|
||||
|
||||
# 将管道输出解码为 NumPy 数组
|
||||
processed_audio = np.frombuffer(out, np.int16)
|
||||
|
||||
return processed_audio
|
||||
feat_chunks = torch.cat(feat_chunks, 0)
|
||||
bs = feat_chunks.shape[0]
|
||||
fea_ref = fea_ref.repeat(bs,1,1)
|
||||
fea = torch.cat([fea_ref, feat_chunks], 2).transpose(2, 1)
|
||||
pred_spec = self.vits_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0)
|
||||
pred_spec = pred_spec[:, :, -chunk_len:]
|
||||
dd = pred_spec.shape[1]
|
||||
pred_spec = pred_spec.permute(1, 0, 2).contiguous().view(dd, -1).unsqueeze(0)
|
||||
# pred_spec = pred_spec[..., :-padding_len]
|
||||
|
||||
|
||||
pred_spec = denorm_spec(pred_spec)
|
||||
|
||||
with torch.no_grad():
|
||||
wav_gen = self.bigvgan_model(pred_spec)
|
||||
audio = wav_gen[0][0]#.cpu().detach().numpy()
|
||||
|
||||
|
||||
audio_fragments = []
|
||||
upsample_rate = 256
|
||||
pos = 0
|
||||
|
||||
while pos < audio.shape[-1]:
|
||||
audio_fragment = audio[pos:pos+chunk_len*upsample_rate]
|
||||
audio_fragments.append(audio_fragment)
|
||||
pos += chunk_len*upsample_rate
|
||||
|
||||
audio = self.sola_algorithm(audio_fragments, overlapped_len*upsample_rate)
|
||||
audio = audio[overlapped_len*upsample_rate:-padding_len*upsample_rate]
|
||||
|
||||
audio_fragments = []
|
||||
for feat_len in feat_lens:
|
||||
audio_fragment = audio[:feat_len*upsample_rate]
|
||||
audio_fragments.append(audio_fragment)
|
||||
audio = audio[feat_len*upsample_rate:]
|
||||
|
||||
|
||||
return audio_fragments
|
||||
|
||||
|
||||
|
||||
def sola_algorithm(self,
|
||||
audio_fragments:List[torch.Tensor],
|
||||
overlap_len:int,
|
||||
):
|
||||
|
||||
for i in range(len(audio_fragments)-1):
|
||||
f1 = audio_fragments[i]
|
||||
f2 = audio_fragments[i+1]
|
||||
w1 = f1[-overlap_len:]
|
||||
w2 = f2[:overlap_len]
|
||||
assert w1.shape == w2.shape
|
||||
corr = F.conv1d(w1.view(1,1,-1), w2.view(1,1,-1),padding=w2.shape[-1]//2).view(-1)[:-1]
|
||||
idx = corr.argmax()
|
||||
f1_ = f1[:-(overlap_len-idx)]
|
||||
audio_fragments[i] = f1_
|
||||
|
||||
f2_ = f2[idx:]
|
||||
window = torch.hann_window((overlap_len-idx)*2, device=f1.device, dtype=f1.dtype)
|
||||
f2_[:(overlap_len-idx)] = window[:(overlap_len-idx)]*f2_[:(overlap_len-idx)] + window[(overlap_len-idx):]*f1[-(overlap_len-idx):]
|
||||
audio_fragments[i+1] = f2_
|
||||
|
||||
|
||||
return torch.cat(audio_fragments, 0)
|
||||
|
||||
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
|
||||
import os, sys
|
||||
import threading
|
||||
|
||||
from tqdm import tqdm
|
||||
now_dir = os.getcwd()
|
||||
@ -54,6 +55,7 @@ class TextPreprocessor:
|
||||
self.bert_model = bert_model
|
||||
self.tokenizer = tokenizer
|
||||
self.device = device
|
||||
self.bert_lock = threading.RLock()
|
||||
|
||||
def preprocess(self, text:str, lang:str, text_split_method:str, version:str="v2")->List[Dict]:
|
||||
print(f'############ {i18n("切分文本")} ############')
|
||||
@ -117,70 +119,71 @@ class TextPreprocessor:
|
||||
return self.get_phones_and_bert(text, language, version)
|
||||
|
||||
def get_phones_and_bert(self, text:str, language:str, version:str, final:bool=False):
|
||||
if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
|
||||
language = language.replace("all_","")
|
||||
formattext = text
|
||||
while " " in formattext:
|
||||
formattext = formattext.replace(" ", " ")
|
||||
if language == "zh":
|
||||
if re.search(r'[A-Za-z]', formattext):
|
||||
formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext)
|
||||
formattext = chinese.mix_text_normalize(formattext)
|
||||
return self.get_phones_and_bert(formattext,"zh",version)
|
||||
else:
|
||||
phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version)
|
||||
bert = self.get_bert_feature(norm_text, word2ph).to(self.device)
|
||||
elif language == "yue" and re.search(r'[A-Za-z]', formattext):
|
||||
formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext)
|
||||
formattext = chinese.mix_text_normalize(formattext)
|
||||
return self.get_phones_and_bert(formattext,"yue",version)
|
||||
else:
|
||||
phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version)
|
||||
bert = torch.zeros(
|
||||
(1024, len(phones)),
|
||||
dtype=torch.float32,
|
||||
).to(self.device)
|
||||
elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}:
|
||||
textlist=[]
|
||||
langlist=[]
|
||||
if language == "auto":
|
||||
for tmp in LangSegmenter.getTexts(text):
|
||||
langlist.append(tmp["lang"])
|
||||
textlist.append(tmp["text"])
|
||||
elif language == "auto_yue":
|
||||
for tmp in LangSegmenter.getTexts(text):
|
||||
if tmp["lang"] == "zh":
|
||||
tmp["lang"] = "yue"
|
||||
langlist.append(tmp["lang"])
|
||||
textlist.append(tmp["text"])
|
||||
else:
|
||||
for tmp in LangSegmenter.getTexts(text):
|
||||
if tmp["lang"] == "en":
|
||||
langlist.append(tmp["lang"])
|
||||
else:
|
||||
# 因无法区别中日韩文汉字,以用户输入为准
|
||||
langlist.append(language)
|
||||
textlist.append(tmp["text"])
|
||||
# print(textlist)
|
||||
# print(langlist)
|
||||
phones_list = []
|
||||
bert_list = []
|
||||
norm_text_list = []
|
||||
for i in range(len(textlist)):
|
||||
lang = langlist[i]
|
||||
phones, word2ph, norm_text = self.clean_text_inf(textlist[i], lang, version)
|
||||
bert = self.get_bert_inf(phones, word2ph, norm_text, lang)
|
||||
phones_list.append(phones)
|
||||
norm_text_list.append(norm_text)
|
||||
bert_list.append(bert)
|
||||
bert = torch.cat(bert_list, dim=1)
|
||||
phones = sum(phones_list, [])
|
||||
norm_text = ''.join(norm_text_list)
|
||||
with self.bert_lock:
|
||||
if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
|
||||
# language = language.replace("all_","")
|
||||
formattext = text
|
||||
while " " in formattext:
|
||||
formattext = formattext.replace(" ", " ")
|
||||
if language == "all_zh":
|
||||
if re.search(r'[A-Za-z]', formattext):
|
||||
formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext)
|
||||
formattext = chinese.mix_text_normalize(formattext)
|
||||
return self.get_phones_and_bert(formattext,"zh",version)
|
||||
else:
|
||||
phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version)
|
||||
bert = self.get_bert_feature(norm_text, word2ph).to(self.device)
|
||||
elif language == "all_yue" and re.search(r'[A-Za-z]', formattext):
|
||||
formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext)
|
||||
formattext = chinese.mix_text_normalize(formattext)
|
||||
return self.get_phones_and_bert(formattext,"yue",version)
|
||||
else:
|
||||
phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version)
|
||||
bert = torch.zeros(
|
||||
(1024, len(phones)),
|
||||
dtype=torch.float32,
|
||||
).to(self.device)
|
||||
elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}:
|
||||
textlist=[]
|
||||
langlist=[]
|
||||
if language == "auto":
|
||||
for tmp in LangSegmenter.getTexts(text):
|
||||
langlist.append(tmp["lang"])
|
||||
textlist.append(tmp["text"])
|
||||
elif language == "auto_yue":
|
||||
for tmp in LangSegmenter.getTexts(text):
|
||||
if tmp["lang"] == "zh":
|
||||
tmp["lang"] = "yue"
|
||||
langlist.append(tmp["lang"])
|
||||
textlist.append(tmp["text"])
|
||||
else:
|
||||
for tmp in LangSegmenter.getTexts(text):
|
||||
if tmp["lang"] == "en":
|
||||
langlist.append(tmp["lang"])
|
||||
else:
|
||||
# 因无法区别中日韩文汉字,以用户输入为准
|
||||
langlist.append(language)
|
||||
textlist.append(tmp["text"])
|
||||
# print(textlist)
|
||||
# print(langlist)
|
||||
phones_list = []
|
||||
bert_list = []
|
||||
norm_text_list = []
|
||||
for i in range(len(textlist)):
|
||||
lang = langlist[i]
|
||||
phones, word2ph, norm_text = self.clean_text_inf(textlist[i], lang, version)
|
||||
bert = self.get_bert_inf(phones, word2ph, norm_text, lang)
|
||||
phones_list.append(phones)
|
||||
norm_text_list.append(norm_text)
|
||||
bert_list.append(bert)
|
||||
bert = torch.cat(bert_list, dim=1)
|
||||
phones = sum(phones_list, [])
|
||||
norm_text = ''.join(norm_text_list)
|
||||
|
||||
if not final and len(phones) < 6:
|
||||
return self.get_phones_and_bert("." + text,language,version,final=True)
|
||||
if not final and len(phones) < 6:
|
||||
return self.get_phones_and_bert("." + text,language,version,final=True)
|
||||
|
||||
return phones, bert, norm_text
|
||||
return phones, bert, norm_text
|
||||
|
||||
|
||||
def get_bert_feature(self, text:str, word2ph:list)->torch.Tensor:
|
||||
@ -199,6 +202,7 @@ class TextPreprocessor:
|
||||
return phone_level_feature.T
|
||||
|
||||
def clean_text_inf(self, text:str, language:str, version:str="v2"):
|
||||
language = language.replace("all_","")
|
||||
phones, word2ph, norm_text = clean_text(text, language, version)
|
||||
phones = cleaned_text_to_sequence(phones, version)
|
||||
return phones, word2ph, norm_text
|
||||
|
@ -6,7 +6,7 @@ custom:
|
||||
t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
|
||||
version: v2
|
||||
vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
|
||||
default:
|
||||
v1:
|
||||
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
|
||||
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
|
||||
device: cpu
|
||||
@ -14,7 +14,7 @@ default:
|
||||
t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
|
||||
version: v1
|
||||
vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth
|
||||
default_v2:
|
||||
v2:
|
||||
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
|
||||
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
|
||||
device: cpu
|
||||
@ -22,3 +22,11 @@ default_v2:
|
||||
t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
|
||||
version: v2
|
||||
vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
|
||||
v3:
|
||||
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
|
||||
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
|
||||
device: cpu
|
||||
is_half: false
|
||||
t2s_weights_path: GPT_SoVITS/pretrained_models/s1v3.ckpt
|
||||
version: v3
|
||||
vits_weights_path: GPT_SoVITS/pretrained_models/s2Gv3.pth
|
||||
|
@ -427,7 +427,7 @@ class T2SModel(nn.Module):
|
||||
self.top_k = int(raw_t2s.config["inference"]["top_k"])
|
||||
self.early_stop_num = torch.LongTensor([self.hz * self.max_sec])
|
||||
|
||||
def forward(self,prompts:LongTensor, ref_seq:LongTensor, text_seq:LongTensor, ref_bert:torch.Tensor, text_bert:torch.Tensor):
|
||||
def forward(self,prompts:LongTensor, ref_seq:LongTensor, text_seq:LongTensor, ref_bert:torch.Tensor, text_bert:torch.Tensor,top_k:LongTensor):
|
||||
bert = torch.cat([ref_bert.T, text_bert.T], 1)
|
||||
all_phoneme_ids = torch.cat([ref_seq, text_seq], 1)
|
||||
bert = bert.unsqueeze(0)
|
||||
@ -472,12 +472,13 @@ class T2SModel(nn.Module):
|
||||
.to(device=x.device, dtype=torch.bool)
|
||||
|
||||
idx = 0
|
||||
top_k = int(top_k)
|
||||
|
||||
xy_dec, k_cache, v_cache = self.t2s_transformer.process_prompt(xy_pos, xy_attn_mask, None)
|
||||
|
||||
logits = self.ar_predict_layer(xy_dec[:, -1])
|
||||
logits = logits[:, :-1]
|
||||
samples = sample(logits, y, top_k=self.top_k, top_p=1, repetition_penalty=1.35, temperature=1.0)[0]
|
||||
samples = sample(logits, y, top_k=top_k, top_p=1, repetition_penalty=1.35, temperature=1.0)[0]
|
||||
y = torch.concat([y, samples], dim=1)
|
||||
y_emb = self.ar_audio_embedding(y[:, -1:])
|
||||
xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[:, y_len + idx].to(dtype=y_emb.dtype,device=y_emb.device)
|
||||
@ -493,7 +494,7 @@ class T2SModel(nn.Module):
|
||||
if(idx<11):###至少预测出10个token不然不给停止(0.4s)
|
||||
logits = logits[:, :-1]
|
||||
|
||||
samples = sample(logits, y, top_k=self.top_k, top_p=1, repetition_penalty=1.35, temperature=1.0)[0]
|
||||
samples = sample(logits, y, top_k=top_k, top_p=1, repetition_penalty=1.35, temperature=1.0)[0]
|
||||
|
||||
y = torch.concat([y, samples], dim=1)
|
||||
|
||||
@ -653,6 +654,8 @@ def export(gpt_path, vits_path, ref_audio_path, ref_text, output_path, export_be
|
||||
torch._dynamo.mark_dynamic(ref_bert, 0)
|
||||
torch._dynamo.mark_dynamic(text_bert, 0)
|
||||
|
||||
top_k = torch.LongTensor([5]).to(device)
|
||||
|
||||
with torch.no_grad():
|
||||
gpt_sovits_export = torch.jit.trace(
|
||||
gpt_sovits,
|
||||
@ -662,7 +665,8 @@ def export(gpt_path, vits_path, ref_audio_path, ref_text, output_path, export_be
|
||||
ref_seq,
|
||||
text_seq,
|
||||
ref_bert,
|
||||
text_bert))
|
||||
text_bert,
|
||||
top_k))
|
||||
|
||||
gpt_sovits_path = os.path.join(output_path, "gpt_sovits_model.pt")
|
||||
gpt_sovits_export.save(gpt_sovits_path)
|
||||
@ -684,15 +688,26 @@ class GPT_SoVITS(nn.Module):
|
||||
self.t2s = t2s
|
||||
self.vits = vits
|
||||
|
||||
def forward(self, ssl_content:torch.Tensor, ref_audio_sr:torch.Tensor, ref_seq:Tensor, text_seq:Tensor, ref_bert:Tensor, text_bert:Tensor, speed=1.0):
|
||||
def forward(
|
||||
self,
|
||||
ssl_content: torch.Tensor,
|
||||
ref_audio_sr: torch.Tensor,
|
||||
ref_seq: Tensor,
|
||||
text_seq: Tensor,
|
||||
ref_bert: Tensor,
|
||||
text_bert: Tensor,
|
||||
top_k: LongTensor,
|
||||
speed=1.0,
|
||||
):
|
||||
codes = self.vits.vq_model.extract_latent(ssl_content)
|
||||
prompt_semantic = codes[0, 0]
|
||||
prompts = prompt_semantic.unsqueeze(0)
|
||||
|
||||
pred_semantic = self.t2s(prompts, ref_seq, text_seq, ref_bert, text_bert)
|
||||
pred_semantic = self.t2s(prompts, ref_seq, text_seq, ref_bert, text_bert, top_k)
|
||||
audio = self.vits(text_seq, pred_semantic, ref_audio_sr, speed)
|
||||
return audio
|
||||
|
||||
|
||||
def test():
|
||||
parser = argparse.ArgumentParser(description="GPT-SoVITS Command Line Tool")
|
||||
parser.add_argument('--gpt_model', required=True, help="Path to the GPT model file")
|
||||
@ -784,8 +799,10 @@ def test():
|
||||
print('text_bert:',text_bert.shape)
|
||||
text_bert=text_bert.to('cuda')
|
||||
|
||||
top_k = torch.LongTensor([5]).to('cuda')
|
||||
|
||||
with torch.no_grad():
|
||||
audio = gpt_sovits(ssl_content, ref_audio_sr, ref_seq, text_seq, ref_bert, test_bert)
|
||||
audio = gpt_sovits(ssl_content, ref_audio_sr, ref_seq, text_seq, ref_bert, test_bert, top_k)
|
||||
print('start write wav')
|
||||
soundfile.write("out.wav", audio.detach().cpu().numpy(), 32000)
|
||||
|
||||
|
1045
GPT_SoVITS/export_torch_script_v3.py
Normal file
1045
GPT_SoVITS/export_torch_script_v3.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -138,7 +138,7 @@ class DiT(nn.Module):
|
||||
time: float["b"] | float[""], # time step # noqa: F821 F722
|
||||
dt_base_bootstrap,
|
||||
text0, # : int["b nt"] # noqa: F722#####condition feature
|
||||
use_grad_ckpt, # bool
|
||||
use_grad_ckpt=False, # bool
|
||||
###no-use
|
||||
drop_audio_cond=False, # cfg for cond audio
|
||||
drop_text=False, # cfg for text
|
||||
|
@ -238,7 +238,7 @@ def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
|
||||
else:
|
||||
visible_sample_steps=False
|
||||
visible_inp_refs=True
|
||||
yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False}
|
||||
yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False},{"__type__": "update", "value":i18n("模型加载中,请等待"),"interactive":False}
|
||||
|
||||
dict_s2 = load_sovits_new(sovits_path)
|
||||
hps = dict_s2["config"]
|
||||
@ -294,6 +294,7 @@ def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
|
||||
# torch.save(vq_model.state_dict(),"merge_win.pth")
|
||||
vq_model.eval()
|
||||
|
||||
yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False},{"__type__": "update", "value":i18n("合成语音"),"interactive":True}
|
||||
with open("./weight.json")as f:
|
||||
data=f.read()
|
||||
data=json.loads(data)
|
||||
@ -691,7 +692,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
||||
wav_gen = bigvgan_model(cmf_res)
|
||||
audio=wav_gen[0][0]#.cpu().detach().numpy()
|
||||
max_audio=torch.abs(audio).max()#简单防止16bit爆音
|
||||
if max_audio>1:audio/=max_audio
|
||||
if max_audio>1:audio=audio/max_audio
|
||||
audio_opt.append(audio)
|
||||
audio_opt.append(zero_wav_torch)#zero_wav
|
||||
t4 = ttime()
|
||||
@ -704,7 +705,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
||||
print(i18n("音频超分中"))
|
||||
audio_opt,sr=audio_sr(audio_opt.unsqueeze(0),sr)
|
||||
max_audio=np.abs(audio_opt).max()
|
||||
if max_audio > 1: audio /= max_audio
|
||||
if max_audio > 1: audio_opt /= max_audio
|
||||
else:
|
||||
audio_opt=audio_opt.cpu().detach().numpy()
|
||||
yield sr, (audio_opt * 32767).astype(np.int16)
|
||||
@ -877,7 +878,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
with gr.Row():
|
||||
inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频,超过会报错!"), type="filepath", scale=13)
|
||||
with gr.Column(scale=13):
|
||||
ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。")+i18n("v3暂不支持该模式,使用了会报错。"), value=False, interactive=True, show_label=True,scale=1)
|
||||
ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。")+i18n("v3暂不支持该模式,使用了会报错。"), value=False, interactive=True if model_version!="v3"else False, show_label=True,scale=1)
|
||||
gr.Markdown(html_left(i18n("使用无参考文本模式时建议使用微调的GPT")+"<br>"+i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。")))
|
||||
prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="", lines=5, max_lines=5,scale=1)
|
||||
with gr.Column(scale=14):
|
||||
@ -915,7 +916,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
# phoneme=gr.Textbox(label=i18n("音素框"), value="")
|
||||
# get_phoneme_button = gr.Button(i18n("目标文本转音素"), variant="primary")
|
||||
with gr.Row():
|
||||
inference_button = gr.Button(i18n("合成语音"), variant="primary", size='lg', scale=25)
|
||||
inference_button = gr.Button(value=i18n("合成语音"), variant="primary", size='lg', scale=25)
|
||||
output = gr.Audio(label=i18n("输出的语音"), scale=14)
|
||||
|
||||
inference_button.click(
|
||||
@ -923,7 +924,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
[inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free,speed,if_freeze,inp_refs,sample_steps,if_sr_Checkbox,pause_second_slider],
|
||||
[output],
|
||||
)
|
||||
SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language,sample_steps,inp_refs,ref_text_free,if_sr_Checkbox])
|
||||
SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language,sample_steps,inp_refs,ref_text_free,if_sr_Checkbox,inference_button])
|
||||
GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], [])
|
||||
|
||||
# gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"))
|
||||
|
@ -7,7 +7,7 @@
|
||||
全部按日文识别
|
||||
'''
|
||||
import random
|
||||
import os, re, logging
|
||||
import os, re, logging, json
|
||||
import sys
|
||||
now_dir = os.getcwd()
|
||||
sys.path.append(now_dir)
|
||||
@ -41,12 +41,13 @@ gpt_path = os.environ.get("gpt_path", None)
|
||||
sovits_path = os.environ.get("sovits_path", None)
|
||||
cnhubert_base_path = os.environ.get("cnhubert_base_path", None)
|
||||
bert_path = os.environ.get("bert_path", None)
|
||||
version=os.environ.get("version","v2")
|
||||
version=model_version=os.environ.get("version","v2")
|
||||
|
||||
import gradio as gr
|
||||
from TTS_infer_pack.TTS import TTS, TTS_Config
|
||||
from TTS_infer_pack.TTS import TTS, TTS_Config, NO_PROMPT_ERROR
|
||||
from TTS_infer_pack.text_segmentation_method import get_method
|
||||
from tools.i18n.i18n import I18nAuto, scan_language_list
|
||||
from inference_webui import DictToAttrRecursive
|
||||
|
||||
language=os.environ.get("language","Auto")
|
||||
language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
|
||||
@ -62,6 +63,9 @@ if torch.cuda.is_available():
|
||||
else:
|
||||
device = "cpu"
|
||||
|
||||
# is_half = False
|
||||
# device = "cpu"
|
||||
|
||||
dict_language_v1 = {
|
||||
i18n("中文"): "all_zh",#全部按中文识别
|
||||
i18n("英文"): "en",#全部按英文识别#######不变
|
||||
@ -123,11 +127,11 @@ def inference(text, text_lang,
|
||||
speed_factor, ref_text_free,
|
||||
split_bucket,fragment_interval,
|
||||
seed, keep_random, parallel_infer,
|
||||
repetition_penalty
|
||||
repetition_penalty, sample_steps, super_sampling,
|
||||
):
|
||||
|
||||
seed = -1 if keep_random else seed
|
||||
actual_seed = seed if seed not in [-1, "", None] else random.randrange(1 << 32)
|
||||
actual_seed = seed if seed not in [-1, "", None] else random.randint(0, 2**32 - 1)
|
||||
inputs={
|
||||
"text": text,
|
||||
"text_lang": dict_language[text_lang],
|
||||
@ -147,9 +151,14 @@ def inference(text, text_lang,
|
||||
"seed":actual_seed,
|
||||
"parallel_infer": parallel_infer,
|
||||
"repetition_penalty": repetition_penalty,
|
||||
"sample_steps": int(sample_steps),
|
||||
"super_sampling": super_sampling,
|
||||
}
|
||||
for item in tts_pipeline.run(inputs):
|
||||
yield item, actual_seed
|
||||
try:
|
||||
for item in tts_pipeline.run(inputs):
|
||||
yield item, actual_seed
|
||||
except NO_PROMPT_ERROR:
|
||||
gr.Warning(i18n('V3不支持无参考文本模式,请填写参考文本!'))
|
||||
|
||||
def custom_sort_key(s):
|
||||
# 使用正则表达式提取字符串中的数字部分和非数字部分
|
||||
@ -163,19 +172,38 @@ def change_choices():
|
||||
SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root)
|
||||
return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names, key=custom_sort_key), "__type__": "update"}
|
||||
|
||||
path_sovits_v3="GPT_SoVITS/pretrained_models/s2Gv3.pth"
|
||||
pretrained_sovits_name=["GPT_SoVITS/pretrained_models/s2G488k.pth", "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",path_sovits_v3]
|
||||
pretrained_gpt_name=["GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt","GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", "GPT_SoVITS/pretrained_models/s1v3.ckpt"]
|
||||
|
||||
pretrained_sovits_name=["GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth", "GPT_SoVITS/pretrained_models/s2G488k.pth"]
|
||||
pretrained_gpt_name=["GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt", "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"]
|
||||
_ =[[],[]]
|
||||
for i in range(2):
|
||||
if os.path.exists(pretrained_gpt_name[i]):
|
||||
_[0].append(pretrained_gpt_name[i])
|
||||
if os.path.exists(pretrained_sovits_name[i]):
|
||||
_[-1].append(pretrained_sovits_name[i])
|
||||
for i in range(3):
|
||||
if os.path.exists(pretrained_gpt_name[i]):_[0].append(pretrained_gpt_name[i])
|
||||
if os.path.exists(pretrained_sovits_name[i]):_[-1].append(pretrained_sovits_name[i])
|
||||
pretrained_gpt_name,pretrained_sovits_name = _
|
||||
|
||||
SoVITS_weight_root=["SoVITS_weights_v2","SoVITS_weights"]
|
||||
GPT_weight_root=["GPT_weights_v2","GPT_weights"]
|
||||
|
||||
if os.path.exists(f"./weight.json"):
|
||||
pass
|
||||
else:
|
||||
with open(f"./weight.json", 'w', encoding="utf-8") as file:json.dump({'GPT':{},'SoVITS':{}},file)
|
||||
|
||||
with open(f"./weight.json", 'r', encoding="utf-8") as file:
|
||||
weight_data = file.read()
|
||||
weight_data=json.loads(weight_data)
|
||||
gpt_path = os.environ.get(
|
||||
"gpt_path", weight_data.get('GPT',{}).get(version,pretrained_gpt_name))
|
||||
sovits_path = os.environ.get(
|
||||
"sovits_path", weight_data.get('SoVITS',{}).get(version,pretrained_sovits_name))
|
||||
if isinstance(gpt_path,list):
|
||||
gpt_path = gpt_path[0]
|
||||
if isinstance(sovits_path,list):
|
||||
sovits_path = sovits_path[0]
|
||||
|
||||
|
||||
|
||||
SoVITS_weight_root=["SoVITS_weights","SoVITS_weights_v2","SoVITS_weights_v3"]
|
||||
GPT_weight_root=["GPT_weights","GPT_weights_v2","GPT_weights_v3"]
|
||||
for path in SoVITS_weight_root+GPT_weight_root:
|
||||
os.makedirs(path,exist_ok=True)
|
||||
|
||||
@ -194,11 +222,16 @@ def get_weights_names(GPT_weight_root, SoVITS_weight_root):
|
||||
SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root)
|
||||
|
||||
|
||||
|
||||
from process_ckpt import get_sovits_version_from_path_fast,load_sovits_new
|
||||
def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
|
||||
tts_pipeline.init_vits_weights(sovits_path)
|
||||
global version, dict_language
|
||||
dict_language = dict_language_v1 if tts_pipeline.configs.version =='v1' else dict_language_v2
|
||||
global version, model_version, dict_language,if_lora_v3
|
||||
version, model_version, if_lora_v3=get_sovits_version_from_path_fast(sovits_path)
|
||||
# print(sovits_path,version, model_version, if_lora_v3)
|
||||
if if_lora_v3 and not os.path.exists(path_sovits_v3):
|
||||
info= path_sovits_v3 + i18n("SoVITS V3 底模缺失,无法加载相应 LoRA 权重")
|
||||
gr.Warning(info)
|
||||
raise FileExistsError(info)
|
||||
dict_language = dict_language_v1 if version =='v1' else dict_language_v2
|
||||
if prompt_language is not None and text_language is not None:
|
||||
if prompt_language in list(dict_language.keys()):
|
||||
prompt_text_update, prompt_language_update = {'__type__':'update'}, {'__type__':'update', 'value':prompt_language}
|
||||
@ -210,9 +243,22 @@ def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
|
||||
else:
|
||||
text_update = {'__type__':'update', 'value':''}
|
||||
text_language_update = {'__type__':'update', 'value':i18n("中文")}
|
||||
return {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update
|
||||
|
||||
if model_version=="v3":
|
||||
visible_sample_steps=True
|
||||
visible_inp_refs=False
|
||||
else:
|
||||
visible_sample_steps=False
|
||||
visible_inp_refs=True
|
||||
#prompt_language,text_language,prompt_text,prompt_language,text,text_language,inp_refs,ref_text_free,
|
||||
yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "interactive": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "interactive": True if model_version!="v3"else False},{"__type__": "update", "value":i18n("模型加载中,请等待"),"interactive":False}
|
||||
|
||||
tts_pipeline.init_vits_weights(sovits_path)
|
||||
yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "interactive": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "interactive": True if model_version!="v3"else False},{"__type__": "update", "value":i18n("合成语音"),"interactive":True}
|
||||
with open("./weight.json")as f:
|
||||
data=f.read()
|
||||
data=json.loads(data)
|
||||
data["SoVITS"][version]=sovits_path
|
||||
with open("./weight.json","w")as f:f.write(json.dumps(data))
|
||||
|
||||
with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
gr.Markdown(
|
||||
@ -234,14 +280,14 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
gr.Markdown(value=i18n("*请上传并填写参考信息"))
|
||||
with gr.Row():
|
||||
inp_ref = gr.Audio(label=i18n("主参考音频(请上传3~10秒内参考音频,超过会报错!)"), type="filepath")
|
||||
inp_refs = gr.File(label=i18n("辅参考音频(可选多个,或不选)"),file_count="multiple")
|
||||
inp_refs = gr.File(label=i18n("辅参考音频(可选多个,或不选)"),file_count="multiple", visible=True if model_version!="v3"else False)
|
||||
prompt_text = gr.Textbox(label=i18n("主参考音频的文本"), value="", lines=2)
|
||||
with gr.Row():
|
||||
prompt_language = gr.Dropdown(
|
||||
label=i18n("主参考音频的语种"), choices=list(dict_language.keys()), value=i18n("中文")
|
||||
)
|
||||
with gr.Column():
|
||||
ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True, show_label=True)
|
||||
ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True if model_version!="v3"else False, show_label=True)
|
||||
gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT")+"<br>"+i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。"))
|
||||
|
||||
with gr.Column():
|
||||
@ -257,13 +303,19 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
with gr.Row():
|
||||
|
||||
with gr.Column():
|
||||
batch_size = gr.Slider(minimum=1,maximum=200,step=1,label=i18n("batch_size"),value=20,interactive=True)
|
||||
fragment_interval = gr.Slider(minimum=0.01,maximum=1,step=0.01,label=i18n("分段间隔(秒)"),value=0.3,interactive=True)
|
||||
speed_factor = gr.Slider(minimum=0.6,maximum=1.65,step=0.05,label="speed_factor",value=1.0,interactive=True)
|
||||
top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True)
|
||||
top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True)
|
||||
temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True)
|
||||
repetition_penalty = gr.Slider(minimum=0,maximum=2,step=0.05,label=i18n("重复惩罚"),value=1.35,interactive=True)
|
||||
with gr.Row():
|
||||
batch_size = gr.Slider(minimum=1,maximum=200,step=1,label=i18n("batch_size"),value=20,interactive=True)
|
||||
sample_steps = gr.Radio(label=i18n("采样步数(仅对V3生效)"),value=32,choices=[4,8,16,32],visible=True)
|
||||
with gr.Row():
|
||||
fragment_interval = gr.Slider(minimum=0.01,maximum=1,step=0.01,label=i18n("分段间隔(秒)"),value=0.3,interactive=True)
|
||||
speed_factor = gr.Slider(minimum=0.6,maximum=1.65,step=0.05,label="语速",value=1.0,interactive=True)
|
||||
with gr.Row():
|
||||
top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True)
|
||||
top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True)
|
||||
with gr.Row():
|
||||
temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True)
|
||||
repetition_penalty = gr.Slider(minimum=0,maximum=2,step=0.05,label=i18n("重复惩罚"),value=1.35,interactive=True)
|
||||
|
||||
with gr.Column():
|
||||
with gr.Row():
|
||||
how_to_cut = gr.Dropdown(
|
||||
@ -272,10 +324,14 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
value=i18n("凑四句一切"),
|
||||
interactive=True, scale=1
|
||||
)
|
||||
super_sampling = gr.Checkbox(label=i18n("音频超采样(仅对V3生效))"), value=False, interactive=True, show_label=True)
|
||||
|
||||
with gr.Row():
|
||||
parallel_infer = gr.Checkbox(label=i18n("并行推理"), value=True, interactive=True, show_label=True)
|
||||
split_bucket = gr.Checkbox(label=i18n("数据分桶(并行推理时会降低一点计算量)"), value=True, interactive=True, show_label=True)
|
||||
|
||||
with gr.Row():
|
||||
|
||||
seed = gr.Number(label=i18n("随机种子"),value=-1)
|
||||
keep_random = gr.Checkbox(label=i18n("保持随机"), value=True, interactive=True, show_label=True)
|
||||
|
||||
@ -295,12 +351,12 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
||||
speed_factor, ref_text_free,
|
||||
split_bucket,fragment_interval,
|
||||
seed, keep_random, parallel_infer,
|
||||
repetition_penalty
|
||||
repetition_penalty, sample_steps, super_sampling,
|
||||
],
|
||||
[output, seed],
|
||||
)
|
||||
stop_infer.click(tts_pipeline.stop, [], [])
|
||||
SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language])
|
||||
SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language,sample_steps,inp_refs,ref_text_free,inference_button])#
|
||||
GPT_dropdown.change(tts_pipeline.init_t2s_weights, [GPT_dropdown], [])
|
||||
|
||||
with gr.Group():
|
||||
|
@ -1162,6 +1162,7 @@ class SynthesizerTrnV3(nn.Module):
|
||||
use_sdp=True,
|
||||
semantic_frame_rate=None,
|
||||
freeze_quantizer=None,
|
||||
version="v3",
|
||||
**kwargs):
|
||||
|
||||
super().__init__()
|
||||
@ -1182,6 +1183,7 @@ class SynthesizerTrnV3(nn.Module):
|
||||
self.segment_size = segment_size
|
||||
self.n_speakers = n_speakers
|
||||
self.gin_channels = gin_channels
|
||||
self.version = version
|
||||
|
||||
self.model_dim=512
|
||||
self.use_sdp = use_sdp
|
||||
|
@ -9,6 +9,8 @@ from module import commons
|
||||
from module import modules
|
||||
from module import attentions_onnx as attentions
|
||||
|
||||
from f5_tts.model import DiT
|
||||
|
||||
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
||||
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
||||
from module.commons import init_weights, get_padding
|
||||
@ -342,6 +344,37 @@ class PosteriorEncoder(nn.Module):
|
||||
return z, m, logs, x_mask
|
||||
|
||||
|
||||
class Encoder(nn.Module):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
hidden_channels,
|
||||
kernel_size,
|
||||
dilation_rate,
|
||||
n_layers,
|
||||
gin_channels=0):
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.dilation_rate = dilation_rate
|
||||
self.n_layers = n_layers
|
||||
self.gin_channels = gin_channels
|
||||
|
||||
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
||||
self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
|
||||
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
||||
|
||||
def forward(self, x, x_lengths, g=None):
|
||||
if(g!=None):
|
||||
g = g.detach()
|
||||
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
||||
x = self.pre(x) * x_mask
|
||||
x = self.enc(x, x_mask, g=g)
|
||||
stats = self.proj(x) * x_mask
|
||||
return stats, x_mask
|
||||
|
||||
class WNEncoder(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
@ -916,4 +949,175 @@ class SynthesizerTrn(nn.Module):
|
||||
def extract_latent(self, x):
|
||||
ssl = self.ssl_proj(x)
|
||||
quantized, codes, commit_loss, quantized_list = self.quantizer(ssl)
|
||||
return codes.transpose(0, 1)
|
||||
return codes.transpose(0, 1)
|
||||
|
||||
class CFM(torch.nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels,dit
|
||||
):
|
||||
super().__init__()
|
||||
# self.sigma_min = 1e-6
|
||||
|
||||
self.estimator = dit
|
||||
|
||||
self.in_channels = in_channels
|
||||
|
||||
# self.criterion = torch.nn.MSELoss()
|
||||
|
||||
def forward(self, mu:torch.Tensor, x_lens:torch.LongTensor, prompt:torch.Tensor, n_timesteps:torch.LongTensor, temperature:float=1.0):
|
||||
"""Forward diffusion"""
|
||||
B, T = mu.size(0), mu.size(1)
|
||||
x = torch.randn([B, self.in_channels, T], device=mu.device,dtype=mu.dtype)
|
||||
|
||||
ntimesteps = int(n_timesteps)
|
||||
|
||||
prompt_len = prompt.size(-1)
|
||||
prompt_x = torch.zeros_like(x,dtype=mu.dtype)
|
||||
prompt_x[..., :prompt_len] = prompt[..., :prompt_len]
|
||||
x[..., :prompt_len] = 0.0
|
||||
mu=mu.transpose(2,1)
|
||||
t = torch.tensor(0.0,dtype=x.dtype,device=x.device)
|
||||
d = torch.tensor(1.0/ntimesteps,dtype=x.dtype,device=x.device)
|
||||
d_tensor = torch.ones(x.shape[0], device=x.device,dtype=mu.dtype) * d
|
||||
|
||||
for j in range(ntimesteps):
|
||||
t_tensor = torch.ones(x.shape[0], device=x.device,dtype=mu.dtype) * t
|
||||
# d_tensor = torch.ones(x.shape[0], device=x.device,dtype=mu.dtype) * d
|
||||
# v_pred = model(x, t_tensor, d_tensor, **extra_args)
|
||||
v_pred = self.estimator(x, prompt_x, x_lens, t_tensor,d_tensor, mu).transpose(2, 1)
|
||||
# if inference_cfg_rate>1e-5:
|
||||
# neg = self.estimator(x, prompt_x, x_lens, t_tensor, d_tensor, mu, use_grad_ckpt=False, drop_audio_cond=True, drop_text=True).transpose(2, 1)
|
||||
# v_pred=v_pred+(v_pred-neg)*inference_cfg_rate
|
||||
x = x + d * v_pred
|
||||
t = t + d
|
||||
x[:, :, :prompt_len] = 0.0
|
||||
return x
|
||||
|
||||
|
||||
def set_no_grad(net_g):
|
||||
for name, param in net_g.named_parameters():
|
||||
param.requires_grad=False
|
||||
|
||||
@torch.jit.script_if_tracing
|
||||
def compile_codes_length(codes):
|
||||
y_lengths1 = torch.LongTensor([codes.size(2)]).to(codes.device)
|
||||
return y_lengths1 * 2.5 * 1.5
|
||||
|
||||
@torch.jit.script_if_tracing
|
||||
def compile_ref_length(refer):
|
||||
refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device)
|
||||
return refer_lengths
|
||||
|
||||
class SynthesizerTrnV3(nn.Module):
|
||||
"""
|
||||
Synthesizer for Training
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
spec_channels,
|
||||
segment_size,
|
||||
inter_channels,
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
n_heads,
|
||||
n_layers,
|
||||
kernel_size,
|
||||
p_dropout,
|
||||
resblock,
|
||||
resblock_kernel_sizes,
|
||||
resblock_dilation_sizes,
|
||||
upsample_rates,
|
||||
upsample_initial_channel,
|
||||
upsample_kernel_sizes,
|
||||
n_speakers=0,
|
||||
gin_channels=0,
|
||||
use_sdp=True,
|
||||
semantic_frame_rate=None,
|
||||
freeze_quantizer=None,
|
||||
version="v3",
|
||||
**kwargs):
|
||||
|
||||
super().__init__()
|
||||
self.spec_channels = spec_channels
|
||||
self.inter_channels = inter_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.resblock = resblock
|
||||
self.resblock_kernel_sizes = resblock_kernel_sizes
|
||||
self.resblock_dilation_sizes = resblock_dilation_sizes
|
||||
self.upsample_rates = upsample_rates
|
||||
self.upsample_initial_channel = upsample_initial_channel
|
||||
self.upsample_kernel_sizes = upsample_kernel_sizes
|
||||
self.segment_size = segment_size
|
||||
self.n_speakers = n_speakers
|
||||
self.gin_channels = gin_channels
|
||||
self.version = version
|
||||
|
||||
self.model_dim=512
|
||||
self.use_sdp = use_sdp
|
||||
self.enc_p = TextEncoder(inter_channels,hidden_channels,filter_channels,n_heads,n_layers,kernel_size,p_dropout)
|
||||
# self.ref_enc = modules.MelStyleEncoder(spec_channels, style_vector_dim=gin_channels)###Rollback
|
||||
self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels)###Rollback
|
||||
# self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates,
|
||||
# upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
|
||||
# self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16,
|
||||
# gin_channels=gin_channels)
|
||||
# self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
|
||||
|
||||
|
||||
ssl_dim = 768
|
||||
assert semantic_frame_rate in ['25hz', "50hz"]
|
||||
self.semantic_frame_rate = semantic_frame_rate
|
||||
if semantic_frame_rate == '25hz':
|
||||
self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 2, stride=2)
|
||||
else:
|
||||
self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 1, stride=1)
|
||||
|
||||
self.quantizer = ResidualVectorQuantizer(
|
||||
dimension=ssl_dim,
|
||||
n_q=1,
|
||||
bins=1024
|
||||
)
|
||||
freeze_quantizer
|
||||
inter_channels2=512
|
||||
self.bridge=nn.Sequential(
|
||||
nn.Conv1d(inter_channels, inter_channels2, 1, stride=1),
|
||||
nn.LeakyReLU()
|
||||
)
|
||||
self.wns1=Encoder(inter_channels2, inter_channels2, inter_channels2, 5, 1, 8,gin_channels=gin_channels)
|
||||
self.linear_mel=nn.Conv1d(inter_channels2,100,1,stride=1)
|
||||
self.cfm = CFM(100,DiT(**dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=inter_channels2, conv_layers=4)),)#text_dim is condition feature dim
|
||||
if freeze_quantizer==True:
|
||||
set_no_grad(self.ssl_proj)
|
||||
set_no_grad(self.quantizer)
|
||||
set_no_grad(self.enc_p)
|
||||
|
||||
def create_ge(self, refer):
|
||||
refer_lengths = compile_ref_length(refer)
|
||||
refer_mask = torch.unsqueeze(commons.sequence_mask(refer_lengths, refer.size(2)), 1).to(refer.dtype)
|
||||
ge = self.ref_enc(refer[:,:704] * refer_mask, refer_mask)
|
||||
return ge
|
||||
|
||||
def forward(self, codes, text,ge,speed=1):
|
||||
|
||||
y_lengths1=compile_codes_length(codes)
|
||||
|
||||
quantized = self.quantizer.decode(codes)
|
||||
if self.semantic_frame_rate == '25hz':
|
||||
quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")##BCT
|
||||
x, m_p, logs_p, y_mask = self.enc_p(quantized, text, ge,speed)
|
||||
fea=self.bridge(x)
|
||||
fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")##BCT
|
||||
####more wn paramter to learn mel
|
||||
fea, y_mask_ = self.wns1(fea, y_lengths1, ge)
|
||||
return fea
|
||||
|
||||
def extract_latent(self, x):
|
||||
ssl = self.ssl_proj(x)
|
||||
quantized, codes, commit_loss, quantized_list = self.quantizer(ssl)
|
||||
return codes.transpose(0,1)
|
@ -81,7 +81,7 @@ if os.path.exists(semantic_path) == False:
|
||||
# utils.load_checkpoint(pretrained_s2G, vq_model, None, True)
|
||||
print(
|
||||
vq_model.load_state_dict(
|
||||
torch.load(pretrained_s2G, map_location="cpu")["weight"], strict=False
|
||||
torch.load(pretrained_s2G, map_location="cpu", weights_only=False)["weight"], strict=False
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -429,26 +429,25 @@ def train_and_evaluate(
|
||||
# scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)})
|
||||
# scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)})
|
||||
# scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)})
|
||||
image_dict = {
|
||||
"slice/mel_org": utils.plot_spectrogram_to_numpy(
|
||||
y_mel[0].data.cpu().numpy()
|
||||
),
|
||||
"slice/mel_gen": utils.plot_spectrogram_to_numpy(
|
||||
y_hat_mel[0].data.cpu().numpy()
|
||||
),
|
||||
"all/mel": utils.plot_spectrogram_to_numpy(
|
||||
mel[0].data.cpu().numpy()
|
||||
),
|
||||
"all/stats_ssl": utils.plot_spectrogram_to_numpy(
|
||||
stats_ssl[0].data.cpu().numpy()
|
||||
),
|
||||
}
|
||||
utils.summarize(
|
||||
writer=writer,
|
||||
global_step=global_step,
|
||||
images=image_dict,
|
||||
scalars=scalar_dict,
|
||||
)
|
||||
image_dict=None
|
||||
try:###Some people installed the wrong version of matplotlib.
|
||||
image_dict = {
|
||||
"slice/mel_org": utils.plot_spectrogram_to_numpy(
|
||||
y_mel[0].data.cpu().numpy()
|
||||
),
|
||||
"slice/mel_gen": utils.plot_spectrogram_to_numpy(
|
||||
y_hat_mel[0].data.cpu().numpy()
|
||||
),
|
||||
"all/mel": utils.plot_spectrogram_to_numpy(
|
||||
mel[0].data.cpu().numpy()
|
||||
),
|
||||
"all/stats_ssl": utils.plot_spectrogram_to_numpy(
|
||||
stats_ssl[0].data.cpu().numpy()
|
||||
),
|
||||
}
|
||||
except:pass
|
||||
if image_dict:utils.summarize(writer=writer,global_step=global_step,images=image_dict,scalars=scalar_dict,)
|
||||
else:utils.summarize(writer=writer,global_step=global_step,scalars=scalar_dict,)
|
||||
global_step += 1
|
||||
if epoch % hps.train.save_every_epoch == 0 and rank == 0:
|
||||
if hps.train.if_save_latest == 0:
|
||||
|
@ -8,66 +8,7 @@ jieba.setLogLevel(logging.CRITICAL)
|
||||
# 更改fast_langdetect大模型位置
|
||||
from pathlib import Path
|
||||
import fast_langdetect
|
||||
fast_langdetect.ft_detect.infer.CACHE_DIRECTORY = Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect"
|
||||
|
||||
# 防止win下无法读取模型
|
||||
import os
|
||||
from typing import Optional
|
||||
def load_fasttext_model(
|
||||
model_path: Path,
|
||||
download_url: Optional[str] = None,
|
||||
proxy: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Load a FastText model, downloading it if necessary.
|
||||
:param model_path: Path to the FastText model file
|
||||
:param download_url: URL to download the model from
|
||||
:param proxy: Proxy URL for downloading the model
|
||||
:return: FastText model
|
||||
:raises DetectError: If model loading fails
|
||||
"""
|
||||
if all([
|
||||
fast_langdetect.ft_detect.infer.VERIFY_FASTTEXT_LARGE_MODEL,
|
||||
model_path.exists(),
|
||||
model_path.name == fast_langdetect.ft_detect.infer.FASTTEXT_LARGE_MODEL_NAME,
|
||||
]):
|
||||
if not fast_langdetect.ft_detect.infer.verify_md5(model_path, fast_langdetect.ft_detect.infer.VERIFY_FASTTEXT_LARGE_MODEL):
|
||||
fast_langdetect.ft_detect.infer.logger.warning(
|
||||
f"fast-langdetect: MD5 hash verification failed for {model_path}, "
|
||||
f"please check the integrity of the downloaded file from {fast_langdetect.ft_detect.infer.FASTTEXT_LARGE_MODEL_URL}. "
|
||||
"\n This may seriously reduce the prediction accuracy. "
|
||||
"If you want to ignore this, please set `fast_langdetect.ft_detect.infer.VERIFY_FASTTEXT_LARGE_MODEL = None` "
|
||||
)
|
||||
if not model_path.exists():
|
||||
if download_url:
|
||||
fast_langdetect.ft_detect.infer.download_model(download_url, model_path, proxy)
|
||||
if not model_path.exists():
|
||||
raise fast_langdetect.ft_detect.infer.DetectError(f"FastText model file not found at {model_path}")
|
||||
|
||||
try:
|
||||
# Load FastText model
|
||||
if (re.match(r'^[A-Za-z0-9_/\\:.]*$', str(model_path))):
|
||||
model = fast_langdetect.ft_detect.infer.fasttext.load_model(str(model_path))
|
||||
else:
|
||||
python_dir = os.getcwd()
|
||||
if (str(model_path)[:len(python_dir)].upper() == python_dir.upper()):
|
||||
model = fast_langdetect.ft_detect.infer.fasttext.load_model(os.path.relpath(model_path, python_dir))
|
||||
else:
|
||||
import tempfile
|
||||
import shutil
|
||||
with tempfile.NamedTemporaryFile(delete=False) as tmpfile:
|
||||
shutil.copyfile(model_path, tmpfile.name)
|
||||
|
||||
model = fast_langdetect.ft_detect.infer.fasttext.load_model(tmpfile.name)
|
||||
os.unlink(tmpfile.name)
|
||||
return model
|
||||
|
||||
except Exception as e:
|
||||
fast_langdetect.ft_detect.infer.logger.warning(f"fast-langdetect:Failed to load FastText model from {model_path}: {e}")
|
||||
raise fast_langdetect.ft_detect.infer.DetectError(f"Failed to load FastText model: {e}")
|
||||
|
||||
if os.name == 'nt':
|
||||
fast_langdetect.ft_detect.infer.load_fasttext_model = load_fasttext_model
|
||||
fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector(fast_langdetect.infer.LangDetectConfig(cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect"))
|
||||
|
||||
|
||||
from split_lang import LangSplitter
|
||||
|
@ -17,6 +17,8 @@ pinyin_to_symbol_map = {
|
||||
for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
|
||||
}
|
||||
|
||||
import jieba_fast, logging
|
||||
jieba_fast.setLogLevel(logging.CRITICAL)
|
||||
import jieba_fast.posseg as psg
|
||||
|
||||
|
||||
|
@ -18,13 +18,15 @@ pinyin_to_symbol_map = {
|
||||
for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
|
||||
}
|
||||
|
||||
import jieba_fast, logging
|
||||
jieba_fast.setLogLevel(logging.CRITICAL)
|
||||
import jieba_fast.posseg as psg
|
||||
|
||||
# is_g2pw_str = os.environ.get("is_g2pw", "True")##默认开启
|
||||
# is_g2pw = False#True if is_g2pw_str.lower() == 'true' else False
|
||||
is_g2pw = True#True if is_g2pw_str.lower() == 'true' else False
|
||||
if is_g2pw:
|
||||
print("当前使用g2pw进行拼音推理")
|
||||
# print("当前使用g2pw进行拼音推理")
|
||||
from text.g2pw import G2PWPinyin, correct_pronunciation
|
||||
parent_directory = os.path.dirname(current_file_path)
|
||||
g2pw = G2PWPinyin(model_dir="GPT_SoVITS/text/G2PWModel",model_source=os.environ.get("bert_path","GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"),v_to_u=False, neutral_tone_with_five=True)
|
||||
|
@ -58,7 +58,7 @@ def download_and_decompress(model_dir: str='G2PWModel/'):
|
||||
extract_dir = os.path.join(parent_directory,"G2PWModel_1.1")
|
||||
extract_dir_new = os.path.join(parent_directory,"G2PWModel")
|
||||
print("Downloading g2pw model...")
|
||||
modelscope_url = "https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip"
|
||||
modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip"#"https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip"
|
||||
with requests.get(modelscope_url, stream=True) as r:
|
||||
r.raise_for_status()
|
||||
with open(zip_dir, 'wb') as f:
|
||||
|
@ -10,7 +10,7 @@ try:
|
||||
if os.name == 'nt':
|
||||
python_dir = os.getcwd()
|
||||
OPEN_JTALK_DICT_DIR = pyopenjtalk.OPEN_JTALK_DICT_DIR.decode("utf-8")
|
||||
if not (re.match(r'^[A-Za-z0-9_/\\:.]*$', OPEN_JTALK_DICT_DIR)):
|
||||
if not (re.match(r'^[A-Za-z0-9_/\\:.\-]*$', OPEN_JTALK_DICT_DIR)):
|
||||
if (OPEN_JTALK_DICT_DIR[:len(python_dir)].upper() == python_dir.upper()):
|
||||
OPEN_JTALK_DICT_DIR = os.path.join(os.path.relpath(OPEN_JTALK_DICT_DIR,python_dir))
|
||||
else:
|
||||
@ -25,7 +25,7 @@ try:
|
||||
OPEN_JTALK_DICT_DIR = os.path.join("TEMP", "ja", "open_jtalk_dic")
|
||||
pyopenjtalk.OPEN_JTALK_DICT_DIR = OPEN_JTALK_DICT_DIR.encode("utf-8")
|
||||
|
||||
if not (re.match(r'^[A-Za-z0-9_/\\:.]*$', current_file_path)):
|
||||
if not (re.match(r'^[A-Za-z0-9_/\\:.\-]*$', current_file_path)):
|
||||
if (current_file_path[:len(python_dir)].upper() == python_dir.upper()):
|
||||
current_file_path = os.path.join(os.path.relpath(current_file_path,python_dir))
|
||||
else:
|
||||
|
@ -19,13 +19,13 @@ if os.name == 'nt':
|
||||
print(f'you have to install eunjeon. install it...')
|
||||
else:
|
||||
installpath = spam_spec.submodule_search_locations[0]
|
||||
if not (re.match(r'^[A-Za-z0-9_/\\:.]*$', installpath)):
|
||||
if not (re.match(r'^[A-Za-z0-9_/\\:.\-]*$', installpath)):
|
||||
|
||||
import sys
|
||||
from eunjeon import Mecab as _Mecab
|
||||
class Mecab(_Mecab):
|
||||
def get_dicpath(installpath):
|
||||
if not (re.match(r'^[A-Za-z0-9_/\\:.]*$', installpath)):
|
||||
if not (re.match(r'^[A-Za-z0-9_/\\:.\-]*$', installpath)):
|
||||
import shutil
|
||||
python_dir = os.getcwd()
|
||||
if (installpath[:len(python_dir)].upper() == python_dir.upper()):
|
||||
|
@ -1,42 +1,37 @@
|
||||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3"
|
||||
},
|
||||
"accelerator": "GPU"
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "himHYZmra7ix"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "e9b7iFV3dm1f"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
|
||||
"%cd GPT-SoVITS\n",
|
||||
"!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
|
||||
"!pip install -r extra-req.txt --no-deps\n",
|
||||
"!pip install -r requirements.txt"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"cellView": "form",
|
||||
"id": "0NgxXg5sjv7z"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# @title Download pretrained models 下载预训练模型\n",
|
||||
"!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
|
||||
@ -53,16 +48,16 @@
|
||||
"!git clone https://huggingface.co/Delik/uvr5_weights\n",
|
||||
"!git config core.sparseCheckout true\n",
|
||||
"!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "0NgxXg5sjv7z",
|
||||
"cellView": "form"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"cellView": "form",
|
||||
"id": "cPDEH-9czOJF"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#@title Create folder models 创建文件夹模型\n",
|
||||
"import os\n",
|
||||
@ -77,16 +72,16 @@
|
||||
" print(f\"The folder '{folder_name}' was created successfully! (文件夹'{folder_name}'已成功创建!)\")\n",
|
||||
"\n",
|
||||
"print(\"All folders have been created. (所有文件夹均已创建。)\")"
|
||||
],
|
||||
"metadata": {
|
||||
"cellView": "form",
|
||||
"id": "cPDEH-9czOJF"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"cellView": "form",
|
||||
"id": "vbZY-LnM0tzq"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"import zipfile\n",
|
||||
@ -124,29 +119,35 @@
|
||||
" shutil.move(source_path, destination_path)\n",
|
||||
"\n",
|
||||
"print(f'Model downloaded. (模型已下载。)')"
|
||||
],
|
||||
"metadata": {
|
||||
"cellView": "form",
|
||||
"id": "vbZY-LnM0tzq"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"cellView": "form",
|
||||
"id": "4oRGUzkrk8C7"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# @title launch WebUI 启动WebUI\n",
|
||||
"!/usr/local/bin/pip install ipykernel\n",
|
||||
"!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
|
||||
"%cd /content/GPT-SoVITS/\n",
|
||||
"!/usr/local/bin/python webui.py"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "4oRGUzkrk8C7",
|
||||
"cellView": "form"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"accelerator": "GPU",
|
||||
"colab": {
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"name": "python3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
||||
|
43
README.md
43
README.md
@ -1,6 +1,5 @@
|
||||
<div align="center">
|
||||
|
||||
|
||||
<h1>GPT-SoVITS-WebUI</h1>
|
||||
A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>
|
||||
|
||||
@ -54,7 +53,7 @@ _Note: numba==0.56.4 requires py<3.11_
|
||||
|
||||
### Windows
|
||||
|
||||
If you are a Windows user (tested with win>=10), you can [download the integrated package](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta.7z?download=true) and double-click on _go-webui.bat_ to start GPT-SoVITS-WebUI.
|
||||
If you are a Windows user (tested with win>=10), you can [download the integrated package](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true) and double-click on _go-webui.bat_ to start GPT-SoVITS-WebUI.
|
||||
|
||||
**Users in China can [download the package here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).**
|
||||
|
||||
@ -77,6 +76,7 @@ bash install.sh
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda activate GPTSoVits
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
@ -105,6 +105,7 @@ Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWeb
|
||||
Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) (Korean TTS Only)
|
||||
|
||||
##### MacOS Users
|
||||
|
||||
```bash
|
||||
brew install ffmpeg
|
||||
```
|
||||
@ -112,6 +113,7 @@ brew install ffmpeg
|
||||
#### Install Dependences
|
||||
|
||||
```bash
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
@ -146,13 +148,13 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
|
||||
|
||||
1. Download pretrained models from [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) and place them in `GPT_SoVITS/pretrained_models`.
|
||||
|
||||
2. Download G2PW models from [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip), unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.(Chinese TTS Only)
|
||||
2. Download G2PW models from [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip), unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.(Chinese TTS Only)
|
||||
|
||||
3. For UVR5 (Vocals/Accompaniment Separation & Reverberation Removal, additionally), download models from [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) and place them in `tools/uvr5/uvr5_weights`.
|
||||
|
||||
- If you want to use `bs_roformer` or `mel_band_roformer` models for UVR5, you can manually download the model and corresponding configuration file, and put them in `tools/uvr5/uvr5_weights`. **Rename the model file and configuration file, ensure that the model and configuration files have the same and corresponding names except for the suffix**. In addition, the model and configuration file names **must include `roformer`** in order to be recognized as models of the roformer class.
|
||||
- If you want to use `bs_roformer` or `mel_band_roformer` models for UVR5, you can manually download the model and corresponding configuration file, and put them in `tools/uvr5/uvr5_weights`. **Rename the model file and configuration file, ensure that the model and configuration files have the same and corresponding names except for the suffix**. In addition, the model and configuration file names **must include `roformer`** in order to be recognized as models of the roformer class.
|
||||
|
||||
- The suggestion is to **directly specify the model type** in the model name and configuration file name, such as `mel_mand_roformer`, `bs_roformer`. If not specified, the features will be compared from the configuration file to determine which type of model it is. For example, the model `bs_roformer_ep_368_sdr_12.9628.ckpt` and its corresponding configuration file `bs_roformer_ep_368_sdr_12.9628.yaml` are a pair, `kim_mel_band_roformer.ckpt` and `kim_mel_band_roformer.yaml` are also a pair.
|
||||
- The suggestion is to **directly specify the model type** in the model name and configuration file name, such as `mel_mand_roformer`, `bs_roformer`. If not specified, the features will be compared from the configuration file to determine which type of model it is. For example, the model `bs_roformer_ep_368_sdr_12.9628.ckpt` and its corresponding configuration file `bs_roformer_ep_368_sdr_12.9628.yaml` are a pair, `kim_mel_band_roformer.ckpt` and `kim_mel_band_roformer.yaml` are also a pair.
|
||||
|
||||
4. For Chinese ASR (additionally), download models from [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), and [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) and place them in `tools/asr/models`.
|
||||
|
||||
@ -200,6 +202,7 @@ if you want to switch to V1,then
|
||||
```bash
|
||||
python webui.py v1 <language(optional)>
|
||||
```
|
||||
|
||||
Or maunally switch version in WebUI
|
||||
|
||||
### Finetune
|
||||
@ -217,18 +220,20 @@ Or maunally switch version in WebUI
|
||||
|
||||
#### Integrated Package Users
|
||||
|
||||
Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
|
||||
Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
|
||||
|
||||
#### Others
|
||||
|
||||
```bash
|
||||
python GPT_SoVITS/inference_webui.py <language(optional)>
|
||||
```
|
||||
|
||||
OR
|
||||
|
||||
```bash
|
||||
python webui.py
|
||||
```
|
||||
|
||||
then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
|
||||
|
||||
## V2 Release Notes
|
||||
@ -243,7 +248,7 @@ New Features:
|
||||
|
||||
4. Improved synthesis quality for low-quality reference audio
|
||||
|
||||
[more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
||||
[more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||
|
||||
Use v2 from v1 environment:
|
||||
|
||||
@ -253,7 +258,7 @@ Use v2 from v1 environment:
|
||||
|
||||
3. Download v2 pretrained models from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) and put them into `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`.
|
||||
|
||||
Chinese v2 additional: [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.
|
||||
Chinese v2 additional: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.
|
||||
|
||||
## V3 Release Notes
|
||||
|
||||
@ -263,7 +268,7 @@ New Features:
|
||||
|
||||
2. GPT model is more stable, with fewer repetitions and omissions, and it is easier to generate speech with richer emotional expression.
|
||||
|
||||
[more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
||||
[more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||
|
||||
Use v3 from v2 environment:
|
||||
|
||||
@ -273,8 +278,7 @@ Use v3 from v2 environment:
|
||||
|
||||
3. Download v3 pretrained models (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS\pretrained_models`.
|
||||
|
||||
additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt)
|
||||
|
||||
additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt)
|
||||
|
||||
## Todo List
|
||||
|
||||
@ -297,15 +301,20 @@ Use v3 from v2 environment:
|
||||
- [ ] model mix
|
||||
|
||||
## (Additional) Method for running from the command line
|
||||
|
||||
Use the command line to open the WebUI for UVR5
|
||||
|
||||
```
|
||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
||||
```
|
||||
|
||||
<!-- If you can't open a browser, follow the format below for UVR processing,This is using mdxnet for audio processing
|
||||
```
|
||||
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
||||
``` -->
|
||||
|
||||
This is how the audio segmentation of the dataset is done using the command line
|
||||
|
||||
```
|
||||
python audio_slicer.py \
|
||||
--input_path "<path_to_original_audio_file_or_directory>" \
|
||||
@ -315,16 +324,21 @@ python audio_slicer.py \
|
||||
--min_interval <shortest_time_gap_between_adjacent_subclips>
|
||||
--hop_size <step_size_for_computing_volume_curve>
|
||||
```
|
||||
|
||||
This is how dataset ASR processing is done using the command line(Only Chinese)
|
||||
|
||||
```
|
||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
||||
```
|
||||
|
||||
ASR processing is performed through Faster_Whisper(ASR marking except Chinese)
|
||||
|
||||
(No progress bars, GPU performance may cause time delays)
|
||||
|
||||
```
|
||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
||||
```
|
||||
|
||||
A custom list save path is enabled
|
||||
|
||||
## Credits
|
||||
@ -332,6 +346,7 @@ A custom list save path is enabled
|
||||
Special thanks to the following projects and contributors:
|
||||
|
||||
### Theoretical Research
|
||||
|
||||
- [ar-vits](https://github.com/innnky/ar-vits)
|
||||
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
||||
- [vits](https://github.com/jaywalnut310/vits)
|
||||
@ -341,17 +356,23 @@ Special thanks to the following projects and contributors:
|
||||
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
||||
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
||||
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
||||
|
||||
### Pretrained Models
|
||||
|
||||
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
||||
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
||||
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
||||
|
||||
### Text Frontend for Inference
|
||||
|
||||
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
||||
- [split-lang](https://github.com/DoodleBears/split-lang)
|
||||
- [g2pW](https://github.com/GitYCC/g2pW)
|
||||
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
||||
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
||||
|
||||
### WebUI Tools
|
||||
|
||||
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
||||
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
||||
- [SubFix](https://github.com/cronrpc/SubFix)
|
||||
|
273
api.py
273
api.py
@ -150,9 +150,9 @@ sys.path.append(now_dir)
|
||||
sys.path.append("%s/GPT_SoVITS" % (now_dir))
|
||||
|
||||
import signal
|
||||
import LangSegment
|
||||
from text.LangSegmenter import LangSegmenter
|
||||
from time import time as ttime
|
||||
import torch
|
||||
import torch, torchaudio
|
||||
import librosa
|
||||
import soundfile as sf
|
||||
from fastapi import FastAPI, Request, Query, HTTPException
|
||||
@ -162,7 +162,8 @@ from transformers import AutoModelForMaskedLM, AutoTokenizer
|
||||
import numpy as np
|
||||
from feature_extractor import cnhubert
|
||||
from io import BytesIO
|
||||
from module.models import SynthesizerTrn
|
||||
from module.models import SynthesizerTrn, SynthesizerTrnV3
|
||||
from peft import LoraConfig, PeftModel, get_peft_model
|
||||
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
||||
from text import cleaned_text_to_sequence
|
||||
from text.cleaner import clean_text
|
||||
@ -197,6 +198,61 @@ def is_full(*items): # 任意一项为空返回False
|
||||
return True
|
||||
|
||||
|
||||
def init_bigvgan():
|
||||
global bigvgan_model
|
||||
from BigVGAN import bigvgan
|
||||
bigvgan_model = bigvgan.BigVGAN.from_pretrained("%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,), use_cuda_kernel=False) # if True, RuntimeError: Ninja is required to load C++ extensions
|
||||
# remove weight norm in the model and set to eval mode
|
||||
bigvgan_model.remove_weight_norm()
|
||||
bigvgan_model = bigvgan_model.eval()
|
||||
if is_half == True:
|
||||
bigvgan_model = bigvgan_model.half().to(device)
|
||||
else:
|
||||
bigvgan_model = bigvgan_model.to(device)
|
||||
|
||||
|
||||
resample_transform_dict={}
|
||||
def resample(audio_tensor, sr0):
|
||||
global resample_transform_dict
|
||||
if sr0 not in resample_transform_dict:
|
||||
resample_transform_dict[sr0] = torchaudio.transforms.Resample(
|
||||
sr0, 24000
|
||||
).to(device)
|
||||
return resample_transform_dict[sr0](audio_tensor)
|
||||
|
||||
|
||||
from module.mel_processing import spectrogram_torch,mel_spectrogram_torch
|
||||
spec_min = -12
|
||||
spec_max = 2
|
||||
def norm_spec(x):
|
||||
return (x - spec_min) / (spec_max - spec_min) * 2 - 1
|
||||
def denorm_spec(x):
|
||||
return (x + 1) / 2 * (spec_max - spec_min) + spec_min
|
||||
mel_fn=lambda x: mel_spectrogram_torch(x, **{
|
||||
"n_fft": 1024,
|
||||
"win_size": 1024,
|
||||
"hop_size": 256,
|
||||
"num_mels": 100,
|
||||
"sampling_rate": 24000,
|
||||
"fmin": 0,
|
||||
"fmax": None,
|
||||
"center": False
|
||||
})
|
||||
|
||||
|
||||
sr_model=None
|
||||
def audio_sr(audio,sr):
|
||||
global sr_model
|
||||
if sr_model==None:
|
||||
from tools.audio_sr import AP_BWE
|
||||
try:
|
||||
sr_model=AP_BWE(device,DictToAttrRecursive)
|
||||
except FileNotFoundError:
|
||||
logger.info("你没有下载超分模型的参数,因此不进行超分。如想超分请先参照教程把文件下载")
|
||||
return audio.cpu().detach().numpy(),sr
|
||||
return sr_model(audio,sr)
|
||||
|
||||
|
||||
class Speaker:
|
||||
def __init__(self, name, gpt, sovits, phones = None, bert = None, prompt = None):
|
||||
self.name = name
|
||||
@ -214,31 +270,72 @@ class Sovits:
|
||||
self.vq_model = vq_model
|
||||
self.hps = hps
|
||||
|
||||
from process_ckpt import get_sovits_version_from_path_fast,load_sovits_new
|
||||
def get_sovits_weights(sovits_path):
|
||||
dict_s2 = torch.load(sovits_path, map_location="cpu")
|
||||
path_sovits_v3="GPT_SoVITS/pretrained_models/s2Gv3.pth"
|
||||
is_exist_s2gv3=os.path.exists(path_sovits_v3)
|
||||
|
||||
version, model_version, if_lora_v3=get_sovits_version_from_path_fast(sovits_path)
|
||||
if if_lora_v3==True and is_exist_s2gv3==False:
|
||||
logger.info("SoVITS V3 底模缺失,无法加载相应 LoRA 权重")
|
||||
|
||||
dict_s2 = load_sovits_new(sovits_path)
|
||||
hps = dict_s2["config"]
|
||||
hps = DictToAttrRecursive(hps)
|
||||
hps.model.semantic_frame_rate = "25hz"
|
||||
if dict_s2['weight']['enc_p.text_embedding.weight'].shape[0] == 322:
|
||||
if 'enc_p.text_embedding.weight' not in dict_s2['weight']:
|
||||
hps.model.version = "v2"#v3model,v2sybomls
|
||||
elif dict_s2['weight']['enc_p.text_embedding.weight'].shape[0] == 322:
|
||||
hps.model.version = "v1"
|
||||
else:
|
||||
hps.model.version = "v2"
|
||||
logger.info(f"模型版本: {hps.model.version}")
|
||||
|
||||
if model_version == "v3":
|
||||
hps.model.version = "v3"
|
||||
|
||||
model_params_dict = vars(hps.model)
|
||||
vq_model = SynthesizerTrn(
|
||||
hps.data.filter_length // 2 + 1,
|
||||
hps.train.segment_size // hps.data.hop_length,
|
||||
n_speakers=hps.data.n_speakers,
|
||||
**model_params_dict
|
||||
)
|
||||
if model_version!="v3":
|
||||
vq_model = SynthesizerTrn(
|
||||
hps.data.filter_length // 2 + 1,
|
||||
hps.train.segment_size // hps.data.hop_length,
|
||||
n_speakers=hps.data.n_speakers,
|
||||
**model_params_dict
|
||||
)
|
||||
else:
|
||||
vq_model = SynthesizerTrnV3(
|
||||
hps.data.filter_length // 2 + 1,
|
||||
hps.train.segment_size // hps.data.hop_length,
|
||||
n_speakers=hps.data.n_speakers,
|
||||
**model_params_dict
|
||||
)
|
||||
init_bigvgan()
|
||||
model_version=hps.model.version
|
||||
logger.info(f"模型版本: {model_version}")
|
||||
if ("pretrained" not in sovits_path):
|
||||
del vq_model.enc_q
|
||||
try:
|
||||
del vq_model.enc_q
|
||||
except:pass
|
||||
if is_half == True:
|
||||
vq_model = vq_model.half().to(device)
|
||||
else:
|
||||
vq_model = vq_model.to(device)
|
||||
vq_model.eval()
|
||||
vq_model.load_state_dict(dict_s2["weight"], strict=False)
|
||||
if if_lora_v3 == False:
|
||||
vq_model.load_state_dict(dict_s2["weight"], strict=False)
|
||||
else:
|
||||
vq_model.load_state_dict(load_sovits_new(path_sovits_v3)["weight"], strict=False)
|
||||
lora_rank=dict_s2["lora_rank"]
|
||||
lora_config = LoraConfig(
|
||||
target_modules=["to_k", "to_q", "to_v", "to_out.0"],
|
||||
r=lora_rank,
|
||||
lora_alpha=lora_rank,
|
||||
init_lora_weights=True,
|
||||
)
|
||||
vq_model.cfm = get_peft_model(vq_model.cfm, lora_config)
|
||||
vq_model.load_state_dict(dict_s2["weight"], strict=False)
|
||||
vq_model.cfm = vq_model.cfm.merge_and_unload()
|
||||
# torch.save(vq_model.state_dict(),"merge_win.pth")
|
||||
vq_model.eval()
|
||||
|
||||
sovits = Sovits(vq_model, hps)
|
||||
return sovits
|
||||
@ -260,8 +357,8 @@ def get_gpt_weights(gpt_path):
|
||||
t2s_model = t2s_model.half()
|
||||
t2s_model = t2s_model.to(device)
|
||||
t2s_model.eval()
|
||||
total = sum([param.nelement() for param in t2s_model.parameters()])
|
||||
logger.info("Number of parameter: %.2fM" % (total / 1e6))
|
||||
# total = sum([param.nelement() for param in t2s_model.parameters()])
|
||||
# logger.info("Number of parameter: %.2fM" % (total / 1e6))
|
||||
|
||||
gpt = Gpt(max_sec, t2s_model)
|
||||
return gpt
|
||||
@ -295,6 +392,7 @@ def get_bert_feature(text, word2ph):
|
||||
|
||||
|
||||
def clean_text_inf(text, language, version):
|
||||
language = language.replace("all_","")
|
||||
phones, word2ph, norm_text = clean_text(text, language, version)
|
||||
phones = cleaned_text_to_sequence(phones, version)
|
||||
return phones, word2ph, norm_text
|
||||
@ -315,16 +413,10 @@ def get_bert_inf(phones, word2ph, norm_text, language):
|
||||
from text import chinese
|
||||
def get_phones_and_bert(text,language,version,final=False):
|
||||
if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
|
||||
language = language.replace("all_","")
|
||||
if language == "en":
|
||||
LangSegment.setfilters(["en"])
|
||||
formattext = " ".join(tmp["text"] for tmp in LangSegment.getTexts(text))
|
||||
else:
|
||||
# 因无法区别中日韩文汉字,以用户输入为准
|
||||
formattext = text
|
||||
formattext = text
|
||||
while " " in formattext:
|
||||
formattext = formattext.replace(" ", " ")
|
||||
if language == "zh":
|
||||
if language == "all_zh":
|
||||
if re.search(r'[A-Za-z]', formattext):
|
||||
formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext)
|
||||
formattext = chinese.mix_text_normalize(formattext)
|
||||
@ -332,7 +424,7 @@ def get_phones_and_bert(text,language,version,final=False):
|
||||
else:
|
||||
phones, word2ph, norm_text = clean_text_inf(formattext, language, version)
|
||||
bert = get_bert_feature(norm_text, word2ph).to(device)
|
||||
elif language == "yue" and re.search(r'[A-Za-z]', formattext):
|
||||
elif language == "all_yue" and re.search(r'[A-Za-z]', formattext):
|
||||
formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext)
|
||||
formattext = chinese.mix_text_normalize(formattext)
|
||||
return get_phones_and_bert(formattext,"yue",version)
|
||||
@ -345,19 +437,18 @@ def get_phones_and_bert(text,language,version,final=False):
|
||||
elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}:
|
||||
textlist=[]
|
||||
langlist=[]
|
||||
LangSegment.setfilters(["zh","ja","en","ko"])
|
||||
if language == "auto":
|
||||
for tmp in LangSegment.getTexts(text):
|
||||
for tmp in LangSegmenter.getTexts(text):
|
||||
langlist.append(tmp["lang"])
|
||||
textlist.append(tmp["text"])
|
||||
elif language == "auto_yue":
|
||||
for tmp in LangSegment.getTexts(text):
|
||||
for tmp in LangSegmenter.getTexts(text):
|
||||
if tmp["lang"] == "zh":
|
||||
tmp["lang"] = "yue"
|
||||
langlist.append(tmp["lang"])
|
||||
textlist.append(tmp["text"])
|
||||
else:
|
||||
for tmp in LangSegment.getTexts(text):
|
||||
for tmp in LangSegmenter.getTexts(text):
|
||||
if tmp["lang"] == "en":
|
||||
langlist.append(tmp["lang"])
|
||||
else:
|
||||
@ -556,10 +647,11 @@ def only_punc(text):
|
||||
|
||||
|
||||
splits = {",", "。", "?", "!", ",", ".", "?", "!", "~", ":", ":", "—", "…", }
|
||||
def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, top_k= 15, top_p = 0.6, temperature = 0.6, speed = 1, inp_refs = None, spk = "default"):
|
||||
def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, top_k= 15, top_p = 0.6, temperature = 0.6, speed = 1, inp_refs = None, sample_steps = 32, if_sr = False, spk = "default"):
|
||||
infer_sovits = speaker_list[spk].sovits
|
||||
vq_model = infer_sovits.vq_model
|
||||
hps = infer_sovits.hps
|
||||
version = vq_model.version
|
||||
|
||||
infer_gpt = speaker_list[spk].gpt
|
||||
t2s_model = infer_gpt.t2s_model
|
||||
@ -587,20 +679,22 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
||||
prompt_semantic = codes[0, 0]
|
||||
prompt = prompt_semantic.unsqueeze(0).to(device)
|
||||
|
||||
refers=[]
|
||||
if(inp_refs):
|
||||
for path in inp_refs:
|
||||
try:
|
||||
refer = get_spepc(hps, path).to(dtype).to(device)
|
||||
refers.append(refer)
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
if(len(refers)==0):
|
||||
refers = [get_spepc(hps, ref_wav_path).to(dtype).to(device)]
|
||||
if version != "v3":
|
||||
refers=[]
|
||||
if(inp_refs):
|
||||
for path in inp_refs:
|
||||
try:
|
||||
refer = get_spepc(hps, path).to(dtype).to(device)
|
||||
refers.append(refer)
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
if(len(refers)==0):
|
||||
refers = [get_spepc(hps, ref_wav_path).to(dtype).to(device)]
|
||||
else:
|
||||
refer = get_spepc(hps, ref_wav_path).to(device).to(dtype)
|
||||
|
||||
t1 = ttime()
|
||||
version = vq_model.version
|
||||
os.environ['version'] = version
|
||||
# os.environ['version'] = version
|
||||
prompt_language = dict_language[prompt_language.lower()]
|
||||
text_language = dict_language[text_language.lower()]
|
||||
phones1, bert1, norm_text1 = get_phones_and_bert(prompt_text, prompt_language, version)
|
||||
@ -634,20 +728,82 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
||||
early_stop_num=hz * max_sec)
|
||||
pred_semantic = pred_semantic[:, -idx:].unsqueeze(0)
|
||||
t3 = ttime()
|
||||
audio = \
|
||||
vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0),
|
||||
refers,speed=speed).detach().cpu().numpy()[
|
||||
0, 0] ###试试重建不带上prompt部分
|
||||
|
||||
if version != "v3":
|
||||
audio = \
|
||||
vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0),
|
||||
refers,speed=speed).detach().cpu().numpy()[
|
||||
0, 0] ###试试重建不带上prompt部分
|
||||
else:
|
||||
phoneme_ids0=torch.LongTensor(phones1).to(device).unsqueeze(0)
|
||||
phoneme_ids1=torch.LongTensor(phones2).to(device).unsqueeze(0)
|
||||
# print(11111111, phoneme_ids0, phoneme_ids1)
|
||||
fea_ref,ge = vq_model.decode_encp(prompt.unsqueeze(0), phoneme_ids0, refer)
|
||||
ref_audio, sr = torchaudio.load(ref_wav_path)
|
||||
ref_audio=ref_audio.to(device).float()
|
||||
if (ref_audio.shape[0] == 2):
|
||||
ref_audio = ref_audio.mean(0).unsqueeze(0)
|
||||
if sr!=24000:
|
||||
ref_audio=resample(ref_audio,sr)
|
||||
# print("ref_audio",ref_audio.abs().mean())
|
||||
mel2 = mel_fn(ref_audio)
|
||||
mel2 = norm_spec(mel2)
|
||||
T_min = min(mel2.shape[2], fea_ref.shape[2])
|
||||
mel2 = mel2[:, :, :T_min]
|
||||
fea_ref = fea_ref[:, :, :T_min]
|
||||
if (T_min > 468):
|
||||
mel2 = mel2[:, :, -468:]
|
||||
fea_ref = fea_ref[:, :, -468:]
|
||||
T_min = 468
|
||||
chunk_len = 934 - T_min
|
||||
# print("fea_ref",fea_ref,fea_ref.shape)
|
||||
# print("mel2",mel2)
|
||||
mel2=mel2.to(dtype)
|
||||
fea_todo, ge = vq_model.decode_encp(pred_semantic, phoneme_ids1, refer, ge,speed)
|
||||
# print("fea_todo",fea_todo)
|
||||
# print("ge",ge.abs().mean())
|
||||
cfm_resss = []
|
||||
idx = 0
|
||||
while (1):
|
||||
fea_todo_chunk = fea_todo[:, :, idx:idx + chunk_len]
|
||||
if (fea_todo_chunk.shape[-1] == 0): break
|
||||
idx += chunk_len
|
||||
fea = torch.cat([fea_ref, fea_todo_chunk], 2).transpose(2, 1)
|
||||
# set_seed(123)
|
||||
cfm_res = vq_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0)
|
||||
cfm_res = cfm_res[:, :, mel2.shape[2]:]
|
||||
mel2 = cfm_res[:, :, -T_min:]
|
||||
# print("fea", fea)
|
||||
# print("mel2in", mel2)
|
||||
fea_ref = fea_todo_chunk[:, :, -T_min:]
|
||||
cfm_resss.append(cfm_res)
|
||||
cmf_res = torch.cat(cfm_resss, 2)
|
||||
cmf_res = denorm_spec(cmf_res)
|
||||
if bigvgan_model==None:init_bigvgan()
|
||||
with torch.inference_mode():
|
||||
wav_gen = bigvgan_model(cmf_res)
|
||||
audio=wav_gen[0][0].cpu().detach().numpy()
|
||||
|
||||
max_audio=np.abs(audio).max()
|
||||
if max_audio>1:
|
||||
audio/=max_audio
|
||||
audio_opt.append(audio)
|
||||
audio_opt.append(zero_wav)
|
||||
audio_opt = np.concatenate(audio_opt, 0)
|
||||
t4 = ttime()
|
||||
|
||||
sr = hps.data.sampling_rate if version != "v3" else 24000
|
||||
if if_sr and sr == 24000:
|
||||
audio_opt = torch.from_numpy(audio_opt).float().to(device)
|
||||
audio_opt,sr=audio_sr(audio_opt.unsqueeze(0),sr)
|
||||
max_audio=np.abs(audio_opt).max()
|
||||
if max_audio > 1: audio_opt /= max_audio
|
||||
sr = 48000
|
||||
|
||||
if is_int32:
|
||||
audio_bytes = pack_audio(audio_bytes,(np.concatenate(audio_opt, 0) * 2147483647).astype(np.int32),hps.data.sampling_rate)
|
||||
audio_bytes = pack_audio(audio_bytes,(audio_opt * 2147483647).astype(np.int32),sr)
|
||||
else:
|
||||
audio_bytes = pack_audio(audio_bytes,(np.concatenate(audio_opt, 0) * 32768).astype(np.int16),hps.data.sampling_rate)
|
||||
audio_bytes = pack_audio(audio_bytes,(audio_opt * 32768).astype(np.int16),sr)
|
||||
# logger.info("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
|
||||
if stream_mode == "normal":
|
||||
audio_bytes, audio_chunk = read_clean_buffer(audio_bytes)
|
||||
@ -655,7 +811,9 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
||||
|
||||
if not stream_mode == "normal":
|
||||
if media_type == "wav":
|
||||
audio_bytes = pack_wav(audio_bytes,hps.data.sampling_rate)
|
||||
sr = 48000 if if_sr else 24000
|
||||
sr = hps.data.sampling_rate if version != "v3" else sr
|
||||
audio_bytes = pack_wav(audio_bytes,sr)
|
||||
yield audio_bytes.getvalue()
|
||||
|
||||
|
||||
@ -688,7 +846,7 @@ def handle_change(path, text, language):
|
||||
return JSONResponse({"code": 0, "message": "Success"}, status_code=200)
|
||||
|
||||
|
||||
def handle(refer_wav_path, prompt_text, prompt_language, text, text_language, cut_punc, top_k, top_p, temperature, speed, inp_refs):
|
||||
def handle(refer_wav_path, prompt_text, prompt_language, text, text_language, cut_punc, top_k, top_p, temperature, speed, inp_refs, sample_steps, if_sr):
|
||||
if (
|
||||
refer_wav_path == "" or refer_wav_path is None
|
||||
or prompt_text == "" or prompt_text is None
|
||||
@ -702,12 +860,15 @@ def handle(refer_wav_path, prompt_text, prompt_language, text, text_language, cu
|
||||
if not default_refer.is_ready():
|
||||
return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400)
|
||||
|
||||
if not sample_steps in [4,8,16,32]:
|
||||
sample_steps = 32
|
||||
|
||||
if cut_punc == None:
|
||||
text = cut_text(text,default_cut_punc)
|
||||
else:
|
||||
text = cut_text(text,cut_punc)
|
||||
|
||||
return StreamingResponse(get_tts_wav(refer_wav_path, prompt_text, prompt_language, text, text_language, top_k, top_p, temperature, speed, inp_refs), media_type="audio/"+media_type)
|
||||
return StreamingResponse(get_tts_wav(refer_wav_path, prompt_text, prompt_language, text, text_language, top_k, top_p, temperature, speed, inp_refs, sample_steps, if_sr), media_type="audio/"+media_type)
|
||||
|
||||
|
||||
|
||||
@ -915,7 +1076,9 @@ async def tts_endpoint(request: Request):
|
||||
json_post_raw.get("top_p", 1.0),
|
||||
json_post_raw.get("temperature", 1.0),
|
||||
json_post_raw.get("speed", 1.0),
|
||||
json_post_raw.get("inp_refs", [])
|
||||
json_post_raw.get("inp_refs", []),
|
||||
json_post_raw.get("sample_steps", 32),
|
||||
json_post_raw.get("if_sr", False)
|
||||
)
|
||||
|
||||
|
||||
@ -931,9 +1094,11 @@ async def tts_endpoint(
|
||||
top_p: float = 1.0,
|
||||
temperature: float = 1.0,
|
||||
speed: float = 1.0,
|
||||
inp_refs: list = Query(default=[])
|
||||
inp_refs: list = Query(default=[]),
|
||||
sample_steps: int = 32,
|
||||
if_sr: bool = False
|
||||
):
|
||||
return handle(refer_wav_path, prompt_text, prompt_language, text, text_language, cut_punc, top_k, top_p, temperature, speed, inp_refs)
|
||||
return handle(refer_wav_path, prompt_text, prompt_language, text, text_language, cut_punc, top_k, top_p, temperature, speed, inp_refs, sample_steps, if_sr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
24
api_v2.py
24
api_v2.py
@ -39,6 +39,8 @@ POST:
|
||||
"seed": -1, # int. random seed for reproducibility.
|
||||
"parallel_infer": True, # bool. whether to use parallel inference.
|
||||
"repetition_penalty": 1.35 # float. repetition penalty for T2S model.
|
||||
"sample_steps": 32, # int. number of sampling steps for VITS model V3.
|
||||
"super_sampling": False, # bool. whether to use super-sampling for audio when using VITS model V3.
|
||||
}
|
||||
```
|
||||
|
||||
@ -164,6 +166,8 @@ class TTS_Request(BaseModel):
|
||||
streaming_mode:bool = False
|
||||
parallel_infer:bool = True
|
||||
repetition_penalty:float = 1.35
|
||||
sample_steps:int = 32
|
||||
super_sampling:bool = False
|
||||
|
||||
### modify from https://github.com/RVC-Boss/GPT-SoVITS/pull/894/files
|
||||
def pack_ogg(io_buffer:BytesIO, data:np.ndarray, rate:int):
|
||||
@ -294,7 +298,9 @@ async def tts_handle(req:dict):
|
||||
"media_type": "wav", # str. media type of the output audio, support "wav", "raw", "ogg", "aac".
|
||||
"streaming_mode": False, # bool. whether to return a streaming response.
|
||||
"parallel_infer": True, # bool.(optional) whether to use parallel inference.
|
||||
"repetition_penalty": 1.35 # float.(optional) repetition penalty for T2S model.
|
||||
"repetition_penalty": 1.35 # float.(optional) repetition penalty for T2S model.
|
||||
"sample_steps": 32, # int. number of sampling steps for VITS model V3.
|
||||
"super_sampling": False, # bool. whether to use super-sampling for audio when using VITS model V3.
|
||||
}
|
||||
returns:
|
||||
StreamingResponse: audio stream response.
|
||||
@ -316,10 +322,12 @@ async def tts_handle(req:dict):
|
||||
|
||||
if streaming_mode:
|
||||
def streaming_generator(tts_generator:Generator, media_type:str):
|
||||
if media_type == "wav":
|
||||
yield wave_header_chunk()
|
||||
media_type = "raw"
|
||||
if_frist_chunk = True
|
||||
for sr, chunk in tts_generator:
|
||||
if if_frist_chunk and media_type == "wav":
|
||||
yield wave_header_chunk(sample_rate=sr)
|
||||
media_type = "raw"
|
||||
if_frist_chunk = False
|
||||
yield pack_audio(BytesIO(), chunk, sr, media_type).getvalue()
|
||||
# _media_type = f"audio/{media_type}" if not (streaming_mode and media_type in ["wav", "raw"]) else f"audio/x-{media_type}"
|
||||
return StreamingResponse(streaming_generator(tts_generator, media_type, ), media_type=f"audio/{media_type}")
|
||||
@ -365,7 +373,9 @@ async def tts_get_endpoint(
|
||||
media_type:str = "wav",
|
||||
streaming_mode:bool = False,
|
||||
parallel_infer:bool = True,
|
||||
repetition_penalty:float = 1.35
|
||||
repetition_penalty:float = 1.35,
|
||||
sample_steps:int =32,
|
||||
super_sampling:bool = False
|
||||
):
|
||||
req = {
|
||||
"text": text,
|
||||
@ -387,7 +397,9 @@ async def tts_get_endpoint(
|
||||
"media_type":media_type,
|
||||
"streaming_mode":streaming_mode,
|
||||
"parallel_infer":parallel_infer,
|
||||
"repetition_penalty":float(repetition_penalty)
|
||||
"repetition_penalty":float(repetition_penalty),
|
||||
"sample_steps":int(sample_steps),
|
||||
"super_sampling":super_sampling
|
||||
}
|
||||
return await tts_handle(req)
|
||||
|
||||
|
@ -1,23 +1,10 @@
|
||||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"provenance": [],
|
||||
"include_colab_link": true
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3"
|
||||
},
|
||||
"accelerator": "GPU"
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "view-in-github",
|
||||
"colab_type": "text"
|
||||
"colab_type": "text",
|
||||
"id": "view-in-github"
|
||||
},
|
||||
"source": [
|
||||
"<a href=\"https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
||||
@ -25,18 +12,20 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"环境配置 environment"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "_o6a8GS2lWQM"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"环境配置 environment"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "e9b7iFV3dm1f"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install -q condacolab\n",
|
||||
"# Setting up condacolab and installing packages\n",
|
||||
@ -47,13 +36,17 @@
|
||||
"!conda install -y -q -c pytorch -c nvidia cudatoolkit\n",
|
||||
"%cd -q /content/GPT-SoVITS\n",
|
||||
"!conda install -y -q -c conda-forge gcc gxx ffmpeg cmake -c pytorch -c nvidia\n",
|
||||
"!/usr/local/bin/pip install -r extra-req.txt --no-deps\n",
|
||||
"!/usr/local/bin/pip install -r requirements.txt"
|
||||
],
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "0NgxXg5sjv7z"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# @title Download pretrained models 下载预训练模型\n",
|
||||
"!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
|
||||
@ -71,27 +64,35 @@
|
||||
"!git clone https://huggingface.co/Delik/uvr5_weights\n",
|
||||
"!git config core.sparseCheckout true\n",
|
||||
"!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "0NgxXg5sjv7z"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "4oRGUzkrk8C7"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# @title launch WebUI 启动WebUI\n",
|
||||
"!/usr/local/bin/pip install ipykernel\n",
|
||||
"!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
|
||||
"%cd /content/GPT-SoVITS/\n",
|
||||
"!/usr/local/bin/python webui.py"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "4oRGUzkrk8C7"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
]
|
||||
}
|
||||
]
|
||||
],
|
||||
"metadata": {
|
||||
"accelerator": "GPU",
|
||||
"colab": {
|
||||
"include_colab_link": true,
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"name": "python3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
||||
|
@ -286,3 +286,17 @@ https://github.com/RVC-Boss/GPT-SoVITS/pull/2112 https://github.com/RVC-Boss/GPT
|
||||
修复短文本语种选择出错 https://github.com/RVC-Boss/GPT-SoVITS/pull/2122
|
||||
|
||||
修复v3sovits未传参以支持调节语速
|
||||
|
||||
### 202503
|
||||
|
||||
修复一批由依赖的库版本不对导致的问题https://github.com/RVC-Boss/GPT-SoVITS/commit/6c468583c5566e5fbb4fb805e4cc89c403e997b8
|
||||
|
||||
修复模型加载异步逻辑https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa
|
||||
|
||||
修复其他若干bug
|
||||
|
||||
重点更新:
|
||||
|
||||
1-v3支持并行推理 https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa
|
||||
|
||||
2-整合包修复onnxruntime GPU推理的支持,影响:(1)g2pw有个onnx模型原先是CPU推理现在用GPU,显著降低推理的CPU瓶颈 (2)foxjoy去混响模型现在可使用GPU推理
|
||||
|
@ -53,7 +53,7 @@ _注: numba==0.56.4 需要 python<3.11_
|
||||
|
||||
### Windows
|
||||
|
||||
如果你是 Windows 用户(已在 win>=10 上测试),可以下载[下载整合包](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta.7z?download=true),解压后双击 go-webui.bat 即可启动 GPT-SoVITS-WebUI。
|
||||
如果你是 Windows 用户(已在 win>=10 上测试),可以下载[整合包](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true),解压后双击 go-webui.bat 即可启动 GPT-SoVITS-WebUI。
|
||||
|
||||
**中国地区的用户可以[在此处下载整合包](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO)。**
|
||||
|
||||
@ -76,6 +76,7 @@ bash install.sh
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda activate GPTSoVits
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
@ -101,9 +102,10 @@ conda install -c conda-forge 'ffmpeg<7'
|
||||
|
||||
下载并将 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) 和 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) 放置在 GPT-SoVITS 根目录下。
|
||||
|
||||
安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语TTS)
|
||||
安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语 TTS)
|
||||
|
||||
##### MacOS 用户
|
||||
|
||||
```bash
|
||||
brew install ffmpeg
|
||||
```
|
||||
@ -111,6 +113,7 @@ brew install ffmpeg
|
||||
#### 安装依赖
|
||||
|
||||
```bash
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
@ -147,14 +150,13 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
|
||||
|
||||
1. 从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型,并将其放置在 `GPT_SoVITS/pretrained_models` 目录中。
|
||||
|
||||
2. 从 [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) 下载模型,解压并重命名为 `G2PWModel`,然后将其放置在 `GPT_SoVITS/text` 目录中。(仅限中文TTS)
|
||||
2. 从 [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) 下载模型,解压并重命名为 `G2PWModel`,然后将其放置在 `GPT_SoVITS/text` 目录中。(仅限中文 TTS)
|
||||
|
||||
3. 对于 UVR5(人声/伴奏分离和混响移除,额外功能),从 [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 下载模型,并将其放置在 `tools/uvr5/uvr5_weights` 目录中。
|
||||
|
||||
- 如果你在 UVR5 中使用 `bs_roformer` 或 `mel_band_roformer`模型,你可以手动下载模型和相应的配置文件,并将它们放在 `tools/UVR5/UVR5_weights` 中。**重命名模型文件和配置文件,确保除后缀外**,模型和配置文件具有相同且对应的名称。此外,模型和配置文件名**必须包含“roformer”**,才能被识别为 roformer 类的模型。
|
||||
|
||||
- 建议在模型名称和配置文件名中**直接指定模型类型**,例如`mel_mand_roformer`、`bs_roformer`。如果未指定,将从配置文中比对特征,以确定它是哪种类型的模型。例如,模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对。`kim_mel_band_roformer.ckpt` 和 `kim_mel_band_roformer.yaml` 也是一对。
|
||||
- 如果你在 UVR5 中使用 `bs_roformer` 或 `mel_band_roformer`模型,你可以手动下载模型和相应的配置文件,并将它们放在 `tools/UVR5/UVR5_weights` 中。**重命名模型文件和配置文件,确保除后缀外**,模型和配置文件具有相同且对应的名称。此外,模型和配置文件名**必须包含“roformer”**,才能被识别为 roformer 类的模型。
|
||||
|
||||
- 建议在模型名称和配置文件名中**直接指定模型类型**,例如`mel_mand_roformer`、`bs_roformer`。如果未指定,将从配置文中比对特征,以确定它是哪种类型的模型。例如,模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对。`kim_mel_band_roformer.ckpt` 和 `kim_mel_band_roformer.yaml` 也是一对。
|
||||
|
||||
4. 对于中文 ASR(额外功能),从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型,并将它们放置在 `tools/asr/models` 目录中。
|
||||
|
||||
@ -184,12 +186,12 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神。
|
||||
|
||||
## 微调与推理
|
||||
|
||||
### 打开WebUI
|
||||
### 打开 WebUI
|
||||
|
||||
#### 整合包用户
|
||||
|
||||
双击`go-webui.bat`或者使用`go-webui.ps1`
|
||||
若想使用V1,则双击`go-webui-v1.bat`或者使用`go-webui-v1.ps1`
|
||||
若想使用 V1,则双击`go-webui-v1.bat`或者使用`go-webui-v1.ps1`
|
||||
|
||||
#### 其他
|
||||
|
||||
@ -197,12 +199,13 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神。
|
||||
python webui.py <language(optional)>
|
||||
```
|
||||
|
||||
若想使用V1,则
|
||||
若想使用 V1,则
|
||||
|
||||
```bash
|
||||
python webui.py v1 <language(optional)>
|
||||
```
|
||||
或者在webUI内动态切换
|
||||
|
||||
或者在 webUI 内动态切换
|
||||
|
||||
### 微调
|
||||
|
||||
@ -215,25 +218,27 @@ python webui.py v1 <language(optional)>
|
||||
5. 校对标注
|
||||
6. 前往下一个窗口,点击训练
|
||||
|
||||
### 打开推理WebUI
|
||||
### 打开推理 WebUI
|
||||
|
||||
#### 整合包用户
|
||||
|
||||
双击 `go-webui.bat` 或者使用 `go-webui.ps1` ,然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理webUI
|
||||
双击 `go-webui.bat` 或者使用 `go-webui.ps1` ,然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
|
||||
|
||||
#### 其他
|
||||
|
||||
```bash
|
||||
python GPT_SoVITS/inference_webui.py <language(optional)>
|
||||
```
|
||||
|
||||
或者
|
||||
|
||||
```bash
|
||||
python webui.py
|
||||
```
|
||||
然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理webUI
|
||||
|
||||
## V2发布说明
|
||||
然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
|
||||
|
||||
## V2 发布说明
|
||||
|
||||
新特性:
|
||||
|
||||
@ -241,42 +246,41 @@ python webui.py
|
||||
|
||||
2. 更好的文本前端
|
||||
|
||||
3. 底模由2k小时扩展至5k小时
|
||||
3. 底模由 2k 小时扩展至 5k 小时
|
||||
|
||||
4. 对低音质参考音频(尤其是来源于网络的高频严重缺失、听着很闷的音频)合成出来音质更好
|
||||
|
||||
详见[wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
||||
详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||
|
||||
从v1环境迁移至v2
|
||||
从 v1 环境迁移至 v2
|
||||
|
||||
1. 需要pip安装requirements.txt更新环境
|
||||
1. 需要 pip 安装 requirements.txt 更新环境
|
||||
|
||||
2. 需要克隆github上的最新代码
|
||||
2. 需要克隆 github 上的最新代码
|
||||
|
||||
3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到GPT_SoVITS\pretrained_models\gsv-v2final-pretrained下
|
||||
3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到 GPT_SoVITS\pretrained_models\gsv-v2final-pretrained 下
|
||||
|
||||
中文额外需要下载[G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(下载G2PW模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下)
|
||||
中文额外需要下载[G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(下载 G2PW 模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下)
|
||||
|
||||
## V3更新说明
|
||||
## V3 更新说明
|
||||
|
||||
新模型特点:
|
||||
|
||||
1. 音色相似度更像,需要更少训练集来逼近本人(不训练直接使用底模模式下音色相似性提升更大)
|
||||
|
||||
2. GPT合成更稳定,重复漏字更少,也更容易跑出丰富情感
|
||||
2. GPT 合成更稳定,重复漏字更少,也更容易跑出丰富情感
|
||||
|
||||
详见[wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
||||
详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||
|
||||
从v2环境迁移至v3
|
||||
从 v2 环境迁移至 v3
|
||||
|
||||
1. 需要pip安装requirements.txt更新环境
|
||||
1. 需要 pip 安装 requirements.txt 更新环境
|
||||
|
||||
2. 需要克隆github上的最新代码
|
||||
2. 需要克隆 github 上的最新代码
|
||||
|
||||
3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些v3新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS\pretrained_models`目录下
|
||||
|
||||
如果想用音频超分功能缓解v3模型生成24k音频觉得闷的问题,需要下载额外的模型参数,参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt)
|
||||
3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些 v3 新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS\pretrained_models`目录下
|
||||
|
||||
如果想用音频超分功能缓解 v3 模型生成 24k 音频觉得闷的问题,需要下载额外的模型参数,参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt)
|
||||
|
||||
## 待办事项清单
|
||||
|
||||
@ -299,16 +303,21 @@ python webui.py
|
||||
- [ ] 模型混合。
|
||||
|
||||
## (附加)命令行运行方式
|
||||
使用命令行打开UVR5的WebUI
|
||||
````
|
||||
|
||||
使用命令行打开 UVR5 的 WebUI
|
||||
|
||||
```
|
||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
||||
````
|
||||
```
|
||||
|
||||
<!-- 如果打不开浏览器,请按照下面的格式进行UVR处理,这是使用mdxnet进行音频处理的方式
|
||||
````
|
||||
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
||||
```` -->
|
||||
|
||||
这是使用命令行完成数据集的音频切分的方式
|
||||
````
|
||||
|
||||
```
|
||||
python audio_slicer.py \
|
||||
--input_path "<path_to_original_audio_file_or_directory>" \
|
||||
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
|
||||
@ -316,17 +325,22 @@ python audio_slicer.py \
|
||||
--min_length <minimum_duration_of_each_subclip> \
|
||||
--min_interval <shortest_time_gap_between_adjacent_subclips>
|
||||
--hop_size <step_size_for_computing_volume_curve>
|
||||
````
|
||||
这是使用命令行完成数据集ASR处理的方式(仅限中文)
|
||||
````
|
||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
||||
````
|
||||
通过Faster_Whisper进行ASR处理(除中文之外的ASR标记)
|
||||
```
|
||||
|
||||
这是使用命令行完成数据集 ASR 处理的方式(仅限中文)
|
||||
|
||||
```
|
||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
||||
```
|
||||
|
||||
通过 Faster_Whisper 进行 ASR 处理(除中文之外的 ASR 标记)
|
||||
|
||||
(没有进度条,GPU 性能可能会导致时间延迟)
|
||||
|
||||
(没有进度条,GPU性能可能会导致时间延迟)
|
||||
```
|
||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
||||
```
|
||||
|
||||
启用自定义列表保存路径
|
||||
|
||||
## 致谢
|
||||
@ -334,6 +348,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
||||
特别感谢以下项目和贡献者:
|
||||
|
||||
### 理论研究
|
||||
|
||||
- [ar-vits](https://github.com/innnky/ar-vits)
|
||||
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
||||
- [vits](https://github.com/jaywalnut310/vits)
|
||||
@ -343,17 +358,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
||||
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
||||
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
||||
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
||||
|
||||
### 预训练模型
|
||||
|
||||
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
||||
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
||||
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
||||
|
||||
### 推理用文本前端
|
||||
|
||||
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
||||
- [split-lang](https://github.com/DoodleBears/split-lang)
|
||||
- [g2pW](https://github.com/GitYCC/g2pW)
|
||||
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
||||
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
||||
|
||||
### WebUI 工具
|
||||
|
||||
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
||||
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
||||
- [SubFix](https://github.com/cronrpc/SubFix)
|
||||
|
@ -20,17 +20,17 @@
|
||||
|
||||
## 機能:
|
||||
|
||||
1. **Zero-Shot TTS:** たった5秒間の音声サンプルで、即座にテキストからその音声に変換できます。
|
||||
1. **Zero-Shot TTS:** たった 5 秒間の音声サンプルで、即座にテキストからその音声に変換できます。
|
||||
|
||||
2. **Few-Shot TTS:** わずか1分間のトレーニングデータでモデルを微調整し、音声のクオリティを向上。
|
||||
2. **Few-Shot TTS:** わずか 1 分間のトレーニングデータでモデルを微調整し、音声のクオリティを向上。
|
||||
|
||||
3. **多言語サポート:** 現在、英語、日本語、韓国語、広東語、中国語をサポートしています。
|
||||
|
||||
4. **WebUI ツール:** 統合されたツールは、音声と伴奏(BGM等)の分離、トレーニングセットの自動セグメンテーション、ASR(中国語のみ)、テキストラベリング等を含むため、初心者の方でもトレーニングデータセットの作成やGPT/SoVITSモデルのトレーニング等を非常に簡単に行えます。
|
||||
4. **WebUI ツール:** 統合されたツールは、音声と伴奏(BGM 等)の分離、トレーニングセットの自動セグメンテーション、ASR(中国語のみ)、テキストラベリング等を含むため、初心者の方でもトレーニングデータセットの作成や GPT/SoVITS モデルのトレーニング等を非常に簡単に行えます。
|
||||
|
||||
**[デモ動画](https://www.bilibili.com/video/BV12g4y1m7Uw)をチェック!**
|
||||
|
||||
声の事前学習無しかつFew-Shotでトレーニングされたモデルのデモ:
|
||||
声の事前学習無しかつ Few-Shot でトレーニングされたモデルのデモ:
|
||||
|
||||
https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
|
||||
|
||||
@ -43,13 +43,13 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
|
||||
- Python 3.9, PyTorch 2.0.1, CUDA 11
|
||||
- Python 3.10.13, PyTorch 2.1.2, CUDA 12.3
|
||||
- Python 3.9, PyTorch 2.2.2, macOS 14.4.1 (Apple silicon)
|
||||
- Python 3.9, PyTorch 2.2.2, CPUデバイス
|
||||
- Python 3.9, PyTorch 2.2.2, CPU デバイス
|
||||
|
||||
_注記: numba==0.56.4 は py<3.11 が必要です_
|
||||
|
||||
### Windows
|
||||
|
||||
Windows ユーザー:(Windows 10 以降でテスト済み)、[統合パッケージをダウンロード](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta.7z?download=true)し、解凍後に _go-webui.bat_ をダブルクリックすると、GPT-SoVITS-WebUI が起動します。
|
||||
Windows ユーザー:(Windows 10 以降でテスト済み)、[統合パッケージをダウンロード](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true)し、解凍後に _go-webui.bat_ をダブルクリックすると、GPT-SoVITS-WebUI が起動します。
|
||||
|
||||
### Linux
|
||||
|
||||
@ -61,22 +61,22 @@ bash install.sh
|
||||
|
||||
### macOS
|
||||
|
||||
**注:MacでGPUを使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面はCPUを使用して訓練することを強く推奨します。**
|
||||
**注:Mac で GPU を使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面は CPU を使用して訓練することを強く推奨します。**
|
||||
|
||||
1. `xcode-select --install` を実行して、Xcodeコマンドラインツールをインストールします。
|
||||
2. `brew install ffmpeg` を実行してFFmpegをインストールします。
|
||||
1. `xcode-select --install` を実行して、Xcode コマンドラインツールをインストールします。
|
||||
2. `brew install ffmpeg` を実行して FFmpeg をインストールします。
|
||||
3. 上記の手順を完了した後、以下のコマンドを実行してこのプロジェクトをインストールします。
|
||||
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda activate GPTSoVits
|
||||
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### 手動インストール
|
||||
|
||||
#### FFmpegをインストールします。
|
||||
#### FFmpeg をインストールします。
|
||||
|
||||
##### Conda ユーザー
|
||||
|
||||
@ -97,6 +97,7 @@ conda install -c conda-forge 'ffmpeg<7'
|
||||
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます。
|
||||
|
||||
##### MacOS ユーザー
|
||||
|
||||
```bash
|
||||
brew install ffmpeg
|
||||
```
|
||||
@ -104,6 +105,7 @@ brew install ffmpeg
|
||||
#### 依存関係をインストールします
|
||||
|
||||
```bash
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirementx.txt
|
||||
```
|
||||
|
||||
@ -138,17 +140,17 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
|
||||
|
||||
1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) から事前訓練済みモデルをダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリに配置してください。
|
||||
|
||||
2. [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください。(中国語TTSのみ)
|
||||
2. [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください。(中国語 TTS のみ)
|
||||
|
||||
3. UVR5(ボーカル/伴奏(BGM等)分離 & リバーブ除去の追加機能)の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`tools/uvr5/uvr5_weights` ディレクトリに配置してください。
|
||||
3. UVR5(ボーカル/伴奏(BGM 等)分離 & リバーブ除去の追加機能)の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`tools/uvr5/uvr5_weights` ディレクトリに配置してください。
|
||||
|
||||
- UVR5でbs_roformerまたはmel_band_roformerモデルを使用する場合、モデルと対応する設定ファイルを手動でダウンロードし、`tools/UVR5/UVR5_weights`フォルダに配置することができます。**モデルファイルと設定ファイルの名前は、拡張子を除いて同じであることを確認してください**。さらに、モデルと設定ファイルの名前には**「roformer」が含まれている必要があります**。これにより、roformerクラスのモデルとして認識されます。
|
||||
- UVR5 で bs_roformer または mel_band_roformer モデルを使用する場合、モデルと対応する設定ファイルを手動でダウンロードし、`tools/UVR5/UVR5_weights`フォルダに配置することができます。**モデルファイルと設定ファイルの名前は、拡張子を除いて同じであることを確認してください**。さらに、モデルと設定ファイルの名前には**「roformer」が含まれている必要があります**。これにより、roformer クラスのモデルとして認識されます。
|
||||
|
||||
- モデル名と設定ファイル名には、**直接モデルタイプを指定することをお勧めします**。例:mel_mand_roformer、bs_roformer。指定しない場合、設定文から特徴を照合して、モデルの種類を特定します。例えば、モデル`bs_roformer_ep_368_sdr_12.9628.ckpt`と対応する設定ファイル`bs_roformer_ep_368_sdr_12.9628.yaml`はペアです。同様に、`kim_mel_band_roformer.ckpt`と`kim_mel_band_roformer.yaml`もペアです。
|
||||
- モデル名と設定ファイル名には、**直接モデルタイプを指定することをお勧めします**。例:mel_mand_roformer、bs_roformer。指定しない場合、設定文から特徴を照合して、モデルの種類を特定します。例えば、モデル`bs_roformer_ep_368_sdr_12.9628.ckpt`と対応する設定ファイル`bs_roformer_ep_368_sdr_12.9628.yaml`はペアです。同様に、`kim_mel_band_roformer.ckpt`と`kim_mel_band_roformer.yaml`もペアです。
|
||||
|
||||
4. 中国語ASR(追加機能)の場合は、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、および [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。
|
||||
4. 中国語 ASR(追加機能)の場合は、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、および [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。
|
||||
|
||||
5. 英語または日本語のASR(追加機能)を使用する場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。また、[他のモデル](https://huggingface.co/Systran) は、より小さいサイズで高クオリティな可能性があります。
|
||||
5. 英語または日本語の ASR(追加機能)を使用する場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。また、[他のモデル](https://huggingface.co/Systran) は、より小さいサイズで高クオリティな可能性があります。
|
||||
|
||||
## データセット形式
|
||||
|
||||
@ -169,14 +171,15 @@ vocal_path|speaker_name|language|text
|
||||
```
|
||||
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
|
||||
```
|
||||
|
||||
## 微調整と推論
|
||||
|
||||
### WebUIを開く
|
||||
### WebUI を開く
|
||||
|
||||
#### 統合パッケージ利用者
|
||||
|
||||
`go-webui.bat`をダブルクリックするか、`go-webui.ps1`を使用します。
|
||||
V1に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックするか、`go-webui-v1.ps1`を使用してください。
|
||||
V1 に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックするか、`go-webui-v1.ps1`を使用してください。
|
||||
|
||||
#### その他
|
||||
|
||||
@ -184,12 +187,13 @@ V1に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックす
|
||||
python webui.py <言語(オプション)>
|
||||
```
|
||||
|
||||
V1に切り替えたい場合は
|
||||
V1 に切り替えたい場合は
|
||||
|
||||
```bash
|
||||
python webui.py v1 <言語(オプション)>
|
||||
```
|
||||
またはWebUIで手動でバージョンを切り替えてください。
|
||||
|
||||
または WebUI で手動でバージョンを切り替えてください。
|
||||
|
||||
### 微調整
|
||||
|
||||
@ -202,25 +206,27 @@ python webui.py v1 <言語(オプション)>
|
||||
5. ASR転写を校正する
|
||||
6. 次のタブに移動し、モデルを微調整する
|
||||
|
||||
### 推論WebUIを開く
|
||||
### 推論 WebUI を開く
|
||||
|
||||
#### 統合パッケージ利用者
|
||||
|
||||
`go-webui-v2.bat`をダブルクリックするか、`go-webui-v2.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論webuiを開きます。
|
||||
`go-webui-v2.bat`をダブルクリックするか、`go-webui-v2.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます。
|
||||
|
||||
#### その他
|
||||
|
||||
```bash
|
||||
python GPT_SoVITS/inference_webui.py <言語(オプション)>
|
||||
```
|
||||
|
||||
または
|
||||
|
||||
```bash
|
||||
python webui.py
|
||||
```
|
||||
その後、`1-GPT-SoVITS-TTS/1C-inference`で推論webuiを開きます。
|
||||
|
||||
## V2リリースノート
|
||||
その後、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます。
|
||||
|
||||
## V2 リリースノート
|
||||
|
||||
新機能:
|
||||
|
||||
@ -228,21 +234,21 @@ python webui.py
|
||||
|
||||
2. 最適化されたテキストフロントエンド
|
||||
|
||||
3. 事前学習済みモデルが2千時間から5千時間に拡張
|
||||
3. 事前学習済みモデルが 2 千時間から 5 千時間に拡張
|
||||
|
||||
4. 低品質の参照音声に対する合成品質の向上
|
||||
|
||||
[詳細はこちら](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
||||
[詳細はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||
|
||||
V1環境からV2を使用するには:
|
||||
V1 環境から V2 を使用するには:
|
||||
|
||||
1. `pip install -r requirements.txt`を使用していくつかのパッケージを更新
|
||||
|
||||
2. 最新のコードをgithubからクローン
|
||||
2. 最新のコードを github からクローン
|
||||
|
||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)からV2の事前学習モデルをダウンロードし、それらを`GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`に配置
|
||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)から V2 の事前学習モデルをダウンロードし、それらを`GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`に配置
|
||||
|
||||
中国語V2追加: [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(G2PWモデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します)
|
||||
中国語 V2 追加: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)(G2PW モデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します)
|
||||
|
||||
## V3 リリースノート
|
||||
|
||||
@ -250,19 +256,19 @@ V1環境からV2を使用するには:
|
||||
|
||||
1. 音色の類似性が向上し、ターゲットスピーカーを近似するために必要な学習データが少なくなりました(音色の類似性は、ファインチューニングなしでベースモデルを直接使用することで顕著に改善されます)。
|
||||
|
||||
2. GPTモデルがより安定し、繰り返しや省略が減少し、より豊かな感情表現を持つ音声の生成が容易になりました。
|
||||
2. GPT モデルがより安定し、繰り返しや省略が減少し、より豊かな感情表現を持つ音声の生成が容易になりました。
|
||||
|
||||
[詳細情報はこちら](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
||||
[詳細情報はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||
|
||||
v2 環境から v3 を使用する方法:
|
||||
|
||||
1. `pip install -r requirements.txt` を実行して、いくつかのパッケージを更新します。
|
||||
|
||||
2. GitHubから最新のコードをクローンします。
|
||||
2. GitHub から最新のコードをクローンします。
|
||||
|
||||
3. v3の事前学習済みモデル(s1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ)を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS\pretrained_models フォルダに配置します。
|
||||
3. v3 の事前学習済みモデル(s1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ)を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS\pretrained_models フォルダに配置します。
|
||||
|
||||
追加: 音声超解像モデルについては、[ダウンロード方法](../../tools/AP_BWE_main/24kto48k/readme.txt)を参照してください。
|
||||
追加: 音声超解像モデルについては、[ダウンロード方法](../../tools/AP_BWE_main/24kto48k/readme.txt)を参照してください。
|
||||
|
||||
## Todo リスト
|
||||
|
||||
@ -285,15 +291,20 @@ v2 環境から v3 を使用する方法:
|
||||
- [ ] モデルミックス
|
||||
|
||||
## (追加の) コマンドラインから実行する方法
|
||||
|
||||
コマンド ラインを使用して UVR5 の WebUI を開きます
|
||||
|
||||
```
|
||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
||||
```
|
||||
|
||||
<!-- ブラウザを開けない場合は、以下の形式に従って UVR 処理を行ってください。これはオーディオ処理に mdxnet を使用しています。
|
||||
```
|
||||
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
||||
``` -->
|
||||
|
||||
コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです。
|
||||
|
||||
```
|
||||
python audio_slicer.py \
|
||||
--input_path "<path_to_original_audio_file_or_directory>" \
|
||||
@ -303,16 +314,21 @@ python audio_slicer.py \
|
||||
--min_interval <shortest_time_gap_between_adjacent_subclips>
|
||||
--hop_size <step_size_for_computing_volume_curve>
|
||||
```
|
||||
|
||||
コマンドラインを使用してデータセット ASR 処理を行う方法です (中国語のみ)
|
||||
|
||||
```
|
||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
||||
```
|
||||
ASR処理はFaster_Whisperを通じて実行されます(中国語を除くASRマーキング)
|
||||
|
||||
ASR 処理は Faster_Whisper を通じて実行されます(中国語を除く ASR マーキング)
|
||||
|
||||
(進行状況バーは表示されません。GPU のパフォーマンスにより時間遅延が発生する可能性があります)
|
||||
|
||||
```
|
||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
||||
```
|
||||
|
||||
カスタムリストの保存パスが有効になっています
|
||||
|
||||
## クレジット
|
||||
@ -320,6 +336,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
||||
特に以下のプロジェクトと貢献者に感謝します:
|
||||
|
||||
### 理論研究
|
||||
|
||||
- [ar-vits](https://github.com/innnky/ar-vits)
|
||||
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
||||
- [vits](https://github.com/jaywalnut310/vits)
|
||||
@ -329,17 +346,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
||||
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
||||
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
||||
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
||||
|
||||
### 事前学習モデル
|
||||
|
||||
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
||||
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
||||
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
||||
|
||||
### 推論用テキストフロントエンド
|
||||
|
||||
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
||||
- [split-lang](https://github.com/DoodleBears/split-lang)
|
||||
- [g2pW](https://github.com/GitYCC/g2pW)
|
||||
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
||||
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
||||
|
||||
### WebUI ツール
|
||||
|
||||
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
||||
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
||||
- [SubFix](https://github.com/cronrpc/SubFix)
|
||||
|
@ -49,7 +49,7 @@ _참고: numba==0.56.4 는 python<3.11 을 필요로 합니다._
|
||||
|
||||
### Windows
|
||||
|
||||
Windows 사용자라면 (win>=10에서 테스트됨), [통합 패키지를 다운로드](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta.7z?download=true)한 후 압축을 풀고 _go-webui.bat_ 파일을 더블 클릭하면 GPT-SoVITS-WebUI를 시작할 수 있습니다.
|
||||
Windows 사용자라면 (win>=10에서 테스트됨), [통합 패키지를 다운로드](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true)한 후 압축을 풀고 _go-webui.bat_ 파일을 더블 클릭하면 GPT-SoVITS-WebUI를 시작할 수 있습니다.
|
||||
|
||||
### Linux
|
||||
|
||||
@ -70,7 +70,7 @@ bash install.sh
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda activate GPTSoVits
|
||||
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
@ -99,6 +99,7 @@ conda install -c conda-forge 'ffmpeg<7'
|
||||
[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치 (Korean TTS 전용)
|
||||
|
||||
##### MacOS 사용자
|
||||
|
||||
```bash
|
||||
brew install ffmpeg
|
||||
```
|
||||
@ -106,6 +107,7 @@ brew install ffmpeg
|
||||
#### 의존성 설치
|
||||
|
||||
```bash
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
@ -143,13 +145,13 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
|
||||
|
||||
1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 에서 사전 학습된 모델을 다운로드하고, `GPT_SoVITS/pretrained_models` 디렉토리에 배치하세요.
|
||||
|
||||
2. [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) 에서 모델을 다운로드하고 압축을 풀어 `G2PWModel`로 이름을 변경한 후, `GPT_SoVITS/text` 디렉토리에 배치하세요. (중국어 TTS 전용)
|
||||
2. [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) 에서 모델을 다운로드하고 압축을 풀어 `G2PWModel`로 이름을 변경한 후, `GPT_SoVITS/text` 디렉토리에 배치하세요. (중국어 TTS 전용)
|
||||
|
||||
3. UVR5 (보컬/반주 분리 & 잔향 제거 추가 기능)의 경우, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 에서 모델을 다운로드하고 `tools/uvr5/uvr5_weights` 디렉토리에 배치하세요.
|
||||
|
||||
- UVR5에서 bs_roformer 또는 mel_band_roformer 모델을 사용할 경우, 모델과 해당 설정 파일을 수동으로 다운로드하여 `tools/UVR5/UVR5_weights` 폴더에 저장할 수 있습니다. **모델 파일과 설정 파일의 이름은 확장자를 제외하고 동일한 이름을 가지도록 해야 합니다**. 또한, 모델과 설정 파일 이름에는 **“roformer”**가 포함되어야 roformer 클래스의 모델로 인식됩니다.
|
||||
- UVR5에서 bs_roformer 또는 mel_band_roformer 모델을 사용할 경우, 모델과 해당 설정 파일을 수동으로 다운로드하여 `tools/UVR5/UVR5_weights` 폴더에 저장할 수 있습니다. **모델 파일과 설정 파일의 이름은 확장자를 제외하고 동일한 이름을 가지도록 해야 합니다**. 또한, 모델과 설정 파일 이름에는 **“roformer”**가 포함되어야 roformer 클래스의 모델로 인식됩니다.
|
||||
|
||||
- 모델 이름과 설정 파일 이름에 **모델 유형을 직접 지정하는 것이 좋습니다**. 예: mel_mand_roformer, bs_roformer. 지정하지 않으면 설정 파일을 기준으로 특성을 비교하여 어떤 유형의 모델인지를 판단합니다. 예를 들어, 모델 `bs_roformer_ep_368_sdr_12.9628.ckpt`와 해당 설정 파일 `bs_roformer_ep_368_sdr_12.9628.yaml`은 한 쌍입니다. `kim_mel_band_roformer.ckpt`와 `kim_mel_band_roformer.yaml`도 한 쌍입니다.
|
||||
- 모델 이름과 설정 파일 이름에 **모델 유형을 직접 지정하는 것이 좋습니다**. 예: mel_mand_roformer, bs_roformer. 지정하지 않으면 설정 파일을 기준으로 특성을 비교하여 어떤 유형의 모델인지를 판단합니다. 예를 들어, 모델 `bs_roformer_ep_368_sdr_12.9628.ckpt`와 해당 설정 파일 `bs_roformer_ep_368_sdr_12.9628.yaml`은 한 쌍입니다. `kim_mel_band_roformer.ckpt`와 `kim_mel_band_roformer.yaml`도 한 쌍입니다.
|
||||
|
||||
4. 중국어 ASR (추가 기능)의 경우, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 및 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 에서 모델을 다운로드하고, `tools/asr/models` 디렉토리에 배치하세요.
|
||||
|
||||
@ -195,6 +197,7 @@ V1으로 전환하려면,
|
||||
```bash
|
||||
python webui.py v1 <언어(옵션)>
|
||||
```
|
||||
|
||||
또는 WebUI에서 수동으로 버전을 전환하십시오.
|
||||
|
||||
### 미세 조정
|
||||
@ -219,11 +222,13 @@ python webui.py v1 <언어(옵션)>
|
||||
```bash
|
||||
python GPT_SoVITS/inference_webui.py <언어(옵션)>
|
||||
```
|
||||
|
||||
또는
|
||||
|
||||
```bash
|
||||
python webui.py
|
||||
```
|
||||
|
||||
그런 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.
|
||||
|
||||
## V2 릴리스 노트
|
||||
@ -238,7 +243,7 @@ python webui.py
|
||||
|
||||
4. 저품질 참조 오디오에 대한 합성 품질 향상
|
||||
|
||||
[자세한 내용](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
||||
[자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||
|
||||
V1 환경에서 V2를 사용하려면:
|
||||
|
||||
@ -248,7 +253,7 @@ V1 환경에서 V2를 사용하려면:
|
||||
|
||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)에서 V2 사전 학습 모델을 다운로드하여 `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`에 넣으십시오.
|
||||
|
||||
중국어 V2 추가: [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.)
|
||||
중국어 V2 추가: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.)
|
||||
|
||||
## V3 릴리스 노트
|
||||
|
||||
@ -258,7 +263,7 @@ V1 환경에서 V2를 사용하려면:
|
||||
|
||||
2. GPT 모델이 더 안정적이며 반복 및 생략이 적고, 더 풍부한 감정 표현을 가진 음성을 생성하기가 더 쉽습니다.
|
||||
|
||||
[자세한 내용](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
||||
[자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||
|
||||
v2 환경에서 v3 사용하기:
|
||||
|
||||
@ -268,8 +273,7 @@ v2 환경에서 v3 사용하기:
|
||||
|
||||
3. v3 사전 훈련된 모델(s1v3.ckpt, s2Gv3.pth, 그리고 models--nvidia--bigvgan_v2_24khz_100band_256x 폴더)을 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 다운로드하여 `GPT_SoVITS\pretrained_models` 폴더에 넣습니다.
|
||||
|
||||
추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요.
|
||||
|
||||
추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요.
|
||||
|
||||
## 할 일 목록
|
||||
|
||||
@ -293,15 +297,20 @@ v2 환경에서 v3 사용하기:
|
||||
- [ ] 모델 블렌딩.
|
||||
|
||||
## (추가적인) 명령줄에서 실행하는 방법
|
||||
|
||||
명령줄을 사용하여 UVR5용 WebUI 열기
|
||||
|
||||
```
|
||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
||||
```
|
||||
|
||||
<!-- 브라우저를 열 수 없는 경우 UVR 처리를 위해 아래 형식을 따르십시오. 이는 오디오 처리를 위해 mdxnet을 사용하는 것입니다.
|
||||
```
|
||||
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
||||
``` -->
|
||||
|
||||
명령줄을 사용하여 데이터세트의 오디오 분할을 수행하는 방법은 다음과 같습니다.
|
||||
|
||||
```
|
||||
python audio_slicer.py \
|
||||
--input_path "<path_to_original_audio_file_or_directory>" \
|
||||
@ -311,16 +320,21 @@ python audio_slicer.py \
|
||||
--min_interval <shortest_time_gap_between_adjacent_subclips>
|
||||
--hop_size <step_size_for_computing_volume_curve>
|
||||
```
|
||||
|
||||
명령줄을 사용하여 데이터 세트 ASR 처리를 수행하는 방법입니다(중국어만 해당).
|
||||
|
||||
```
|
||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
||||
```
|
||||
|
||||
ASR 처리는 Faster_Whisper(중국어를 제외한 ASR 마킹)를 통해 수행됩니다.
|
||||
|
||||
(진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음)
|
||||
|
||||
```
|
||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
||||
```
|
||||
|
||||
사용자 정의 목록 저장 경로가 활성화되었습니다.
|
||||
|
||||
## 감사의 말
|
||||
@ -328,6 +342,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
||||
다음 프로젝트와 기여자들에게 특별히 감사드립니다:
|
||||
|
||||
### 이론 연구
|
||||
|
||||
- [ar-vits](https://github.com/innnky/ar-vits)
|
||||
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
||||
- [vits](https://github.com/jaywalnut310/vits)
|
||||
@ -337,17 +352,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
||||
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
||||
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
||||
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
||||
|
||||
### 사전 학습 모델
|
||||
|
||||
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
||||
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
||||
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
||||
|
||||
### 추론용 텍스트 프론트엔드
|
||||
|
||||
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
||||
- [split-lang](https://github.com/DoodleBears/split-lang)
|
||||
- [g2pW](https://github.com/GitYCC/g2pW)
|
||||
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
||||
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
||||
|
||||
### WebUI 도구
|
||||
|
||||
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
||||
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
||||
- [SubFix](https://github.com/cronrpc/SubFix)
|
||||
|
@ -51,7 +51,7 @@ _Not: numba==0.56.4, py<3.11 gerektirir_
|
||||
|
||||
### Windows
|
||||
|
||||
Eğer bir Windows kullanıcısıysanız (win>=10 ile test edilmiştir), [entegre paketi indirin](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-beta.7z?download=true) ve _go-webui.bat_ dosyasına çift tıklayarak GPT-SoVITS-WebUI'yi başlatın.
|
||||
Eğer bir Windows kullanıcısıysanız (win>=10 ile test edilmiştir), [entegre paketi indirin](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true) ve _go-webui.bat_ dosyasına çift tıklayarak GPT-SoVITS-WebUI'yi başlatın.
|
||||
|
||||
### Linux
|
||||
|
||||
@ -72,7 +72,7 @@ bash install.sh
|
||||
```bash
|
||||
conda create -n GPTSoVits python=3.9
|
||||
conda activate GPTSoVits
|
||||
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
@ -99,6 +99,7 @@ conda install -c conda-forge 'ffmpeg<7'
|
||||
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin.
|
||||
|
||||
##### MacOS Kullanıcıları
|
||||
|
||||
```bash
|
||||
brew install ffmpeg
|
||||
```
|
||||
@ -106,6 +107,7 @@ brew install ffmpeg
|
||||
#### Bağımlılıkları Yükleme
|
||||
|
||||
```bash
|
||||
pip install -r extra-req.txt --no-deps
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
@ -138,13 +140,13 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
|
||||
|
||||
1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) üzerinden önceden eğitilmiş modelleri indirip `GPT_SoVITS/pretrained_models` dizinine yerleştirin.
|
||||
|
||||
2. [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) üzerinden modeli indirip sıkıştırmayı açın ve `G2PWModel` olarak yeniden adlandırın, ardından `GPT_SoVITS/text` dizinine yerleştirin. (Sadece Çince TTS için)
|
||||
2. [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) üzerinden modeli indirip sıkıştırmayı açın ve `G2PWModel` olarak yeniden adlandırın, ardından `GPT_SoVITS/text` dizinine yerleştirin. (Sadece Çince TTS için)
|
||||
|
||||
3. UVR5 (Vokal/Enstrümantal Ayrımı & Yankı Giderme) için, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) üzerinden modelleri indirip `tools/uvr5/uvr5_weights` dizinine yerleştirin.
|
||||
|
||||
- UVR5'te bs_roformer veya mel_band_roformer modellerini kullanıyorsanız, modeli ve ilgili yapılandırma dosyasını manuel olarak indirip `tools/UVR5/UVR5_weights` klasörüne yerleştirebilirsiniz. **Model dosyası ve yapılandırma dosyasının adı, uzantı dışında aynı olmalıdır**. Ayrıca, model ve yapılandırma dosyasının adlarında **“roformer”** kelimesi yer almalıdır, böylece roformer sınıfındaki bir model olarak tanınır.
|
||||
- UVR5'te bs_roformer veya mel_band_roformer modellerini kullanıyorsanız, modeli ve ilgili yapılandırma dosyasını manuel olarak indirip `tools/UVR5/UVR5_weights` klasörüne yerleştirebilirsiniz. **Model dosyası ve yapılandırma dosyasının adı, uzantı dışında aynı olmalıdır**. Ayrıca, model ve yapılandırma dosyasının adlarında **“roformer”** kelimesi yer almalıdır, böylece roformer sınıfındaki bir model olarak tanınır.
|
||||
|
||||
- Model adı ve yapılandırma dosyası adı içinde **doğrudan model tipini belirtmek önerilir**. Örneğin: mel_mand_roformer, bs_roformer. Belirtilmezse, yapılandırma dosyasından özellikler karşılaştırılarak model tipi belirlenir. Örneğin, `bs_roformer_ep_368_sdr_12.9628.ckpt` modeli ve karşılık gelen yapılandırma dosyası `bs_roformer_ep_368_sdr_12.9628.yaml` bir çifttir. Aynı şekilde, `kim_mel_band_roformer.ckpt` ve `kim_mel_band_roformer.yaml` da bir çifttir.
|
||||
- Model adı ve yapılandırma dosyası adı içinde **doğrudan model tipini belirtmek önerilir**. Örneğin: mel_mand_roformer, bs_roformer. Belirtilmezse, yapılandırma dosyasından özellikler karşılaştırılarak model tipi belirlenir. Örneğin, `bs_roformer_ep_368_sdr_12.9628.ckpt` modeli ve karşılık gelen yapılandırma dosyası `bs_roformer_ep_368_sdr_12.9628.yaml` bir çifttir. Aynı şekilde, `kim_mel_band_roformer.ckpt` ve `kim_mel_band_roformer.yaml` da bir çifttir.
|
||||
|
||||
4. Çince ASR için, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) ve [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) üzerinden modelleri indirip `tools/asr/models` dizinine yerleştirin.
|
||||
|
||||
@ -192,6 +194,7 @@ V1'e geçmek istiyorsanız,
|
||||
```bash
|
||||
python webui.py v1 <dil(isteğe bağlı)>
|
||||
```
|
||||
|
||||
veya WebUI'de manuel olarak sürüm değiştirin.
|
||||
|
||||
### İnce Ayar
|
||||
@ -216,11 +219,13 @@ veya WebUI'de manuel olarak sürüm değiştirin.
|
||||
```bash
|
||||
python GPT_SoVITS/inference_webui.py <dil(isteğe bağlı)>
|
||||
```
|
||||
|
||||
VEYA
|
||||
|
||||
```bash
|
||||
python webui.py
|
||||
```
|
||||
|
||||
ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.
|
||||
|
||||
## V2 Sürüm Notları
|
||||
@ -235,7 +240,7 @@ Yeni Özellikler:
|
||||
|
||||
4. Düşük kaliteli referans sesler için geliştirilmiş sentez kalitesi
|
||||
|
||||
[detaylar burada](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
||||
[detaylar burada](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||
|
||||
V1 ortamından V2'yi kullanmak için:
|
||||
|
||||
@ -245,7 +250,7 @@ V1 ortamından V2'yi kullanmak için:
|
||||
|
||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) adresinden v2 önceden eğitilmiş modelleri indirin ve bunları `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained` dizinine yerleştirin.
|
||||
|
||||
Ek olarak Çince V2: [G2PWModel_1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.)
|
||||
Ek olarak Çince V2: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.)
|
||||
|
||||
## V3 Sürüm Notları
|
||||
|
||||
@ -255,7 +260,7 @@ V1 ortamından V2'yi kullanmak için:
|
||||
|
||||
2. GPT modeli daha **kararlı** hale geldi, tekrarlar ve atlamalar azaldı ve **daha zengin duygusal ifadeler** ile konuşma üretmek daha kolay hale geldi.
|
||||
|
||||
[daha fazla detay](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
||||
[daha fazla detay](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||
|
||||
### v2 ortamında v3 kullanımı:
|
||||
|
||||
@ -265,7 +270,7 @@ V1 ortamından V2'yi kullanmak için:
|
||||
|
||||
3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden v3 önceden eğitilmiş modellerini (s1v3.ckpt, s2Gv3.pth ve models--nvidia--bigvgan_v2_24khz_100band_256x klasörünü) indirin ve `GPT_SoVITS\pretrained_models` dizinine yerleştirin.
|
||||
|
||||
ek: Ses Süper Çözünürlük modeli için [nasıl indirileceği](../../tools/AP_BWE_main/24kto48k/readme.txt) hakkında bilgi alabilirsiniz.
|
||||
ek: Ses Süper Çözünürlük modeli için [nasıl indirileceği](../../tools/AP_BWE_main/24kto48k/readme.txt) hakkında bilgi alabilirsiniz.
|
||||
|
||||
## Yapılacaklar Listesi
|
||||
|
||||
@ -288,15 +293,20 @@ V1 ortamından V2'yi kullanmak için:
|
||||
- [ ] model karışımı
|
||||
|
||||
## (Ekstra) Komut satırından çalıştırma yöntemi
|
||||
|
||||
UVR5 için Web Arayüzünü açmak için komut satırını kullanın
|
||||
|
||||
```
|
||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
||||
```
|
||||
|
||||
<!-- Bir tarayıcı açamıyorsanız, UVR işleme için aşağıdaki formatı izleyin,Bu ses işleme için mdxnet kullanıyor
|
||||
```
|
||||
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
||||
``` -->
|
||||
|
||||
Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır
|
||||
|
||||
```
|
||||
python audio_slicer.py \
|
||||
--input_path "<orijinal_ses_dosyası_veya_dizininin_yolu>" \
|
||||
@ -306,16 +316,21 @@ python audio_slicer.py \
|
||||
--min_interval <bitişik_alt_klipler_arasındaki_en_kısa_zaman_aralığı>
|
||||
--hop_size <ses_eğrisini_hesaplamak_için_adım_boyutu>
|
||||
```
|
||||
|
||||
Veri seti ASR işleme komut satırı kullanılarak bu şekilde yapılır (Yalnızca Çince)
|
||||
|
||||
```
|
||||
python tools/asr/funasr_asr.py -i <girdi> -o <çıktı>
|
||||
```
|
||||
|
||||
ASR işleme Faster_Whisper aracılığıyla gerçekleştirilir (Çince dışındaki ASR işaretleme)
|
||||
|
||||
(İlerleme çubukları yok, GPU performansı zaman gecikmelerine neden olabilir)
|
||||
|
||||
```
|
||||
python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
|
||||
```
|
||||
|
||||
Özel bir liste kaydetme yolu etkinleştirildi
|
||||
|
||||
## Katkı Verenler
|
||||
@ -323,6 +338,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
|
||||
Özellikle aşağıdaki projelere ve katkıda bulunanlara teşekkür ederiz:
|
||||
|
||||
### Teorik Araştırma
|
||||
|
||||
- [ar-vits](https://github.com/innnky/ar-vits)
|
||||
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
||||
- [vits](https://github.com/jaywalnut310/vits)
|
||||
@ -332,17 +348,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
|
||||
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
||||
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
||||
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
||||
|
||||
### Önceden Eğitilmiş Modeller
|
||||
|
||||
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
||||
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
||||
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
||||
|
||||
### Tahmin İçin Metin Ön Ucu
|
||||
|
||||
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
||||
- [split-lang](https://github.com/DoodleBears/split-lang)
|
||||
- [g2pW](https://github.com/GitYCC/g2pW)
|
||||
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
||||
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
||||
|
||||
### WebUI Araçları
|
||||
|
||||
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
||||
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
||||
- [SubFix](https://github.com/cronrpc/SubFix)
|
||||
|
1
extra-req.txt
Normal file
1
extra-req.txt
Normal file
@ -0,0 +1 @@
|
||||
faster-whisper
|
@ -27,7 +27,8 @@
|
||||
"!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
|
||||
"%cd GPT-SoVITS\n",
|
||||
"!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
|
||||
"!pip install -r requirements.txt"
|
||||
"!pip install -r requirements.txt\n",
|
||||
"!pip install -r extra-req.txt --no-deps"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
111
install.sh
111
install.sh
@ -1,6 +1,109 @@
|
||||
#!/bin/bash
|
||||
conda install -c conda-forge gcc=14
|
||||
conda install -c conda-forge gxx
|
||||
conda install ffmpeg cmake
|
||||
conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia
|
||||
|
||||
set -e
|
||||
|
||||
# 安装构建工具
|
||||
# Install build tools
|
||||
echo "Installing GCC..."
|
||||
conda install -c conda-forge gcc=14 -y
|
||||
|
||||
echo "Installing G++..."
|
||||
conda install -c conda-forge gxx -y
|
||||
|
||||
echo "Installing ffmpeg and cmake..."
|
||||
conda install ffmpeg cmake -y
|
||||
|
||||
# 设置编译环境
|
||||
# Set up build environment
|
||||
export CMAKE_MAKE_PROGRAM="$CONDA_PREFIX/bin/cmake"
|
||||
export CC="$CONDA_PREFIX/bin/gcc"
|
||||
export CXX="$CONDA_PREFIX/bin/g++"
|
||||
|
||||
echo "Checking for CUDA installation..."
|
||||
if command -v nvidia-smi &>/dev/null; then
|
||||
USE_CUDA=true
|
||||
echo "CUDA found."
|
||||
else
|
||||
echo "CUDA not found."
|
||||
USE_CUDA=false
|
||||
fi
|
||||
|
||||
if [ "$USE_CUDA" = false ]; then
|
||||
echo "Checking for ROCm installation..."
|
||||
if [ -d "/opt/rocm" ]; then
|
||||
USE_ROCM=true
|
||||
echo "ROCm found."
|
||||
if grep -qi "microsoft" /proc/version; then
|
||||
echo "You are running WSL."
|
||||
IS_WSL=true
|
||||
else
|
||||
echo "You are NOT running WSL."
|
||||
IS_WSL=false
|
||||
fi
|
||||
else
|
||||
echo "ROCm not found."
|
||||
USE_ROCM=false
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "$USE_CUDA" = true ]; then
|
||||
echo "Installing PyTorch with CUDA support..."
|
||||
conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia
|
||||
elif [ "$USE_ROCM" = true ]; then
|
||||
echo "Installing PyTorch with ROCm support..."
|
||||
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/rocm6.2
|
||||
else
|
||||
echo "Installing PyTorch for CPU..."
|
||||
conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 cpuonly -c pytorch
|
||||
fi
|
||||
|
||||
echo "Installing Python dependencies from requirements.txt..."
|
||||
|
||||
# 刷新环境
|
||||
# Refresh environment
|
||||
hash -r
|
||||
|
||||
# pyopenjtalk Installation
|
||||
conda install jq -y
|
||||
|
||||
OS_TYPE=$(uname)
|
||||
|
||||
PACKAGE_NAME="pyopenjtalk"
|
||||
|
||||
VERSION=$(curl -s https://pypi.org/pypi/$PACKAGE_NAME/json | jq -r .info.version)
|
||||
|
||||
wget "https://files.pythonhosted.org/packages/source/${PACKAGE_NAME:0:1}/$PACKAGE_NAME/$PACKAGE_NAME-$VERSION.tar.gz"
|
||||
|
||||
TAR_FILE=$(ls ${PACKAGE_NAME}-*.tar.gz)
|
||||
DIR_NAME="${TAR_FILE%.tar.gz}"
|
||||
|
||||
tar -xzf "$TAR_FILE"
|
||||
rm "$TAR_FILE"
|
||||
|
||||
CMAKE_FILE="$DIR_NAME/lib/open_jtalk/src/CMakeLists.txt"
|
||||
|
||||
if [[ "$OS_TYPE" == "darwin"* ]]; then
|
||||
sed -i '' -E 's/cmake_minimum_required\(VERSION[^\)]*\)/cmake_minimum_required(VERSION 3.5...3.31)/' "$CMAKE_FILE"
|
||||
else
|
||||
sed -i -E 's/cmake_minimum_required\(VERSION[^\)]*\)/cmake_minimum_required(VERSION 3.5...3.31)/' "$CMAKE_FILE"
|
||||
fi
|
||||
|
||||
tar -czf "$TAR_FILE" "$DIR_NAME"
|
||||
|
||||
pip install "$TAR_FILE"
|
||||
|
||||
rm -rf "$TAR_FILE" "$DIR_NAME"
|
||||
|
||||
pip install -r extra-req.txt --no-deps
|
||||
|
||||
pip install -r requirements.txt
|
||||
|
||||
if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ]; then
|
||||
echo "Update to WSL compatible runtime lib..."
|
||||
location=$(pip show torch | grep Location | awk -F ": " '{print $2}')
|
||||
cd "${location}"/torch/lib/ || exit
|
||||
rm libhsa-runtime64.so*
|
||||
cp /opt/rocm/lib/libhsa-runtime64.so.1.2 libhsa-runtime64.so
|
||||
fi
|
||||
|
||||
echo "Installation completed successfully!"
|
||||
|
@ -3,7 +3,7 @@ scipy
|
||||
tensorboard
|
||||
librosa==0.9.2
|
||||
numba==0.56.4
|
||||
pytorch-lightning
|
||||
pytorch-lightning>2.0
|
||||
gradio>=4.0,<=4.24.0
|
||||
ffmpeg-python
|
||||
onnxruntime; sys_platform == 'darwin'
|
||||
@ -25,8 +25,7 @@ psutil
|
||||
jieba_fast
|
||||
jieba
|
||||
split-lang
|
||||
fast_langdetect
|
||||
Faster_Whisper
|
||||
fast_langdetect>=0.3.0
|
||||
wordsegment
|
||||
rotary_embedding_torch
|
||||
ToJyutping
|
||||
@ -38,4 +37,9 @@ python_mecab_ko; sys_platform != 'win32'
|
||||
fastapi<0.112.2
|
||||
x_transformers
|
||||
torchmetrics<=1.5
|
||||
attrdict
|
||||
pydantic<=2.10.6
|
||||
ctranslate2>=4.0,<5
|
||||
huggingface_hub>=0.13
|
||||
tokenizers>=0.13,<1
|
||||
av>=11
|
||||
tqdm
|
||||
|
@ -39,6 +39,11 @@ class AP_BWE():
|
||||
self.model=model
|
||||
self.h=h
|
||||
|
||||
def to(self, *arg, **kwargs):
|
||||
self.model.to(*arg, **kwargs)
|
||||
self.device = self.model.conv_pre_mag.weight.device
|
||||
return self
|
||||
|
||||
def __call__(self, audio,orig_sampling_rate):
|
||||
with torch.no_grad():
|
||||
# audio, orig_sampling_rate = torchaudio.load(inp_path)
|
||||
|
@ -32,7 +32,7 @@ def clean_path(path_str:str):
|
||||
if path_str.endswith(('\\','/')):
|
||||
return clean_path(path_str[0:-1])
|
||||
path_str = path_str.replace('/', os.sep).replace('\\', os.sep)
|
||||
return path_str.strip(" ").strip('\'').strip("\n").strip('"').strip(" ").strip("\u202a")
|
||||
return path_str.strip(" \'\n\"\u202a")#path_str.strip(" ").strip('\'').strip("\n").strip('"').strip(" ").strip("\u202a")
|
||||
|
||||
|
||||
def check_for_existance(file_list:list=None,is_train=False,is_dataset_processing=False):
|
||||
|
10
webui.py
10
webui.py
@ -298,9 +298,9 @@ def change_tts_inference(bert_path,cnhubert_base_path,gpu_number,gpt_path,sovits
|
||||
cmd = '"%s" GPT_SoVITS/inference_webui_fast.py "%s"'%(python_exec, language)
|
||||
else:
|
||||
cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language)
|
||||
#####v3暂不支持加速推理
|
||||
if version=="v3":
|
||||
cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language)
|
||||
# #####v3暂不支持加速推理
|
||||
# if version=="v3":
|
||||
# cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language)
|
||||
if p_tts_inference is None:
|
||||
os.environ["gpt_path"]=gpt_path if "/" in gpt_path else "%s/%s"%(GPT_weight_root,gpt_path)
|
||||
os.environ["sovits_path"]=sovits_path if "/"in sovits_path else "%s/%s"%(SoVITS_weight_root,sovits_path)
|
||||
@ -849,8 +849,8 @@ def switch_version(version_):
|
||||
{'__type__': 'update', "value": default_sovits_save_every_epoch,"maximum": max_sovits_save_every_epoch}, \
|
||||
{'__type__': 'update', "visible": True if version!="v3"else False}, \
|
||||
{'__type__': 'update', "value": False if not if_force_ckpt else True, "interactive": True if not if_force_ckpt else False}, \
|
||||
{'__type__': 'update', "interactive": False if version == "v3" else True, "value": False}, \
|
||||
{'__type__': 'update', "visible": True if version== "v3" else False}
|
||||
{'__type__': 'update', "interactive": True, "value": False}, \
|
||||
{'__type__': 'update', "visible": True if version== "v3" else False} # {'__type__': 'update', "interactive": False if version == "v3" else True, "value": False}, \ ####batch infer
|
||||
|
||||
if os.path.exists('GPT_SoVITS/text/G2PWModel'):...
|
||||
else:
|
||||
|
Loading…
x
Reference in New Issue
Block a user