mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-04-05 19:41:56 +08:00
Compare commits
80 Commits
74c0e5a918
...
cc88d33348
Author | SHA1 | Date | |
---|---|---|---|
|
cc88d33348 | ||
|
03b662a769 | ||
|
6c468583c5 | ||
|
50a88a596d | ||
|
86e5b67448 | ||
|
2faf74beaa | ||
|
16b3c2a131 | ||
|
9f418af1dd | ||
|
5ffb193bcd | ||
|
56d6ae6b3b | ||
|
7c3c778b17 | ||
|
61b21e1fca | ||
|
18002ad809 | ||
|
12fa7d875f | ||
|
48cc70a7de | ||
|
3ac7aad4d0 | ||
|
036d828a7e | ||
|
7e1c40ef9f | ||
|
fdffd50066 | ||
|
8a10c528e3 | ||
|
02fabe807f | ||
|
4ebcb3bf1b | ||
|
5843d56c4e | ||
|
7660f1c8fb | ||
|
fa45c5ac4f | ||
|
2dc36d3d60 | ||
|
f70fd8ff87 | ||
|
ed8d276ac9 | ||
|
1de89feb7b | ||
|
b8356880dc | ||
|
8182908f7d | ||
|
5081168918 | ||
|
c26fa983a4 | ||
|
5280d17d2f | ||
|
371a2d7138 | ||
|
fe969ab9a2 | ||
|
61db7f05dc | ||
|
536c226b1a | ||
|
01468158d3 | ||
|
c9547ab669 | ||
|
b1ad8b5dcd | ||
|
d6e255a071 | ||
|
e89f986e3f | ||
|
af0bd9f414 | ||
|
13567362d9 | ||
|
27325f4cf9 | ||
|
6cb3c15448 | ||
|
9264f7e38e | ||
|
25b65cdfd0 | ||
|
1a7cf580e0 | ||
|
c36d0a93fe | ||
|
2a23f95f61 | ||
|
d1e92edc7c | ||
|
d8d551d4d2 | ||
|
1d434e1a0a | ||
|
9fe20c14d6 | ||
|
64cc2fd9d1 | ||
|
a291629438 | ||
|
e3e47d2c06 | ||
|
ca9ffbf98e | ||
|
684e1cfd2f | ||
|
878fef248a | ||
|
2880e3a6f8 | ||
|
1da23aa259 | ||
|
c8be484c0e | ||
|
d855eecc7b | ||
|
d20bd37965 | ||
|
926dd6b34a | ||
|
f61a723bab | ||
|
441ab54889 | ||
|
ecbc7d0b1e | ||
|
b6f0bb36ef | ||
|
4daa9ad53c | ||
|
2c8f6bd4c9 | ||
|
4cbbe2a258 | ||
|
a1fc00a9d8 | ||
|
8c9627bb30 | ||
|
e69e449599 | ||
|
29b8370c45 | ||
|
7efdf31113 |
178
.gitignore
vendored
178
.gitignore
vendored
@ -18,5 +18,183 @@ TEMP
|
|||||||
weight.json
|
weight.json
|
||||||
ffmpeg*
|
ffmpeg*
|
||||||
ffprobe*
|
ffprobe*
|
||||||
|
cfg.json
|
||||||
|
speakers.json
|
||||||
|
ref_audios
|
||||||
tools/AP_BWE_main/24kto48k/*
|
tools/AP_BWE_main/24kto48k/*
|
||||||
!tools/AP_BWE_main/24kto48k/readme.txt
|
!tools/AP_BWE_main/24kto48k/readme.txt
|
||||||
|
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# UV
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
#uv.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||||
|
#poetry.lock
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||||
|
#pdm.lock
|
||||||
|
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||||
|
# in version control.
|
||||||
|
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
||||||
|
.pdm.toml
|
||||||
|
.pdm-python
|
||||||
|
.pdm-build/
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||||
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
|
#.idea/
|
||||||
|
|
||||||
|
# Ruff stuff:
|
||||||
|
.ruff_cache/
|
||||||
|
|
||||||
|
# PyPI configuration file
|
||||||
|
.pypirc
|
||||||
|
@ -3,7 +3,7 @@ import math
|
|||||||
import os, sys, gc
|
import os, sys, gc
|
||||||
import random
|
import random
|
||||||
import traceback
|
import traceback
|
||||||
|
import time
|
||||||
import torchaudio
|
import torchaudio
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
now_dir = os.getcwd()
|
now_dir = os.getcwd()
|
||||||
@ -908,11 +908,14 @@ class TTS:
|
|||||||
split_bucket = False
|
split_bucket = False
|
||||||
print(i18n("分段返回模式不支持分桶处理,已自动关闭分桶处理"))
|
print(i18n("分段返回模式不支持分桶处理,已自动关闭分桶处理"))
|
||||||
|
|
||||||
if split_bucket and speed_factor==1.0:
|
if split_bucket and speed_factor==1.0 and not (self.configs.is_v3_synthesizer and parallel_infer):
|
||||||
print(i18n("分桶处理模式已开启"))
|
print(i18n("分桶处理模式已开启"))
|
||||||
elif speed_factor!=1.0:
|
elif speed_factor!=1.0:
|
||||||
print(i18n("语速调节不支持分桶处理,已自动关闭分桶处理"))
|
print(i18n("语速调节不支持分桶处理,已自动关闭分桶处理"))
|
||||||
split_bucket = False
|
split_bucket = False
|
||||||
|
elif self.configs.is_v3_synthesizer and parallel_infer:
|
||||||
|
print(i18n("当开启并行推理模式时,SoVits V3模型不支持分桶处理,已自动关闭分桶处理"))
|
||||||
|
split_bucket = False
|
||||||
else:
|
else:
|
||||||
print(i18n("分桶处理模式已关闭"))
|
print(i18n("分桶处理模式已关闭"))
|
||||||
|
|
||||||
@ -936,7 +939,7 @@ class TTS:
|
|||||||
raise ValueError("ref_audio_path cannot be empty, when the reference audio is not set using set_ref_audio()")
|
raise ValueError("ref_audio_path cannot be empty, when the reference audio is not set using set_ref_audio()")
|
||||||
|
|
||||||
###### setting reference audio and prompt text preprocessing ########
|
###### setting reference audio and prompt text preprocessing ########
|
||||||
t0 = ttime()
|
t0 = time.perf_counter()
|
||||||
if (ref_audio_path is not None) and (ref_audio_path != self.prompt_cache["ref_audio_path"]):
|
if (ref_audio_path is not None) and (ref_audio_path != self.prompt_cache["ref_audio_path"]):
|
||||||
if not os.path.exists(ref_audio_path):
|
if not os.path.exists(ref_audio_path):
|
||||||
raise ValueError(f"{ref_audio_path} not exists")
|
raise ValueError(f"{ref_audio_path} not exists")
|
||||||
@ -975,7 +978,7 @@ class TTS:
|
|||||||
|
|
||||||
|
|
||||||
###### text preprocessing ########
|
###### text preprocessing ########
|
||||||
t1 = ttime()
|
t1 = time.perf_counter()
|
||||||
data:list = None
|
data:list = None
|
||||||
if not return_fragment:
|
if not return_fragment:
|
||||||
data = self.text_preprocessor.preprocess(text, text_lang, text_split_method, self.configs.version)
|
data = self.text_preprocessor.preprocess(text, text_lang, text_split_method, self.configs.version)
|
||||||
@ -1027,7 +1030,7 @@ class TTS:
|
|||||||
return batch[0]
|
return batch[0]
|
||||||
|
|
||||||
|
|
||||||
t2 = ttime()
|
t2 = time.perf_counter()
|
||||||
try:
|
try:
|
||||||
print("############ 推理 ############")
|
print("############ 推理 ############")
|
||||||
###### inference ######
|
###### inference ######
|
||||||
@ -1036,7 +1039,7 @@ class TTS:
|
|||||||
audio = []
|
audio = []
|
||||||
output_sr = self.configs.sampling_rate if not self.configs.is_v3_synthesizer else 24000
|
output_sr = self.configs.sampling_rate if not self.configs.is_v3_synthesizer else 24000
|
||||||
for item in data:
|
for item in data:
|
||||||
t3 = ttime()
|
t3 = time.perf_counter()
|
||||||
if return_fragment:
|
if return_fragment:
|
||||||
item = make_batch(item)
|
item = make_batch(item)
|
||||||
if item is None:
|
if item is None:
|
||||||
@ -1071,7 +1074,7 @@ class TTS:
|
|||||||
max_len=max_len,
|
max_len=max_len,
|
||||||
repetition_penalty=repetition_penalty,
|
repetition_penalty=repetition_penalty,
|
||||||
)
|
)
|
||||||
t4 = ttime()
|
t4 = time.perf_counter()
|
||||||
t_34 += t4 - t3
|
t_34 += t4 - t3
|
||||||
|
|
||||||
refer_audio_spec:torch.Tensor = [item.to(dtype=self.precision, device=self.configs.device) for item in self.prompt_cache["refer_spec"]]
|
refer_audio_spec:torch.Tensor = [item.to(dtype=self.precision, device=self.configs.device) for item in self.prompt_cache["refer_spec"]]
|
||||||
@ -1094,6 +1097,7 @@ class TTS:
|
|||||||
print(f"############ {i18n('合成音频')} ############")
|
print(f"############ {i18n('合成音频')} ############")
|
||||||
if not self.configs.is_v3_synthesizer:
|
if not self.configs.is_v3_synthesizer:
|
||||||
if speed_factor == 1.0:
|
if speed_factor == 1.0:
|
||||||
|
print(f"{i18n('并行合成中')}...")
|
||||||
# ## vits并行推理 method 2
|
# ## vits并行推理 method 2
|
||||||
pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)]
|
pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)]
|
||||||
upsample_rate = math.prod(self.vits_model.upsample_rates)
|
upsample_rate = math.prod(self.vits_model.upsample_rates)
|
||||||
@ -1117,6 +1121,17 @@ class TTS:
|
|||||||
batch_audio_fragment.append(
|
batch_audio_fragment.append(
|
||||||
audio_fragment
|
audio_fragment
|
||||||
) ###试试重建不带上prompt部分
|
) ###试试重建不带上prompt部分
|
||||||
|
else:
|
||||||
|
if parallel_infer:
|
||||||
|
print(f"{i18n('并行合成中')}...")
|
||||||
|
audio_fragments = self.v3_synthesis_batched_infer(
|
||||||
|
idx_list,
|
||||||
|
pred_semantic_list,
|
||||||
|
batch_phones,
|
||||||
|
speed=speed_factor,
|
||||||
|
sample_steps=sample_steps
|
||||||
|
)
|
||||||
|
batch_audio_fragment.extend(audio_fragments)
|
||||||
else:
|
else:
|
||||||
for i, idx in enumerate(tqdm(idx_list)):
|
for i, idx in enumerate(tqdm(idx_list)):
|
||||||
phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
|
phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
|
||||||
@ -1128,7 +1143,7 @@ class TTS:
|
|||||||
audio_fragment
|
audio_fragment
|
||||||
)
|
)
|
||||||
|
|
||||||
t5 = ttime()
|
t5 = time.perf_counter()
|
||||||
t_45 += t5 - t4
|
t_45 += t5 - t4
|
||||||
if return_fragment:
|
if return_fragment:
|
||||||
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t4 - t3, t5 - t4))
|
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t4 - t3, t5 - t4))
|
||||||
@ -1219,13 +1234,13 @@ class TTS:
|
|||||||
|
|
||||||
if super_sampling:
|
if super_sampling:
|
||||||
print(f"############ {i18n('音频超采样')} ############")
|
print(f"############ {i18n('音频超采样')} ############")
|
||||||
t1 = ttime()
|
t1 = time.perf_counter()
|
||||||
self.init_sr_model()
|
self.init_sr_model()
|
||||||
if not self.sr_model_not_exist:
|
if not self.sr_model_not_exist:
|
||||||
audio,sr=self.sr_model(audio.unsqueeze(0),sr)
|
audio,sr=self.sr_model(audio.unsqueeze(0),sr)
|
||||||
max_audio=np.abs(audio).max()
|
max_audio=np.abs(audio).max()
|
||||||
if max_audio > 1: audio /= max_audio
|
if max_audio > 1: audio /= max_audio
|
||||||
t2 = ttime()
|
t2 = time.perf_counter()
|
||||||
print(f"超采样用时:{t2-t1:.3f}s")
|
print(f"超采样用时:{t2-t1:.3f}s")
|
||||||
else:
|
else:
|
||||||
audio = audio.cpu().numpy()
|
audio = audio.cpu().numpy()
|
||||||
@ -1260,7 +1275,7 @@ class TTS:
|
|||||||
ref_audio = ref_audio.mean(0).unsqueeze(0)
|
ref_audio = ref_audio.mean(0).unsqueeze(0)
|
||||||
if ref_sr!=24000:
|
if ref_sr!=24000:
|
||||||
ref_audio=resample(ref_audio, ref_sr, self.configs.device)
|
ref_audio=resample(ref_audio, ref_sr, self.configs.device)
|
||||||
# print("ref_audio",ref_audio.abs().mean())
|
|
||||||
mel2 = mel_fn(ref_audio)
|
mel2 = mel_fn(ref_audio)
|
||||||
mel2 = norm_spec(mel2)
|
mel2 = norm_spec(mel2)
|
||||||
T_min = min(mel2.shape[2], fea_ref.shape[2])
|
T_min = min(mel2.shape[2], fea_ref.shape[2])
|
||||||
@ -1285,15 +1300,156 @@ class TTS:
|
|||||||
|
|
||||||
cfm_res = self.vits_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0)
|
cfm_res = self.vits_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0)
|
||||||
cfm_res = cfm_res[:, :, mel2.shape[2]:]
|
cfm_res = cfm_res[:, :, mel2.shape[2]:]
|
||||||
mel2 = cfm_res[:, :, -T_min:]
|
|
||||||
|
|
||||||
|
mel2 = cfm_res[:, :, -T_min:]
|
||||||
fea_ref = fea_todo_chunk[:, :, -T_min:]
|
fea_ref = fea_todo_chunk[:, :, -T_min:]
|
||||||
|
|
||||||
cfm_resss.append(cfm_res)
|
cfm_resss.append(cfm_res)
|
||||||
cmf_res = torch.cat(cfm_resss, 2)
|
cfm_res = torch.cat(cfm_resss, 2)
|
||||||
cmf_res = denorm_spec(cmf_res)
|
cfm_res = denorm_spec(cfm_res)
|
||||||
|
|
||||||
|
|
||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
wav_gen = self.bigvgan_model(cmf_res)
|
wav_gen = self.bigvgan_model(cfm_res)
|
||||||
audio=wav_gen[0][0]#.cpu().detach().numpy()
|
audio=wav_gen[0][0]#.cpu().detach().numpy()
|
||||||
|
|
||||||
return audio
|
return audio
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def v3_synthesis_batched_infer(self,
|
||||||
|
idx_list:List[int],
|
||||||
|
semantic_tokens_list:List[torch.Tensor],
|
||||||
|
batch_phones:List[torch.Tensor],
|
||||||
|
speed:float=1.0,
|
||||||
|
sample_steps:int=32
|
||||||
|
)->List[torch.Tensor]:
|
||||||
|
|
||||||
|
prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device)
|
||||||
|
prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device)
|
||||||
|
refer_audio_spec = self.prompt_cache["refer_spec"][0].to(dtype=self.precision, device=self.configs.device)
|
||||||
|
|
||||||
|
fea_ref,ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
|
||||||
|
ref_audio:torch.Tensor = self.prompt_cache["raw_audio"]
|
||||||
|
ref_sr = self.prompt_cache["raw_sr"]
|
||||||
|
ref_audio=ref_audio.to(self.configs.device).float()
|
||||||
|
if (ref_audio.shape[0] == 2):
|
||||||
|
ref_audio = ref_audio.mean(0).unsqueeze(0)
|
||||||
|
if ref_sr!=24000:
|
||||||
|
ref_audio=resample(ref_audio, ref_sr, self.configs.device)
|
||||||
|
|
||||||
|
mel2 = mel_fn(ref_audio)
|
||||||
|
mel2 = norm_spec(mel2)
|
||||||
|
T_min = min(mel2.shape[2], fea_ref.shape[2])
|
||||||
|
mel2 = mel2[:, :, :T_min]
|
||||||
|
fea_ref = fea_ref[:, :, :T_min]
|
||||||
|
if (T_min > 468):
|
||||||
|
mel2 = mel2[:, :, -468:]
|
||||||
|
fea_ref = fea_ref[:, :, -468:]
|
||||||
|
T_min = 468
|
||||||
|
chunk_len = 934 - T_min
|
||||||
|
|
||||||
|
mel2=mel2.to(self.precision)
|
||||||
|
|
||||||
|
|
||||||
|
# #### batched inference
|
||||||
|
overlapped_len = 12
|
||||||
|
feat_chunks = []
|
||||||
|
feat_lens = []
|
||||||
|
feat_list = []
|
||||||
|
|
||||||
|
for i, idx in enumerate(idx_list):
|
||||||
|
phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
|
||||||
|
semantic_tokens = semantic_tokens_list[i][-idx:].unsqueeze(0).unsqueeze(0) # .unsqueeze(0)#mq要多unsqueeze一次
|
||||||
|
feat, _ = self.vits_model.decode_encp(semantic_tokens, phones, refer_audio_spec, ge, speed)
|
||||||
|
feat_list.append(feat)
|
||||||
|
feat_lens.append(feat.shape[2])
|
||||||
|
|
||||||
|
feats = torch.cat(feat_list, 2)
|
||||||
|
feats_padded = F.pad(feats, (overlapped_len,0), "constant", 0)
|
||||||
|
pos = 0
|
||||||
|
padding_len = 0
|
||||||
|
while True:
|
||||||
|
if pos ==0:
|
||||||
|
chunk = feats_padded[:, :, pos:pos + chunk_len]
|
||||||
|
else:
|
||||||
|
pos = pos - overlapped_len
|
||||||
|
chunk = feats_padded[:, :, pos:pos + chunk_len]
|
||||||
|
pos += chunk_len
|
||||||
|
if (chunk.shape[-1] == 0): break
|
||||||
|
|
||||||
|
# padding for the last chunk
|
||||||
|
padding_len = chunk_len - chunk.shape[2]
|
||||||
|
if padding_len != 0:
|
||||||
|
chunk = F.pad(chunk, (0,padding_len), "constant", 0)
|
||||||
|
feat_chunks.append(chunk)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
feat_chunks = torch.cat(feat_chunks, 0)
|
||||||
|
bs = feat_chunks.shape[0]
|
||||||
|
fea_ref = fea_ref.repeat(bs,1,1)
|
||||||
|
fea = torch.cat([fea_ref, feat_chunks], 2).transpose(2, 1)
|
||||||
|
pred_spec = self.vits_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0)
|
||||||
|
pred_spec = pred_spec[:, :, -chunk_len:]
|
||||||
|
dd = pred_spec.shape[1]
|
||||||
|
pred_spec = pred_spec.permute(1, 0, 2).contiguous().view(dd, -1).unsqueeze(0)
|
||||||
|
# pred_spec = pred_spec[..., :-padding_len]
|
||||||
|
|
||||||
|
|
||||||
|
pred_spec = denorm_spec(pred_spec)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
wav_gen = self.bigvgan_model(pred_spec)
|
||||||
|
audio = wav_gen[0][0]#.cpu().detach().numpy()
|
||||||
|
|
||||||
|
|
||||||
|
audio_fragments = []
|
||||||
|
upsample_rate = 256
|
||||||
|
pos = 0
|
||||||
|
|
||||||
|
while pos < audio.shape[-1]:
|
||||||
|
audio_fragment = audio[pos:pos+chunk_len*upsample_rate]
|
||||||
|
audio_fragments.append(audio_fragment)
|
||||||
|
pos += chunk_len*upsample_rate
|
||||||
|
|
||||||
|
audio = self.sola_algorithm(audio_fragments, overlapped_len*upsample_rate)
|
||||||
|
audio = audio[overlapped_len*upsample_rate:-padding_len*upsample_rate]
|
||||||
|
|
||||||
|
audio_fragments = []
|
||||||
|
for feat_len in feat_lens:
|
||||||
|
audio_fragment = audio[:feat_len*upsample_rate]
|
||||||
|
audio_fragments.append(audio_fragment)
|
||||||
|
audio = audio[feat_len*upsample_rate:]
|
||||||
|
|
||||||
|
|
||||||
|
return audio_fragments
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def sola_algorithm(self,
|
||||||
|
audio_fragments:List[torch.Tensor],
|
||||||
|
overlap_len:int,
|
||||||
|
):
|
||||||
|
|
||||||
|
for i in range(len(audio_fragments)-1):
|
||||||
|
f1 = audio_fragments[i]
|
||||||
|
f2 = audio_fragments[i+1]
|
||||||
|
w1 = f1[-overlap_len:]
|
||||||
|
w2 = f2[:overlap_len]
|
||||||
|
assert w1.shape == w2.shape
|
||||||
|
corr = F.conv1d(w1.view(1,1,-1), w2.view(1,1,-1),padding=w2.shape[-1]//2).view(-1)[:-1]
|
||||||
|
idx = corr.argmax()
|
||||||
|
f1_ = f1[:-(overlap_len-idx)]
|
||||||
|
audio_fragments[i] = f1_
|
||||||
|
|
||||||
|
f2_ = f2[idx:]
|
||||||
|
window = torch.hann_window((overlap_len-idx)*2, device=f1.device, dtype=f1.dtype)
|
||||||
|
f2_[:(overlap_len-idx)] = window[:(overlap_len-idx)]*f2_[:(overlap_len-idx)] + window[(overlap_len-idx):]*f1[-(overlap_len-idx):]
|
||||||
|
audio_fragments[i+1] = f2_
|
||||||
|
|
||||||
|
|
||||||
|
return torch.cat(audio_fragments, 0)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -398,4 +398,5 @@ arpa = {
|
|||||||
symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)
|
symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)
|
||||||
symbols = sorted(set(symbols))
|
symbols = sorted(set(symbols))
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
print(symbols)
|
||||||
print(len(symbols))
|
print(len(symbols))
|
||||||
|
@ -1,42 +1,37 @@
|
|||||||
{
|
{
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 0,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"provenance": []
|
|
||||||
},
|
|
||||||
"kernelspec": {
|
|
||||||
"name": "python3",
|
|
||||||
"display_name": "Python 3"
|
|
||||||
},
|
|
||||||
"accelerator": "GPU"
|
|
||||||
},
|
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "himHYZmra7ix"
|
"id": "himHYZmra7ix"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "e9b7iFV3dm1f"
|
"id": "e9b7iFV3dm1f"
|
||||||
},
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
|
"!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
|
||||||
"%cd GPT-SoVITS\n",
|
"%cd GPT-SoVITS\n",
|
||||||
"!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
|
"!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
|
||||||
|
"!pip install -r extra-req.txt --no-deps\n",
|
||||||
"!pip install -r requirements.txt"
|
"!pip install -r requirements.txt"
|
||||||
],
|
]
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"cellView": "form",
|
||||||
|
"id": "0NgxXg5sjv7z"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# @title Download pretrained models 下载预训练模型\n",
|
"# @title Download pretrained models 下载预训练模型\n",
|
||||||
"!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
|
"!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
|
||||||
@ -53,16 +48,16 @@
|
|||||||
"!git clone https://huggingface.co/Delik/uvr5_weights\n",
|
"!git clone https://huggingface.co/Delik/uvr5_weights\n",
|
||||||
"!git config core.sparseCheckout true\n",
|
"!git config core.sparseCheckout true\n",
|
||||||
"!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
|
"!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "0NgxXg5sjv7z",
|
|
||||||
"cellView": "form"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"cellView": "form",
|
||||||
|
"id": "cPDEH-9czOJF"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"#@title Create folder models 创建文件夹模型\n",
|
"#@title Create folder models 创建文件夹模型\n",
|
||||||
"import os\n",
|
"import os\n",
|
||||||
@ -77,16 +72,16 @@
|
|||||||
" print(f\"The folder '{folder_name}' was created successfully! (文件夹'{folder_name}'已成功创建!)\")\n",
|
" print(f\"The folder '{folder_name}' was created successfully! (文件夹'{folder_name}'已成功创建!)\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(\"All folders have been created. (所有文件夹均已创建。)\")"
|
"print(\"All folders have been created. (所有文件夹均已创建。)\")"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"cellView": "form",
|
|
||||||
"id": "cPDEH-9czOJF"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"cellView": "form",
|
||||||
|
"id": "vbZY-LnM0tzq"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import requests\n",
|
"import requests\n",
|
||||||
"import zipfile\n",
|
"import zipfile\n",
|
||||||
@ -124,29 +119,35 @@
|
|||||||
" shutil.move(source_path, destination_path)\n",
|
" shutil.move(source_path, destination_path)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(f'Model downloaded. (模型已下载。)')"
|
"print(f'Model downloaded. (模型已下载。)')"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"cellView": "form",
|
|
||||||
"id": "vbZY-LnM0tzq"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"cellView": "form",
|
||||||
|
"id": "4oRGUzkrk8C7"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# @title launch WebUI 启动WebUI\n",
|
"# @title launch WebUI 启动WebUI\n",
|
||||||
"!/usr/local/bin/pip install ipykernel\n",
|
"!/usr/local/bin/pip install ipykernel\n",
|
||||||
"!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
|
"!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
|
||||||
"%cd /content/GPT-SoVITS/\n",
|
"%cd /content/GPT-SoVITS/\n",
|
||||||
"!/usr/local/bin/python webui.py"
|
"!/usr/local/bin/python webui.py"
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"id": "4oRGUzkrk8C7",
|
|
||||||
"cellView": "form"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
}
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"accelerator": "GPU",
|
||||||
|
"colab": {
|
||||||
|
"provenance": []
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"name": "python3"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0
|
||||||
|
}
|
||||||
|
29
README.md
29
README.md
@ -1,6 +1,5 @@
|
|||||||
<div align="center">
|
<div align="center">
|
||||||
|
|
||||||
|
|
||||||
<h1>GPT-SoVITS-WebUI</h1>
|
<h1>GPT-SoVITS-WebUI</h1>
|
||||||
A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>
|
A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>
|
||||||
|
|
||||||
@ -77,6 +76,7 @@ bash install.sh
|
|||||||
```bash
|
```bash
|
||||||
conda create -n GPTSoVits python=3.9
|
conda create -n GPTSoVits python=3.9
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVits
|
||||||
|
pip install -r extra-req.txt --no-deps
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -105,6 +105,7 @@ Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWeb
|
|||||||
Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) (Korean TTS Only)
|
Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) (Korean TTS Only)
|
||||||
|
|
||||||
##### MacOS Users
|
##### MacOS Users
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
brew install ffmpeg
|
brew install ffmpeg
|
||||||
```
|
```
|
||||||
@ -112,6 +113,7 @@ brew install ffmpeg
|
|||||||
#### Install Dependences
|
#### Install Dependences
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
pip install -r extra-req.txt --no-deps
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -200,6 +202,7 @@ if you want to switch to V1,then
|
|||||||
```bash
|
```bash
|
||||||
python webui.py v1 <language(optional)>
|
python webui.py v1 <language(optional)>
|
||||||
```
|
```
|
||||||
|
|
||||||
Or maunally switch version in WebUI
|
Or maunally switch version in WebUI
|
||||||
|
|
||||||
### Finetune
|
### Finetune
|
||||||
@ -224,11 +227,13 @@ Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference
|
|||||||
```bash
|
```bash
|
||||||
python GPT_SoVITS/inference_webui.py <language(optional)>
|
python GPT_SoVITS/inference_webui.py <language(optional)>
|
||||||
```
|
```
|
||||||
|
|
||||||
OR
|
OR
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python webui.py
|
python webui.py
|
||||||
```
|
```
|
||||||
|
|
||||||
then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
|
then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
|
||||||
|
|
||||||
## V2 Release Notes
|
## V2 Release Notes
|
||||||
@ -243,7 +248,7 @@ New Features:
|
|||||||
|
|
||||||
4. Improved synthesis quality for low-quality reference audio
|
4. Improved synthesis quality for low-quality reference audio
|
||||||
|
|
||||||
[more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
[more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
Use v2 from v1 environment:
|
Use v2 from v1 environment:
|
||||||
|
|
||||||
@ -263,7 +268,7 @@ New Features:
|
|||||||
|
|
||||||
2. GPT model is more stable, with fewer repetitions and omissions, and it is easier to generate speech with richer emotional expression.
|
2. GPT model is more stable, with fewer repetitions and omissions, and it is easier to generate speech with richer emotional expression.
|
||||||
|
|
||||||
[more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
[more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
Use v3 from v2 environment:
|
Use v3 from v2 environment:
|
||||||
|
|
||||||
@ -275,7 +280,6 @@ Use v3 from v2 environment:
|
|||||||
|
|
||||||
additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt)
|
additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt)
|
||||||
|
|
||||||
|
|
||||||
## Todo List
|
## Todo List
|
||||||
|
|
||||||
- [x] **High Priority:**
|
- [x] **High Priority:**
|
||||||
@ -297,15 +301,20 @@ Use v3 from v2 environment:
|
|||||||
- [ ] model mix
|
- [ ] model mix
|
||||||
|
|
||||||
## (Additional) Method for running from the command line
|
## (Additional) Method for running from the command line
|
||||||
|
|
||||||
Use the command line to open the WebUI for UVR5
|
Use the command line to open the WebUI for UVR5
|
||||||
|
|
||||||
```
|
```
|
||||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
||||||
```
|
```
|
||||||
|
|
||||||
<!-- If you can't open a browser, follow the format below for UVR processing,This is using mdxnet for audio processing
|
<!-- If you can't open a browser, follow the format below for UVR processing,This is using mdxnet for audio processing
|
||||||
```
|
```
|
||||||
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
||||||
``` -->
|
``` -->
|
||||||
|
|
||||||
This is how the audio segmentation of the dataset is done using the command line
|
This is how the audio segmentation of the dataset is done using the command line
|
||||||
|
|
||||||
```
|
```
|
||||||
python audio_slicer.py \
|
python audio_slicer.py \
|
||||||
--input_path "<path_to_original_audio_file_or_directory>" \
|
--input_path "<path_to_original_audio_file_or_directory>" \
|
||||||
@ -315,16 +324,21 @@ python audio_slicer.py \
|
|||||||
--min_interval <shortest_time_gap_between_adjacent_subclips>
|
--min_interval <shortest_time_gap_between_adjacent_subclips>
|
||||||
--hop_size <step_size_for_computing_volume_curve>
|
--hop_size <step_size_for_computing_volume_curve>
|
||||||
```
|
```
|
||||||
|
|
||||||
This is how dataset ASR processing is done using the command line(Only Chinese)
|
This is how dataset ASR processing is done using the command line(Only Chinese)
|
||||||
|
|
||||||
```
|
```
|
||||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
python tools/asr/funasr_asr.py -i <input> -o <output>
|
||||||
```
|
```
|
||||||
|
|
||||||
ASR processing is performed through Faster_Whisper(ASR marking except Chinese)
|
ASR processing is performed through Faster_Whisper(ASR marking except Chinese)
|
||||||
|
|
||||||
(No progress bars, GPU performance may cause time delays)
|
(No progress bars, GPU performance may cause time delays)
|
||||||
|
|
||||||
```
|
```
|
||||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
||||||
```
|
```
|
||||||
|
|
||||||
A custom list save path is enabled
|
A custom list save path is enabled
|
||||||
|
|
||||||
## Credits
|
## Credits
|
||||||
@ -332,6 +346,7 @@ A custom list save path is enabled
|
|||||||
Special thanks to the following projects and contributors:
|
Special thanks to the following projects and contributors:
|
||||||
|
|
||||||
### Theoretical Research
|
### Theoretical Research
|
||||||
|
|
||||||
- [ar-vits](https://github.com/innnky/ar-vits)
|
- [ar-vits](https://github.com/innnky/ar-vits)
|
||||||
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
||||||
- [vits](https://github.com/jaywalnut310/vits)
|
- [vits](https://github.com/jaywalnut310/vits)
|
||||||
@ -341,17 +356,23 @@ Special thanks to the following projects and contributors:
|
|||||||
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
||||||
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
||||||
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
||||||
|
|
||||||
### Pretrained Models
|
### Pretrained Models
|
||||||
|
|
||||||
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
||||||
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
||||||
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
||||||
|
|
||||||
### Text Frontend for Inference
|
### Text Frontend for Inference
|
||||||
|
|
||||||
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
||||||
- [split-lang](https://github.com/DoodleBears/split-lang)
|
- [split-lang](https://github.com/DoodleBears/split-lang)
|
||||||
- [g2pW](https://github.com/GitYCC/g2pW)
|
- [g2pW](https://github.com/GitYCC/g2pW)
|
||||||
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
||||||
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
||||||
|
|
||||||
### WebUI Tools
|
### WebUI Tools
|
||||||
|
|
||||||
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
||||||
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
||||||
- [SubFix](https://github.com/cronrpc/SubFix)
|
- [SubFix](https://github.com/cronrpc/SubFix)
|
||||||
|
0
Ref_Audio_Selector/__init__.py
Normal file
0
Ref_Audio_Selector/__init__.py
Normal file
0
Ref_Audio_Selector/common/__init__.py
Normal file
0
Ref_Audio_Selector/common/__init__.py
Normal file
156
Ref_Audio_Selector/common/common.py
Normal file
156
Ref_Audio_Selector/common/common.py
Normal file
@ -0,0 +1,156 @@
|
|||||||
|
from tools import my_utils
|
||||||
|
from config import python_exec, is_half
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
class RefAudioListManager:
|
||||||
|
def __init__(self, root_dir):
|
||||||
|
self.audio_dict = {'default': []}
|
||||||
|
absolute_root = os.path.abspath(root_dir)
|
||||||
|
|
||||||
|
for subdir, dirs, files in os.walk(absolute_root):
|
||||||
|
relative_path = os.path.relpath(subdir, absolute_root)
|
||||||
|
|
||||||
|
if relative_path == '.':
|
||||||
|
category = 'default'
|
||||||
|
else:
|
||||||
|
category = relative_path.replace(os.sep, '')
|
||||||
|
|
||||||
|
for file in files:
|
||||||
|
if file.endswith('.wav'):
|
||||||
|
# 将相对路径转换为绝对路径
|
||||||
|
audio_abs_path = os.path.join(subdir, file)
|
||||||
|
if category not in self.audio_dict:
|
||||||
|
self.audio_dict[category] = []
|
||||||
|
self.audio_dict[category].append(audio_abs_path)
|
||||||
|
|
||||||
|
def get_audio_list(self):
|
||||||
|
return self.audio_dict
|
||||||
|
|
||||||
|
def get_flattened_audio_list(self):
|
||||||
|
all_audio_files = []
|
||||||
|
for category_audios in self.audio_dict.values():
|
||||||
|
all_audio_files.extend(category_audios)
|
||||||
|
return all_audio_files
|
||||||
|
|
||||||
|
def get_ref_audio_list(self):
|
||||||
|
audio_info_list = []
|
||||||
|
for category, audio_paths in self.audio_dict.items():
|
||||||
|
for audio_path in audio_paths:
|
||||||
|
filename_without_extension = os.path.splitext(os.path.basename(audio_path))[0]
|
||||||
|
audio_info = {
|
||||||
|
'emotion': f"{category}-{filename_without_extension}",
|
||||||
|
'ref_path': audio_path,
|
||||||
|
'ref_text': filename_without_extension,
|
||||||
|
}
|
||||||
|
audio_info_list.append(audio_info)
|
||||||
|
return audio_info_list
|
||||||
|
|
||||||
|
|
||||||
|
def batch_clean_paths(paths):
|
||||||
|
"""
|
||||||
|
批量处理路径列表,对每个路径调用 clean_path() 函数。
|
||||||
|
|
||||||
|
参数:
|
||||||
|
paths (list[str]): 包含待处理路径的列表。
|
||||||
|
|
||||||
|
返回:
|
||||||
|
list[str]: 经过 clean_path() 处理后的路径列表。
|
||||||
|
"""
|
||||||
|
cleaned_paths = []
|
||||||
|
for path in paths:
|
||||||
|
cleaned_paths.append(my_utils.clean_path(path))
|
||||||
|
return cleaned_paths
|
||||||
|
|
||||||
|
|
||||||
|
def read_text_file_to_list(file_path):
|
||||||
|
# 按照UTF-8编码打开文件(确保能够正确读取中文)
|
||||||
|
with open(file_path, mode='r', encoding='utf-8') as file:
|
||||||
|
# 读取所有行并存储到一个列表中
|
||||||
|
lines = file.read().splitlines()
|
||||||
|
return lines
|
||||||
|
|
||||||
|
|
||||||
|
def get_filename_without_extension(file_path):
|
||||||
|
"""
|
||||||
|
Given a file path string, returns the file name without its extension.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
file_path (str): The full path to the file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The file name without its extension.
|
||||||
|
"""
|
||||||
|
base_name = os.path.basename(file_path) # Get the base name (file name with extension)
|
||||||
|
file_name, file_extension = os.path.splitext(base_name) # Split the base name into file name and extension
|
||||||
|
return file_name # Return the file name without extension
|
||||||
|
|
||||||
|
|
||||||
|
def read_file(file_path):
|
||||||
|
# 使用with语句打开并读取文件
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file: # 'r' 表示以读取模式打开文件
|
||||||
|
# 一次性读取文件所有内容
|
||||||
|
file_content = file.read()
|
||||||
|
|
||||||
|
# 文件在with语句结束时会自动关闭
|
||||||
|
# 现在file_content变量中存储了文件的所有文本内容
|
||||||
|
return file_content
|
||||||
|
|
||||||
|
|
||||||
|
def write_text_to_file(text, output_file_path):
|
||||||
|
try:
|
||||||
|
with open(output_file_path, 'w', encoding='utf-8') as file:
|
||||||
|
file.write(text)
|
||||||
|
except IOError as e:
|
||||||
|
print(f"Error occurred while writing to the file: {e}")
|
||||||
|
else:
|
||||||
|
print(f"Text successfully written to file: {output_file_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def check_path_existence_and_return(path):
|
||||||
|
"""
|
||||||
|
检查给定路径(文件或目录)是否存在。如果存在,返回该路径;否则,返回空字符串。
|
||||||
|
:param path: 待检查的文件或目录路径(字符串)
|
||||||
|
:return: 如果路径存在,返回原路径;否则,返回空字符串
|
||||||
|
"""
|
||||||
|
if os.path.exists(path):
|
||||||
|
return path
|
||||||
|
else:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def open_file(filepath):
|
||||||
|
if sys.platform.startswith('darwin'):
|
||||||
|
subprocess.run(['open', filepath]) # macOS
|
||||||
|
elif os.name == 'nt': # For Windows
|
||||||
|
os.startfile(filepath)
|
||||||
|
elif os.name == 'posix': # For Linux, Unix, etc.
|
||||||
|
subprocess.run(['xdg-open', filepath])
|
||||||
|
|
||||||
|
|
||||||
|
def start_new_service(script_path):
|
||||||
|
# 对于Windows系统
|
||||||
|
if sys.platform.startswith('win'):
|
||||||
|
cmd = f'start cmd /k {python_exec} {script_path}'
|
||||||
|
# 对于Mac或者Linux系统
|
||||||
|
else:
|
||||||
|
cmd = f'xterm -e {python_exec} {script_path}'
|
||||||
|
|
||||||
|
proc = subprocess.Popen(cmd, shell=True)
|
||||||
|
|
||||||
|
# 关闭之前启动的子进程
|
||||||
|
# proc.terminate()
|
||||||
|
|
||||||
|
# 或者如果需要强制关闭可以使用
|
||||||
|
# proc.kill()
|
||||||
|
|
||||||
|
return proc
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
dir = r'C:\Users\Administrator\Desktop/test'
|
||||||
|
dir2 = r'"C:\Users\Administrator\Desktop\test2"'
|
||||||
|
dir, dir2 = batch_clean_paths([dir, dir2])
|
||||||
|
print(dir, dir2)
|
46
Ref_Audio_Selector/common/model_manager.py
Normal file
46
Ref_Audio_Selector/common/model_manager.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
pretrained_sovits_name = "GPT_SoVITS/pretrained_models/s2G488k.pth"
|
||||||
|
pretrained_gpt_name = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
|
||||||
|
SoVITS_weight_root = "SoVITS_weights"
|
||||||
|
GPT_weight_root = "GPT_weights"
|
||||||
|
os.makedirs(SoVITS_weight_root, exist_ok=True)
|
||||||
|
os.makedirs(GPT_weight_root, exist_ok=True)
|
||||||
|
|
||||||
|
speaker_verification_models = {
|
||||||
|
'speech_campplus_sv_zh-cn_16k-common': {
|
||||||
|
'task': 'speaker-verification',
|
||||||
|
'model': 'Ref_Audio_Selector/tool/speaker_verification/models/speech_campplus_sv_zh-cn_16k-common',
|
||||||
|
'model_revision': 'v1.0.0'
|
||||||
|
},
|
||||||
|
'speech_eres2net_sv_zh-cn_16k-common': {
|
||||||
|
'task': 'speaker-verification',
|
||||||
|
'model': 'Ref_Audio_Selector/tool/speaker_verification/models/speech_eres2net_sv_zh-cn_16k-common',
|
||||||
|
'model_revision': 'v1.0.5'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def custom_sort_key(s):
|
||||||
|
# 使用正则表达式提取字符串中的数字部分和非数字部分
|
||||||
|
parts = re.split('(\d+)', s)
|
||||||
|
# 将数字部分转换为整数,非数字部分保持不变
|
||||||
|
parts = [int(part) if part.isdigit() else part for part in parts]
|
||||||
|
return parts
|
||||||
|
|
||||||
|
|
||||||
|
def get_gpt_model_names():
|
||||||
|
gpt_names = [pretrained_gpt_name]
|
||||||
|
for name in os.listdir(GPT_weight_root):
|
||||||
|
if name.endswith(".ckpt"): gpt_names.append("%s/%s" % (GPT_weight_root, name))
|
||||||
|
sorted(gpt_names, key=custom_sort_key)
|
||||||
|
return gpt_names
|
||||||
|
|
||||||
|
|
||||||
|
def get_sovits_model_names():
|
||||||
|
sovits_names = [pretrained_sovits_name]
|
||||||
|
for name in os.listdir(SoVITS_weight_root):
|
||||||
|
if name.endswith(".pth"): sovits_names.append("%s/%s" % (SoVITS_weight_root, name))
|
||||||
|
sorted(sovits_names, key=custom_sort_key)
|
||||||
|
return sovits_names
|
||||||
|
|
72
Ref_Audio_Selector/common/time_util.py
Normal file
72
Ref_Audio_Selector/common/time_util.py
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
import time
|
||||||
|
import os
|
||||||
|
from Ref_Audio_Selector.config_param.log_config import p_logger
|
||||||
|
import Ref_Audio_Selector.config_param.config_params as params
|
||||||
|
|
||||||
|
|
||||||
|
def timeit_decorator(func):
|
||||||
|
"""
|
||||||
|
装饰器,用于计算被装饰函数的执行时间。
|
||||||
|
|
||||||
|
参数:
|
||||||
|
func (function): 要计时的函数。
|
||||||
|
|
||||||
|
返回:
|
||||||
|
function: 包含计时功能的新函数。
|
||||||
|
"""
|
||||||
|
|
||||||
|
def wrapper(*args, **kwargs):
|
||||||
|
if params.time_log_print_type != 'file':
|
||||||
|
return func(*args, **kwargs)
|
||||||
|
|
||||||
|
start_time = time.perf_counter() # 使用 perf_counter 获取高精度计时起点
|
||||||
|
|
||||||
|
func_result = func(*args, **kwargs) # 执行原函数
|
||||||
|
|
||||||
|
end_time = time.perf_counter() # 获取计时终点
|
||||||
|
elapsed_time = end_time - start_time # 计算执行耗时
|
||||||
|
|
||||||
|
# 记录日志内容
|
||||||
|
log_message = f"进程ID: {os.getpid()}, {func.__name__} 执行耗时: {elapsed_time:.6f} 秒"
|
||||||
|
p_logger.info(log_message)
|
||||||
|
|
||||||
|
return func_result
|
||||||
|
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
|
def time_monitor(func):
|
||||||
|
"""
|
||||||
|
返回结果,追加时间
|
||||||
|
"""
|
||||||
|
|
||||||
|
def wrapper(*args, **kwargs):
|
||||||
|
|
||||||
|
start_time = time.perf_counter() # 使用 perf_counter 获取高精度计时起点
|
||||||
|
|
||||||
|
func_result = func(*args, **kwargs) # 执行原函数
|
||||||
|
|
||||||
|
end_time = time.perf_counter() # 获取计时终点
|
||||||
|
elapsed_time = end_time - start_time # 计算执行耗时
|
||||||
|
|
||||||
|
return elapsed_time, func_result
|
||||||
|
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
|
# 使用装饰器
|
||||||
|
@timeit_decorator
|
||||||
|
def example_function(n):
|
||||||
|
time.sleep(n) # 假设这是需要计时的函数,这里模拟耗时操作
|
||||||
|
return n * 2
|
||||||
|
|
||||||
|
|
||||||
|
def example_function2(n):
|
||||||
|
time.sleep(n) # 假设这是需要计时的函数,这里模拟耗时操作
|
||||||
|
return n * 2
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# 调用经过装饰的函数
|
||||||
|
# result = example_function(2)
|
||||||
|
print(time_monitor(example_function2)(2))
|
57
Ref_Audio_Selector/config.ini
Normal file
57
Ref_Audio_Selector/config.ini
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
# config.ini
|
||||||
|
|
||||||
|
[Base]
|
||||||
|
# 服务端口号
|
||||||
|
server_port = 9423
|
||||||
|
# 参考音频目录
|
||||||
|
reference_audio_dir = refer_audio
|
||||||
|
# 临时文件目录
|
||||||
|
temp_dir = Ref_Audio_Selector/temp
|
||||||
|
|
||||||
|
[Log]
|
||||||
|
# 日志保存目录路径
|
||||||
|
log_dir = Ref_Audio_Selector/log/general
|
||||||
|
# 日志级别 CRITICAL、FATAL、ERROR、WARNING、WARN、INFO、DEBUG、NOTSET、
|
||||||
|
log_level = INFO
|
||||||
|
# 函数时间消耗日志打印类型 file 打印到文件; close 关闭
|
||||||
|
time_log_print_type = file
|
||||||
|
# 函数时间消耗日志保存目录路径
|
||||||
|
time_log_print_dir = Ref_Audio_Selector/log/performance
|
||||||
|
|
||||||
|
[AudioSample]
|
||||||
|
# list转换待选参考音频目录
|
||||||
|
list_to_convert_reference_audio_dir = refer_audio_all
|
||||||
|
# 音频相似度目录
|
||||||
|
audio_similarity_dir = similarity
|
||||||
|
# 是否开启基准音频预采样 true false
|
||||||
|
enable_pre_sample = true
|
||||||
|
|
||||||
|
[Inference]
|
||||||
|
# 默认测试文本位置
|
||||||
|
default_test_text_path = Ref_Audio_Selector/file/test_content/test_content.txt
|
||||||
|
# 推理音频目录
|
||||||
|
inference_audio_dir = inference_audio
|
||||||
|
# 推理音频文本聚合目录
|
||||||
|
inference_audio_text_aggregation_dir = text
|
||||||
|
# 推理音频情绪聚合目录
|
||||||
|
inference_audio_emotion_aggregation_dir = emotion
|
||||||
|
|
||||||
|
[ResultCheck]
|
||||||
|
# asr输出文件
|
||||||
|
asr_filename = asr
|
||||||
|
# 文本相似度输出目录
|
||||||
|
text_similarity_output_dir = text_similarity
|
||||||
|
# 文本情绪平均相似度报告文件名
|
||||||
|
text_emotion_average_similarity_report_filename = average_similarity
|
||||||
|
# 文本相似度按情绪聚合明细文件名
|
||||||
|
text_similarity_by_emotion_detail_filename = emotion_group_detail
|
||||||
|
# 文本相似度按文本聚合明细文件名
|
||||||
|
text_similarity_by_text_detail_filename = text_group_detail
|
||||||
|
|
||||||
|
[AudioConfig]
|
||||||
|
# 默认模板文件位置
|
||||||
|
default_template_path = Ref_Audio_Selector/file/config_template/ref_audio_template.txt
|
||||||
|
# 参考音频配置文件名
|
||||||
|
reference_audio_config_filename = refer_audio
|
||||||
|
|
||||||
|
[Other]
|
0
Ref_Audio_Selector/config_param/__init__.py
Normal file
0
Ref_Audio_Selector/config_param/__init__.py
Normal file
111
Ref_Audio_Selector/config_param/config_manager.py
Normal file
111
Ref_Audio_Selector/config_param/config_manager.py
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
import configparser
|
||||||
|
import os
|
||||||
|
import Ref_Audio_Selector.common.common as common
|
||||||
|
|
||||||
|
|
||||||
|
class ParamReadWriteManager:
|
||||||
|
def __init__(self):
|
||||||
|
self.base_dir = 'Ref_Audio_Selector/file/base_info'
|
||||||
|
os.makedirs(self.base_dir, exist_ok=True)
|
||||||
|
# 基础信息
|
||||||
|
self.work_dir = 'work_dir'
|
||||||
|
self.role = 'role'
|
||||||
|
# 第一步
|
||||||
|
self.subsection_num = 'subsection_num'
|
||||||
|
self.sample_num = 'sample_num'
|
||||||
|
# 第二步
|
||||||
|
self.api_set_model_base_url = 'api_set_model_base_url'
|
||||||
|
self.api_gpt_param = 'api_gpt_param'
|
||||||
|
self.api_sovits_param = 'api_sovits_param'
|
||||||
|
|
||||||
|
self.api_v2_set_gpt_model_base_url = 'api_v2_set_gpt_model_base_url'
|
||||||
|
self.api_v2_gpt_model_param = 'api_v2_gpt_model_param'
|
||||||
|
self.api_v2_set_sovits_model_base_url = 'api_v2_set_sovits_model_base_url'
|
||||||
|
self.api_v2_sovits_model_param = 'api_v2_sovits_model_param'
|
||||||
|
|
||||||
|
self.text_url = 'text_url'
|
||||||
|
self.text_param = 'text_param'
|
||||||
|
self.refer_type_param = 'refer_type_param'
|
||||||
|
self.ref_path_param = 'ref_path_param'
|
||||||
|
self.ref_text_param = 'ref_text_param'
|
||||||
|
self.emotion_param = 'emotion_param'
|
||||||
|
|
||||||
|
self.test_content_path = 'test_content_path'
|
||||||
|
self.request_concurrency_num = 'request_concurrency_num'
|
||||||
|
|
||||||
|
# 第三步
|
||||||
|
self.text_similarity_amplification_boundary = 'text_similarity_amplification_boundary'
|
||||||
|
# 第四步
|
||||||
|
# 第五步
|
||||||
|
self.text_template = 'text_template'
|
||||||
|
|
||||||
|
def read(self, key):
|
||||||
|
file_path = os.path.join(self.base_dir, key + '.txt')
|
||||||
|
if os.path.exists(file_path):
|
||||||
|
content = common.read_file(file_path)
|
||||||
|
return content.strip()
|
||||||
|
else:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def write(self, key, content):
|
||||||
|
file_path = os.path.join(self.base_dir, key + '.txt')
|
||||||
|
|
||||||
|
# 确保内容是字符串类型,如果不是,转换为字符串
|
||||||
|
if not isinstance(content, str):
|
||||||
|
clean_content = str(content).strip() # 转换为字符串并移除首尾空白
|
||||||
|
else:
|
||||||
|
clean_content = content.strip()
|
||||||
|
|
||||||
|
common.write_text_to_file(clean_content, file_path)
|
||||||
|
|
||||||
|
|
||||||
|
class ConfigManager:
|
||||||
|
def __init__(self):
|
||||||
|
self.config_path = 'Ref_Audio_Selector/config.ini'
|
||||||
|
self.config = configparser.ConfigParser()
|
||||||
|
self.config.read(self.config_path, encoding='utf-8')
|
||||||
|
|
||||||
|
def get_base(self, key):
|
||||||
|
return self.config.get('Base', key)
|
||||||
|
|
||||||
|
def get_log(self, key):
|
||||||
|
return self.config.get('Log', key)
|
||||||
|
|
||||||
|
def get_audio_sample(self, key):
|
||||||
|
return self.config.get('AudioSample', key)
|
||||||
|
|
||||||
|
def get_inference(self, key):
|
||||||
|
return self.config.get('Inference', key)
|
||||||
|
|
||||||
|
def get_result_check(self, key):
|
||||||
|
return self.config.get('ResultCheck', key)
|
||||||
|
|
||||||
|
def get_audio_config(self, key):
|
||||||
|
return self.config.get('AudioConfig', key)
|
||||||
|
|
||||||
|
def get_other(self, key):
|
||||||
|
return self.config.get('Other', key)
|
||||||
|
|
||||||
|
def print(self):
|
||||||
|
# 打印所有配置
|
||||||
|
for section in self.config.sections():
|
||||||
|
print('[{}]'.format(section))
|
||||||
|
for key in self.config[section]:
|
||||||
|
print('{} = {}'.format(key, self.config[section][key]))
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
_config = ConfigManager()
|
||||||
|
_param_read_write_manager = ParamReadWriteManager()
|
||||||
|
|
||||||
|
|
||||||
|
def get_config():
|
||||||
|
return _config
|
||||||
|
|
||||||
|
|
||||||
|
def get_rw_param():
|
||||||
|
return _param_read_write_manager
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
print(_config.print())
|
58
Ref_Audio_Selector/config_param/config_params.py
Normal file
58
Ref_Audio_Selector/config_param/config_params.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
import Ref_Audio_Selector.config_param.config_manager as config_manager
|
||||||
|
|
||||||
|
config = config_manager.get_config()
|
||||||
|
|
||||||
|
# [Base]
|
||||||
|
# 服务端口号
|
||||||
|
server_port = int(config.get_base('server_port'))
|
||||||
|
# 参考音频目录
|
||||||
|
reference_audio_dir = config.get_base('reference_audio_dir')
|
||||||
|
# 临时文件目录
|
||||||
|
temp_dir = config.get_base('temp_dir')
|
||||||
|
|
||||||
|
# [Log]
|
||||||
|
# 日志保存目录路径
|
||||||
|
log_dir = config.get_log('log_dir')
|
||||||
|
# 日志级别 CRITICAL、FATAL、ERROR、WARNING、WARN、INFO、DEBUG、NOTSET、
|
||||||
|
log_level = config.get_log('log_level')
|
||||||
|
# 函数时间消耗日志打印类型 file 打印到文件; close 关闭
|
||||||
|
time_log_print_type = config.get_log('time_log_print_type')
|
||||||
|
# 函数时间消耗日志保存目录路径
|
||||||
|
time_log_print_dir = config.get_log('time_log_print_dir')
|
||||||
|
|
||||||
|
# [AudioSample]
|
||||||
|
# list转换待选参考音频目录
|
||||||
|
list_to_convert_reference_audio_dir = config.get_audio_sample('list_to_convert_reference_audio_dir')
|
||||||
|
# 音频相似度目录
|
||||||
|
audio_similarity_dir = config.get_audio_sample('audio_similarity_dir')
|
||||||
|
# 是否开启基准音频预采样 true false
|
||||||
|
enable_pre_sample = config.get_audio_sample('enable_pre_sample')
|
||||||
|
|
||||||
|
# [Inference]
|
||||||
|
# 默认测试文本位置
|
||||||
|
default_test_text_path = config.get_inference('default_test_text_path')
|
||||||
|
# 推理音频目录
|
||||||
|
inference_audio_dir = config.get_inference('inference_audio_dir')
|
||||||
|
# 推理音频文本聚合目录
|
||||||
|
inference_audio_text_aggregation_dir = config.get_inference('inference_audio_text_aggregation_dir')
|
||||||
|
# 推理音频情绪聚合目录
|
||||||
|
inference_audio_emotion_aggregation_dir = config.get_inference('inference_audio_emotion_aggregation_dir')
|
||||||
|
|
||||||
|
# [ResultCheck]
|
||||||
|
# asr输出文件
|
||||||
|
asr_filename = config.get_result_check('asr_filename')
|
||||||
|
# 文本相似度输出目录
|
||||||
|
text_similarity_output_dir = config.get_result_check('text_similarity_output_dir')
|
||||||
|
# 文本情绪平均相似度报告文件名
|
||||||
|
text_emotion_average_similarity_report_filename = config.get_result_check('text_emotion_average_similarity_report_filename')
|
||||||
|
# 文本相似度按情绪聚合明细文件名
|
||||||
|
text_similarity_by_emotion_detail_filename = config.get_result_check('text_similarity_by_emotion_detail_filename')
|
||||||
|
# 文本相似度按文本聚合明细文件名
|
||||||
|
text_similarity_by_text_detail_filename = config.get_result_check('text_similarity_by_text_detail_filename')
|
||||||
|
|
||||||
|
# [AudioConfig]
|
||||||
|
# 默认模板文件位置
|
||||||
|
default_template_path = config.get_audio_config('default_template_path')
|
||||||
|
# 参考音频配置文件名
|
||||||
|
reference_audio_config_filename = config.get_audio_config('reference_audio_config_filename')
|
||||||
|
|
65
Ref_Audio_Selector/config_param/log_config.py
Normal file
65
Ref_Audio_Selector/config_param/log_config.py
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import datetime
|
||||||
|
import Ref_Audio_Selector.config_param.config_params as params
|
||||||
|
|
||||||
|
|
||||||
|
def create_general_logger():
|
||||||
|
# 获取当前日期,用于文件名和日志内容
|
||||||
|
current_date = datetime.datetime.now().strftime('%Y-%m-%d')
|
||||||
|
|
||||||
|
# 创建一个用于控制台输出的处理器,并设置日志级别
|
||||||
|
console_handler = logging.StreamHandler()
|
||||||
|
# console_handler.setLevel(logging.INFO)
|
||||||
|
# 可以设置控制台输出的格式
|
||||||
|
console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
console_handler.setFormatter(console_formatter)
|
||||||
|
console_handler.encoding = 'utf-8' # 设置字符编码为utf-8
|
||||||
|
|
||||||
|
os.makedirs(params.log_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# 创建一个用于常规日志的处理器
|
||||||
|
general_handler = logging.FileHandler(f"{params.log_dir}/{current_date}.log", mode='a', encoding='utf-8')
|
||||||
|
# general_handler.setLevel(logging.INFO)
|
||||||
|
general_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
general_handler.setFormatter(general_formatter)
|
||||||
|
|
||||||
|
# 配置一个常规的logger
|
||||||
|
general_logger = logging.getLogger('general')
|
||||||
|
level = logging.getLevelName(params.log_level)
|
||||||
|
general_logger.setLevel(level)
|
||||||
|
general_logger.addHandler(console_handler)
|
||||||
|
general_logger.addHandler(general_handler)
|
||||||
|
|
||||||
|
# 配置根logger,以防万一
|
||||||
|
logging.basicConfig(level=logging.WARNING, handlers=[general_handler])
|
||||||
|
|
||||||
|
return general_logger
|
||||||
|
|
||||||
|
|
||||||
|
def create_performance_logger():
|
||||||
|
# 获取当前日期,用于文件名和日志内容
|
||||||
|
current_date = datetime.datetime.now().strftime('%Y-%m-%d')
|
||||||
|
|
||||||
|
os.makedirs(params.time_log_print_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# 创建一个专用于性能监控日志的处理器
|
||||||
|
performance_handler = logging.FileHandler(
|
||||||
|
f"{params.time_log_print_dir}/{current_date}.log", mode='a', encoding='utf-8')
|
||||||
|
# performance_handler.setLevel(logging.INFO)
|
||||||
|
performance_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
performance_handler.setFormatter(performance_formatter)
|
||||||
|
|
||||||
|
# 配置一个专门用于性能监控的logger
|
||||||
|
performance_logger = logging.getLogger('performance')
|
||||||
|
performance_logger.setLevel(logging.INFO)
|
||||||
|
performance_logger.addHandler(performance_handler)
|
||||||
|
|
||||||
|
return performance_logger
|
||||||
|
|
||||||
|
|
||||||
|
def setup_logging():
|
||||||
|
return create_general_logger(), create_performance_logger()
|
||||||
|
|
||||||
|
|
||||||
|
logger, p_logger = setup_logging()
|
0
Ref_Audio_Selector/file/base_info/role.txt
Normal file
0
Ref_Audio_Selector/file/base_info/role.txt
Normal file
0
Ref_Audio_Selector/file/base_info/work_dir.txt
Normal file
0
Ref_Audio_Selector/file/base_info/work_dir.txt
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
"${emotion}": {
|
||||||
|
"ref_wav_path": "${ref_path}",
|
||||||
|
"prompt_text": "${ref_text}",
|
||||||
|
"prompt_language": "中文"
|
||||||
|
}
|
4
Ref_Audio_Selector/file/test_content/test_content.txt
Normal file
4
Ref_Audio_Selector/file/test_content/test_content.txt
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
也是只有一次。”白蓉简单地回答,然后迅速转移话锋,搂住罗辑的脖子说,“算了,我不要那生日礼物了,你也回到正常的生活中来,好吗?”
|
||||||
|
云天明看到那是一条丑陋的虫子,软乎乎湿漉漉的,在她白皙的手指间蠕动着,旁边一个女生尖叫道:恶心死了,你碰它干吗?!程心把虫子轻轻放到旁边的草丛中,说,它在这里会给踩死的。
|
||||||
|
“那么多的星星,像雾似的。”云天明感叹道。程心把目光从银河收回,转头看着他,指着下面的校园和城市说:“你看下面也很漂亮啊,我们的生活是在这儿,可不是在那么远的银河里。”
|
||||||
|
“可我们的专业,不就是为了到地球之外去吗?”“那是为了这里的生活更好,可不是为了逃离地球啊。”云天明当然知道程心的话是委婉地指向他的孤僻和自闭,他也只有默然以对。
|
1066
Ref_Audio_Selector/ref_audio_selector_webui.py
Normal file
1066
Ref_Audio_Selector/ref_audio_selector_webui.py
Normal file
File diff suppressed because it is too large
Load Diff
5
Ref_Audio_Selector/start_ref_audio_selector_webui.bat
Normal file
5
Ref_Audio_Selector/start_ref_audio_selector_webui.bat
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
CHCP 65001
|
||||||
|
@echo off
|
||||||
|
cd ../
|
||||||
|
runtime\python.exe ./Ref_Audio_Selector/ref_audio_selector_webui.py
|
||||||
|
pause
|
0
Ref_Audio_Selector/tool/__init__.py
Normal file
0
Ref_Audio_Selector/tool/__init__.py
Normal file
0
Ref_Audio_Selector/tool/asr/__init__.py
Normal file
0
Ref_Audio_Selector/tool/asr/__init__.py
Normal file
120
Ref_Audio_Selector/tool/asr/fasterwhisper_asr_multi_level_dir.py
Normal file
120
Ref_Audio_Selector/tool/asr/fasterwhisper_asr_multi_level_dir.py
Normal file
@ -0,0 +1,120 @@
|
|||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import traceback
|
||||||
|
import Ref_Audio_Selector.config_param.config_params as params
|
||||||
|
|
||||||
|
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
|
||||||
|
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from tools.asr.config import check_fw_local_models
|
||||||
|
from Ref_Audio_Selector.config_param.log_config import logger
|
||||||
|
|
||||||
|
language_code_list = [
|
||||||
|
"af", "am", "ar", "as", "az",
|
||||||
|
"ba", "be", "bg", "bn", "bo",
|
||||||
|
"br", "bs", "ca", "cs", "cy",
|
||||||
|
"da", "de", "el", "en", "es",
|
||||||
|
"et", "eu", "fa", "fi", "fo",
|
||||||
|
"fr", "gl", "gu", "ha", "haw",
|
||||||
|
"he", "hi", "hr", "ht", "hu",
|
||||||
|
"hy", "id", "is", "it", "ja",
|
||||||
|
"jw", "ka", "kk", "km", "kn",
|
||||||
|
"ko", "la", "lb", "ln", "lo",
|
||||||
|
"lt", "lv", "mg", "mi", "mk",
|
||||||
|
"ml", "mn", "mr", "ms", "mt",
|
||||||
|
"my", "ne", "nl", "nn", "no",
|
||||||
|
"oc", "pa", "pl", "ps", "pt",
|
||||||
|
"ro", "ru", "sa", "sd", "si",
|
||||||
|
"sk", "sl", "sn", "so", "sq",
|
||||||
|
"sr", "su", "sv", "sw", "ta",
|
||||||
|
"te", "tg", "th", "tk", "tl",
|
||||||
|
"tr", "tt", "uk", "ur", "uz",
|
||||||
|
"vi", "yi", "yo", "zh", "yue",
|
||||||
|
"auto"]
|
||||||
|
|
||||||
|
|
||||||
|
def execute_asr_multi_level_dir(input_folder, output_folder, model_size, language, precision):
|
||||||
|
if '-local' in model_size:
|
||||||
|
model_size = model_size[:-6]
|
||||||
|
model_path = f'tools/asr/models/faster-whisper-{model_size}'
|
||||||
|
else:
|
||||||
|
model_path = model_size
|
||||||
|
if language == 'auto':
|
||||||
|
language = None # 不设置语种由模型自动输出概率最高的语种
|
||||||
|
logger.info("loading faster whisper model:", model_size, model_path)
|
||||||
|
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||||
|
try:
|
||||||
|
model = WhisperModel(model_path, device=device, compute_type=precision)
|
||||||
|
except:
|
||||||
|
return logger.error(traceback.format_exc())
|
||||||
|
|
||||||
|
output = []
|
||||||
|
|
||||||
|
# 递归遍历输入目录及所有子目录
|
||||||
|
for root, dirs, files in os.walk(input_folder):
|
||||||
|
for file_name in sorted(files):
|
||||||
|
# 只处理wav文件(假设是wav文件)
|
||||||
|
if file_name.endswith(".wav"):
|
||||||
|
try:
|
||||||
|
file_path = os.path.join(root, file_name)
|
||||||
|
original_text = os.path.basename(root)
|
||||||
|
segments, info = model.transcribe(
|
||||||
|
audio=file_path,
|
||||||
|
beam_size=5,
|
||||||
|
vad_filter=True,
|
||||||
|
vad_parameters=dict(min_silence_duration_ms=700),
|
||||||
|
language=language)
|
||||||
|
text = ''
|
||||||
|
|
||||||
|
if info.language == "zh":
|
||||||
|
logger.info("检测为中文文本, 转 FunASR 处理")
|
||||||
|
if ("only_asr" not in globals()):
|
||||||
|
from Ref_Audio_Selector.tool.asr.funasr_asr_multi_level_dir import \
|
||||||
|
only_asr # #如果用英文就不需要导入下载模型
|
||||||
|
text = only_asr(file_path)
|
||||||
|
|
||||||
|
if text == '':
|
||||||
|
for segment in segments:
|
||||||
|
text += segment.text
|
||||||
|
output.append(f"{file_path}|{original_text}|{info.language.upper()}|{text}")
|
||||||
|
print(f"{file_path}|{original_text}|{info.language.upper()}|{text}")
|
||||||
|
except:
|
||||||
|
return logger.error(traceback.format_exc())
|
||||||
|
|
||||||
|
output_folder = output_folder
|
||||||
|
os.makedirs(output_folder, exist_ok=True)
|
||||||
|
output_file_path = os.path.abspath(f'{output_folder}/{params.asr_filename}.list')
|
||||||
|
|
||||||
|
with open(output_file_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write("\n".join(output))
|
||||||
|
logger.info(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
|
||||||
|
return output_file_path
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("-i", "--input_folder", type=str, required=True,
|
||||||
|
help="Path to the folder containing WAV files.")
|
||||||
|
parser.add_argument("-o", "--output_folder", type=str, required=True,
|
||||||
|
help="Output folder to store transcriptions.")
|
||||||
|
parser.add_argument("-s", "--model_size", type=str, default='large-v3',
|
||||||
|
choices=check_fw_local_models(),
|
||||||
|
help="Model Size of Faster Whisper")
|
||||||
|
parser.add_argument("-l", "--language", type=str, default='ja',
|
||||||
|
choices=language_code_list,
|
||||||
|
help="Language of the audio files.")
|
||||||
|
parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16', 'float32'],
|
||||||
|
help="fp16 or fp32")
|
||||||
|
|
||||||
|
cmd = parser.parse_args()
|
||||||
|
output_file_path = execute_asr_multi_level_dir(
|
||||||
|
input_folder=cmd.input_folder,
|
||||||
|
output_folder=cmd.output_folder,
|
||||||
|
model_size=cmd.model_size,
|
||||||
|
language=cmd.language,
|
||||||
|
precision=cmd.precision,
|
||||||
|
)
|
94
Ref_Audio_Selector/tool/asr/funasr_asr_multi_level_dir.py
Normal file
94
Ref_Audio_Selector/tool/asr/funasr_asr_multi_level_dir.py
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
# -*- coding:utf-8 -*-
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import traceback
|
||||||
|
import Ref_Audio_Selector.config_param.config_params as params
|
||||||
|
from Ref_Audio_Selector.config_param.log_config import logger
|
||||||
|
from Ref_Audio_Selector.common.time_util import timeit_decorator
|
||||||
|
from tqdm import tqdm
|
||||||
|
from funasr import AutoModel
|
||||||
|
|
||||||
|
path_asr = 'tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
|
||||||
|
path_vad = 'tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch'
|
||||||
|
path_punc = 'tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch'
|
||||||
|
path_asr = path_asr if os.path.exists(
|
||||||
|
path_asr) else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
||||||
|
path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
|
||||||
|
path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
|
||||||
|
|
||||||
|
model = AutoModel(
|
||||||
|
model=path_asr,
|
||||||
|
model_revision="v2.0.4",
|
||||||
|
vad_model=path_vad,
|
||||||
|
vad_model_revision="v2.0.4",
|
||||||
|
punc_model=path_punc,
|
||||||
|
punc_model_revision="v2.0.4",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def only_asr(input_file):
|
||||||
|
try:
|
||||||
|
text = model.generate(input=input_file)[0]["text"]
|
||||||
|
except:
|
||||||
|
text = ''
|
||||||
|
logger.error(traceback.format_exc())
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
@timeit_decorator
|
||||||
|
def execute_asr_multi_level_dir(input_folder, output_folder, model_size, language):
|
||||||
|
output = []
|
||||||
|
# 递归遍历输入目录及所有子目录
|
||||||
|
for root, dirs, files in os.walk(input_folder):
|
||||||
|
for name in sorted(files):
|
||||||
|
# 只处理wav文件(假设是wav文件)
|
||||||
|
if name.endswith(".wav"):
|
||||||
|
try:
|
||||||
|
original_text = os.path.basename(root)
|
||||||
|
# 构造完整的输入音频文件路径
|
||||||
|
input_file_path = os.path.join(root, name)
|
||||||
|
input_file_path = os.path.normpath(input_file_path) # 先标准化可能存在混合斜杠的情况
|
||||||
|
asr_text = model.generate(input=input_file_path)[0]["text"]
|
||||||
|
|
||||||
|
output.append(f"{input_file_path}|{original_text}|{language.upper()}|{asr_text}")
|
||||||
|
|
||||||
|
except:
|
||||||
|
logger.error(traceback.format_exc())
|
||||||
|
|
||||||
|
# 创建或打开指定的输出目录
|
||||||
|
output_folder = output_folder
|
||||||
|
output_dir_abs = os.path.abspath(output_folder)
|
||||||
|
os.makedirs(output_dir_abs, exist_ok=True)
|
||||||
|
|
||||||
|
# 构造输出文件路径
|
||||||
|
output_file_path = os.path.join(output_dir_abs, f'{params.asr_filename}.list')
|
||||||
|
|
||||||
|
# 将输出写入文件
|
||||||
|
with open(output_file_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write("\n".join(output))
|
||||||
|
logger.info(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
|
||||||
|
|
||||||
|
return output_file_path
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("-i", "--input_folder", type=str, required=True,
|
||||||
|
help="Path to the folder containing WAV files.")
|
||||||
|
parser.add_argument("-o", "--output_folder", type=str, required=True,
|
||||||
|
help="Output folder to store transcriptions.")
|
||||||
|
parser.add_argument("-s", "--model_size", type=str, default='large',
|
||||||
|
help="Model Size of FunASR is Large")
|
||||||
|
parser.add_argument("-l", "--language", type=str, default='zh', choices=['zh'],
|
||||||
|
help="Language of the audio files.")
|
||||||
|
parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16', 'float32'],
|
||||||
|
help="fp16 or fp32") # 还没接入
|
||||||
|
|
||||||
|
cmd = parser.parse_args()
|
||||||
|
execute_asr_multi_level_dir(
|
||||||
|
input_folder=cmd.input_folder,
|
||||||
|
output_folder=cmd.output_folder,
|
||||||
|
model_size=cmd.model_size,
|
||||||
|
language=cmd.language,
|
||||||
|
)
|
54
Ref_Audio_Selector/tool/audio_check.py
Normal file
54
Ref_Audio_Selector/tool/audio_check.py
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import Ref_Audio_Selector.common.common as common
|
||||||
|
import Ref_Audio_Selector.config_param.config_params as params
|
||||||
|
from Ref_Audio_Selector.config_param.log_config import logger
|
||||||
|
|
||||||
|
|
||||||
|
def remove_matching_audio_files_in_text_dir(text_dir, emotions_list):
|
||||||
|
count = 0
|
||||||
|
emotions = [item['emotion'] for item in emotions_list]
|
||||||
|
for root, dirs, files in os.walk(text_dir):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith(".wav"):
|
||||||
|
emotion_tag = os.path.basename(file)[:-4]
|
||||||
|
if emotion_tag not in emotions:
|
||||||
|
file_path = os.path.join(root, file)
|
||||||
|
logger.info(f"Deleting file: {file_path}")
|
||||||
|
try:
|
||||||
|
os.remove(file_path)
|
||||||
|
count += 1
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error deleting file {file_path}: {e}")
|
||||||
|
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def delete_emotion_subdirectories(emotion_dir, emotions_list):
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
emotions = [item['emotion'] for item in emotions_list]
|
||||||
|
|
||||||
|
for entry in os.listdir(emotion_dir):
|
||||||
|
entry_path = os.path.join(emotion_dir, entry)
|
||||||
|
if os.path.isdir(entry_path):
|
||||||
|
if entry not in emotions:
|
||||||
|
logger.info(f"Deleting directory: {entry_path}")
|
||||||
|
try:
|
||||||
|
# 使用shutil.rmtree删除整个子目录及其内容
|
||||||
|
shutil.rmtree(entry_path)
|
||||||
|
count += 1
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error deleting directory {entry_path}: {e}")
|
||||||
|
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def sync_ref_audio(ref_audio_dir, inference_audio_dir):
|
||||||
|
ref_audio_manager = common.RefAudioListManager(ref_audio_dir)
|
||||||
|
ref_list = ref_audio_manager.get_ref_audio_list()
|
||||||
|
text_dir = os.path.join(inference_audio_dir, params.inference_audio_text_aggregation_dir)
|
||||||
|
emotion_dir = os.path.join(inference_audio_dir, params.inference_audio_emotion_aggregation_dir)
|
||||||
|
delete_text_wav_num = remove_matching_audio_files_in_text_dir(text_dir, ref_list)
|
||||||
|
delete_emotion_dir_num = delete_emotion_subdirectories(emotion_dir, ref_list)
|
||||||
|
return delete_text_wav_num, delete_emotion_dir_num
|
31
Ref_Audio_Selector/tool/audio_config.py
Normal file
31
Ref_Audio_Selector/tool/audio_config.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
import os
|
||||||
|
import platform
|
||||||
|
|
||||||
|
|
||||||
|
def generate_audio_config(work_space_dir, template_str, audio_list, output_file_path):
|
||||||
|
# 定义一个空字符串来存储最终要写入文件的内容
|
||||||
|
file_content = ""
|
||||||
|
|
||||||
|
# 遍历参考音频列表
|
||||||
|
for audio_info in audio_list:
|
||||||
|
emotion = audio_info['emotion']
|
||||||
|
ref_path = audio_info['ref_path']
|
||||||
|
ref_text = audio_info['ref_text']
|
||||||
|
|
||||||
|
relative_path = os.path.relpath(ref_path, work_space_dir)
|
||||||
|
if platform.system() == 'Windows':
|
||||||
|
relative_path = relative_path.replace('\\', '/')
|
||||||
|
|
||||||
|
# 使用字符串模板替换变量
|
||||||
|
formatted_line = template_str.replace('${emotion}', emotion).replace('${ref_path}', relative_path).replace(
|
||||||
|
'${ref_text}', ref_text)
|
||||||
|
|
||||||
|
# 将格式化后的行添加到内容中,使用逗号和换行符分隔
|
||||||
|
file_content += formatted_line + ",\n"
|
||||||
|
|
||||||
|
# 删除最后一个逗号和换行符,确保格式整洁
|
||||||
|
file_content = file_content[:-2]
|
||||||
|
|
||||||
|
# 将内容写入输出文件
|
||||||
|
with open(output_file_path, 'w', encoding='utf-8') as output_file:
|
||||||
|
output_file.write(file_content)
|
238
Ref_Audio_Selector/tool/audio_inference.py
Normal file
238
Ref_Audio_Selector/tool/audio_inference.py
Normal file
@ -0,0 +1,238 @@
|
|||||||
|
import time
|
||||||
|
import os
|
||||||
|
import requests
|
||||||
|
import itertools
|
||||||
|
import multiprocessing
|
||||||
|
from concurrent.futures import ProcessPoolExecutor
|
||||||
|
import numpy as np
|
||||||
|
import Ref_Audio_Selector.config_param.config_params as params
|
||||||
|
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse, quote
|
||||||
|
from Ref_Audio_Selector.config_param.log_config import logger, p_logger
|
||||||
|
|
||||||
|
|
||||||
|
class SetModelURLComposer:
|
||||||
|
def __init__(self, type, base_url, gpt_param_name, sovits_param_name):
|
||||||
|
self.type = type
|
||||||
|
self.base_url = base_url
|
||||||
|
self.gpt_param_name = gpt_param_name
|
||||||
|
self.sovits_param_name = sovits_param_name
|
||||||
|
|
||||||
|
def is_valid(self):
|
||||||
|
if self.base_url is None or self.base_url == '':
|
||||||
|
raise Exception("请求地址不能为空")
|
||||||
|
if self.type in ['gpt', 'all']:
|
||||||
|
if self.gpt_param_name is None or self.gpt_param_name == '':
|
||||||
|
raise Exception("GPT参数名不能为空")
|
||||||
|
if self.type in ['sovits', 'all']:
|
||||||
|
if self.sovits_param_name is None or self.sovits_param_name == '':
|
||||||
|
raise Exception("Sovits参数名不能为空")
|
||||||
|
|
||||||
|
def build_get_url(self, value_array, need_url_encode=True):
|
||||||
|
params = {}
|
||||||
|
if self.type == 'gpt':
|
||||||
|
params[self.gpt_param_name] = value_array[0]
|
||||||
|
if self.type == 'sovits':
|
||||||
|
params[self.sovits_param_name] = value_array[0]
|
||||||
|
if self.type == 'all':
|
||||||
|
params[self.gpt_param_name] = value_array[0]
|
||||||
|
params[self.sovits_param_name] = value_array[1]
|
||||||
|
return append_params_to_url(self.base_url, params, need_url_encode)
|
||||||
|
|
||||||
|
def build_post_url(self, value_array, need_url_encode=True):
|
||||||
|
url = append_params_to_url(self.base_url, {}, need_url_encode)
|
||||||
|
params = {}
|
||||||
|
if self.type == 'gpt':
|
||||||
|
params[self.gpt_param_name] = value_array[0]
|
||||||
|
if self.type == 'sovits':
|
||||||
|
params[self.sovits_param_name] = value_array[0]
|
||||||
|
if self.type == 'all':
|
||||||
|
params[self.gpt_param_name] = value_array[0]
|
||||||
|
params[self.sovits_param_name] = value_array[1]
|
||||||
|
return url, params
|
||||||
|
|
||||||
|
|
||||||
|
class TTSURLComposer:
|
||||||
|
def __init__(self, base_url, refer_type_param, emotion_param_name, text_param_name, ref_path_param_name, ref_text_param_name):
|
||||||
|
self.base_url = base_url
|
||||||
|
# 角色情绪 or 参考音频
|
||||||
|
self.refer_type_param = refer_type_param
|
||||||
|
self.emotion_param_name = emotion_param_name
|
||||||
|
self.text_param_name = text_param_name
|
||||||
|
self.ref_path_param_name = ref_path_param_name
|
||||||
|
self.ref_text_param_name = ref_text_param_name
|
||||||
|
|
||||||
|
def is_valid(self):
|
||||||
|
if self.base_url is None or self.base_url == '':
|
||||||
|
raise ValueError("请输入url")
|
||||||
|
|
||||||
|
if self.text_param_name is None or self.text_param_name == '':
|
||||||
|
raise ValueError("请输入text参数名")
|
||||||
|
|
||||||
|
if self.emotion_param_name is None and self.ref_path_param_name is None and self.ref_text_param_name is None:
|
||||||
|
raise ValueError("请输入至少一个参考or情绪的参数")
|
||||||
|
|
||||||
|
def is_emotion(self):
|
||||||
|
return self.refer_type_param == '角色情绪'
|
||||||
|
|
||||||
|
def build_url_with_emotion(self, text_value, emotion_value, need_url_encode=True):
|
||||||
|
params = {
|
||||||
|
self.text_param_name: text_value,
|
||||||
|
self.emotion_param_name: emotion_value,
|
||||||
|
}
|
||||||
|
return append_params_to_url(self.base_url, params, need_url_encode)
|
||||||
|
|
||||||
|
def build_url_with_ref(self, text_value, ref_path_value, ref_text_value, need_url_encode=True):
|
||||||
|
params = {
|
||||||
|
self.text_param_name: text_value,
|
||||||
|
self.ref_path_param_name: ref_path_value,
|
||||||
|
self.ref_text_param_name: ref_text_value,
|
||||||
|
}
|
||||||
|
return append_params_to_url(self.base_url, params, need_url_encode)
|
||||||
|
|
||||||
|
|
||||||
|
def append_params_to_url(url_with_params, params, need_url_encode):
|
||||||
|
if params:
|
||||||
|
query_params = '&'.join([f"{k}={v}" for k, v in params.items()])
|
||||||
|
url_with_params += '?' + query_params if '?' not in url_with_params else '&' + query_params
|
||||||
|
return url_with_params if not need_url_encode else safe_encode_query_params(url_with_params)
|
||||||
|
|
||||||
|
|
||||||
|
def safe_encode_query_params(original_url):
|
||||||
|
# 分析URL以获取查询字符串部分
|
||||||
|
parsed_url = urlparse(original_url)
|
||||||
|
query_params = parse_qs(parsed_url.query)
|
||||||
|
|
||||||
|
# 将查询参数转换为编码过的字典(键值对会被转码)
|
||||||
|
encoded_params = {k: quote(v[0]) for k, v in query_params.items()}
|
||||||
|
|
||||||
|
# 重新编码查询字符串
|
||||||
|
new_query_string = urlencode(encoded_params, doseq=False)
|
||||||
|
|
||||||
|
# 重建完整的URL
|
||||||
|
new_parsed_url = parsed_url._replace(query=new_query_string)
|
||||||
|
encoded_url = urlunparse(new_parsed_url)
|
||||||
|
|
||||||
|
logger.info(encoded_url)
|
||||||
|
return encoded_url
|
||||||
|
|
||||||
|
|
||||||
|
def generate_audio_files_parallel(url_composer, text_list, emotion_list, output_dir_path, num_processes=1):
|
||||||
|
|
||||||
|
# 将emotion_list均匀分成num_processes个子集
|
||||||
|
emotion_groups = np.array_split(emotion_list, num_processes)
|
||||||
|
|
||||||
|
with ProcessPoolExecutor(max_workers=num_processes) as executor:
|
||||||
|
futures = [
|
||||||
|
executor.submit(generate_audio_files_for_emotion_group, url_composer, text_list, group, output_dir_path)
|
||||||
|
for group in emotion_groups]
|
||||||
|
for future in futures:
|
||||||
|
future.result() # 等待所有进程完成
|
||||||
|
|
||||||
|
|
||||||
|
def generate_audio_files_for_emotion_group(url_composer, text_list, emotion_list, output_dir_path):
|
||||||
|
start_time = time.perf_counter() # 使用 perf_counter 获取高精度计时起点
|
||||||
|
# Ensure the output directory exists
|
||||||
|
output_dir = os.path.abspath(output_dir_path)
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Create subdirectories for text and emotion categories
|
||||||
|
text_subdir = os.path.join(output_dir, params.inference_audio_text_aggregation_dir)
|
||||||
|
os.makedirs(text_subdir, exist_ok=True)
|
||||||
|
emotion_subdir = os.path.join(output_dir, params.inference_audio_emotion_aggregation_dir)
|
||||||
|
os.makedirs(emotion_subdir, exist_ok=True)
|
||||||
|
|
||||||
|
all_count = len(text_list) * len(emotion_list)
|
||||||
|
has_generated_count = 0
|
||||||
|
all_text_count = sum(len(item) for item in text_list)
|
||||||
|
|
||||||
|
# 计算笛卡尔积
|
||||||
|
cartesian_product = list(itertools.product(text_list, emotion_list))
|
||||||
|
|
||||||
|
for text, emotion in cartesian_product:
|
||||||
|
# Generate audio byte stream using the create_audio function
|
||||||
|
|
||||||
|
emotion_name = emotion['emotion']
|
||||||
|
|
||||||
|
text_subdir_text = os.path.join(text_subdir, text)
|
||||||
|
os.makedirs(text_subdir_text, exist_ok=True)
|
||||||
|
text_subdir_text_file_path = os.path.join(text_subdir_text, emotion_name + '.wav')
|
||||||
|
|
||||||
|
emotion_subdir_emotion = os.path.join(emotion_subdir, emotion_name)
|
||||||
|
os.makedirs(emotion_subdir_emotion, exist_ok=True)
|
||||||
|
emotion_subdir_emotion_file_path = os.path.join(emotion_subdir_emotion, text + '.wav')
|
||||||
|
|
||||||
|
# 检查是否已经存在对应的音频文件,如果存在则跳过
|
||||||
|
if os.path.exists(text_subdir_text_file_path) and os.path.exists(emotion_subdir_emotion_file_path):
|
||||||
|
has_generated_count += 1
|
||||||
|
logger.info(f"进程ID: {os.getpid()}, 进度: {has_generated_count}/{all_count}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if url_composer.is_emotion():
|
||||||
|
real_url = url_composer.build_url_with_emotion(text, emotion['emotion'], False)
|
||||||
|
else:
|
||||||
|
real_url = url_composer.build_url_with_ref(text, emotion['ref_path'], emotion['ref_text'], False)
|
||||||
|
|
||||||
|
audio_bytes = inference_audio_from_api(real_url)
|
||||||
|
|
||||||
|
# Write audio bytes to the respective files
|
||||||
|
with open(text_subdir_text_file_path, 'wb') as f:
|
||||||
|
f.write(audio_bytes)
|
||||||
|
with open(emotion_subdir_emotion_file_path, 'wb') as f:
|
||||||
|
f.write(audio_bytes)
|
||||||
|
|
||||||
|
has_generated_count += 1
|
||||||
|
logger.info(f"进程ID: {os.getpid()}, 进度: {has_generated_count}/{all_count}")
|
||||||
|
end_time = time.perf_counter() # 获取计时终点
|
||||||
|
elapsed_time = end_time - start_time # 计算执行耗时
|
||||||
|
# 记录日志内容
|
||||||
|
log_message = f"进程ID: {os.getpid()}, generate_audio_files_for_emotion_group 执行耗时: {elapsed_time:.6f} 秒;推理数量: {has_generated_count}; 字符总数:{all_text_count};每秒推理字符数:{all_text_count*len(emotion_list) / elapsed_time:.3f};"
|
||||||
|
p_logger.info(log_message)
|
||||||
|
logger.info(log_message)
|
||||||
|
|
||||||
|
|
||||||
|
def inference_audio_from_api(url):
|
||||||
|
logger.info(f'inference_audio_from_api url: {url}')
|
||||||
|
# 发起GET请求
|
||||||
|
response = requests.get(url, stream=True)
|
||||||
|
|
||||||
|
# 检查响应状态码是否正常(例如200表示成功)
|
||||||
|
if response.status_code == 200:
|
||||||
|
# 返回音频数据的字节流
|
||||||
|
return response.content
|
||||||
|
else:
|
||||||
|
raise Exception(f"Failed to fetch audio from API. Server responded with status code {response.status_code}.message: {response.json()}")
|
||||||
|
|
||||||
|
|
||||||
|
def start_api_set_model(set_model_url_composer, gpt_models, sovits_models):
|
||||||
|
url, post_body = set_model_url_composer.build_post_url([gpt_models, sovits_models], True)
|
||||||
|
logger.info(f'set_model_url_composer url: {set_model_url_composer}')
|
||||||
|
logger.info(f'start_api_set_model url: {url}')
|
||||||
|
logger.info(f'start_api_set_model post_body: {post_body}')
|
||||||
|
response = requests.post(url, json=post_body)
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.text
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
return f'请求失败,状态码:{response.status_code}'
|
||||||
|
|
||||||
|
|
||||||
|
def start_api_v2_set_gpt_model(set_model_url_composer, gpt_models):
|
||||||
|
url = set_model_url_composer.build_get_url([gpt_models], False)
|
||||||
|
logger.info(f'start_api_v2_set_gpt_model url: {url}')
|
||||||
|
response = requests.get(url)
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.text
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
return f'请求失败,状态码:{response.status_code}'
|
||||||
|
|
||||||
|
|
||||||
|
def start_api_v2_set_sovits_model(set_model_url_composer, sovits_models):
|
||||||
|
url = set_model_url_composer.build_get_url([sovits_models], False)
|
||||||
|
logger.info(f'start_api_v2_set_sovits_model url: {url}')
|
||||||
|
response = requests.get(url)
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.text
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
return f'请求失败,状态码:{response.status_code}'
|
162
Ref_Audio_Selector/tool/audio_sample.py
Normal file
162
Ref_Audio_Selector/tool/audio_sample.py
Normal file
@ -0,0 +1,162 @@
|
|||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import random
|
||||||
|
import librosa
|
||||||
|
from Ref_Audio_Selector.config_param.log_config import logger
|
||||||
|
|
||||||
|
|
||||||
|
def check_audio_duration(path, min_duration=3, max_duration=10):
|
||||||
|
try:
|
||||||
|
|
||||||
|
# 直接计算音频文件的时长(单位:秒)
|
||||||
|
duration = librosa.get_duration(filename=path)
|
||||||
|
|
||||||
|
# 判断时长是否在3s至10s之间
|
||||||
|
if min_duration <= duration <= max_duration:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"无法打开或处理音频文件:{e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def convert_from_list(list_file, output_dir):
|
||||||
|
# 创建输出目录,如果它不存在的话
|
||||||
|
if not os.path.exists(output_dir):
|
||||||
|
os.makedirs(output_dir)
|
||||||
|
|
||||||
|
# 解析.list文件,并操作文件
|
||||||
|
with open(list_file, 'r', encoding='utf-8') as file:
|
||||||
|
lines = file.readlines()
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
parts = line.strip().split('|')
|
||||||
|
if len(parts) != 4:
|
||||||
|
logger.error(f"Line format incorrect: {line}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
audio_path, _, _, transcription = parts
|
||||||
|
|
||||||
|
# 构建新的文件名和路径
|
||||||
|
new_filename = transcription.strip() + '.wav'
|
||||||
|
# new_filename = new_filename.replace(' ', '_') # 移除空格
|
||||||
|
# new_filename = ''.join(e for e in new_filename if e.isalnum() or e in ['_', '.']) # 移除非法字符
|
||||||
|
new_path = os.path.join(output_dir, new_filename)
|
||||||
|
|
||||||
|
# 如果目标文件已存在,不要覆盖
|
||||||
|
if os.path.exists(new_path):
|
||||||
|
logger.info(f"File already exists: {new_path}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 检查音频文件是否存在
|
||||||
|
if not os.path.exists(audio_path):
|
||||||
|
logger.info(f"Audio file does not exist: {audio_path}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if check_audio_duration(audio_path):
|
||||||
|
# 复制音频文件到output目录并重命名
|
||||||
|
shutil.copy2(audio_path, new_path)
|
||||||
|
logger.info(f"File copied and renamed to: {new_path}")
|
||||||
|
else:
|
||||||
|
logger.info(f"File skipped due to duration: {audio_path}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"An error occurred while processing: {audio_path}")
|
||||||
|
logger.error(e)
|
||||||
|
|
||||||
|
logger.info("Processing complete.")
|
||||||
|
|
||||||
|
|
||||||
|
def sample(output_audio_dir, similarity_list, subsection_num, sample_num):
|
||||||
|
# 按照相似度分值降序排序相似度列表
|
||||||
|
similarity_list.sort(key=lambda x: x['score'], reverse=True)
|
||||||
|
|
||||||
|
# 计算每段的起始索引
|
||||||
|
step = len(similarity_list) // subsection_num
|
||||||
|
if len(similarity_list) % subsection_num != 0:
|
||||||
|
step += 1
|
||||||
|
|
||||||
|
# 分段并随机采样
|
||||||
|
for i in range(subsection_num):
|
||||||
|
start = i * step
|
||||||
|
end = (i + 1) * step
|
||||||
|
end = min(end, len(similarity_list)) # 防止最后一段越界
|
||||||
|
|
||||||
|
# 创建子列表
|
||||||
|
subsection = similarity_list[start:end]
|
||||||
|
# 在子列表上随机打乱
|
||||||
|
random.shuffle(subsection)
|
||||||
|
|
||||||
|
# 从打乱后的子列表中抽取相应数量的个体
|
||||||
|
num = min(sample_num, len(subsection))
|
||||||
|
sampled_subsection = subsection[:num]
|
||||||
|
|
||||||
|
# 创建并进入子目录
|
||||||
|
subdir_name = f'emotion_{i + 1}'
|
||||||
|
subdir_path = os.path.join(output_audio_dir, subdir_name)
|
||||||
|
os.makedirs(subdir_path, exist_ok=True)
|
||||||
|
|
||||||
|
# 复制采样结果的音频到子目录
|
||||||
|
for item in sampled_subsection:
|
||||||
|
src_path = item['wav_path']
|
||||||
|
dst_path = os.path.join(subdir_path, os.path.basename(src_path))
|
||||||
|
shutil.copyfile(src_path, dst_path)
|
||||||
|
|
||||||
|
logger.info("Sampling completed.")
|
||||||
|
|
||||||
|
|
||||||
|
def parse_similarity_file(file_path):
|
||||||
|
"""
|
||||||
|
解析指定文本文件,将其中的内容以元组形式存入列表。
|
||||||
|
|
||||||
|
参数:
|
||||||
|
file_path (str): 待解析的文本文件路径。
|
||||||
|
|
||||||
|
返回:
|
||||||
|
list[tuple[float, str]]: 存储浮点数和路径的元组列表。
|
||||||
|
"""
|
||||||
|
result_list = []
|
||||||
|
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
for line in file:
|
||||||
|
# 去除行尾换行符并按'|'分割
|
||||||
|
score, filepath = line.strip().split('|')
|
||||||
|
|
||||||
|
# 将浮点数字符串转换为浮点数类型
|
||||||
|
score = float(score)
|
||||||
|
|
||||||
|
# 将得分和路径作为元组添加到结果列表
|
||||||
|
result_list.append({
|
||||||
|
'score': score,
|
||||||
|
'wav_path': filepath
|
||||||
|
})
|
||||||
|
|
||||||
|
return result_list
|
||||||
|
|
||||||
|
|
||||||
|
def copy_and_move(output_audio_directory, similarity_scores):
|
||||||
|
# 确保新目录存在
|
||||||
|
if not os.path.exists(output_audio_directory):
|
||||||
|
os.makedirs(output_audio_directory)
|
||||||
|
|
||||||
|
# 遍历并复制文件
|
||||||
|
for item in similarity_scores:
|
||||||
|
# 构造新的文件名
|
||||||
|
base_name = os.path.basename(item['wav_path'])[:-4] # 去掉.wav扩展名
|
||||||
|
new_name = f"{item['score'] * 10000:04.0f}-{base_name}.wav"
|
||||||
|
|
||||||
|
# 新文件的完整路径
|
||||||
|
new_path = os.path.join(output_audio_directory, new_name)
|
||||||
|
|
||||||
|
# 复制文件到新目录
|
||||||
|
shutil.copyfile(item['wav_path'], new_path)
|
||||||
|
|
||||||
|
logger.info("已完成复制和重命名操作。")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
similarity_list = parse_similarity_file("D:/tt/similarity/啊,除了伊甸和樱,竟然还有其他人会提起我?.txt")
|
||||||
|
sample('D:/tt/similarity/output', similarity_list, 10, 4)
|
142
Ref_Audio_Selector/tool/speaker_verification/voice_similarity.py
Normal file
142
Ref_Audio_Selector/tool/speaker_verification/voice_similarity.py
Normal file
@ -0,0 +1,142 @@
|
|||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import torchaudio
|
||||||
|
import torchaudio.transforms as T
|
||||||
|
import platform
|
||||||
|
import Ref_Audio_Selector.config_param.config_params as params
|
||||||
|
import Ref_Audio_Selector.config_param.log_config as log_config
|
||||||
|
from Ref_Audio_Selector.common.time_util import timeit_decorator
|
||||||
|
from Ref_Audio_Selector.common.model_manager import speaker_verification_models as models
|
||||||
|
|
||||||
|
from modelscope.pipelines import pipeline
|
||||||
|
|
||||||
|
|
||||||
|
def init_model(model_type='speech_campplus_sv_zh-cn_16k-common'):
|
||||||
|
log_config.logger.info(f'人声识别模型类型:{model_type}')
|
||||||
|
return pipeline(
|
||||||
|
task=models[model_type]['task'],
|
||||||
|
model=models[model_type]['model'],
|
||||||
|
model_revision=models[model_type]['model_revision']
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@timeit_decorator
|
||||||
|
def compare_audio_and_generate_report(reference_audio_path, comparison_dir_path, output_file_path, model_type):
|
||||||
|
sv_pipeline = init_model(model_type)
|
||||||
|
|
||||||
|
# Step 1: 获取比较音频目录下所有音频文件的路径
|
||||||
|
comparison_audio_paths = [os.path.join(comparison_dir_path, f) for f in os.listdir(comparison_dir_path) if
|
||||||
|
f.endswith('.wav')]
|
||||||
|
|
||||||
|
if platform.system() == 'Windows':
|
||||||
|
# 因为这个模型是基于16k音频数据训练的,为了避免后续比较时,每次都对参考音频进行重采样,所以,提前进行了采样
|
||||||
|
# windows不支持torchaudio.sox_effects.apply_effects_tensor,所以改写了依赖文件中的重采样方法
|
||||||
|
# 改用torchaudio.transforms.Resample进行重采样,如果在非windows环境下,没有更改依赖包的采样方法的话,
|
||||||
|
# 使用这段代码进行预采样会出现因为采样方法不同,而导致的模型相似度计算不准确的问题
|
||||||
|
# 当然如果在windows下,使用了其他的采样方法,也会出现不准确的问题
|
||||||
|
if params.enable_pre_sample == 'true':
|
||||||
|
reference_audio_16k = ensure_16k_wav(reference_audio_path)
|
||||||
|
else:
|
||||||
|
reference_audio_16k = reference_audio_path
|
||||||
|
else:
|
||||||
|
reference_audio_16k = reference_audio_path
|
||||||
|
|
||||||
|
# Step 2: 用参考音频依次比较音频目录下的每个音频,获取相似度分数及对应路径
|
||||||
|
all_count = len(comparison_audio_paths)
|
||||||
|
has_processed_count = 0
|
||||||
|
similarity_scores = []
|
||||||
|
for audio_path in comparison_audio_paths:
|
||||||
|
score = sv_pipeline([reference_audio_16k, audio_path])['score']
|
||||||
|
similarity_scores.append({
|
||||||
|
'score': score,
|
||||||
|
'path': audio_path
|
||||||
|
})
|
||||||
|
has_processed_count += 1
|
||||||
|
log_config.logger.info(f'进度:{has_processed_count}/{all_count}')
|
||||||
|
|
||||||
|
# Step 3: 根据相似度分数降序排列
|
||||||
|
similarity_scores.sort(key=lambda x: x['score'], reverse=True)
|
||||||
|
|
||||||
|
# Step 4: 处理输出文件不存在的情况,创建新文件
|
||||||
|
if not os.path.exists(output_file_path):
|
||||||
|
open(output_file_path, 'w').close() # Create an empty file
|
||||||
|
|
||||||
|
# Step 5: 将排序后的结果写入输出结果文件(支持中文)
|
||||||
|
formatted_scores = [f'{item["score"]}|{item["path"]}' for item in similarity_scores]
|
||||||
|
with open(output_file_path, 'w', encoding='utf-8') as f:
|
||||||
|
# 使用'\n'将每个字符串分开,使其写入不同行
|
||||||
|
content = '\n'.join(formatted_scores)
|
||||||
|
f.write(content)
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_16k_wav(audio_file_path, target_sample_rate=16000):
|
||||||
|
"""
|
||||||
|
输入一个音频文件地址,判断其采样率并决定是否进行重采样,然后将结果保存到指定的输出文件。
|
||||||
|
|
||||||
|
参数:
|
||||||
|
audio_file_path (str): 音频文件路径。
|
||||||
|
output_file_path (str): 保存重采样后音频数据的目标文件路径。
|
||||||
|
target_sample_rate (int, optional): 目标采样率,默认为16000Hz。
|
||||||
|
"""
|
||||||
|
# 读取音频文件并获取其采样率
|
||||||
|
waveform, sample_rate = torchaudio.load(audio_file_path)
|
||||||
|
|
||||||
|
# 判断是否需要重采样
|
||||||
|
if sample_rate == target_sample_rate:
|
||||||
|
return audio_file_path
|
||||||
|
else:
|
||||||
|
|
||||||
|
# 创建Resample实例
|
||||||
|
resampler = T.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
|
||||||
|
|
||||||
|
# 应用重采样
|
||||||
|
resampled_waveform = resampler(waveform)
|
||||||
|
|
||||||
|
# 创建临时文件夹
|
||||||
|
os.makedirs(params.temp_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# 设置临时文件名
|
||||||
|
temp_file_path = os.path.join(params.temp_dir, os.path.basename(audio_file_path))
|
||||||
|
|
||||||
|
# 保存重采样后的音频到指定文件
|
||||||
|
torchaudio.save(temp_file_path, resampled_waveform, target_sample_rate)
|
||||||
|
|
||||||
|
return temp_file_path
|
||||||
|
|
||||||
|
|
||||||
|
def parse_arguments():
|
||||||
|
parser = argparse.ArgumentParser(description="Audio processing script arguments")
|
||||||
|
|
||||||
|
# Reference audio path
|
||||||
|
parser.add_argument("-r", "--reference_audio", type=str, required=True,
|
||||||
|
help="Path to the reference WAV file.")
|
||||||
|
|
||||||
|
# Comparison directory path
|
||||||
|
parser.add_argument("-c", "--comparison_dir", type=str, required=True,
|
||||||
|
help="Path to the directory containing comparison WAV files.")
|
||||||
|
|
||||||
|
# Output file path
|
||||||
|
parser.add_argument("-o", "--output_file", type=str, required=True,
|
||||||
|
help="Path to the output file where results will be written.")
|
||||||
|
|
||||||
|
# Model Type
|
||||||
|
parser.add_argument("-m", "--model_type", type=str, required=True,
|
||||||
|
help="Path to the model type.")
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
cmd = parse_arguments()
|
||||||
|
compare_audio_and_generate_report(
|
||||||
|
reference_audio_path=cmd.reference_audio,
|
||||||
|
comparison_dir_path=cmd.comparison_dir,
|
||||||
|
output_file_path=cmd.output_file,
|
||||||
|
model_type=cmd.model_type,
|
||||||
|
)
|
||||||
|
|
||||||
|
# compare_audio_and_generate_report(
|
||||||
|
# reference_audio_path="D:/tt/渡鸦/refer_audio_all/也对,你的身份和我们不同吗?.wav",
|
||||||
|
# comparison_dir_path='D:/tt/渡鸦/refer_audio_all',
|
||||||
|
# output_file_path='D:/tt/渡鸦/test.txt',
|
||||||
|
# )
|
77
Ref_Audio_Selector/tool/text_check.py
Normal file
77
Ref_Audio_Selector/tool/text_check.py
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
import os
|
||||||
|
import Ref_Audio_Selector.common.common as common
|
||||||
|
import Ref_Audio_Selector.tool.audio_check as audio_check
|
||||||
|
from Ref_Audio_Selector.config_param.log_config import logger
|
||||||
|
|
||||||
|
|
||||||
|
def parse_text_similarity_result_txt(file_path):
|
||||||
|
"""
|
||||||
|
解析指定格式的txt文件,每行格式:f"{item['average_similarity_score']}|{item['count']}|{item['emotion']}"
|
||||||
|
|
||||||
|
:param file_path: txt文件的路径
|
||||||
|
:return: 包含解析后数据的字典列表
|
||||||
|
"""
|
||||||
|
data_list = []
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
for line in file:
|
||||||
|
# 使用'|'作为分隔符分割每行数据
|
||||||
|
parts = line.strip().split('|')
|
||||||
|
if len(parts) == 3:
|
||||||
|
# 将分割后的字符串转换为浮点数、整数和字符串
|
||||||
|
try:
|
||||||
|
item = {
|
||||||
|
'average_similarity_score': float(parts[0]),
|
||||||
|
'count': int(parts[1]),
|
||||||
|
'emotion': parts[2]
|
||||||
|
}
|
||||||
|
data_list.append(item)
|
||||||
|
except ValueError as e:
|
||||||
|
# 如果转换失败,打印错误信息并跳过该行
|
||||||
|
logger.error(f"Error parsing line: {line.strip()} - {e}")
|
||||||
|
|
||||||
|
return data_list
|
||||||
|
|
||||||
|
|
||||||
|
def remove_low_similarity_files(ref_audio_list, report_list, audio_text_similarity_boundary):
|
||||||
|
"""
|
||||||
|
根据条件删除低相似度音频文件并返回删除数量。
|
||||||
|
|
||||||
|
:param ref_audio_list: 包含音频路径和情感属性的列表
|
||||||
|
:param report_list: 包含相似度评分和情感属性的列表
|
||||||
|
:param audio_text_similarity_boundary: 相似度阈值
|
||||||
|
:return: 删除的文件数量
|
||||||
|
"""
|
||||||
|
deleted_count = 0
|
||||||
|
|
||||||
|
# 筛选出平均相似度低于阈值的报告
|
||||||
|
low_similarity_reports = [report for report in report_list if
|
||||||
|
report['average_similarity_score'] < audio_text_similarity_boundary]
|
||||||
|
|
||||||
|
# 遍历低相似度报告,查找并删除对应音频文件
|
||||||
|
for report in low_similarity_reports:
|
||||||
|
emotion = report['emotion']
|
||||||
|
# 查找ref_audio_list中相同情感的音频文件路径
|
||||||
|
matching_refs = [ref for ref in ref_audio_list if ref['emotion'] == emotion]
|
||||||
|
for match in matching_refs:
|
||||||
|
ref_path = match['ref_path']
|
||||||
|
# 检查文件是否存在,然后尝试删除
|
||||||
|
if os.path.exists(ref_path):
|
||||||
|
try:
|
||||||
|
os.remove(ref_path)
|
||||||
|
deleted_count += 1
|
||||||
|
logger.info(f"Deleted file: {ref_path}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error deleting file {ref_path}: {e}")
|
||||||
|
else:
|
||||||
|
logger.error(f"File not found: {ref_path}")
|
||||||
|
|
||||||
|
return deleted_count
|
||||||
|
|
||||||
|
|
||||||
|
def delete_ref_audio_below_boundary(ref_audio_path, text_similarity_result_path, sync_inference_audio_dir,
|
||||||
|
audio_text_similarity_boundary):
|
||||||
|
ref_audio_list = common.RefAudioListManager(ref_audio_path).get_ref_audio_list()
|
||||||
|
report_list = parse_text_similarity_result_txt(text_similarity_result_path)
|
||||||
|
count = remove_low_similarity_files(ref_audio_list, report_list, audio_text_similarity_boundary)
|
||||||
|
audio_check.sync_ref_audio(ref_audio_path, sync_inference_audio_dir)
|
||||||
|
return count
|
0
Ref_Audio_Selector/tool/text_comparison/__init__.py
Normal file
0
Ref_Audio_Selector/tool/text_comparison/__init__.py
Normal file
161
Ref_Audio_Selector/tool/text_comparison/asr_text_process.py
Normal file
161
Ref_Audio_Selector/tool/text_comparison/asr_text_process.py
Normal file
@ -0,0 +1,161 @@
|
|||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
from collections import defaultdict
|
||||||
|
from operator import itemgetter
|
||||||
|
from Ref_Audio_Selector.common.time_util import timeit_decorator
|
||||||
|
import Ref_Audio_Selector.tool.text_comparison.text_comparison as text_comparison
|
||||||
|
import Ref_Audio_Selector.config_param.config_params as params
|
||||||
|
import Ref_Audio_Selector.common.common as common
|
||||||
|
from Ref_Audio_Selector.config_param.log_config import logger
|
||||||
|
|
||||||
|
|
||||||
|
def parse_asr_file(file_path):
|
||||||
|
output = []
|
||||||
|
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
for line in file:
|
||||||
|
# 假设每行都是正确的格式,且"|"'是固定分隔符
|
||||||
|
input_file_path, original_text, language, asr_text = line.strip().split('|')
|
||||||
|
|
||||||
|
emotion = common.get_filename_without_extension(input_file_path)
|
||||||
|
|
||||||
|
# 将解析出的数据构造成新的字典或元组等结构
|
||||||
|
parsed_data = {
|
||||||
|
'emotion': emotion,
|
||||||
|
'input_file_path': input_file_path,
|
||||||
|
'original_text': original_text,
|
||||||
|
'language': language,
|
||||||
|
'asr_text': asr_text,
|
||||||
|
'similarity_score': 0
|
||||||
|
}
|
||||||
|
|
||||||
|
output.append(parsed_data)
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
@timeit_decorator
|
||||||
|
def calculate_similarity_and_append_to_list(input_list, boundary):
|
||||||
|
all_count = len(input_list)
|
||||||
|
has_been_processed_count = 0
|
||||||
|
for item in input_list:
|
||||||
|
original_score, similarity_score = text_comparison.calculate_result(item['original_text'], item['asr_text'], boundary)
|
||||||
|
item['similarity_score'] = similarity_score
|
||||||
|
item['original_score'] = original_score
|
||||||
|
has_been_processed_count += 1
|
||||||
|
logger.info(f'进度:{has_been_processed_count}/{all_count}')
|
||||||
|
|
||||||
|
return input_list
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_average_similarity_by_emotion(data_list):
|
||||||
|
result_dict = defaultdict(list)
|
||||||
|
|
||||||
|
for item in data_list:
|
||||||
|
emotion = item['emotion']
|
||||||
|
similarity_score = item['similarity_score']
|
||||||
|
result_dict[emotion].append(similarity_score)
|
||||||
|
|
||||||
|
average_scores = [{'emotion': emotion, 'average_similarity_score': sum(scores) / len(scores), 'count': len(scores)}
|
||||||
|
for emotion, scores in result_dict.items()]
|
||||||
|
|
||||||
|
average_scores.sort(key=lambda x: x['average_similarity_score'], reverse=True)
|
||||||
|
|
||||||
|
return average_scores
|
||||||
|
|
||||||
|
|
||||||
|
def group_and_sort_by_field(data, group_by_field):
|
||||||
|
# 创建一个空的结果字典,键是group_by_field指定的字段,值是一个列表
|
||||||
|
result_dict = defaultdict(list)
|
||||||
|
|
||||||
|
# 遍历输入列表
|
||||||
|
for item in data:
|
||||||
|
# 根据指定的group_by_field将当前元素添加到对应键的列表中
|
||||||
|
key_to_group = item[group_by_field]
|
||||||
|
result_dict[key_to_group].append(item)
|
||||||
|
|
||||||
|
# 对每个键对应的列表中的元素按similarity_score降序排序
|
||||||
|
for key in result_dict:
|
||||||
|
result_dict[key].sort(key=itemgetter('similarity_score'), reverse=True)
|
||||||
|
|
||||||
|
# 将结果字典转换为列表,每个元素是一个包含键(emotion或original_text)和排序后数组的元组
|
||||||
|
result_list = [(k, v) for k, v in result_dict.items()]
|
||||||
|
|
||||||
|
return result_list
|
||||||
|
|
||||||
|
|
||||||
|
def format_list_to_text(data_list, output_filename):
|
||||||
|
with open(output_filename, 'w', encoding='utf-8') as output_file:
|
||||||
|
output_file.write('放大后的相似度分值|原始分值|ASR文本|原文文本\n')
|
||||||
|
for key, items in data_list:
|
||||||
|
# 写入情绪标题
|
||||||
|
output_file.write(key + '\n')
|
||||||
|
|
||||||
|
# 写入每条记录
|
||||||
|
for item in items:
|
||||||
|
formatted_line = f"{item['similarity_score']}|{item['original_score']}|{item['asr_text']}|{item['original_text']}\n"
|
||||||
|
output_file.write(formatted_line)
|
||||||
|
|
||||||
|
|
||||||
|
def format_list_to_emotion(data_list, output_filename):
|
||||||
|
with open(output_filename, 'w', encoding='utf-8') as output_file:
|
||||||
|
output_file.write('放大后的相似度分值|原始分值|ASR文本|情绪类型\n')
|
||||||
|
for key, items in data_list:
|
||||||
|
# 写入情绪标题
|
||||||
|
output_file.write(key + '\n')
|
||||||
|
|
||||||
|
# 写入每条记录
|
||||||
|
for item in items:
|
||||||
|
formatted_line = f"{item['similarity_score']}|{item['original_score']}|{item['asr_text']}|{item['emotion']}\n"
|
||||||
|
output_file.write(formatted_line)
|
||||||
|
|
||||||
|
|
||||||
|
@timeit_decorator
|
||||||
|
def process(asr_file_path, output_dir, similarity_enlarge_boundary):
|
||||||
|
# 检查输出目录是否存在,如果不存在则创建
|
||||||
|
if not os.path.exists(output_dir):
|
||||||
|
os.makedirs(output_dir)
|
||||||
|
|
||||||
|
records = parse_asr_file(asr_file_path)
|
||||||
|
calculate_similarity_and_append_to_list(records, similarity_enlarge_boundary)
|
||||||
|
average_similarity_list = calculate_average_similarity_by_emotion(records)
|
||||||
|
|
||||||
|
average_similarity_file = os.path.join(output_dir,
|
||||||
|
f'{params.text_emotion_average_similarity_report_filename}.txt')
|
||||||
|
average_similarity_content = \
|
||||||
|
'\n'.join([f"{item['average_similarity_score']}|{item['count']}|{item['emotion']}" for item in average_similarity_list])
|
||||||
|
common.write_text_to_file(average_similarity_content, average_similarity_file)
|
||||||
|
|
||||||
|
emotion_detail_list = group_and_sort_by_field(records, 'emotion')
|
||||||
|
|
||||||
|
emotion_detail_file = os.path.join(output_dir, f'{params.text_similarity_by_emotion_detail_filename}.txt')
|
||||||
|
format_list_to_text(emotion_detail_list, emotion_detail_file)
|
||||||
|
|
||||||
|
original_text_detail_list = group_and_sort_by_field(records, 'original_text')
|
||||||
|
|
||||||
|
original_text_detail_file = os.path.join(output_dir, f'{params.text_similarity_by_text_detail_filename}.txt')
|
||||||
|
format_list_to_emotion(original_text_detail_list, original_text_detail_file)
|
||||||
|
|
||||||
|
logger.info('文本相似度分析完成。')
|
||||||
|
|
||||||
|
|
||||||
|
def parse_arguments():
|
||||||
|
parser = argparse.ArgumentParser(description="Process ASR files and analyze similarity.")
|
||||||
|
|
||||||
|
parser.add_argument("-a", "--asr_file_path", type=str, required=True,
|
||||||
|
help="Path to the directory containing ASR files or path to a single ASR file.")
|
||||||
|
|
||||||
|
parser.add_argument("-o", "--output_dir", type=str, required=True,
|
||||||
|
help="Path to the directory where the analysis results should be saved.")
|
||||||
|
|
||||||
|
parser.add_argument("-b", "--similarity_enlarge_boundary", type=float, required=True,
|
||||||
|
help="Similarity score boundary value to be used in your calculations.")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
cmd = parse_arguments()
|
||||||
|
# print(cmd)
|
||||||
|
process(cmd.asr_file_path, cmd.output_dir, cmd.similarity_enlarge_boundary)
|
128
Ref_Audio_Selector/tool/text_comparison/text_comparison.py
Normal file
128
Ref_Audio_Selector/tool/text_comparison/text_comparison.py
Normal file
@ -0,0 +1,128 @@
|
|||||||
|
import os
|
||||||
|
import torch
|
||||||
|
from transformers import AutoTokenizer, AutoModel
|
||||||
|
from scipy.spatial.distance import cosine
|
||||||
|
from Ref_Audio_Selector.config_param.log_config import logger
|
||||||
|
|
||||||
|
bert_path = os.environ.get(
|
||||||
|
"bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Set device to GPU if available, else CPU
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
|
||||||
|
logger.info(f'使用计算设备: {device}')
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(bert_path)
|
||||||
|
model = AutoModel.from_pretrained(bert_path).to(device)
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_similarity(text1, text2, max_length=512):
|
||||||
|
# 预处理文本,设置最大长度
|
||||||
|
inputs1 = tokenizer(text1, padding=True, truncation=True, max_length=max_length, return_tensors='pt').to(device)
|
||||||
|
inputs2 = tokenizer(text2, padding=True, truncation=True, max_length=max_length, return_tensors='pt').to(device)
|
||||||
|
|
||||||
|
# 获取句子向量(这里是取CLS token的向量并展平为一维)
|
||||||
|
with torch.no_grad():
|
||||||
|
encoded_text1 = model(**inputs1)[0][:, 0, :].flatten()
|
||||||
|
encoded_text2 = model(**inputs2)[0][:, 0, :].flatten()
|
||||||
|
|
||||||
|
# 确保转换为numpy数组并且是一维的
|
||||||
|
similarity = 1 - cosine(encoded_text1.cpu().numpy().flatten(), encoded_text2.cpu().numpy().flatten())
|
||||||
|
|
||||||
|
return similarity
|
||||||
|
|
||||||
|
|
||||||
|
# 对boundary到1区间的值进行放大
|
||||||
|
def adjusted_similarity(similarity_score2, boundary=0.8):
|
||||||
|
if similarity_score2 < boundary:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# 倍数
|
||||||
|
multiple = 1 / (1 - boundary)
|
||||||
|
|
||||||
|
adjusted_score = (similarity_score2 - boundary) * multiple
|
||||||
|
|
||||||
|
return adjusted_score
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_result(t1, t2, boundary):
|
||||||
|
# 计算并打印相似度
|
||||||
|
similarity_score2 = calculate_similarity(t1, t2)
|
||||||
|
|
||||||
|
# 调整相似度
|
||||||
|
adjusted_similarity_score2 = adjusted_similarity(similarity_score2, boundary)
|
||||||
|
|
||||||
|
return similarity_score2, adjusted_similarity_score2
|
||||||
|
|
||||||
|
|
||||||
|
def print_result(t1, t2, boundary):
|
||||||
|
print(f't2: {t2}')
|
||||||
|
# 计算并打印相似度
|
||||||
|
similarity_score2 = calculate_similarity(t1, t2)
|
||||||
|
print(f"两句话的相似度为: {similarity_score2:.4f}")
|
||||||
|
|
||||||
|
# 调整相似度
|
||||||
|
adjusted_similarity_score2 = adjusted_similarity(similarity_score2, boundary)
|
||||||
|
print(f"调整后的相似度为: {adjusted_similarity_score2:.4f}")
|
||||||
|
|
||||||
|
|
||||||
|
def test(boundary):
|
||||||
|
# 原始文本
|
||||||
|
text1 = "这是第一个句子"
|
||||||
|
list = """
|
||||||
|
这是第一个句子
|
||||||
|
这是第二个句子。
|
||||||
|
那么,这是第三个表达。
|
||||||
|
当前呈现的是第四个句子。
|
||||||
|
接下来,我们有第五句话。
|
||||||
|
在此,展示第六条陈述。
|
||||||
|
继续下去,这是第七个短句。
|
||||||
|
不容忽视的是第八个表述。
|
||||||
|
顺延着序列,这是第九句。
|
||||||
|
此处列举的是第十个说法。
|
||||||
|
进入新的篇章,这是第十一个句子。
|
||||||
|
下一段内容即为第十二个句子。
|
||||||
|
显而易见,这是第十三个叙述。
|
||||||
|
渐进地,我们来到第十四句话。
|
||||||
|
向下滚动,您会看到第十五个表达。
|
||||||
|
此刻,呈现在眼前的是第十六个句子。
|
||||||
|
它们中的一个——第十七个句子在此。
|
||||||
|
如同链条般连接,这是第十八个断言。
|
||||||
|
按照顺序排列,接下来是第十九个话语。
|
||||||
|
逐一列举,这是第二十个陈述句。
|
||||||
|
结构相似,本例给出第二十一个实例句。
|
||||||
|
这是最初的陈述句。
|
||||||
|
首先表达的是这一个句子。
|
||||||
|
第一句内容即为此处所示。
|
||||||
|
这是起始的叙述段落。
|
||||||
|
开篇所展示的第一句话就是这个。
|
||||||
|
明媚的阳光洒满大地
|
||||||
|
窗外飘落粉色樱花瓣
|
||||||
|
笔尖轻触纸面思绪万千
|
||||||
|
深夜的月光如水般静谧
|
||||||
|
穿越丛林的小径蜿蜒曲折
|
||||||
|
浅酌清茶品味人生百态
|
||||||
|
破晓时分雄鸡一唱天下白
|
||||||
|
草原上奔驰的骏马无拘无束
|
||||||
|
秋叶纷飞描绘季节更替画卷
|
||||||
|
寒冬雪夜炉火旁围坐共话家常
|
||||||
|
kszdRjYXw
|
||||||
|
pfsMgTlVHnB
|
||||||
|
uQaGxIbWz
|
||||||
|
ZtqNhPmKcOe
|
||||||
|
jfyrXsStVUo
|
||||||
|
wDiEgLkZbn
|
||||||
|
yhNvAfUmqC
|
||||||
|
TpKjxMrWgs
|
||||||
|
eBzHUaFJtYd
|
||||||
|
oQnXcVSiPkL
|
||||||
|
00000
|
||||||
|
"""
|
||||||
|
list2 = list.strip().split('\n')
|
||||||
|
for item in list2:
|
||||||
|
print_result(text1, item, boundary)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test(0.9)
|
0
Ref_Audio_Selector/ui_init/__init__.py
Normal file
0
Ref_Audio_Selector/ui_init/__init__.py
Normal file
197
Ref_Audio_Selector/ui_init/init_ui_param.py
Normal file
197
Ref_Audio_Selector/ui_init/init_ui_param.py
Normal file
@ -0,0 +1,197 @@
|
|||||||
|
import os
|
||||||
|
import multiprocessing
|
||||||
|
import Ref_Audio_Selector.config_param.config_params as params
|
||||||
|
import Ref_Audio_Selector.tool.audio_inference as audio_inference
|
||||||
|
import Ref_Audio_Selector.common.common as common
|
||||||
|
|
||||||
|
rw_param = params.config_manager.get_rw_param()
|
||||||
|
# -------------------基本信息---------------------------
|
||||||
|
|
||||||
|
# 角色所在工作目录
|
||||||
|
base_dir_default = None
|
||||||
|
# 工作目录
|
||||||
|
text_work_space_dir_default = None
|
||||||
|
# 角色名称
|
||||||
|
text_role_default = None
|
||||||
|
# 参考音频所在目录
|
||||||
|
text_refer_audio_file_dir_default = None
|
||||||
|
# 推理音频所在目录
|
||||||
|
text_inference_audio_file_dir_default = None
|
||||||
|
|
||||||
|
# -------------------第一步------------------------------
|
||||||
|
|
||||||
|
# 参考音频抽样目录
|
||||||
|
text_sample_dir_default = None
|
||||||
|
# 分段数
|
||||||
|
slider_subsection_num_default = None
|
||||||
|
# 每段随机抽样个数
|
||||||
|
slider_sample_num_default = None
|
||||||
|
|
||||||
|
# -------------------第二步------------------------------
|
||||||
|
|
||||||
|
# api服务模型切换接口地址
|
||||||
|
text_api_set_model_base_url_default = None
|
||||||
|
# GPT模型参数名
|
||||||
|
text_api_gpt_param_default = None
|
||||||
|
# SoVITS模型参数名
|
||||||
|
text_api_sovits_param_default = None
|
||||||
|
# api服务GPT模型切换接口地址
|
||||||
|
text_api_v2_set_gpt_model_base_url_default = None
|
||||||
|
# GPT模型参数名
|
||||||
|
text_api_v2_gpt_model_param_default = None
|
||||||
|
# api服务SoVITS模型切换接口地址
|
||||||
|
text_api_v2_set_sovits_model_base_url_default = None
|
||||||
|
# SoVITS模型参数名
|
||||||
|
text_api_v2_sovits_model_param_default = None
|
||||||
|
# 推理服务请求地址与参数
|
||||||
|
text_url_default = None
|
||||||
|
# 推理服务请求完整地址
|
||||||
|
text_whole_url_default = None
|
||||||
|
# 文本参数名
|
||||||
|
text_text_default = None
|
||||||
|
# 参考参数类型
|
||||||
|
dropdown_refer_type_param_default = None
|
||||||
|
# 参考音频路径参数名
|
||||||
|
text_ref_path_default = None
|
||||||
|
# 参考音频文本参数名
|
||||||
|
text_ref_text_default = None
|
||||||
|
# 角色情绪参数名
|
||||||
|
text_emotion_default = None
|
||||||
|
# 待推理文本路径
|
||||||
|
text_test_content_default = None
|
||||||
|
# 请求并发数
|
||||||
|
slider_request_concurrency_num_default = 3
|
||||||
|
# 最大并发数
|
||||||
|
slider_request_concurrency_max_num = None
|
||||||
|
|
||||||
|
# -------------------第三步------------------------------
|
||||||
|
|
||||||
|
# 待asr的音频所在目录
|
||||||
|
text_asr_audio_dir_default = None
|
||||||
|
# 待分析的文件路径
|
||||||
|
text_text_similarity_analysis_path_default = None
|
||||||
|
# 文本相似度放大边界
|
||||||
|
slider_text_similarity_amplification_boundary_default = 0.90
|
||||||
|
# 文本相似度分析结果文件所在路径
|
||||||
|
text_text_similarity_result_path_default = None
|
||||||
|
|
||||||
|
# -------------------第四步------------------------------
|
||||||
|
# -------------------第五步------------------------------
|
||||||
|
# 模板内容
|
||||||
|
text_template_default = None
|
||||||
|
|
||||||
|
|
||||||
|
def empty_default(vale, default_value):
|
||||||
|
if vale is None or vale == "":
|
||||||
|
return default_value
|
||||||
|
else:
|
||||||
|
return vale
|
||||||
|
|
||||||
|
|
||||||
|
def init_base():
|
||||||
|
global text_work_space_dir_default, text_role_default, base_dir_default, text_refer_audio_file_dir_default, text_inference_audio_file_dir_default
|
||||||
|
|
||||||
|
text_work_space_dir_default = rw_param.read(rw_param.work_dir)
|
||||||
|
text_role_default = rw_param.read(rw_param.role)
|
||||||
|
base_dir_default = os.path.join(text_work_space_dir_default, text_role_default)
|
||||||
|
|
||||||
|
text_refer_audio_file_dir_default = common.check_path_existence_and_return(
|
||||||
|
os.path.join(base_dir_default, params.reference_audio_dir))
|
||||||
|
|
||||||
|
text_inference_audio_file_dir_default = common.check_path_existence_and_return(
|
||||||
|
os.path.join(base_dir_default, params.inference_audio_dir))
|
||||||
|
|
||||||
|
|
||||||
|
def init_first():
|
||||||
|
global text_sample_dir_default, slider_subsection_num_default, slider_sample_num_default
|
||||||
|
|
||||||
|
text_sample_dir_default = common.check_path_existence_and_return(
|
||||||
|
os.path.join(base_dir_default, params.list_to_convert_reference_audio_dir))
|
||||||
|
|
||||||
|
slider_subsection_num_default = int(empty_default(rw_param.read(rw_param.subsection_num), 10))
|
||||||
|
|
||||||
|
slider_sample_num_default = (empty_default(rw_param.read(rw_param.sample_num), 4))
|
||||||
|
|
||||||
|
|
||||||
|
def init_second():
|
||||||
|
global text_api_set_model_base_url_default, text_api_gpt_param_default, text_api_sovits_param_default, text_api_v2_set_gpt_model_base_url_default, text_api_v2_gpt_model_param_default
|
||||||
|
global text_api_v2_set_sovits_model_base_url_default, text_api_v2_sovits_model_param_default, text_url_default, text_whole_url_default, text_text_default, dropdown_refer_type_param_default, text_ref_path_default
|
||||||
|
global text_ref_text_default, text_emotion_default, text_test_content_default, slider_request_concurrency_num_default, slider_request_concurrency_max_num
|
||||||
|
|
||||||
|
text_api_set_model_base_url_default = empty_default(rw_param.read(rw_param.api_set_model_base_url),
|
||||||
|
'http://localhost:9880/set_model')
|
||||||
|
text_api_gpt_param_default = empty_default(rw_param.read(rw_param.api_gpt_param), 'gpt_model_path')
|
||||||
|
text_api_sovits_param_default = empty_default(rw_param.read(rw_param.api_sovits_param), 'sovits_model_path')
|
||||||
|
|
||||||
|
text_api_v2_set_gpt_model_base_url_default = empty_default(rw_param.read(rw_param.api_v2_set_gpt_model_base_url),
|
||||||
|
'http://localhost:9880/set_gpt_weights')
|
||||||
|
text_api_v2_gpt_model_param_default = empty_default(rw_param.read(rw_param.api_v2_gpt_model_param), 'weights_path')
|
||||||
|
|
||||||
|
text_api_v2_set_sovits_model_base_url_default = empty_default(
|
||||||
|
rw_param.read(rw_param.api_v2_set_sovits_model_base_url), 'http://localhost:9880/set_sovits_weights')
|
||||||
|
text_api_v2_sovits_model_param_default = empty_default(rw_param.read(rw_param.api_v2_sovits_model_param), 'weights_path')
|
||||||
|
|
||||||
|
text_url_default = empty_default(rw_param.read(rw_param.text_url),
|
||||||
|
'http://localhost:9880?prompt_language=中文&text_language=中文&cut_punc=,.;?!、,。?!;:…')
|
||||||
|
text_text_default = empty_default(rw_param.read(rw_param.text_param), 'text')
|
||||||
|
dropdown_refer_type_param_default = empty_default(rw_param.read(rw_param.refer_type_param), '参考音频')
|
||||||
|
|
||||||
|
text_ref_path_default = empty_default(rw_param.read(rw_param.ref_path_param), 'refer_wav_path')
|
||||||
|
text_ref_text_default = empty_default(rw_param.read(rw_param.ref_text_param), 'prompt_text')
|
||||||
|
text_emotion_default = empty_default(rw_param.read(rw_param.emotion_param), 'emotion')
|
||||||
|
|
||||||
|
text_whole_url_default = whole_url(text_url_default, dropdown_refer_type_param_default, text_text_default,
|
||||||
|
text_ref_path_default, text_ref_text_default, text_emotion_default)
|
||||||
|
|
||||||
|
text_test_content_default = empty_default(rw_param.read(rw_param.test_content_path), params.default_test_text_path)
|
||||||
|
|
||||||
|
slider_request_concurrency_max_num = multiprocessing.cpu_count()
|
||||||
|
|
||||||
|
slider_request_concurrency_num_default = empty_default(rw_param.read(rw_param.request_concurrency_num), 3)
|
||||||
|
|
||||||
|
slider_request_concurrency_num_default = min(int(slider_request_concurrency_num_default), slider_request_concurrency_max_num)
|
||||||
|
|
||||||
|
|
||||||
|
# 基于请求路径和参数,合成完整的请求路径
|
||||||
|
def whole_url(text_url, dropdown_refer_type_param, text_text, text_ref_path, text_ref_text, text_emotion):
|
||||||
|
url_composer = audio_inference.TTSURLComposer(text_url, dropdown_refer_type_param, text_emotion, text_text,
|
||||||
|
text_ref_path, text_ref_text)
|
||||||
|
if url_composer.is_emotion():
|
||||||
|
text_whole_url = url_composer.build_url_with_emotion('测试内容', '情绪类型', False)
|
||||||
|
else:
|
||||||
|
text_whole_url = url_composer.build_url_with_ref('测试内容', '参考路径', '参考文本', False)
|
||||||
|
return text_whole_url
|
||||||
|
|
||||||
|
|
||||||
|
def init_third():
|
||||||
|
global text_asr_audio_dir_default, text_text_similarity_analysis_path_default, slider_text_similarity_amplification_boundary_default, text_text_similarity_result_path_default
|
||||||
|
|
||||||
|
text_asr_audio_dir_default = common.check_path_existence_and_return(
|
||||||
|
os.path.join(base_dir_default, params.inference_audio_dir, params.inference_audio_text_aggregation_dir))
|
||||||
|
text_text_similarity_analysis_path_default = common.check_path_existence_and_return(
|
||||||
|
os.path.join(base_dir_default, params.asr_filename + '.list'))
|
||||||
|
slider_text_similarity_amplification_boundary_default = empty_default(
|
||||||
|
rw_param.read(rw_param.text_similarity_amplification_boundary), 0.90)
|
||||||
|
text_text_similarity_result_path_default = common.check_path_existence_and_return(
|
||||||
|
os.path.join(base_dir_default, params.text_emotion_average_similarity_report_filename + '.txt'))
|
||||||
|
|
||||||
|
|
||||||
|
def init_fourth():
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def init_fifth():
|
||||||
|
global text_template_default
|
||||||
|
|
||||||
|
default_template_path = params.default_template_path
|
||||||
|
text_template_default = empty_default(rw_param.read(rw_param.text_template),
|
||||||
|
common.read_file(default_template_path))
|
||||||
|
|
||||||
|
|
||||||
|
def init_all():
|
||||||
|
init_base()
|
||||||
|
init_first()
|
||||||
|
init_second()
|
||||||
|
init_third()
|
||||||
|
init_fourth()
|
||||||
|
init_fifth()
|
BIN
Ref_Audio_Selector/参考音频筛选流程.png
Normal file
BIN
Ref_Audio_Selector/参考音频筛选流程.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 95 KiB |
@ -1,23 +1,10 @@
|
|||||||
{
|
{
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 0,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"provenance": [],
|
|
||||||
"include_colab_link": true
|
|
||||||
},
|
|
||||||
"kernelspec": {
|
|
||||||
"name": "python3",
|
|
||||||
"display_name": "Python 3"
|
|
||||||
},
|
|
||||||
"accelerator": "GPU"
|
|
||||||
},
|
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "view-in-github",
|
"colab_type": "text",
|
||||||
"colab_type": "text"
|
"id": "view-in-github"
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"<a href=\"https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
"<a href=\"https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
||||||
@ -25,18 +12,20 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"环境配置 environment"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "_o6a8GS2lWQM"
|
"id": "_o6a8GS2lWQM"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"环境配置 environment"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "e9b7iFV3dm1f"
|
"id": "e9b7iFV3dm1f"
|
||||||
},
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"!pip install -q condacolab\n",
|
"!pip install -q condacolab\n",
|
||||||
"# Setting up condacolab and installing packages\n",
|
"# Setting up condacolab and installing packages\n",
|
||||||
@ -47,13 +36,17 @@
|
|||||||
"!conda install -y -q -c pytorch -c nvidia cudatoolkit\n",
|
"!conda install -y -q -c pytorch -c nvidia cudatoolkit\n",
|
||||||
"%cd -q /content/GPT-SoVITS\n",
|
"%cd -q /content/GPT-SoVITS\n",
|
||||||
"!conda install -y -q -c conda-forge gcc gxx ffmpeg cmake -c pytorch -c nvidia\n",
|
"!conda install -y -q -c conda-forge gcc gxx ffmpeg cmake -c pytorch -c nvidia\n",
|
||||||
|
"!/usr/local/bin/pip install -r extra-req.txt --no-deps\n",
|
||||||
"!/usr/local/bin/pip install -r requirements.txt"
|
"!/usr/local/bin/pip install -r requirements.txt"
|
||||||
],
|
]
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "0NgxXg5sjv7z"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# @title Download pretrained models 下载预训练模型\n",
|
"# @title Download pretrained models 下载预训练模型\n",
|
||||||
"!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
|
"!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
|
||||||
@ -71,27 +64,35 @@
|
|||||||
"!git clone https://huggingface.co/Delik/uvr5_weights\n",
|
"!git clone https://huggingface.co/Delik/uvr5_weights\n",
|
||||||
"!git config core.sparseCheckout true\n",
|
"!git config core.sparseCheckout true\n",
|
||||||
"!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
|
"!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "0NgxXg5sjv7z"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "4oRGUzkrk8C7"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# @title launch WebUI 启动WebUI\n",
|
"# @title launch WebUI 启动WebUI\n",
|
||||||
"!/usr/local/bin/pip install ipykernel\n",
|
"!/usr/local/bin/pip install ipykernel\n",
|
||||||
"!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
|
"!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
|
||||||
"%cd /content/GPT-SoVITS/\n",
|
"%cd /content/GPT-SoVITS/\n",
|
||||||
"!/usr/local/bin/python webui.py"
|
"!/usr/local/bin/python webui.py"
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"id": "4oRGUzkrk8C7"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
}
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"accelerator": "GPU",
|
||||||
|
"colab": {
|
||||||
|
"include_colab_link": true,
|
||||||
|
"provenance": []
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"name": "python3"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0
|
||||||
|
}
|
||||||
|
@ -76,6 +76,7 @@ bash install.sh
|
|||||||
```bash
|
```bash
|
||||||
conda create -n GPTSoVits python=3.9
|
conda create -n GPTSoVits python=3.9
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVits
|
||||||
|
pip install -r extra-req.txt --no-deps
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -104,6 +105,7 @@ conda install -c conda-forge 'ffmpeg<7'
|
|||||||
安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语 TTS)
|
安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语 TTS)
|
||||||
|
|
||||||
##### MacOS 用户
|
##### MacOS 用户
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
brew install ffmpeg
|
brew install ffmpeg
|
||||||
```
|
```
|
||||||
@ -111,6 +113,7 @@ brew install ffmpeg
|
|||||||
#### 安装依赖
|
#### 安装依赖
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
pip install -r extra-req.txt --no-deps
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -155,7 +158,6 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker
|
|||||||
|
|
||||||
- 建议在模型名称和配置文件名中**直接指定模型类型**,例如`mel_mand_roformer`、`bs_roformer`。如果未指定,将从配置文中比对特征,以确定它是哪种类型的模型。例如,模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对。`kim_mel_band_roformer.ckpt` 和 `kim_mel_band_roformer.yaml` 也是一对。
|
- 建议在模型名称和配置文件名中**直接指定模型类型**,例如`mel_mand_roformer`、`bs_roformer`。如果未指定,将从配置文中比对特征,以确定它是哪种类型的模型。例如,模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对。`kim_mel_band_roformer.ckpt` 和 `kim_mel_band_roformer.yaml` 也是一对。
|
||||||
|
|
||||||
|
|
||||||
4. 对于中文 ASR(额外功能),从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型,并将它们放置在 `tools/asr/models` 目录中。
|
4. 对于中文 ASR(额外功能),从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型,并将它们放置在 `tools/asr/models` 目录中。
|
||||||
|
|
||||||
5. 对于英语或日语 ASR(额外功能),从 [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 下载模型,并将其放置在 `tools/asr/models` 目录中。此外,[其他模型](https://huggingface.co/Systran) 可能具有类似效果且占用更少的磁盘空间。
|
5. 对于英语或日语 ASR(额外功能),从 [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 下载模型,并将其放置在 `tools/asr/models` 目录中。此外,[其他模型](https://huggingface.co/Systran) 可能具有类似效果且占用更少的磁盘空间。
|
||||||
@ -202,6 +204,7 @@ python webui.py <language(optional)>
|
|||||||
```bash
|
```bash
|
||||||
python webui.py v1 <language(optional)>
|
python webui.py v1 <language(optional)>
|
||||||
```
|
```
|
||||||
|
|
||||||
或者在 webUI 内动态切换
|
或者在 webUI 内动态切换
|
||||||
|
|
||||||
### 微调
|
### 微调
|
||||||
@ -226,11 +229,13 @@ python webui.py v1 <language(optional)>
|
|||||||
```bash
|
```bash
|
||||||
python GPT_SoVITS/inference_webui.py <language(optional)>
|
python GPT_SoVITS/inference_webui.py <language(optional)>
|
||||||
```
|
```
|
||||||
|
|
||||||
或者
|
或者
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python webui.py
|
python webui.py
|
||||||
```
|
```
|
||||||
|
|
||||||
然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
|
然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
|
||||||
|
|
||||||
## V2 发布说明
|
## V2 发布说明
|
||||||
@ -245,7 +250,7 @@ python webui.py
|
|||||||
|
|
||||||
4. 对低音质参考音频(尤其是来源于网络的高频严重缺失、听着很闷的音频)合成出来音质更好
|
4. 对低音质参考音频(尤其是来源于网络的高频严重缺失、听着很闷的音频)合成出来音质更好
|
||||||
|
|
||||||
详见[wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
从 v1 环境迁移至 v2
|
从 v1 环境迁移至 v2
|
||||||
|
|
||||||
@ -265,7 +270,7 @@ python webui.py
|
|||||||
|
|
||||||
2. GPT 合成更稳定,重复漏字更少,也更容易跑出丰富情感
|
2. GPT 合成更稳定,重复漏字更少,也更容易跑出丰富情感
|
||||||
|
|
||||||
详见[wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
从 v2 环境迁移至 v3
|
从 v2 环境迁移至 v3
|
||||||
|
|
||||||
@ -277,7 +282,6 @@ python webui.py
|
|||||||
|
|
||||||
如果想用音频超分功能缓解 v3 模型生成 24k 音频觉得闷的问题,需要下载额外的模型参数,参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt)
|
如果想用音频超分功能缓解 v3 模型生成 24k 音频觉得闷的问题,需要下载额外的模型参数,参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt)
|
||||||
|
|
||||||
|
|
||||||
## 待办事项清单
|
## 待办事项清单
|
||||||
|
|
||||||
- [x] **高优先级:**
|
- [x] **高优先级:**
|
||||||
@ -299,16 +303,21 @@ python webui.py
|
|||||||
- [ ] 模型混合。
|
- [ ] 模型混合。
|
||||||
|
|
||||||
## (附加)命令行运行方式
|
## (附加)命令行运行方式
|
||||||
|
|
||||||
使用命令行打开 UVR5 的 WebUI
|
使用命令行打开 UVR5 的 WebUI
|
||||||
````
|
|
||||||
|
```
|
||||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
||||||
````
|
```
|
||||||
|
|
||||||
<!-- 如果打不开浏览器,请按照下面的格式进行UVR处理,这是使用mdxnet进行音频处理的方式
|
<!-- 如果打不开浏览器,请按照下面的格式进行UVR处理,这是使用mdxnet进行音频处理的方式
|
||||||
````
|
````
|
||||||
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
||||||
```` -->
|
```` -->
|
||||||
|
|
||||||
这是使用命令行完成数据集的音频切分的方式
|
这是使用命令行完成数据集的音频切分的方式
|
||||||
````
|
|
||||||
|
```
|
||||||
python audio_slicer.py \
|
python audio_slicer.py \
|
||||||
--input_path "<path_to_original_audio_file_or_directory>" \
|
--input_path "<path_to_original_audio_file_or_directory>" \
|
||||||
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
|
--output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
|
||||||
@ -316,17 +325,22 @@ python audio_slicer.py \
|
|||||||
--min_length <minimum_duration_of_each_subclip> \
|
--min_length <minimum_duration_of_each_subclip> \
|
||||||
--min_interval <shortest_time_gap_between_adjacent_subclips>
|
--min_interval <shortest_time_gap_between_adjacent_subclips>
|
||||||
--hop_size <step_size_for_computing_volume_curve>
|
--hop_size <step_size_for_computing_volume_curve>
|
||||||
````
|
```
|
||||||
|
|
||||||
这是使用命令行完成数据集 ASR 处理的方式(仅限中文)
|
这是使用命令行完成数据集 ASR 处理的方式(仅限中文)
|
||||||
````
|
|
||||||
|
```
|
||||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
python tools/asr/funasr_asr.py -i <input> -o <output>
|
||||||
````
|
```
|
||||||
|
|
||||||
通过 Faster_Whisper 进行 ASR 处理(除中文之外的 ASR 标记)
|
通过 Faster_Whisper 进行 ASR 处理(除中文之外的 ASR 标记)
|
||||||
|
|
||||||
(没有进度条,GPU 性能可能会导致时间延迟)
|
(没有进度条,GPU 性能可能会导致时间延迟)
|
||||||
|
|
||||||
```
|
```
|
||||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
||||||
```
|
```
|
||||||
|
|
||||||
启用自定义列表保存路径
|
启用自定义列表保存路径
|
||||||
|
|
||||||
## 致谢
|
## 致谢
|
||||||
@ -334,6 +348,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
|||||||
特别感谢以下项目和贡献者:
|
特别感谢以下项目和贡献者:
|
||||||
|
|
||||||
### 理论研究
|
### 理论研究
|
||||||
|
|
||||||
- [ar-vits](https://github.com/innnky/ar-vits)
|
- [ar-vits](https://github.com/innnky/ar-vits)
|
||||||
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
||||||
- [vits](https://github.com/jaywalnut310/vits)
|
- [vits](https://github.com/jaywalnut310/vits)
|
||||||
@ -343,17 +358,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
|||||||
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
||||||
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
||||||
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
||||||
|
|
||||||
### 预训练模型
|
### 预训练模型
|
||||||
|
|
||||||
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
||||||
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
||||||
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
||||||
|
|
||||||
### 推理用文本前端
|
### 推理用文本前端
|
||||||
|
|
||||||
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
||||||
- [split-lang](https://github.com/DoodleBears/split-lang)
|
- [split-lang](https://github.com/DoodleBears/split-lang)
|
||||||
- [g2pW](https://github.com/GitYCC/g2pW)
|
- [g2pW](https://github.com/GitYCC/g2pW)
|
||||||
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
||||||
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
||||||
|
|
||||||
### WebUI 工具
|
### WebUI 工具
|
||||||
|
|
||||||
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
||||||
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
||||||
- [SubFix](https://github.com/cronrpc/SubFix)
|
- [SubFix](https://github.com/cronrpc/SubFix)
|
||||||
|
@ -70,7 +70,7 @@ bash install.sh
|
|||||||
```bash
|
```bash
|
||||||
conda create -n GPTSoVits python=3.9
|
conda create -n GPTSoVits python=3.9
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVits
|
||||||
|
pip install -r extra-req.txt --no-deps
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -97,6 +97,7 @@ conda install -c conda-forge 'ffmpeg<7'
|
|||||||
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます。
|
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます。
|
||||||
|
|
||||||
##### MacOS ユーザー
|
##### MacOS ユーザー
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
brew install ffmpeg
|
brew install ffmpeg
|
||||||
```
|
```
|
||||||
@ -104,6 +105,7 @@ brew install ffmpeg
|
|||||||
#### 依存関係をインストールします
|
#### 依存関係をインストールします
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
pip install -r extra-req.txt --no-deps
|
||||||
pip install -r requirementx.txt
|
pip install -r requirementx.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -169,6 +171,7 @@ vocal_path|speaker_name|language|text
|
|||||||
```
|
```
|
||||||
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
|
D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
|
||||||
```
|
```
|
||||||
|
|
||||||
## 微調整と推論
|
## 微調整と推論
|
||||||
|
|
||||||
### WebUI を開く
|
### WebUI を開く
|
||||||
@ -189,6 +192,7 @@ V1に切り替えたい場合は
|
|||||||
```bash
|
```bash
|
||||||
python webui.py v1 <言語(オプション)>
|
python webui.py v1 <言語(オプション)>
|
||||||
```
|
```
|
||||||
|
|
||||||
または WebUI で手動でバージョンを切り替えてください。
|
または WebUI で手動でバージョンを切り替えてください。
|
||||||
|
|
||||||
### 微調整
|
### 微調整
|
||||||
@ -213,11 +217,13 @@ python webui.py v1 <言語(オプション)>
|
|||||||
```bash
|
```bash
|
||||||
python GPT_SoVITS/inference_webui.py <言語(オプション)>
|
python GPT_SoVITS/inference_webui.py <言語(オプション)>
|
||||||
```
|
```
|
||||||
|
|
||||||
または
|
または
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python webui.py
|
python webui.py
|
||||||
```
|
```
|
||||||
|
|
||||||
その後、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます。
|
その後、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます。
|
||||||
|
|
||||||
## V2 リリースノート
|
## V2 リリースノート
|
||||||
@ -232,7 +238,7 @@ python webui.py
|
|||||||
|
|
||||||
4. 低品質の参照音声に対する合成品質の向上
|
4. 低品質の参照音声に対する合成品質の向上
|
||||||
|
|
||||||
[詳細はこちら](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
[詳細はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
V1 環境から V2 を使用するには:
|
V1 環境から V2 を使用するには:
|
||||||
|
|
||||||
@ -252,7 +258,7 @@ V1環境からV2を使用するには:
|
|||||||
|
|
||||||
2. GPT モデルがより安定し、繰り返しや省略が減少し、より豊かな感情表現を持つ音声の生成が容易になりました。
|
2. GPT モデルがより安定し、繰り返しや省略が減少し、より豊かな感情表現を持つ音声の生成が容易になりました。
|
||||||
|
|
||||||
[詳細情報はこちら](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
[詳細情報はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
v2 環境から v3 を使用する方法:
|
v2 環境から v3 を使用する方法:
|
||||||
|
|
||||||
@ -285,15 +291,20 @@ v2 環境から v3 を使用する方法:
|
|||||||
- [ ] モデルミックス
|
- [ ] モデルミックス
|
||||||
|
|
||||||
## (追加の) コマンドラインから実行する方法
|
## (追加の) コマンドラインから実行する方法
|
||||||
|
|
||||||
コマンド ラインを使用して UVR5 の WebUI を開きます
|
コマンド ラインを使用して UVR5 の WebUI を開きます
|
||||||
|
|
||||||
```
|
```
|
||||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
||||||
```
|
```
|
||||||
|
|
||||||
<!-- ブラウザを開けない場合は、以下の形式に従って UVR 処理を行ってください。これはオーディオ処理に mdxnet を使用しています。
|
<!-- ブラウザを開けない場合は、以下の形式に従って UVR 処理を行ってください。これはオーディオ処理に mdxnet を使用しています。
|
||||||
```
|
```
|
||||||
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
||||||
``` -->
|
``` -->
|
||||||
|
|
||||||
コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです。
|
コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです。
|
||||||
|
|
||||||
```
|
```
|
||||||
python audio_slicer.py \
|
python audio_slicer.py \
|
||||||
--input_path "<path_to_original_audio_file_or_directory>" \
|
--input_path "<path_to_original_audio_file_or_directory>" \
|
||||||
@ -303,16 +314,21 @@ python audio_slicer.py \
|
|||||||
--min_interval <shortest_time_gap_between_adjacent_subclips>
|
--min_interval <shortest_time_gap_between_adjacent_subclips>
|
||||||
--hop_size <step_size_for_computing_volume_curve>
|
--hop_size <step_size_for_computing_volume_curve>
|
||||||
```
|
```
|
||||||
|
|
||||||
コマンドラインを使用してデータセット ASR 処理を行う方法です (中国語のみ)
|
コマンドラインを使用してデータセット ASR 処理を行う方法です (中国語のみ)
|
||||||
|
|
||||||
```
|
```
|
||||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
python tools/asr/funasr_asr.py -i <input> -o <output>
|
||||||
```
|
```
|
||||||
|
|
||||||
ASR 処理は Faster_Whisper を通じて実行されます(中国語を除く ASR マーキング)
|
ASR 処理は Faster_Whisper を通じて実行されます(中国語を除く ASR マーキング)
|
||||||
|
|
||||||
(進行状況バーは表示されません。GPU のパフォーマンスにより時間遅延が発生する可能性があります)
|
(進行状況バーは表示されません。GPU のパフォーマンスにより時間遅延が発生する可能性があります)
|
||||||
|
|
||||||
```
|
```
|
||||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
||||||
```
|
```
|
||||||
|
|
||||||
カスタムリストの保存パスが有効になっています
|
カスタムリストの保存パスが有効になっています
|
||||||
|
|
||||||
## クレジット
|
## クレジット
|
||||||
@ -320,6 +336,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
|||||||
特に以下のプロジェクトと貢献者に感謝します:
|
特に以下のプロジェクトと貢献者に感謝します:
|
||||||
|
|
||||||
### 理論研究
|
### 理論研究
|
||||||
|
|
||||||
- [ar-vits](https://github.com/innnky/ar-vits)
|
- [ar-vits](https://github.com/innnky/ar-vits)
|
||||||
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
||||||
- [vits](https://github.com/jaywalnut310/vits)
|
- [vits](https://github.com/jaywalnut310/vits)
|
||||||
@ -329,17 +346,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
|||||||
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
||||||
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
||||||
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
||||||
|
|
||||||
### 事前学習モデル
|
### 事前学習モデル
|
||||||
|
|
||||||
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
||||||
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
||||||
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
||||||
|
|
||||||
### 推論用テキストフロントエンド
|
### 推論用テキストフロントエンド
|
||||||
|
|
||||||
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
||||||
- [split-lang](https://github.com/DoodleBears/split-lang)
|
- [split-lang](https://github.com/DoodleBears/split-lang)
|
||||||
- [g2pW](https://github.com/GitYCC/g2pW)
|
- [g2pW](https://github.com/GitYCC/g2pW)
|
||||||
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
||||||
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
||||||
|
|
||||||
### WebUI ツール
|
### WebUI ツール
|
||||||
|
|
||||||
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
||||||
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
||||||
- [SubFix](https://github.com/cronrpc/SubFix)
|
- [SubFix](https://github.com/cronrpc/SubFix)
|
||||||
|
@ -70,7 +70,7 @@ bash install.sh
|
|||||||
```bash
|
```bash
|
||||||
conda create -n GPTSoVits python=3.9
|
conda create -n GPTSoVits python=3.9
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVits
|
||||||
|
pip install -r extra-req.txt --no-deps
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -99,6 +99,7 @@ conda install -c conda-forge 'ffmpeg<7'
|
|||||||
[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치 (Korean TTS 전용)
|
[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치 (Korean TTS 전용)
|
||||||
|
|
||||||
##### MacOS 사용자
|
##### MacOS 사용자
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
brew install ffmpeg
|
brew install ffmpeg
|
||||||
```
|
```
|
||||||
@ -106,6 +107,7 @@ brew install ffmpeg
|
|||||||
#### 의존성 설치
|
#### 의존성 설치
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
pip install -r extra-req.txt --no-deps
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -195,6 +197,7 @@ V1으로 전환하려면,
|
|||||||
```bash
|
```bash
|
||||||
python webui.py v1 <언어(옵션)>
|
python webui.py v1 <언어(옵션)>
|
||||||
```
|
```
|
||||||
|
|
||||||
또는 WebUI에서 수동으로 버전을 전환하십시오.
|
또는 WebUI에서 수동으로 버전을 전환하십시오.
|
||||||
|
|
||||||
### 미세 조정
|
### 미세 조정
|
||||||
@ -219,11 +222,13 @@ python webui.py v1 <언어(옵션)>
|
|||||||
```bash
|
```bash
|
||||||
python GPT_SoVITS/inference_webui.py <언어(옵션)>
|
python GPT_SoVITS/inference_webui.py <언어(옵션)>
|
||||||
```
|
```
|
||||||
|
|
||||||
또는
|
또는
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python webui.py
|
python webui.py
|
||||||
```
|
```
|
||||||
|
|
||||||
그런 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.
|
그런 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.
|
||||||
|
|
||||||
## V2 릴리스 노트
|
## V2 릴리스 노트
|
||||||
@ -238,7 +243,7 @@ python webui.py
|
|||||||
|
|
||||||
4. 저품질 참조 오디오에 대한 합성 품질 향상
|
4. 저품질 참조 오디오에 대한 합성 품질 향상
|
||||||
|
|
||||||
[자세한 내용](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
[자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
V1 환경에서 V2를 사용하려면:
|
V1 환경에서 V2를 사용하려면:
|
||||||
|
|
||||||
@ -258,7 +263,7 @@ V1 환경에서 V2를 사용하려면:
|
|||||||
|
|
||||||
2. GPT 모델이 더 안정적이며 반복 및 생략이 적고, 더 풍부한 감정 표현을 가진 음성을 생성하기가 더 쉽습니다.
|
2. GPT 모델이 더 안정적이며 반복 및 생략이 적고, 더 풍부한 감정 표현을 가진 음성을 생성하기가 더 쉽습니다.
|
||||||
|
|
||||||
[자세한 내용](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
[자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
v2 환경에서 v3 사용하기:
|
v2 환경에서 v3 사용하기:
|
||||||
|
|
||||||
@ -270,7 +275,6 @@ v2 환경에서 v3 사용하기:
|
|||||||
|
|
||||||
추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요.
|
추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요.
|
||||||
|
|
||||||
|
|
||||||
## 할 일 목록
|
## 할 일 목록
|
||||||
|
|
||||||
- [x] **최우선순위:**
|
- [x] **최우선순위:**
|
||||||
@ -293,15 +297,20 @@ v2 환경에서 v3 사용하기:
|
|||||||
- [ ] 모델 블렌딩.
|
- [ ] 모델 블렌딩.
|
||||||
|
|
||||||
## (추가적인) 명령줄에서 실행하는 방법
|
## (추가적인) 명령줄에서 실행하는 방법
|
||||||
|
|
||||||
명령줄을 사용하여 UVR5용 WebUI 열기
|
명령줄을 사용하여 UVR5용 WebUI 열기
|
||||||
|
|
||||||
```
|
```
|
||||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
||||||
```
|
```
|
||||||
|
|
||||||
<!-- 브라우저를 열 수 없는 경우 UVR 처리를 위해 아래 형식을 따르십시오. 이는 오디오 처리를 위해 mdxnet을 사용하는 것입니다.
|
<!-- 브라우저를 열 수 없는 경우 UVR 처리를 위해 아래 형식을 따르십시오. 이는 오디오 처리를 위해 mdxnet을 사용하는 것입니다.
|
||||||
```
|
```
|
||||||
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
||||||
``` -->
|
``` -->
|
||||||
|
|
||||||
명령줄을 사용하여 데이터세트의 오디오 분할을 수행하는 방법은 다음과 같습니다.
|
명령줄을 사용하여 데이터세트의 오디오 분할을 수행하는 방법은 다음과 같습니다.
|
||||||
|
|
||||||
```
|
```
|
||||||
python audio_slicer.py \
|
python audio_slicer.py \
|
||||||
--input_path "<path_to_original_audio_file_or_directory>" \
|
--input_path "<path_to_original_audio_file_or_directory>" \
|
||||||
@ -311,16 +320,21 @@ python audio_slicer.py \
|
|||||||
--min_interval <shortest_time_gap_between_adjacent_subclips>
|
--min_interval <shortest_time_gap_between_adjacent_subclips>
|
||||||
--hop_size <step_size_for_computing_volume_curve>
|
--hop_size <step_size_for_computing_volume_curve>
|
||||||
```
|
```
|
||||||
|
|
||||||
명령줄을 사용하여 데이터 세트 ASR 처리를 수행하는 방법입니다(중국어만 해당).
|
명령줄을 사용하여 데이터 세트 ASR 처리를 수행하는 방법입니다(중국어만 해당).
|
||||||
|
|
||||||
```
|
```
|
||||||
python tools/asr/funasr_asr.py -i <input> -o <output>
|
python tools/asr/funasr_asr.py -i <input> -o <output>
|
||||||
```
|
```
|
||||||
|
|
||||||
ASR 처리는 Faster_Whisper(중국어를 제외한 ASR 마킹)를 통해 수행됩니다.
|
ASR 처리는 Faster_Whisper(중국어를 제외한 ASR 마킹)를 통해 수행됩니다.
|
||||||
|
|
||||||
(진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음)
|
(진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음)
|
||||||
|
|
||||||
```
|
```
|
||||||
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
|
||||||
```
|
```
|
||||||
|
|
||||||
사용자 정의 목록 저장 경로가 활성화되었습니다.
|
사용자 정의 목록 저장 경로가 활성화되었습니다.
|
||||||
|
|
||||||
## 감사의 말
|
## 감사의 말
|
||||||
@ -328,6 +342,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
|||||||
다음 프로젝트와 기여자들에게 특별히 감사드립니다:
|
다음 프로젝트와 기여자들에게 특별히 감사드립니다:
|
||||||
|
|
||||||
### 이론 연구
|
### 이론 연구
|
||||||
|
|
||||||
- [ar-vits](https://github.com/innnky/ar-vits)
|
- [ar-vits](https://github.com/innnky/ar-vits)
|
||||||
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
||||||
- [vits](https://github.com/jaywalnut310/vits)
|
- [vits](https://github.com/jaywalnut310/vits)
|
||||||
@ -337,17 +352,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
|
|||||||
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
||||||
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
||||||
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
||||||
|
|
||||||
### 사전 학습 모델
|
### 사전 학습 모델
|
||||||
|
|
||||||
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
||||||
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
||||||
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
||||||
|
|
||||||
### 추론용 텍스트 프론트엔드
|
### 추론용 텍스트 프론트엔드
|
||||||
|
|
||||||
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
||||||
- [split-lang](https://github.com/DoodleBears/split-lang)
|
- [split-lang](https://github.com/DoodleBears/split-lang)
|
||||||
- [g2pW](https://github.com/GitYCC/g2pW)
|
- [g2pW](https://github.com/GitYCC/g2pW)
|
||||||
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
||||||
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
||||||
|
|
||||||
### WebUI 도구
|
### WebUI 도구
|
||||||
|
|
||||||
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
||||||
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
||||||
- [SubFix](https://github.com/cronrpc/SubFix)
|
- [SubFix](https://github.com/cronrpc/SubFix)
|
||||||
|
@ -72,7 +72,7 @@ bash install.sh
|
|||||||
```bash
|
```bash
|
||||||
conda create -n GPTSoVits python=3.9
|
conda create -n GPTSoVits python=3.9
|
||||||
conda activate GPTSoVits
|
conda activate GPTSoVits
|
||||||
|
pip install -r extra-req.txt --no-deps
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -99,6 +99,7 @@ conda install -c conda-forge 'ffmpeg<7'
|
|||||||
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin.
|
[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin.
|
||||||
|
|
||||||
##### MacOS Kullanıcıları
|
##### MacOS Kullanıcıları
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
brew install ffmpeg
|
brew install ffmpeg
|
||||||
```
|
```
|
||||||
@ -106,6 +107,7 @@ brew install ffmpeg
|
|||||||
#### Bağımlılıkları Yükleme
|
#### Bağımlılıkları Yükleme
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
pip install -r extra-req.txt --no-deps
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -192,6 +194,7 @@ V1'e geçmek istiyorsanız,
|
|||||||
```bash
|
```bash
|
||||||
python webui.py v1 <dil(isteğe bağlı)>
|
python webui.py v1 <dil(isteğe bağlı)>
|
||||||
```
|
```
|
||||||
|
|
||||||
veya WebUI'de manuel olarak sürüm değiştirin.
|
veya WebUI'de manuel olarak sürüm değiştirin.
|
||||||
|
|
||||||
### İnce Ayar
|
### İnce Ayar
|
||||||
@ -216,11 +219,13 @@ veya WebUI'de manuel olarak sürüm değiştirin.
|
|||||||
```bash
|
```bash
|
||||||
python GPT_SoVITS/inference_webui.py <dil(isteğe bağlı)>
|
python GPT_SoVITS/inference_webui.py <dil(isteğe bağlı)>
|
||||||
```
|
```
|
||||||
|
|
||||||
VEYA
|
VEYA
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python webui.py
|
python webui.py
|
||||||
```
|
```
|
||||||
|
|
||||||
ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.
|
ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.
|
||||||
|
|
||||||
## V2 Sürüm Notları
|
## V2 Sürüm Notları
|
||||||
@ -235,7 +240,7 @@ Yeni Özellikler:
|
|||||||
|
|
||||||
4. Düşük kaliteli referans sesler için geliştirilmiş sentez kalitesi
|
4. Düşük kaliteli referans sesler için geliştirilmiş sentez kalitesi
|
||||||
|
|
||||||
[detaylar burada](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
[detaylar burada](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
V1 ortamından V2'yi kullanmak için:
|
V1 ortamından V2'yi kullanmak için:
|
||||||
|
|
||||||
@ -255,7 +260,7 @@ V1 ortamından V2'yi kullanmak için:
|
|||||||
|
|
||||||
2. GPT modeli daha **kararlı** hale geldi, tekrarlar ve atlamalar azaldı ve **daha zengin duygusal ifadeler** ile konuşma üretmek daha kolay hale geldi.
|
2. GPT modeli daha **kararlı** hale geldi, tekrarlar ve atlamalar azaldı ve **daha zengin duygusal ifadeler** ile konuşma üretmek daha kolay hale geldi.
|
||||||
|
|
||||||
[daha fazla detay](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
|
[daha fazla detay](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
|
||||||
|
|
||||||
### v2 ortamında v3 kullanımı:
|
### v2 ortamında v3 kullanımı:
|
||||||
|
|
||||||
@ -288,15 +293,20 @@ V1 ortamından V2'yi kullanmak için:
|
|||||||
- [ ] model karışımı
|
- [ ] model karışımı
|
||||||
|
|
||||||
## (Ekstra) Komut satırından çalıştırma yöntemi
|
## (Ekstra) Komut satırından çalıştırma yöntemi
|
||||||
|
|
||||||
UVR5 için Web Arayüzünü açmak için komut satırını kullanın
|
UVR5 için Web Arayüzünü açmak için komut satırını kullanın
|
||||||
|
|
||||||
```
|
```
|
||||||
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
|
||||||
```
|
```
|
||||||
|
|
||||||
<!-- Bir tarayıcı açamıyorsanız, UVR işleme için aşağıdaki formatı izleyin,Bu ses işleme için mdxnet kullanıyor
|
<!-- Bir tarayıcı açamıyorsanız, UVR işleme için aşağıdaki formatı izleyin,Bu ses işleme için mdxnet kullanıyor
|
||||||
```
|
```
|
||||||
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
|
||||||
``` -->
|
``` -->
|
||||||
|
|
||||||
Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır
|
Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır
|
||||||
|
|
||||||
```
|
```
|
||||||
python audio_slicer.py \
|
python audio_slicer.py \
|
||||||
--input_path "<orijinal_ses_dosyası_veya_dizininin_yolu>" \
|
--input_path "<orijinal_ses_dosyası_veya_dizininin_yolu>" \
|
||||||
@ -306,16 +316,21 @@ python audio_slicer.py \
|
|||||||
--min_interval <bitişik_alt_klipler_arasındaki_en_kısa_zaman_aralığı>
|
--min_interval <bitişik_alt_klipler_arasındaki_en_kısa_zaman_aralığı>
|
||||||
--hop_size <ses_eğrisini_hesaplamak_için_adım_boyutu>
|
--hop_size <ses_eğrisini_hesaplamak_için_adım_boyutu>
|
||||||
```
|
```
|
||||||
|
|
||||||
Veri seti ASR işleme komut satırı kullanılarak bu şekilde yapılır (Yalnızca Çince)
|
Veri seti ASR işleme komut satırı kullanılarak bu şekilde yapılır (Yalnızca Çince)
|
||||||
|
|
||||||
```
|
```
|
||||||
python tools/asr/funasr_asr.py -i <girdi> -o <çıktı>
|
python tools/asr/funasr_asr.py -i <girdi> -o <çıktı>
|
||||||
```
|
```
|
||||||
|
|
||||||
ASR işleme Faster_Whisper aracılığıyla gerçekleştirilir (Çince dışındaki ASR işaretleme)
|
ASR işleme Faster_Whisper aracılığıyla gerçekleştirilir (Çince dışındaki ASR işaretleme)
|
||||||
|
|
||||||
(İlerleme çubukları yok, GPU performansı zaman gecikmelerine neden olabilir)
|
(İlerleme çubukları yok, GPU performansı zaman gecikmelerine neden olabilir)
|
||||||
|
|
||||||
```
|
```
|
||||||
python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
|
python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
|
||||||
```
|
```
|
||||||
|
|
||||||
Özel bir liste kaydetme yolu etkinleştirildi
|
Özel bir liste kaydetme yolu etkinleştirildi
|
||||||
|
|
||||||
## Katkı Verenler
|
## Katkı Verenler
|
||||||
@ -323,6 +338,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
|
|||||||
Özellikle aşağıdaki projelere ve katkıda bulunanlara teşekkür ederiz:
|
Özellikle aşağıdaki projelere ve katkıda bulunanlara teşekkür ederiz:
|
||||||
|
|
||||||
### Teorik Araştırma
|
### Teorik Araştırma
|
||||||
|
|
||||||
- [ar-vits](https://github.com/innnky/ar-vits)
|
- [ar-vits](https://github.com/innnky/ar-vits)
|
||||||
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
- [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
|
||||||
- [vits](https://github.com/jaywalnut310/vits)
|
- [vits](https://github.com/jaywalnut310/vits)
|
||||||
@ -332,17 +348,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
|
|||||||
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
|
||||||
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
- [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
|
||||||
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
- [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
|
||||||
|
|
||||||
### Önceden Eğitilmiş Modeller
|
### Önceden Eğitilmiş Modeller
|
||||||
|
|
||||||
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
- [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
|
||||||
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
- [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
|
||||||
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
- [BigVGAN](https://github.com/NVIDIA/BigVGAN)
|
||||||
|
|
||||||
### Tahmin İçin Metin Ön Ucu
|
### Tahmin İçin Metin Ön Ucu
|
||||||
|
|
||||||
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
- [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
|
||||||
- [split-lang](https://github.com/DoodleBears/split-lang)
|
- [split-lang](https://github.com/DoodleBears/split-lang)
|
||||||
- [g2pW](https://github.com/GitYCC/g2pW)
|
- [g2pW](https://github.com/GitYCC/g2pW)
|
||||||
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
- [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
|
||||||
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
- [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
|
||||||
|
|
||||||
### WebUI Araçları
|
### WebUI Araçları
|
||||||
|
|
||||||
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
- [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
|
||||||
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
- [audio-slicer](https://github.com/openvpi/audio-slicer)
|
||||||
- [SubFix](https://github.com/cronrpc/SubFix)
|
- [SubFix](https://github.com/cronrpc/SubFix)
|
||||||
|
1
extra-req.txt
Normal file
1
extra-req.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
faster-whisper
|
@ -27,7 +27,8 @@
|
|||||||
"!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
|
"!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
|
||||||
"%cd GPT-SoVITS\n",
|
"%cd GPT-SoVITS\n",
|
||||||
"!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
|
"!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
|
||||||
"!pip install -r requirements.txt"
|
"!pip install -r requirements.txt\n",
|
||||||
|
"!pip install -r extra-req.txt --no-deps"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
49
install.sh
49
install.sh
@ -1,15 +1,17 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
# 安装构建工具
|
# 安装构建工具
|
||||||
# Install build tools
|
# Install build tools
|
||||||
echo "Installing GCC..."
|
echo "Installing GCC..."
|
||||||
conda install -c conda-forge gcc=14
|
conda install -c conda-forge gcc=14 -y
|
||||||
|
|
||||||
echo "Installing G++..."
|
echo "Installing G++..."
|
||||||
conda install -c conda-forge gxx
|
conda install -c conda-forge gxx -y
|
||||||
|
|
||||||
echo "Installing ffmpeg and cmake..."
|
echo "Installing ffmpeg and cmake..."
|
||||||
conda install ffmpeg cmake
|
conda install ffmpeg cmake -y
|
||||||
|
|
||||||
# 设置编译环境
|
# 设置编译环境
|
||||||
# Set up build environment
|
# Set up build environment
|
||||||
@ -26,7 +28,6 @@ else
|
|||||||
USE_CUDA=false
|
USE_CUDA=false
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
if [ "$USE_CUDA" = false ]; then
|
if [ "$USE_CUDA" = false ]; then
|
||||||
echo "Checking for ROCm installation..."
|
echo "Checking for ROCm installation..."
|
||||||
if [ -d "/opt/rocm" ]; then
|
if [ -d "/opt/rocm" ]; then
|
||||||
@ -56,21 +57,53 @@ else
|
|||||||
conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 cpuonly -c pytorch
|
conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 cpuonly -c pytorch
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
echo "Installing Python dependencies from requirements.txt..."
|
echo "Installing Python dependencies from requirements.txt..."
|
||||||
|
|
||||||
# 刷新环境
|
# 刷新环境
|
||||||
# Refresh environment
|
# Refresh environment
|
||||||
hash -r
|
hash -r
|
||||||
|
|
||||||
|
# pyopenjtalk Installation
|
||||||
|
conda install jq -y
|
||||||
|
|
||||||
|
OS_TYPE=$(uname)
|
||||||
|
|
||||||
|
PACKAGE_NAME="pyopenjtalk"
|
||||||
|
|
||||||
|
VERSION=$(curl -s https://pypi.org/pypi/$PACKAGE_NAME/json | jq -r .info.version)
|
||||||
|
|
||||||
|
wget "https://files.pythonhosted.org/packages/source/${PACKAGE_NAME:0:1}/$PACKAGE_NAME/$PACKAGE_NAME-$VERSION.tar.gz"
|
||||||
|
|
||||||
|
TAR_FILE=$(ls ${PACKAGE_NAME}-*.tar.gz)
|
||||||
|
DIR_NAME="${TAR_FILE%.tar.gz}"
|
||||||
|
|
||||||
|
tar -xzf "$TAR_FILE"
|
||||||
|
rm "$TAR_FILE"
|
||||||
|
|
||||||
|
CMAKE_FILE="$DIR_NAME/lib/open_jtalk/src/CMakeLists.txt"
|
||||||
|
|
||||||
|
if [[ "$OS_TYPE" == "darwin"* ]]; then
|
||||||
|
sed -i '' -E 's/cmake_minimum_required\(VERSION[^\)]*\)/cmake_minimum_required(VERSION 3.5...3.31)/' "$CMAKE_FILE"
|
||||||
|
else
|
||||||
|
sed -i -E 's/cmake_minimum_required\(VERSION[^\)]*\)/cmake_minimum_required(VERSION 3.5...3.31)/' "$CMAKE_FILE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
tar -czf "$TAR_FILE" "$DIR_NAME"
|
||||||
|
|
||||||
|
pip install "$TAR_FILE"
|
||||||
|
|
||||||
|
rm -rf "$TAR_FILE" "$DIR_NAME"
|
||||||
|
|
||||||
|
pip install -r extra-req.txt --no-deps
|
||||||
|
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
|
|
||||||
if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ]; then
|
if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ]; then
|
||||||
echo "Update to WSL compatible runtime lib..."
|
echo "Update to WSL compatible runtime lib..."
|
||||||
location=`pip show torch | grep Location | awk -F ": " '{print $2}'`
|
location=$(pip show torch | grep Location | awk -F ": " '{print $2}')
|
||||||
cd ${location}/torch/lib/
|
cd "${location}"/torch/lib/ || exit
|
||||||
rm libhsa-runtime64.so*
|
rm libhsa-runtime64.so*
|
||||||
cp /opt/rocm/lib/libhsa-runtime64.so.1.2 libhsa-runtime64.so
|
cp /opt/rocm/lib/libhsa-runtime64.so.1.2 libhsa-runtime64.so
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Installation completed successfully!"
|
echo "Installation completed successfully!"
|
||||||
|
|
||||||
|
@ -3,7 +3,7 @@ scipy
|
|||||||
tensorboard
|
tensorboard
|
||||||
librosa==0.9.2
|
librosa==0.9.2
|
||||||
numba==0.56.4
|
numba==0.56.4
|
||||||
pytorch-lightning
|
pytorch-lightning>2.0
|
||||||
gradio>=4.0,<=4.24.0
|
gradio>=4.0,<=4.24.0
|
||||||
ffmpeg-python
|
ffmpeg-python
|
||||||
onnxruntime; sys_platform == 'darwin'
|
onnxruntime; sys_platform == 'darwin'
|
||||||
@ -26,7 +26,6 @@ jieba_fast
|
|||||||
jieba
|
jieba
|
||||||
split-lang
|
split-lang
|
||||||
fast_langdetect>=0.3.0
|
fast_langdetect>=0.3.0
|
||||||
Faster_Whisper
|
|
||||||
wordsegment
|
wordsegment
|
||||||
rotary_embedding_torch
|
rotary_embedding_torch
|
||||||
ToJyutping
|
ToJyutping
|
||||||
@ -38,4 +37,9 @@ python_mecab_ko; sys_platform != 'win32'
|
|||||||
fastapi<0.112.2
|
fastapi<0.112.2
|
||||||
x_transformers
|
x_transformers
|
||||||
torchmetrics<=1.5
|
torchmetrics<=1.5
|
||||||
attrdict
|
pydantic<=2.10.6
|
||||||
|
ctranslate2>=4.0,<5
|
||||||
|
huggingface_hub>=0.13
|
||||||
|
tokenizers>=0.13,<1
|
||||||
|
av>=11
|
||||||
|
tqdm
|
||||||
|
Loading…
x
Reference in New Issue
Block a user