mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-10-16 05:36:34 +08:00
Merge 495ef66177731ae1dfc81e2a1cad1758219d7af0 into 11aa78bd9bda8b53047cfcae03abf7ca94d27391
This commit is contained in:
commit
b0a85e2852
@ -16,7 +16,7 @@ pypinyin
|
|||||||
pyopenjtalk>=0.4.1
|
pyopenjtalk>=0.4.1
|
||||||
g2p_en
|
g2p_en
|
||||||
torchaudio
|
torchaudio
|
||||||
modelscope==1.10.0
|
modelscope
|
||||||
sentencepiece
|
sentencepiece
|
||||||
transformers>=4.43,<=4.50
|
transformers>=4.43,<=4.50
|
||||||
peft
|
peft
|
||||||
@ -39,7 +39,5 @@ x_transformers
|
|||||||
torchmetrics<=1.5
|
torchmetrics<=1.5
|
||||||
pydantic<=2.10.6
|
pydantic<=2.10.6
|
||||||
ctranslate2>=4.0,<5
|
ctranslate2>=4.0,<5
|
||||||
huggingface_hub>=0.13
|
|
||||||
tokenizers>=0.13,<1
|
|
||||||
av>=11
|
av>=11
|
||||||
tqdm
|
tqdm
|
||||||
|
@ -1,34 +1,13 @@
|
|||||||
import os
|
|
||||||
|
|
||||||
|
|
||||||
def check_fw_local_models():
|
|
||||||
"""
|
|
||||||
启动时检查本地是否有 Faster Whisper 模型.
|
|
||||||
"""
|
|
||||||
model_size_list = [
|
|
||||||
"medium",
|
|
||||||
"medium.en",
|
|
||||||
"distil-large-v2",
|
|
||||||
"distil-large-v3",
|
|
||||||
"large-v1",
|
|
||||||
"large-v2",
|
|
||||||
"large-v3",
|
|
||||||
]
|
|
||||||
for i, size in enumerate(model_size_list):
|
|
||||||
if os.path.exists(f"tools/asr/models/faster-whisper-{size}"):
|
|
||||||
model_size_list[i] = size + "-local"
|
|
||||||
return model_size_list
|
|
||||||
|
|
||||||
|
|
||||||
def get_models():
|
def get_models():
|
||||||
model_size_list = [
|
model_size_list = [
|
||||||
"medium",
|
"medium",
|
||||||
"medium.en",
|
"medium.en",
|
||||||
"distil-large-v2",
|
|
||||||
"distil-large-v3",
|
|
||||||
"large-v1",
|
|
||||||
"large-v2",
|
"large-v2",
|
||||||
"large-v3",
|
"large-v3",
|
||||||
|
"large-v3-turbo",
|
||||||
|
"distil-large-v2",
|
||||||
|
"distil-large-v3",
|
||||||
|
"distil-large-v3.5",
|
||||||
]
|
]
|
||||||
return model_size_list
|
return model_size_list
|
||||||
|
|
||||||
@ -36,7 +15,7 @@ def get_models():
|
|||||||
asr_dict = {
|
asr_dict = {
|
||||||
"达摩 ASR (中文)": {"lang": ["zh", "yue"], "size": ["large"], "path": "funasr_asr.py", "precision": ["float32"]},
|
"达摩 ASR (中文)": {"lang": ["zh", "yue"], "size": ["large"], "path": "funasr_asr.py", "precision": ["float32"]},
|
||||||
"Faster Whisper (多语种)": {
|
"Faster Whisper (多语种)": {
|
||||||
"lang": ["auto", "zh", "en", "ja", "ko", "yue"],
|
"lang": ["auto", "en", "ja", "ko"],
|
||||||
"size": get_models(),
|
"size": get_models(),
|
||||||
"path": "fasterwhisper_asr.py",
|
"path": "fasterwhisper_asr.py",
|
||||||
"precision": ["float32", "float16", "int8"],
|
"precision": ["float32", "float16", "int8"],
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import time
|
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
|
import requests
|
||||||
import torch
|
import torch
|
||||||
from faster_whisper import WhisperModel
|
from faster_whisper import WhisperModel
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download as snapshot_download_hf
|
||||||
from huggingface_hub.errors import LocalEntryNotFoundError
|
from modelscope import snapshot_download as snapshot_download_ms
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from tools.asr.config import get_models
|
from tools.asr.config import get_models
|
||||||
@ -40,11 +40,35 @@ language_code_list = [
|
|||||||
|
|
||||||
|
|
||||||
def download_model(model_size: str):
|
def download_model(model_size: str):
|
||||||
if "distil" in model_size:
|
url = "https://huggingface.co/api/models/gpt2"
|
||||||
repo_id = "Systran/faster-{}-whisper-{}".format(*model_size.split("-", maxsplit=1))
|
try:
|
||||||
|
requests.get(url, timeout=3)
|
||||||
|
source = "HF"
|
||||||
|
except Exception:
|
||||||
|
source = "ModelScope"
|
||||||
|
|
||||||
|
model_path = ""
|
||||||
|
if source == "HF":
|
||||||
|
if "distil" in model_size:
|
||||||
|
if "3.5" in model_size:
|
||||||
|
repo_id = "distil-whisper/distil-large-v3.5-ct2"
|
||||||
|
model_path = "tools/asr/models/faster-whisper-distil-large-v3.5"
|
||||||
|
else:
|
||||||
|
repo_id = "Systran/faster-{}-whisper-{}".format(*model_size.split("-", maxsplit=1))
|
||||||
|
elif model_size == "large-v3-turbo":
|
||||||
|
repo_id = "mobiuslabsgmbh/faster-whisper-large-v3-turbo"
|
||||||
|
model_path = "tools/asr/models/faster-whisper-large-v3-turbo"
|
||||||
|
else:
|
||||||
|
repo_id = f"Systran/faster-whisper-{model_size}"
|
||||||
|
model_path = (
|
||||||
|
model_path
|
||||||
|
or f"tools/asr/models/{repo_id.replace('Systran/', '').replace('distil-whisper/', '', 1)}".replace(
|
||||||
|
"distil-whisper", "whisper-distil"
|
||||||
|
)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
repo_id = f"Systran/faster-whisper-{model_size}"
|
repo_id = "XXXXRT/faster-whisper"
|
||||||
model_path = f"tools/asr/models/{repo_id.strip('Systran/')}"
|
model_path = f"tools/asr/models/faster-whisper-{model_size}".replace("distil-whisper", "whisper-distil")
|
||||||
|
|
||||||
files: list[str] = [
|
files: list[str] = [
|
||||||
"config.json",
|
"config.json",
|
||||||
@ -58,26 +82,24 @@ def download_model(model_size: str):
|
|||||||
|
|
||||||
files.remove("vocabulary.txt")
|
files.remove("vocabulary.txt")
|
||||||
|
|
||||||
for attempt in range(2):
|
if source == "ModelScope":
|
||||||
try:
|
files = [f"faster-whisper-{model_size}/{file}".replace("whisper-distil", "distil-whisper") for file in files]
|
||||||
snapshot_download(
|
|
||||||
repo_id=repo_id,
|
|
||||||
allow_patterns=files,
|
|
||||||
local_dir=model_path,
|
|
||||||
)
|
|
||||||
break
|
|
||||||
except LocalEntryNotFoundError:
|
|
||||||
if attempt < 1:
|
|
||||||
time.sleep(2)
|
|
||||||
else:
|
|
||||||
print("[ERROR] LocalEntryNotFoundError and no fallback.")
|
|
||||||
traceback.print_exc()
|
|
||||||
exit(1)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[ERROR] Unexpected error on attempt {attempt + 1}: {e}")
|
|
||||||
traceback.print_exc()
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
|
if source == "HF":
|
||||||
|
print(f"Downloading model from HuggingFace: {repo_id} to {model_path}")
|
||||||
|
snapshot_download_hf(
|
||||||
|
repo_id,
|
||||||
|
local_dir=model_path,
|
||||||
|
local_dir_use_symlinks=False,
|
||||||
|
allow_patterns=files,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print(f"Downloading model from ModelScope: {repo_id} to {model_path}")
|
||||||
|
snapshot_download_ms(
|
||||||
|
repo_id,
|
||||||
|
local_dir=model_path,
|
||||||
|
allow_patterns=files,
|
||||||
|
)
|
||||||
return model_path
|
return model_path
|
||||||
|
|
||||||
|
|
||||||
@ -106,7 +128,7 @@ def execute_asr(input_folder, output_folder, model_path, language, precision):
|
|||||||
)
|
)
|
||||||
text = ""
|
text = ""
|
||||||
|
|
||||||
if info.language == "zh":
|
if info.language in ["zh", "yue"]:
|
||||||
print("检测为中文文本, 转 FunASR 处理")
|
print("检测为中文文本, 转 FunASR 处理")
|
||||||
text = only_asr(file_path, language=info.language.lower())
|
text = only_asr(file_path, language=info.language.lower())
|
||||||
|
|
||||||
|
@ -4,9 +4,8 @@ import argparse
|
|||||||
import os
|
import os
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
# from funasr.utils import version_checker
|
|
||||||
# version_checker.check_for_update = lambda: None
|
|
||||||
from funasr import AutoModel
|
from funasr import AutoModel
|
||||||
|
from modelscope import snapshot_download
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
funasr_models = {} # 存储模型避免重复加载
|
funasr_models = {} # 存储模型避免重复加载
|
||||||
@ -16,40 +15,43 @@ def only_asr(input_file, language):
|
|||||||
try:
|
try:
|
||||||
model = create_model(language)
|
model = create_model(language)
|
||||||
text = model.generate(input=input_file)[0]["text"]
|
text = model.generate(input=input_file)[0]["text"]
|
||||||
except:
|
except Exception:
|
||||||
text = ""
|
text = ""
|
||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def create_model(language="zh"):
|
def create_model(language="zh"):
|
||||||
path_vad = "tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch"
|
|
||||||
path_punc = "tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
|
|
||||||
path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
|
|
||||||
path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
|
|
||||||
vad_model_revision = punc_model_revision = "v2.0.4"
|
|
||||||
|
|
||||||
if language == "zh":
|
if language == "zh":
|
||||||
|
path_vad = "tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch"
|
||||||
|
path_punc = "tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
|
||||||
path_asr = "tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
path_asr = "tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
||||||
path_asr = (
|
snapshot_download(
|
||||||
path_asr
|
"iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
|
||||||
if os.path.exists(path_asr)
|
local_dir="tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch",
|
||||||
else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
)
|
||||||
|
snapshot_download(
|
||||||
|
"iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
|
||||||
|
local_dir="tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
|
||||||
|
)
|
||||||
|
snapshot_download(
|
||||||
|
"iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
||||||
|
local_dir="tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
||||||
)
|
)
|
||||||
model_revision = "v2.0.4"
|
model_revision = "v2.0.4"
|
||||||
elif language == "yue":
|
elif language == "yue":
|
||||||
path_asr = "tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online"
|
path_asr = "tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online"
|
||||||
path_asr = (
|
snapshot_download(
|
||||||
path_asr
|
"iic/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online",
|
||||||
if os.path.exists(path_asr)
|
local_dir="tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online",
|
||||||
else "iic/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online"
|
|
||||||
)
|
)
|
||||||
model_revision = "master"
|
|
||||||
path_vad = path_punc = None
|
path_vad = path_punc = None
|
||||||
vad_model_revision = punc_model_revision = None
|
vad_model_revision = punc_model_revision = ""
|
||||||
###友情提示:粤语带VAD识别可能会有少量shape不对报错的,但是不带VAD可以.不带vad只能分阶段单独加标点。不过标点模型对粤语效果真的不行…
|
model_revision = "master"
|
||||||
else:
|
else:
|
||||||
raise ValueError("FunASR 不支持该语言" + ": " + language)
|
raise ValueError(f"{language} is not supported")
|
||||||
|
|
||||||
|
vad_model_revision = punc_model_revision = "v2.0.4"
|
||||||
|
|
||||||
if language in funasr_models:
|
if language in funasr_models:
|
||||||
return funasr_models[language]
|
return funasr_models[language]
|
||||||
@ -83,7 +85,7 @@ def execute_asr(input_folder, output_folder, model_size, language):
|
|||||||
file_path = os.path.join(input_folder, file_name)
|
file_path = os.path.join(input_folder, file_name)
|
||||||
text = model.generate(input=file_path)[0]["text"]
|
text = model.generate(input=file_path)[0]["text"]
|
||||||
output.append(f"{file_path}|{output_file_name}|{language.upper()}|{text}")
|
output.append(f"{file_path}|{output_file_name}|{language.upper()}|{text}")
|
||||||
except:
|
except Exception:
|
||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
|
|
||||||
output_folder = output_folder or "output/asr_opt"
|
output_folder = output_folder or "output/asr_opt"
|
||||||
|
@ -38,7 +38,7 @@
|
|||||||
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: FO hop size, the smaller the value, the higher the accuracy)",
|
"hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: FO hop size, the smaller the value, the higher the accuracy)",
|
||||||
"max:归一化后最大值多少": "Loudness multiplier after normalized",
|
"max:归一化后最大值多少": "Loudness multiplier after normalized",
|
||||||
"max_sil_kept:切完后静音最多留多长": "Maximum length for silence to be kept",
|
"max_sil_kept:切完后静音最多留多长": "Maximum length for silence to be kept",
|
||||||
"min_interval:最短切割间隔": "Minumum interval for audio cutting",
|
"min_interval:最短切割间隔": "Minimum interval for audio cutting",
|
||||||
"min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值": "min_length: the minimum length of each segment. If the first segment is too short, it will be concatenated with the next segment until it exceeds this value",
|
"min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值": "min_length: the minimum length of each segment. If the first segment is too short, it will be concatenated with the next segment until it exceeds this value",
|
||||||
"temperature": "temperature",
|
"temperature": "temperature",
|
||||||
"threshold:音量小于这个值视作静音的备选切割点": "Noise gate threshold (loudness below this value will be treated as noise",
|
"threshold:音量小于这个值视作静音的备选切割点": "Noise gate threshold (loudness below this value will be treated as noise",
|
||||||
@ -176,7 +176,7 @@
|
|||||||
"语音降噪": "Speech Denoising",
|
"语音降噪": "Speech Denoising",
|
||||||
"请上传3~10秒内参考音频,超过会报错!": "Please upload a reference audio within the 3-10 second range; if it exceeds this duration, it will raise errors.",
|
"请上传3~10秒内参考音频,超过会报错!": "Please upload a reference audio within the 3-10 second range; if it exceeds this duration, it will raise errors.",
|
||||||
"请上传参考音频": "Please Upload the Reference Audio",
|
"请上传参考音频": "Please Upload the Reference Audio",
|
||||||
"请填入推理文本": "Please Fill in the Terget Text",
|
"请填入推理文本": "Please Fill in the Target Text",
|
||||||
"请填入正确的List路径": "Please Fill in the Correct List Path",
|
"请填入正确的List路径": "Please Fill in the Correct List Path",
|
||||||
"请填入正确的音频文件夹路径": "Please Fill in the Correct Audio Folder Path",
|
"请填入正确的音频文件夹路径": "Please Fill in the Correct Audio Folder Path",
|
||||||
"请输入有效文本": "Please enter valid text.",
|
"请输入有效文本": "Please enter valid text.",
|
||||||
|
2
webui.py
2
webui.py
@ -86,7 +86,6 @@ from config import (
|
|||||||
from tools import my_utils
|
from tools import my_utils
|
||||||
from tools.my_utils import check_details, check_for_existance
|
from tools.my_utils import check_details, check_for_existance
|
||||||
|
|
||||||
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
|
|
||||||
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
||||||
|
|
||||||
# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu
|
# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu
|
||||||
@ -1980,4 +1979,3 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
|
|||||||
server_port=webui_port_main,
|
server_port=webui_port_main,
|
||||||
# quiet=True,
|
# quiet=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user