mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2026-06-04 05:01:27 +08:00
Merge dbeacf51dc079446a509ebbb1ff89dfe557395a5 into 08d627c3338173c3229286d8787060d6559fe0f8
This commit is contained in:
commit
0da1508e9f
@ -10,7 +10,7 @@ ffmpeg-python
|
|||||||
onnxruntime; platform_machine == "aarch64" or platform_machine == "arm64"
|
onnxruntime; platform_machine == "aarch64" or platform_machine == "arm64"
|
||||||
onnxruntime-gpu; platform_machine == "x86_64" or platform_machine == "AMD64"
|
onnxruntime-gpu; platform_machine == "x86_64" or platform_machine == "AMD64"
|
||||||
tqdm
|
tqdm
|
||||||
funasr==1.0.27
|
funasr>=1.3.7
|
||||||
cn2an
|
cn2an
|
||||||
pypinyin
|
pypinyin
|
||||||
pyopenjtalk>=0.4.1
|
pyopenjtalk>=0.4.1
|
||||||
|
|||||||
@ -5,15 +5,14 @@ def get_models():
|
|||||||
"large-v2",
|
"large-v2",
|
||||||
"large-v3",
|
"large-v3",
|
||||||
"large-v3-turbo",
|
"large-v3-turbo",
|
||||||
#"distil-large-v2",
|
|
||||||
#"distil-large-v3",
|
|
||||||
#"distil-large-v3.5",
|
|
||||||
]
|
]
|
||||||
return model_size_list
|
return model_size_list
|
||||||
|
|
||||||
|
|
||||||
asr_dict = {
|
asr_dict = {
|
||||||
"达摩 ASR (中文)": {"lang": ["zh", "yue"], "size": ["large"], "path": "funasr_asr.py", "precision": ["float32"]},
|
"Fun-ASR-Nano (31语种+方言, 推荐)": {"lang": ["zh", "en", "ja", "ko", "yue", "auto"], "size": ["large"], "path": "funasr_asr.py", "precision": ["float32"]},
|
||||||
|
"SenseVoice (极速, 5语种)": {"lang": ["zh", "en", "ja", "ko", "yue", "auto"], "size": ["large"], "path": "funasr_asr.py", "precision": ["float32"]},
|
||||||
|
"达摩 ASR (中文经典)": {"lang": ["zh", "yue"], "size": ["large"], "path": "funasr_asr.py", "precision": ["float32"]},
|
||||||
"Faster Whisper (多语种)": {
|
"Faster Whisper (多语种)": {
|
||||||
"lang": ["auto", "en", "ja", "ko"],
|
"lang": ["auto", "en", "ja", "ko"],
|
||||||
"size": get_models(),
|
"size": get_models(),
|
||||||
|
|||||||
@ -11,9 +11,9 @@ from tqdm import tqdm
|
|||||||
funasr_models = {} # 存储模型避免重复加载
|
funasr_models = {} # 存储模型避免重复加载
|
||||||
|
|
||||||
|
|
||||||
def only_asr(input_file, language):
|
def only_asr(input_file, language, backend="fun-asr-nano"):
|
||||||
try:
|
try:
|
||||||
model = create_model(language)
|
model = create_model(language, backend=backend)
|
||||||
text = model.generate(input=input_file)[0]["text"]
|
text = model.generate(input=input_file)[0]["text"]
|
||||||
except Exception:
|
except Exception:
|
||||||
text = ""
|
text = ""
|
||||||
@ -21,7 +21,39 @@ def only_asr(input_file, language):
|
|||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def create_model(language="zh"):
|
def create_model(language="zh", **kwargs):
|
||||||
|
backend = kwargs.get("backend", "fun-asr-nano")
|
||||||
|
|
||||||
|
# For non-classic backends, route to multilingual models regardless of language
|
||||||
|
if backend in ("fun-asr-nano", "sensevoice") and language != "yue":
|
||||||
|
import torch
|
||||||
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
cache_key = f"{language}_{backend}"
|
||||||
|
if cache_key in funasr_models:
|
||||||
|
return funasr_models[cache_key]
|
||||||
|
|
||||||
|
if backend == "fun-asr-nano":
|
||||||
|
model = AutoModel(
|
||||||
|
model="FunAudioLLM/Fun-ASR-Nano-2512",
|
||||||
|
trust_remote_code=True,
|
||||||
|
hub="hf",
|
||||||
|
vad_model="fsmn-vad",
|
||||||
|
device=device,
|
||||||
|
disable_update=True,
|
||||||
|
)
|
||||||
|
print(f"FunASR Fun-ASR-Nano 模型加载完成: {language.upper()}")
|
||||||
|
else:
|
||||||
|
model = AutoModel(
|
||||||
|
model="iic/SenseVoiceSmall",
|
||||||
|
vad_model="fsmn-vad",
|
||||||
|
device=device,
|
||||||
|
disable_update=True,
|
||||||
|
)
|
||||||
|
print(f"FunASR SenseVoice 模型加载完成: {language.upper()}")
|
||||||
|
|
||||||
|
funasr_models[cache_key] = model
|
||||||
|
return model
|
||||||
|
|
||||||
if language == "zh":
|
if language == "zh":
|
||||||
path_vad = "tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch"
|
path_vad = "tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch"
|
||||||
path_punc = "tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
|
path_punc = "tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
|
||||||
@ -50,7 +82,7 @@ def create_model(language="zh"):
|
|||||||
vad_model_revision = punc_model_revision = ""
|
vad_model_revision = punc_model_revision = ""
|
||||||
model_revision = "master"
|
model_revision = "master"
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"{language} is not supported")
|
raise ValueError(f"{language} is not supported. Supported: zh, yue, ja, en, ko, auto")
|
||||||
|
|
||||||
if language in funasr_models:
|
if language in funasr_models:
|
||||||
return funasr_models[language]
|
return funasr_models[language]
|
||||||
@ -69,14 +101,14 @@ def create_model(language="zh"):
|
|||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def execute_asr(input_folder, output_folder, model_size, language):
|
def execute_asr(input_folder, output_folder, model_size, language, backend="fun-asr-nano"):
|
||||||
input_file_names = os.listdir(input_folder)
|
input_file_names = os.listdir(input_folder)
|
||||||
input_file_names.sort()
|
input_file_names.sort()
|
||||||
|
|
||||||
output = []
|
output = []
|
||||||
output_file_name = os.path.basename(input_folder)
|
output_file_name = os.path.basename(input_folder)
|
||||||
|
|
||||||
model = create_model(language)
|
model = create_model(language, backend=backend)
|
||||||
|
|
||||||
for file_name in tqdm(input_file_names):
|
for file_name in tqdm(input_file_names):
|
||||||
try:
|
try:
|
||||||
@ -105,7 +137,7 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument("-o", "--output_folder", type=str, required=True, help="Output folder to store transcriptions.")
|
parser.add_argument("-o", "--output_folder", type=str, required=True, help="Output folder to store transcriptions.")
|
||||||
parser.add_argument("-s", "--model_size", type=str, default="large", help="Model Size of FunASR is Large")
|
parser.add_argument("-s", "--model_size", type=str, default="large", help="Model Size of FunASR is Large")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-l", "--language", type=str, default="zh", choices=["zh", "yue", "auto"], help="Language of the audio files."
|
"-l", "--language", type=str, default="zh", choices=["zh", "yue", "ja", "en", "ko", "auto"], help="Language of the audio files."
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-p", "--precision", type=str, default="float16", choices=["float16", "float32"], help="fp16 or fp32"
|
"-p", "--precision", type=str, default="float16", choices=["float16", "float32"], help="fp16 or fp32"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user