From 84e902eaa8ec5739230f9e9040f1b79249556751 Mon Sep 17 00:00:00 2001 From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com> Date: Sun, 5 Oct 2025 12:35:01 +0100 Subject: [PATCH 1/5] Add ModelScope Snapshot Download For ASR --- requirements.txt | 4 +- tools/asr/config.py | 31 +++----------- tools/asr/fasterwhisper_asr.py | 74 ++++++++++++++++++++++------------ 3 files changed, 54 insertions(+), 55 deletions(-) diff --git a/requirements.txt b/requirements.txt index 90e4957d..578bb87c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,7 +16,7 @@ pypinyin pyopenjtalk>=0.4.1 g2p_en torchaudio -modelscope==1.10.0 +modelscope sentencepiece transformers>=4.43,<=4.50 peft @@ -39,7 +39,5 @@ x_transformers torchmetrics<=1.5 pydantic<=2.10.6 ctranslate2>=4.0,<5 -huggingface_hub>=0.13 -tokenizers>=0.13,<1 av>=11 tqdm diff --git a/tools/asr/config.py b/tools/asr/config.py index 9c26a4f6..097aa643 100644 --- a/tools/asr/config.py +++ b/tools/asr/config.py @@ -1,34 +1,13 @@ -import os - - -def check_fw_local_models(): - """ - 启动时检查本地是否有 Faster Whisper 模型. - """ - model_size_list = [ - "medium", - "medium.en", - "distil-large-v2", - "distil-large-v3", - "large-v1", - "large-v2", - "large-v3", - ] - for i, size in enumerate(model_size_list): - if os.path.exists(f"tools/asr/models/faster-whisper-{size}"): - model_size_list[i] = size + "-local" - return model_size_list - - def get_models(): model_size_list = [ "medium", "medium.en", - "distil-large-v2", - "distil-large-v3", - "large-v1", "large-v2", "large-v3", + "large-v3-turbo", + "distil-large-v2", + "distil-large-v3", + "distil-large-v3.5", ] return model_size_list @@ -36,7 +15,7 @@ def get_models(): asr_dict = { "达摩 ASR (中文)": {"lang": ["zh", "yue"], "size": ["large"], "path": "funasr_asr.py", "precision": ["float32"]}, "Faster Whisper (多语种)": { - "lang": ["auto", "zh", "en", "ja", "ko", "yue"], + "lang": ["auto", "en", "ja", "ko", "yue"], "size": get_models(), "path": "fasterwhisper_asr.py", "precision": ["float32", "float16", "int8"], diff --git a/tools/asr/fasterwhisper_asr.py b/tools/asr/fasterwhisper_asr.py index a2ebe975..1f98b840 100644 --- a/tools/asr/fasterwhisper_asr.py +++ b/tools/asr/fasterwhisper_asr.py @@ -1,12 +1,12 @@ import argparse import os -import time import traceback +import requests import torch from faster_whisper import WhisperModel -from huggingface_hub import snapshot_download -from huggingface_hub.errors import LocalEntryNotFoundError +from huggingface_hub import snapshot_download as snapshot_download_hf +from modelscope import snapshot_download as snapshot_download_ms from tqdm import tqdm from tools.asr.config import get_models @@ -40,11 +40,35 @@ language_code_list = [ def download_model(model_size: str): - if "distil" in model_size: - repo_id = "Systran/faster-{}-whisper-{}".format(*model_size.split("-", maxsplit=1)) + url = "https://huggingface.co/api/models/gpt2" + try: + requests.get(url, timeout=3) + source = "HF" + except Exception: + source = "ModelScope" + + model_path = "" + if source == "HF": + if "distil" in model_size: + if "3.5" in model_size: + repo_id = "distil-whisper/distil-large-v3.5-ct2" + model_path = "tools/asr/models/faster-whisper-distil-large-v3.5" + else: + repo_id = "Systran/faster-{}-whisper-{}".format(*model_size.split("-", maxsplit=1)) + elif model_size == "large-v3-turbo": + repo_id = "mobiuslabsgmbh/faster-whisper-large-v3-turbo" + model_path = "tools/asr/models/faster-whisper-large-v3-turbo" + else: + repo_id = f"Systran/faster-whisper-{model_size}" + model_path = ( + model_path + or f"tools/asr/models/{repo_id.replace('Systran/', '').replace('distil-whisper/', '', 1)}".replace( + "distil-whisper", "whisper-distil" + ) + ) else: - repo_id = f"Systran/faster-whisper-{model_size}" - model_path = f"tools/asr/models/{repo_id.strip('Systran/')}" + repo_id = "XXXXRT/faster-whisper" + model_path = f"tools/asr/models/faster-whisper-{model_size}".replace("distil-whisper", "whisper-distil") files: list[str] = [ "config.json", @@ -58,26 +82,24 @@ def download_model(model_size: str): files.remove("vocabulary.txt") - for attempt in range(2): - try: - snapshot_download( - repo_id=repo_id, - allow_patterns=files, - local_dir=model_path, - ) - break - except LocalEntryNotFoundError: - if attempt < 1: - time.sleep(2) - else: - print("[ERROR] LocalEntryNotFoundError and no fallback.") - traceback.print_exc() - exit(1) - except Exception as e: - print(f"[ERROR] Unexpected error on attempt {attempt + 1}: {e}") - traceback.print_exc() - exit(1) + if source == "ModelScope": + files = [f"faster-whisper-{model_size}/{file}".replace("whisper-distil", "distil-whisper") for file in files] + if source == "HF": + print(f"Downloading model from HuggingFace: {repo_id} to {model_path}") + snapshot_download_hf( + repo_id, + local_dir=model_path, + local_dir_use_symlinks=False, + allow_patterns=files, + ) + else: + print(f"Downloading model from ModelScope: {repo_id} to {model_path}") + snapshot_download_ms( + repo_id, + local_dir=model_path, + allow_patterns=files, + ) return model_path From da43eb76693a1a1fc22afe65094fe907af652c55 Mon Sep 17 00:00:00 2001 From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com> Date: Sun, 5 Oct 2025 12:42:59 +0100 Subject: [PATCH 2/5] Typo Fix --- tools/i18n/locale/en_US.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/i18n/locale/en_US.json b/tools/i18n/locale/en_US.json index 24d24de4..561d3bfd 100644 --- a/tools/i18n/locale/en_US.json +++ b/tools/i18n/locale/en_US.json @@ -38,7 +38,7 @@ "hop_size:怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)": "hop_size: FO hop size, the smaller the value, the higher the accuracy)", "max:归一化后最大值多少": "Loudness multiplier after normalized", "max_sil_kept:切完后静音最多留多长": "Maximum length for silence to be kept", - "min_interval:最短切割间隔": "Minumum interval for audio cutting", + "min_interval:最短切割间隔": "Minimum interval for audio cutting", "min_length:每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值": "min_length: the minimum length of each segment. If the first segment is too short, it will be concatenated with the next segment until it exceeds this value", "temperature": "temperature", "threshold:音量小于这个值视作静音的备选切割点": "Noise gate threshold (loudness below this value will be treated as noise", @@ -176,7 +176,7 @@ "语音降噪": "Speech Denoising", "请上传3~10秒内参考音频,超过会报错!": "Please upload a reference audio within the 3-10 second range; if it exceeds this duration, it will raise errors.", "请上传参考音频": "Please Upload the Reference Audio", - "请填入推理文本": "Please Fill in the Terget Text", + "请填入推理文本": "Please Fill in the Target Text", "请填入正确的List路径": "Please Fill in the Correct List Path", "请填入正确的音频文件夹路径": "Please Fill in the Correct Audio Folder Path", "请输入有效文本": "Please enter valid text.", From 93870073e89a11242bd2b9662ef1091b801b5912 Mon Sep 17 00:00:00 2001 From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com> Date: Sun, 5 Oct 2025 14:20:20 +0100 Subject: [PATCH 3/5] Remove YUE in whisper --- tools/asr/config.py | 2 +- tools/asr/fasterwhisper_asr.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/asr/config.py b/tools/asr/config.py index 097aa643..fdff7518 100644 --- a/tools/asr/config.py +++ b/tools/asr/config.py @@ -15,7 +15,7 @@ def get_models(): asr_dict = { "达摩 ASR (中文)": {"lang": ["zh", "yue"], "size": ["large"], "path": "funasr_asr.py", "precision": ["float32"]}, "Faster Whisper (多语种)": { - "lang": ["auto", "en", "ja", "ko", "yue"], + "lang": ["auto", "en", "ja", "ko"], "size": get_models(), "path": "fasterwhisper_asr.py", "precision": ["float32", "float16", "int8"], diff --git a/tools/asr/fasterwhisper_asr.py b/tools/asr/fasterwhisper_asr.py index 1f98b840..72a4b82a 100644 --- a/tools/asr/fasterwhisper_asr.py +++ b/tools/asr/fasterwhisper_asr.py @@ -128,7 +128,7 @@ def execute_asr(input_folder, output_folder, model_path, language, precision): ) text = "" - if info.language == "zh": + if info.language in ["zh", "yue"]: print("检测为中文文本, 转 FunASR 处理") text = only_asr(file_path, language=info.language.lower()) From fc7c6521050fcb150b59a423b631976959282529 Mon Sep 17 00:00:00 2001 From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com> Date: Sun, 5 Oct 2025 14:21:21 +0100 Subject: [PATCH 4/5] Remove HF ENDPOINT --- webui.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/webui.py b/webui.py index cf5d8a3a..beb0963a 100644 --- a/webui.py +++ b/webui.py @@ -86,7 +86,6 @@ from config import ( from tools import my_utils from tools.my_utils import check_details, check_for_existance -os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu @@ -1980,4 +1979,3 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css server_port=webui_port_main, # quiet=True, ) - From 495ef66177731ae1dfc81e2a1cad1758219d7af0 Mon Sep 17 00:00:00 2001 From: XXXXRT666 <157766680+XXXXRT666@users.noreply.github.com> Date: Sun, 5 Oct 2025 14:37:24 +0100 Subject: [PATCH 5/5] Add FunASR Download --- tools/asr/funasr_asr.py | 46 +++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/tools/asr/funasr_asr.py b/tools/asr/funasr_asr.py index b0ffceb0..6a5c9989 100644 --- a/tools/asr/funasr_asr.py +++ b/tools/asr/funasr_asr.py @@ -4,9 +4,8 @@ import argparse import os import traceback -# from funasr.utils import version_checker -# version_checker.check_for_update = lambda: None from funasr import AutoModel +from modelscope import snapshot_download from tqdm import tqdm funasr_models = {} # 存储模型避免重复加载 @@ -16,40 +15,43 @@ def only_asr(input_file, language): try: model = create_model(language) text = model.generate(input=input_file)[0]["text"] - except: + except Exception: text = "" print(traceback.format_exc()) return text def create_model(language="zh"): - path_vad = "tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch" - path_punc = "tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" - path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch" - path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" - vad_model_revision = punc_model_revision = "v2.0.4" - if language == "zh": + path_vad = "tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch" + path_punc = "tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" path_asr = "tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" - path_asr = ( - path_asr - if os.path.exists(path_asr) - else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" + snapshot_download( + "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch", + local_dir="tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch", + ) + snapshot_download( + "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", + local_dir="tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", + ) + snapshot_download( + "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", + local_dir="tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", ) model_revision = "v2.0.4" elif language == "yue": path_asr = "tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online" - path_asr = ( - path_asr - if os.path.exists(path_asr) - else "iic/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online" + snapshot_download( + "iic/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online", + local_dir="tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online", ) - model_revision = "master" path_vad = path_punc = None - vad_model_revision = punc_model_revision = None - ###友情提示:粤语带VAD识别可能会有少量shape不对报错的,但是不带VAD可以.不带vad只能分阶段单独加标点。不过标点模型对粤语效果真的不行… + vad_model_revision = punc_model_revision = "" + model_revision = "master" else: - raise ValueError("FunASR 不支持该语言" + ": " + language) + raise ValueError(f"{language} is not supported") + + vad_model_revision = punc_model_revision = "v2.0.4" if language in funasr_models: return funasr_models[language] @@ -83,7 +85,7 @@ def execute_asr(input_folder, output_folder, model_size, language): file_path = os.path.join(input_folder, file_name) text = model.generate(input=file_path)[0]["text"] output.append(f"{file_path}|{output_file_name}|{language.upper()}|{text}") - except: + except Exception: print(traceback.format_exc()) output_folder = output_folder or "output/asr_opt"