diff --git a/tools/asr/config.py b/tools/asr/config.py deleted file mode 100644 index 8fe68388..00000000 --- a/tools/asr/config.py +++ /dev/null @@ -1,31 +0,0 @@ -import os - -def check_fw_local_models(): - ''' - 启动时检查本地是否有 Faster Whisper 模型. - ''' - model_size_list = [ - "tiny", "tiny.en", - "base", "base.en", - "small", "small.en", - "medium", "medium.en", - "large", "large-v1", - "large-v2", "large-v3"] - for i, size in enumerate(model_size_list): - if os.path.exists(f'tools/asr/models/faster-whisper-{size}'): - model_size_list[i] = size + '-local' - return model_size_list - -asr_dict = { - "达摩 ASR (中文)": { - 'lang': ['zh'], - 'size': ['large'], - 'path': 'funasr_asr.py', - }, - "Faster Whisper (多语种)": { - 'lang': ['auto', 'zh', 'en', 'ja'], - 'size': check_fw_local_models(), - 'path': 'fasterwhisper_asr.py' - } -} - diff --git a/tools/asr/fasterwhisper_asr.py b/tools/asr/fasterwhisper_asr.py deleted file mode 100644 index e9fc6a47..00000000 --- a/tools/asr/fasterwhisper_asr.py +++ /dev/null @@ -1,114 +0,0 @@ -import argparse -import os -import traceback - -os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" -os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" - -import torch -from faster_whisper import WhisperModel -from tqdm import tqdm - -from tools.asr.config import check_fw_local_models - -language_code_list = [ - "af", "am", "ar", "as", "az", - "ba", "be", "bg", "bn", "bo", - "br", "bs", "ca", "cs", "cy", - "da", "de", "el", "en", "es", - "et", "eu", "fa", "fi", "fo", - "fr", "gl", "gu", "ha", "haw", - "he", "hi", "hr", "ht", "hu", - "hy", "id", "is", "it", "ja", - "jw", "ka", "kk", "km", "kn", - "ko", "la", "lb", "ln", "lo", - "lt", "lv", "mg", "mi", "mk", - "ml", "mn", "mr", "ms", "mt", - "my", "ne", "nl", "nn", "no", - "oc", "pa", "pl", "ps", "pt", - "ro", "ru", "sa", "sd", "si", - "sk", "sl", "sn", "so", "sq", - "sr", "su", "sv", "sw", "ta", - "te", "tg", "th", "tk", "tl", - "tr", "tt", "uk", "ur", "uz", - "vi", "yi", "yo", "zh", "yue", - "auto"] - -def execute_asr(input_folder, output_folder, model_size, language, precision): - if '-local' in model_size: - model_size = model_size[:-6] - model_path = f'tools/asr/models/faster-whisper-{model_size}' - else: - model_path = model_size - if language == 'auto': - language = None #不设置语种由模型自动输出概率最高的语种 - print("loading faster whisper model:",model_size,model_path) - device = 'cuda' if torch.cuda.is_available() else 'cpu' - try: - model = WhisperModel(model_path, device=device, compute_type=precision) - except: - return print(traceback.format_exc()) - - input_file_names = os.listdir(input_folder) - input_file_names.sort() - - output = [] - output_file_name = os.path.basename(input_folder) - - for file_name in tqdm(input_file_names): - try: - file_path = os.path.join(input_folder, file_name) - segments, info = model.transcribe( - audio = file_path, - beam_size = 5, - vad_filter = True, - vad_parameters = dict(min_silence_duration_ms=700), - language = language) - text = '' - - if info.language == "zh": - print("检测为中文文本, 转 FunASR 处理") - if("only_asr"not in globals()): - from tools.asr.funasr_asr import \ - only_asr # #如果用英文就不需要导入下载模型 - text = only_asr(file_path) - - if text == '': - for segment in segments: - text += segment.text - output.append(f"{file_path}|{output_file_name}|{info.language.upper()}|{text}") - except: - print(traceback.format_exc()) - - output_folder = output_folder or "output/asr_opt" - os.makedirs(output_folder, exist_ok=True) - output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list') - - with open(output_file_path, "w", encoding="utf-8") as f: - f.write("\n".join(output)) - print(f"ASR 任务完成->标注文件路径: {output_file_path}\n") - return output_file_path - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument("-i", "--input_folder", type=str, required=True, - help="Path to the folder containing WAV files.") - parser.add_argument("-o", "--output_folder", type=str, required=True, - help="Output folder to store transcriptions.") - parser.add_argument("-s", "--model_size", type=str, default='large-v3', - choices=check_fw_local_models(), - help="Model Size of Faster Whisper") - parser.add_argument("-l", "--language", type=str, default='ja', - choices=language_code_list, - help="Language of the audio files.") - parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'], - help="fp16 or fp32") - - cmd = parser.parse_args() - output_file_path = execute_asr( - input_folder = cmd.input_folder, - output_folder = cmd.output_folder, - model_size = cmd.model_size, - language = cmd.language, - precision = cmd.precision, - ) diff --git a/tools/asr/funasr_asr.py b/tools/asr/funasr_asr.py deleted file mode 100644 index 831da6c1..00000000 --- a/tools/asr/funasr_asr.py +++ /dev/null @@ -1,77 +0,0 @@ -# -*- coding:utf-8 -*- - -import argparse -import os -import traceback -from tqdm import tqdm - -from funasr import AutoModel - -path_asr = 'tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' -path_vad = 'tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch' -path_punc = 'tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch' -path_asr = path_asr if os.path.exists(path_asr) else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" -path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch" -path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" - -model = AutoModel( - model = path_asr, - model_revision = "v2.0.4", - vad_model = path_vad, - vad_model_revision = "v2.0.4", - punc_model = path_punc, - punc_model_revision = "v2.0.4", -) - -def only_asr(input_file): - try: - text = model.generate(input=input_file)[0]["text"] - except: - text = '' - print(traceback.format_exc()) - return text - -def execute_asr(input_folder, output_folder, model_size, language): - input_file_names = os.listdir(input_folder) - input_file_names.sort() - - output = [] - output_file_name = os.path.basename(input_folder) - - for file_name in tqdm(input_file_names): - try: - file_path = os.path.join(input_folder, file_name) - text = model.generate(input=file_path)[0]["text"] - output.append(f"{file_path}|{output_file_name}|{language.upper()}|{text}") - except: - print(traceback.format_exc()) - - output_folder = output_folder or "output/asr_opt" - os.makedirs(output_folder, exist_ok=True) - output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list') - - with open(output_file_path, "w", encoding="utf-8") as f: - f.write("\n".join(output)) - print(f"ASR 任务完成->标注文件路径: {output_file_path}\n") - return output_file_path - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument("-i", "--input_folder", type=str, required=True, - help="Path to the folder containing WAV files.") - parser.add_argument("-o", "--output_folder", type=str, required=True, - help="Output folder to store transcriptions.") - parser.add_argument("-s", "--model_size", type=str, default='large', - help="Model Size of FunASR is Large") - parser.add_argument("-l", "--language", type=str, default='zh', choices=['zh'], - help="Language of the audio files.") - parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'], - help="fp16 or fp32")#还没接入 - - cmd = parser.parse_args() - execute_asr( - input_folder = cmd.input_folder, - output_folder = cmd.output_folder, - model_size = cmd.model_size, - language = cmd.language, - ) diff --git a/tools/asr/sensevoice.py b/tools/asr/sensevoice.py new file mode 100644 index 00000000..326c9c2c --- /dev/null +++ b/tools/asr/sensevoice.py @@ -0,0 +1,71 @@ +import argparse +import os +import re +import traceback +import torch +from tqdm import tqdm +from funasr import AutoModel + +model_dir = "tools/asr/models/SenseVoiceSmall" +model_dir = model_dir if os.path.exists(model_dir) else "iic/SenseVoiceSmall" + +def execute_asr(input_folder, output_folder, language, device): + try: + model = AutoModel(model=model_dir, + vad_model="fsmn-vad", + vad_kwargs={"max_single_segment_time": 30000}, + ) + except: + return print(traceback.format_exc()) + + input_file_names = os.listdir(input_folder) + input_file_names.sort() + output = [] + output_file_name = os.path.basename(input_folder) + if device is None: + device = 'cuda' if torch.cuda.is_available() else 'cpu' + for file_name in tqdm(input_file_names): + try: + file_path = os.path.join(input_folder, file_name) + res = model.generate( + input=file_path, + cache={}, + language=language.lower(), # "zn", "en", "yue", "ja", "ko", "nospeech" + use_itn=True, + batch_size_s=0, + device = device + )[0]['text'] + text_language = re.search(r'<([^<>]+)>', res).group(1)[1:-1].upper() if language == 'auto' else language.upper() + text = re.sub(r'<[^<>]*>', '', res).replace(' ', '') + output.append(f"{file_path}|{output_file_name}|{text_language}|{text}") + except: + print(traceback.format_exc()) + + output_folder = output_folder or "output/asr_opt" + os.makedirs(output_folder, exist_ok=True) + output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list') + + with open(output_file_path, "w", encoding="utf-8") as f: + f.write("\n".join(output)) + print(f"ASR 任务完成->标注文件路径: {output_file_path}\n") + return output_file_path + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input_folder", type=str, required=True, + help="Path to the folder containing WAV files.") + parser.add_argument("-o", "--output_folder", type=str, required=True, + help="Output folder to store transcriptions.") + parser.add_argument("-l", "--language", type=str, default='auto', + choices=['auto','zh','en','ja'], + help="Language of the audio files.") + parser.add_argument("-d", "--device", type=str, default=None, choices=['cpu','cuda'], + help="CPU or CUDA") + + cmd = parser.parse_args() + output_file_path = execute_asr( + input_folder = cmd.input_folder, + output_folder = cmd.output_folder, + language = cmd.language, + device = cmd.device, + ) diff --git a/webui.py b/webui.py index a200a747..0881e73f 100644 --- a/webui.py +++ b/webui.py @@ -194,17 +194,15 @@ def change_tts_inference(if_tts,bert_path,cnhubert_base_path,gpu_number,gpt_path p_tts_inference=None yield i18n("TTS推理进程已关闭") -from tools.asr.config import asr_dict def open_asr(asr_inp_dir, asr_opt_dir, asr_model, asr_model_size, asr_lang): global p_asr if(p_asr==None): asr_inp_dir=my_utils.clean_path(asr_inp_dir) - cmd = f'"{python_exec}" tools/asr/{asr_dict[asr_model]["path"]}' + cmd = f'"{python_exec}" tools/asr/sensevoice.py' cmd += f' -i "{asr_inp_dir}"' cmd += f' -o "{asr_opt_dir}"' - cmd += f' -s {asr_model_size}' cmd += f' -l {asr_lang}' - cmd += " -p %s"%("float16"if is_half==True else "float32") + yield "ASR任务开启:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True} print(cmd) @@ -732,33 +730,25 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: with gr.Row(): asr_model = gr.Dropdown( label = i18n("ASR 模型"), - choices = list(asr_dict.keys()), + choices = ['SenseVoice'], interactive = True, - value="达摩 ASR (中文)" + value="enseVoice" ) asr_size = gr.Dropdown( label = i18n("ASR 模型尺寸"), - choices = ["large"], + choices = ["small"], interactive = True, - value="large" + value="small" ) asr_lang = gr.Dropdown( label = i18n("ASR 语言设置"), - choices = ["zh"], + choices = ["auto","zh","en","ja"], interactive = True, - value="zh" + value="auto" ) with gr.Row(): asr_info = gr.Textbox(label=i18n("ASR进程输出信息")) - def change_lang_choices(key): #根据选择的模型修改可选的语言 - # return gr.Dropdown(choices=asr_dict[key]['lang']) - return {"__type__": "update", "choices": asr_dict[key]['lang'],"value":asr_dict[key]['lang'][0]} - def change_size_choices(key): # 根据选择的模型修改可选的模型尺寸 - # return gr.Dropdown(choices=asr_dict[key]['size']) - return {"__type__": "update", "choices": asr_dict[key]['size']} - asr_model.change(change_lang_choices, [asr_model], [asr_lang]) - asr_model.change(change_size_choices, [asr_model], [asr_size]) gr.Markdown(value=i18n("0d-语音文本校对标注工具")) with gr.Row():