From 2d0db0bac5a8a1f36fb848c77fdff24468b63cbc Mon Sep 17 00:00:00 2001 From: XXXXRT666 Date: Wed, 10 Jul 2024 16:10:24 +0800 Subject: [PATCH] roll back more features --- README.md | 28 ++++- docs/cn/README.md | 26 +++- docs/ja/README.md | 3 +- docs/ko/README.md | 4 +- docs/tr/README.md | 1 + requirements.txt | 1 + tools/asr/config.py | 39 ++++++ tools/asr/fasterwhisper_asr.py | 114 ++++++++++++++++++ tools/asr/funasr_asr.py | 77 ++++++++++++ .../asr/{sensevoice.py => sensevoice_asr.py} | 5 + webui.py | 99 +++++++++------ 11 files changed, 355 insertions(+), 42 deletions(-) create mode 100644 tools/asr/config.py create mode 100644 tools/asr/fasterwhisper_asr.py create mode 100644 tools/asr/funasr_asr.py rename tools/asr/{sensevoice.py => sensevoice_asr.py} (90%) diff --git a/README.md b/README.md index 79387d5c..1b61ce18 100644 --- a/README.md +++ b/README.md @@ -147,7 +147,17 @@ Users in China region can download these two models by entering the links below - [UVR5 Weights](https://www.icloud.com.cn/iclouddrive/0bekRKDiJXboFhbfm3lM2fVbA#UVR5_Weights) -For Multilingual ASR, download models from [FunAudioLLM/SenseVoiceSmall](https://huggingface.co/FunAudioLLM/SenseVoiceSmall/tree/main) or [iic/SenseVoiceSmall](https://modelscope.cn/models/iic/SenseVoiceSmall/files) and place them in `tools/asr/models`. +For Chinese ASR (additionally), download models from [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), and [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) and place them in `tools/asr/models`. + +For English or Japanese ASR (additionally), download models from [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) and place them in `tools/asr/models`. Also, [other models](https://huggingface.co/Systran) may have the similar effect with smaller disk footprint. + +Users in China region can download this model by entering the links below + +- [Faster Whisper Large V3](https://www.icloud.com/iclouddrive/0c4pQxFs7oWyVU1iMTq2DbmLA#faster-whisper-large-v3) (clicking "Download a copy") + +- [Faster Whisper Large V3](https://hf-mirror.com/Systran/faster-whisper-large-v3) (HuggingFace mirror site) + +For SenseVoice Multilingual ASR, download models from [FunAudioLLM/SenseVoiceSmall](https://huggingface.co/FunAudioLLM/SenseVoiceSmall/tree/main) or [iic/SenseVoiceSmall](https://modelscope.cn/models/iic/SenseVoiceSmall/files) and place them in `tools/asr/models`. ## Dataset Format @@ -208,13 +218,23 @@ python audio_slicer.py \ --min_interval --hop_size ``` -This is how dataset ASR processing is done using the command line +This is how dataset ASR processing is done using the command line(Only Chinese) +``` +python tools/asr/funasr_asr.py -i -o +``` +ASR processing is performed through Faster_Whisper(ASR marking except Chinese) + +(No progress bars, GPU performance may cause time delays) +``` +python ./tools/asr/fasterwhisper_asr.py -i -o -l -p +``` +SenseVoice Multilingual ASR ``` python tools/asr/sensevoice.py -i -o -l -d ``` - A custom list save path is enabled + ## Credits Special thanks to the following projects and contributors: @@ -239,6 +259,8 @@ Special thanks to the following projects and contributors: - [SubFix](https://github.com/cronrpc/SubFix) - [FFmpeg](https://github.com/FFmpeg/FFmpeg) - [gradio](https://github.com/gradio-app/gradio) +- [faster-whisper](https://github.com/SYSTRAN/faster-whisper) +- [FunASR](https://github.com/alibaba-damo-academy/FunASR) - [SenseVoice](https://github.com/FunAudioLLM/SenseVoice) ## Thanks to all contributors for their efforts diff --git a/docs/cn/README.md b/docs/cn/README.md index 642bab80..75a4ff90 100644 --- a/docs/cn/README.md +++ b/docs/cn/README.md @@ -147,7 +147,17 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker - [UVR5 Weights](https://www.icloud.com.cn/iclouddrive/0bekRKDiJXboFhbfm3lM2fVbA#UVR5_Weights) -对于多语言自动语音识别(附加),从 [FunAudioLLM/SenseVoiceSmall](https://huggingface.co/FunAudioLLM/SenseVoiceSmall/tree/main) 或 [iic/SenseVoiceSmall](https://modelscope.cn/models/iic/SenseVoiceSmall/files) 下载模型,并将它们放置在 `tools/asr/models` 中。 +对于中文自动语音识别(附加),从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型,并将它们放置在 `tools/asr/models` 中。 + +对于SenseVoice多语言自动语音识别(附加),从 [FunAudioLLM/SenseVoiceSmall](https://huggingface.co/FunAudioLLM/SenseVoiceSmall/tree/main) 或 [iic/SenseVoiceSmall](https://modelscope.cn/models/iic/SenseVoiceSmall/files) 下载模型,并将它们放置在 `tools/asr/models` 中。 + +对于英语与日语自动语音识别(附加),从 [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 下载模型,并将它们放置在 `tools/asr/models` 中。 此外,[其他模型](https://huggingface.co/Systran)可能具有类似效果,但占用更小的磁盘空间。 + +中国地区用户可以通过以下链接下载: + +- [Faster Whisper Large V3](https://www.icloud.com/iclouddrive/0c4pQxFs7oWyVU1iMTq2DbmLA#faster-whisper-large-v3)(点击“下载副本”) + +- [Faster Whisper Large V3](https://hf-mirror.com/Systran/faster-whisper-large-v3)(Hugging Face镜像站) @@ -210,7 +220,17 @@ python audio_slicer.py \ --min_interval --hop_size ```` -这是使用命令行完成数据集ASR处理的方式 +这是使用命令行完成数据集ASR处理的方式(仅限中文) +```` +python tools/asr/funasr_asr.py -i -o +```` +通过Faster_Whisper进行ASR处理(除中文之外的ASR标记) + +(没有进度条,GPU性能可能会导致时间延迟) +```` +python ./tools/asr/fasterwhisper_asr.py -i -o -l -p +```` +使用SenseVoice进行多语言ASR ```` python tools/asr/sensevoice.py -i -o -l -d ```` @@ -241,6 +261,8 @@ python tools/asr/sensevoice.py -i -o -l -d - [SubFix](https://github.com/cronrpc/SubFix) - [FFmpeg](https://github.com/FFmpeg/FFmpeg) - [gradio](https://github.com/gradio-app/gradio) +- [faster-whisper](https://github.com/SYSTRAN/faster-whisper) +- [FunASR](https://github.com/alibaba-damo-academy/FunASR) - [SenseVoice](https://github.com/FunAudioLLM/SenseVoice) ## 感谢所有贡献者的努力 diff --git a/docs/ja/README.md b/docs/ja/README.md index d7cd8b51..9477c919 100644 --- a/docs/ja/README.md +++ b/docs/ja/README.md @@ -206,7 +206,7 @@ ASR処理はFaster_Whisperを通じて実行されます(中国語を除くASR (進行状況バーは表示されません。GPU のパフォーマンスにより時間遅延が発生する可能性があります) ``` -python ./tools/asr/fasterwhisper_asr.py -i -o -l +python ./tools/asr/fasterwhisper_asr.py -i -o -l -p ``` カスタムリストの保存パスが有効になっています @@ -236,6 +236,7 @@ python ./tools/asr/fasterwhisper_asr.py -i -o -l - [gradio](https://github.com/gradio-app/gradio) - [faster-whisper](https://github.com/SYSTRAN/faster-whisper) - [FunASR](https://github.com/alibaba-damo-academy/FunASR) +- [SenseVoice](https://github.com/FunAudioLLM/SenseVoice) ## すべてのコントリビューターに感謝します diff --git a/docs/ko/README.md b/docs/ko/README.md index 9e2bd1ca..a0746b11 100644 --- a/docs/ko/README.md +++ b/docs/ko/README.md @@ -210,7 +210,7 @@ ASR 처리는 Faster_Whisper(중국어를 제외한 ASR 마킹)를 통해 수행 (진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음) ``` -python ./tools/asr/fasterwhisper_asr.py -i -o -l +python ./tools/asr/fasterwhisper_asr.py -i -o -l -p ``` 사용자 정의 목록 저장 경로가 활성화되었습니다. @@ -240,7 +240,7 @@ python ./tools/asr/fasterwhisper_asr.py -i -o -l - [gradio](https://github.com/gradio-app/gradio) - [faster-whisper](https://github.com/SYSTRAN/faster-whisper) - [FunASR](https://github.com/alibaba-damo-academy/FunASR) - +- [SenseVoice](https://github.com/FunAudioLLM/SenseVoice) ## 모든 기여자들에게 감사드립니다 ;) diff --git a/docs/tr/README.md b/docs/tr/README.md index b9da738f..1db94b7b 100644 --- a/docs/tr/README.md +++ b/docs/tr/README.md @@ -250,6 +250,7 @@ python ./tools/asr/fasterwhisper_asr.py -i -o <çıktı> -l - [gradio](https://github.com/gradio-app/gradio) - [faster-whisper](https://github.com/SYSTRAN/faster-whisper) - [FunASR](https://github.com/alibaba-damo-academy/FunASR) +- [SenseVoice](https://github.com/FunAudioLLM/SenseVoice) ## Tüm katkıda bulunanlara çabaları için teşekkürler diff --git a/requirements.txt b/requirements.txt index da600c58..5961b81f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,4 +24,5 @@ psutil jieba_fast jieba LangSegment>=0.2.0 +faster_whisper wordsegment \ No newline at end of file diff --git a/tools/asr/config.py b/tools/asr/config.py new file mode 100644 index 00000000..53efdbd1 --- /dev/null +++ b/tools/asr/config.py @@ -0,0 +1,39 @@ +import os + +def check_fw_local_models(): + ''' + 启动时检查本地是否有 Faster Whisper 模型. + ''' + model_size_list = [ + "tiny", "tiny.en", + "base", "base.en", + "small", "small.en", + "medium", "medium.en", + "large", "large-v1", + "large-v2", "large-v3"] + for i, size in enumerate(model_size_list): + if os.path.exists(f'tools/asr/models/faster-whisper-{size}'): + model_size_list[i] = size + '-local' + return model_size_list + +asr_dict = { + "达摩 ASR (中文)": { + 'lang': ['zh'], + 'size': ['large'], + 'path': 'funasr_asr.py', + 'precision': 'float32' + }, + "Faster Whisper (多语种)": { + 'lang': ['auto', 'zh', 'en', 'ja'], + 'size': check_fw_local_models(), + 'path': 'fasterwhisper_asr.py', + 'precision': ['float32', 'float16', 'int8'] + }, + "Sense Voice": { + 'lang': ['auto', 'zh', 'en', 'ja'], + 'size': ['small'], + 'path': 'sensevoice_asr.py', + 'precision': 'float32' + } +} + diff --git a/tools/asr/fasterwhisper_asr.py b/tools/asr/fasterwhisper_asr.py new file mode 100644 index 00000000..da8eadfb --- /dev/null +++ b/tools/asr/fasterwhisper_asr.py @@ -0,0 +1,114 @@ +import argparse +import os +import traceback + +os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" +os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" + +import torch +from faster_whisper import WhisperModel +from tqdm import tqdm + +from tools.asr.config import check_fw_local_models + +language_code_list = [ + "af", "am", "ar", "as", "az", + "ba", "be", "bg", "bn", "bo", + "br", "bs", "ca", "cs", "cy", + "da", "de", "el", "en", "es", + "et", "eu", "fa", "fi", "fo", + "fr", "gl", "gu", "ha", "haw", + "he", "hi", "hr", "ht", "hu", + "hy", "id", "is", "it", "ja", + "jw", "ka", "kk", "km", "kn", + "ko", "la", "lb", "ln", "lo", + "lt", "lv", "mg", "mi", "mk", + "ml", "mn", "mr", "ms", "mt", + "my", "ne", "nl", "nn", "no", + "oc", "pa", "pl", "ps", "pt", + "ro", "ru", "sa", "sd", "si", + "sk", "sl", "sn", "so", "sq", + "sr", "su", "sv", "sw", "ta", + "te", "tg", "th", "tk", "tl", + "tr", "tt", "uk", "ur", "uz", + "vi", "yi", "yo", "zh", "yue", + "auto"] + +def execute_asr(input_folder, output_folder, model_size, language, precision): + if '-local' in model_size: + model_size = model_size[:-6] + model_path = f'tools/asr/models/faster-whisper-{model_size}' + else: + model_path = model_size + if language == 'auto': + language = None #不设置语种由模型自动输出概率最高的语种 + print("loading faster whisper model:",model_size,model_path) + device = 'cuda' if torch.cuda.is_available() else 'cpu' + try: + model = WhisperModel(model_path, device=device, compute_type=precision) + except: + return print(traceback.format_exc()) + + input_file_names = os.listdir(input_folder) + input_file_names.sort() + + output = [] + output_file_name = os.path.basename(input_folder) + + for file_name in tqdm(input_file_names): + try: + file_path = os.path.join(input_folder, file_name) + segments, info = model.transcribe( + audio = file_path, + beam_size = 5, + vad_filter = True, + vad_parameters = dict(min_silence_duration_ms=700), + language = language) + text = '' + + if info.language == "zh": + print("检测为中文文本, 转 FunASR 处理") + if("only_asr"not in globals()): + from tools.asr.funasr_asr import \ + only_asr # #如果用英文就不需要导入下载模型 + text = only_asr(file_path) + + if text == '': + for segment in segments: + text += segment.text + output.append(f"{file_path}|{output_file_name}|{info.language.upper()}|{text}") + except: + print(traceback.format_exc()) + + output_folder = output_folder or "output/asr_opt" + os.makedirs(output_folder, exist_ok=True) + output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list') + + with open(output_file_path, "w", encoding="utf-8") as f: + f.write("\n".join(output)) + print(f"ASR 任务完成->标注文件路径: {output_file_path}\n") + return output_file_path + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input_folder", type=str, required=True, + help="Path to the folder containing WAV files.") + parser.add_argument("-o", "--output_folder", type=str, required=True, + help="Output folder to store transcriptions.") + parser.add_argument("-s", "--model_size", type=str, default='large-v3', + choices=check_fw_local_models(), + help="Model Size of Faster Whisper") + parser.add_argument("-l", "--language", type=str, default='ja', + choices=language_code_list, + help="Language of the audio files.") + parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32','int8'], + help="fp16, int8 or fp32") + + cmd = parser.parse_args() + output_file_path = execute_asr( + input_folder = cmd.input_folder, + output_folder = cmd.output_folder, + model_size = cmd.model_size, + language = cmd.language, + precision = cmd.precision, + ) diff --git a/tools/asr/funasr_asr.py b/tools/asr/funasr_asr.py new file mode 100644 index 00000000..831da6c1 --- /dev/null +++ b/tools/asr/funasr_asr.py @@ -0,0 +1,77 @@ +# -*- coding:utf-8 -*- + +import argparse +import os +import traceback +from tqdm import tqdm + +from funasr import AutoModel + +path_asr = 'tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' +path_vad = 'tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch' +path_punc = 'tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch' +path_asr = path_asr if os.path.exists(path_asr) else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" +path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch" +path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" + +model = AutoModel( + model = path_asr, + model_revision = "v2.0.4", + vad_model = path_vad, + vad_model_revision = "v2.0.4", + punc_model = path_punc, + punc_model_revision = "v2.0.4", +) + +def only_asr(input_file): + try: + text = model.generate(input=input_file)[0]["text"] + except: + text = '' + print(traceback.format_exc()) + return text + +def execute_asr(input_folder, output_folder, model_size, language): + input_file_names = os.listdir(input_folder) + input_file_names.sort() + + output = [] + output_file_name = os.path.basename(input_folder) + + for file_name in tqdm(input_file_names): + try: + file_path = os.path.join(input_folder, file_name) + text = model.generate(input=file_path)[0]["text"] + output.append(f"{file_path}|{output_file_name}|{language.upper()}|{text}") + except: + print(traceback.format_exc()) + + output_folder = output_folder or "output/asr_opt" + os.makedirs(output_folder, exist_ok=True) + output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list') + + with open(output_file_path, "w", encoding="utf-8") as f: + f.write("\n".join(output)) + print(f"ASR 任务完成->标注文件路径: {output_file_path}\n") + return output_file_path + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input_folder", type=str, required=True, + help="Path to the folder containing WAV files.") + parser.add_argument("-o", "--output_folder", type=str, required=True, + help="Output folder to store transcriptions.") + parser.add_argument("-s", "--model_size", type=str, default='large', + help="Model Size of FunASR is Large") + parser.add_argument("-l", "--language", type=str, default='zh', choices=['zh'], + help="Language of the audio files.") + parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'], + help="fp16 or fp32")#还没接入 + + cmd = parser.parse_args() + execute_asr( + input_folder = cmd.input_folder, + output_folder = cmd.output_folder, + model_size = cmd.model_size, + language = cmd.language, + ) diff --git a/tools/asr/sensevoice.py b/tools/asr/sensevoice_asr.py similarity index 90% rename from tools/asr/sensevoice.py rename to tools/asr/sensevoice_asr.py index 66f381b6..0b095630 100644 --- a/tools/asr/sensevoice.py +++ b/tools/asr/sensevoice_asr.py @@ -63,6 +63,11 @@ if __name__ == '__main__': help="Language of the audio files.") parser.add_argument("-d", "--device", type=str, default=None, choices=['cpu','cuda'], help="CPU or CUDA") + parser.add_argument("-p", "--precision", type=str, default='float32', choices=['float32'], + help="fp16 or fp32") + parser.add_argument("-s", "--model_size", type=str, default='small', + choices=['small'], + help="Model Size of Faster Whisper") cmd = parser.parse_args() output_file_path = execute_asr( diff --git a/webui.py b/webui.py index 5baee0d8..94ef505b 100644 --- a/webui.py +++ b/webui.py @@ -194,25 +194,28 @@ def change_tts_inference(if_tts,bert_path,cnhubert_base_path,gpu_number,gpt_path p_tts_inference=None yield i18n("TTS推理进程已关闭") -def open_asr(asr_inp_dir, asr_opt_dir, asr_model, asr_model_size, asr_lang): +from tools.asr.config import asr_dict +def open_asr(asr_inp_dir, asr_opt_dir, asr_model, asr_model_size, asr_lang, asr_precision): global p_asr if(p_asr==None): asr_inp_dir=my_utils.clean_path(asr_inp_dir) - cmd = f'"{python_exec}" tools/asr/sensevoice.py' + cmd = f'"{python_exec}" tools/asr/{asr_dict[asr_model]["path"]}' cmd += f' -i "{asr_inp_dir}"' cmd += f' -o "{asr_opt_dir}"' + cmd += f' -s {asr_model_size}' cmd += f' -l {asr_lang}' + cmd += f" -p {asr_precision}" output_file_name = os.path.basename(asr_inp_dir) output_folder = asr_opt_dir or "output/asr_opt" output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list') - yield "ASR任务开启:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True},{"__type__":"update"} + yield "ASR任务开启:%s"%cmd, {"__type__":"update","visible":False}, {"__type__":"update","visible":True}, {"__type__":"update"} print(cmd) p_asr = Popen(cmd, shell=True) p_asr.wait() p_asr=None - yield f"ASR任务完成, 查看终端进行下一步",{"__type__":"update","visible":True},{"__type__":"update","visible":False},{"__type__":"update","value":output_file_path} + yield f"ASR任务完成, 查看终端进行下一步", {"__type__":"update","visible":True}, {"__type__":"update","visible":False}, {"__type__":"update","value":output_file_path} else: - yield "已有正在进行的ASR任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True},{"__type__":"update"} + yield "已有正在进行的ASR任务,需先终止才能开启下一次任务", {"__type__":"update","visible":False}, {"__type__":"update","visible":True}, {"__type__":"update"} # return None def close_asr(): @@ -220,7 +223,7 @@ def close_asr(): if(p_asr!=None): kill_process(p_asr.pid) p_asr=None - return "已终止ASR进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + return "已终止ASR进程", {"__type__":"update","visible":True}, {"__type__":"update","visible":False} def open_denoise(denoise_inp_dir, denoise_opt_dir): global p_denoise if(p_denoise==None): @@ -228,14 +231,14 @@ def open_denoise(denoise_inp_dir, denoise_opt_dir): denoise_opt_dir=my_utils.clean_path(denoise_opt_dir) cmd = '"%s" tools/cmd-denoise.py -i "%s" -o "%s" -p %s'%(python_exec,denoise_inp_dir,denoise_opt_dir,"float16"if is_half==True else "float32") - yield "语音降噪任务开启:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True} + yield "语音降噪任务开启:%s"%cmd, {"__type__":"update","visible":False}, {"__type__":"update","visible":True}, {"__type__":"update"} print(cmd) p_denoise = Popen(cmd, shell=True) p_denoise.wait() p_denoise=None - yield f"语音降噪任务完成, 查看终端进行下一步",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + yield f"语音降噪任务完成, 查看终端进行下一步", {"__type__":"update","visible":True}, {"__type__":"update","visible":False}, {"__type__":"update","value":denoise_opt_dir} else: - yield "已有正在进行的语音降噪任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True} + yield "已有正在进行的语音降噪任务,需先终止才能开启下一次任务", {"__type__":"update","visible":False}, {"__type__":"update","visible":True}, {"__type__":"update"} # return None def close_denoise(): @@ -243,7 +246,7 @@ def close_denoise(): if(p_denoise!=None): kill_process(p_denoise.pid) p_denoise=None - return "已终止语音降噪进程",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + return "已终止语音降噪进程", {"__type__":"update","visible":True}, {"__type__":"update","visible":False} p_train_SoVITS=None def open1Ba(batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers1Ba,pretrained_s2G,pretrained_s2D): @@ -273,21 +276,21 @@ def open1Ba(batch_size,total_epoch,exp_name,text_low_lr_rate,if_save_latest,if_s with open(tmp_config_path,"w")as f:f.write(json.dumps(data)) cmd = '"%s" GPT_SoVITS/s2_train.py --config "%s"'%(python_exec,tmp_config_path) - yield "SoVITS训练开始:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True} + yield "SoVITS训练开始:%s"%cmd, {"__type__":"update","visible":False}, {"__type__":"update","visible":True} print(cmd) p_train_SoVITS = Popen(cmd, shell=True) p_train_SoVITS.wait() p_train_SoVITS=None - yield "SoVITS训练完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + yield "SoVITS训练完成", {"__type__":"update","visible":True}, {"__type__":"update","visible":False} else: - yield "已有正在进行的SoVITS训练任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True} + yield "已有正在进行的SoVITS训练任务,需先终止才能开启下一次任务", {"__type__":"update","visible":False}, {"__type__":"update","visible":True} def close1Ba(): global p_train_SoVITS if(p_train_SoVITS!=None): kill_process(p_train_SoVITS.pid) p_train_SoVITS=None - return "已终止SoVITS训练",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + return "已终止SoVITS训练", {"__type__":"update","visible":True}, {"__type__":"update","visible":False} p_train_GPT=None def open1Bb(batch_size,total_epoch,exp_name,if_dpo,if_save_latest,if_save_every_weights,save_every_epoch,gpu_numbers,pretrained_s1): @@ -320,21 +323,21 @@ def open1Bb(batch_size,total_epoch,exp_name,if_dpo,if_save_latest,if_save_every_ with open(tmp_config_path, "w") as f:f.write(yaml.dump(data, default_flow_style=False)) # cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" --train_semantic_path "%s/6-name2semantic.tsv" --train_phoneme_path "%s/2-name2text.txt" --output_dir "%s/logs_s1"'%(python_exec,tmp_config_path,s1_dir,s1_dir,s1_dir) cmd = '"%s" GPT_SoVITS/s1_train.py --config_file "%s" '%(python_exec,tmp_config_path) - yield "GPT训练开始:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True} + yield "GPT训练开始:%s"%cmd, {"__type__":"update","visible":False}, {"__type__":"update","visible":True} print(cmd) p_train_GPT = Popen(cmd, shell=True) p_train_GPT.wait() p_train_GPT=None - yield "GPT训练完成",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + yield "GPT训练完成", {"__type__":"update","visible":True}, {"__type__":"update","visible":False} else: - yield "已有正在进行的GPT训练任务,需先终止才能开启下一次任务",{"__type__":"update","visible":False},{"__type__":"update","visible":True} + yield "已有正在进行的GPT训练任务,需先终止才能开启下一次任务", {"__type__":"update","visible":False}, {"__type__":"update","visible":True} def close1Bb(): global p_train_GPT if(p_train_GPT!=None): kill_process(p_train_GPT.pid) p_train_GPT=None - return "已终止GPT训练",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + return "已终止GPT训练", {"__type__":"update","visible":True}, {"__type__":"update","visible":False} ps_slice=[] def open_slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_parts): @@ -342,12 +345,12 @@ def open_slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_k inp = my_utils.clean_path(inp) opt_root = my_utils.clean_path(opt_root) if(os.path.exists(inp)==False): - yield "输入路径不存在",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + yield "输入路径不存在", {"__type__":"update","visible":True}, {"__type__":"update","visible":False}, {"__type__": "update"}, {"__type__": "update"} return if os.path.isfile(inp):n_parts=1 elif os.path.isdir(inp):pass else: - yield "输入路径存在但既不是文件也不是文件夹",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + yield "输入路径存在但既不是文件也不是文件夹", {"__type__":"update","visible":True}, {"__type__":"update","visible":False}, {"__type__": "update"}, {"__type__": "update"} return if (ps_slice == []): for i_part in range(n_parts): @@ -355,13 +358,13 @@ def open_slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_k print(cmd) p = Popen(cmd, shell=True) ps_slice.append(p) - yield "切割执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield "切割执行中", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}, {"__type__": "update"}, {"__type__": "update"} for p in ps_slice: p.wait() ps_slice=[] - yield "切割结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + yield "切割结束", {"__type__":"update","visible":True}, {"__type__":"update","visible":False}, {"__type__": "update", "value":opt_root}, {"__type__": "update", "value":opt_root} else: - yield "已有正在进行的切割任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} + yield "已有正在进行的切割任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True}, {"__type__": "update"}, {"__type__": "update"} def close_slice(): global ps_slice @@ -468,7 +471,7 @@ def open1b(inp_text,inp_wav_dir,exp_name,gpu_numbers,ssl_pretrained_dir): for p in ps1b: p.wait() ps1b=[] - yield "SSL提取进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + yield "SSL提取进程结束", {"__type__":"update","visible":True}, {"__type__":"update","visible":False} else: yield "已有正在进行的SSL提取任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} @@ -525,7 +528,7 @@ def open1c(inp_text,exp_name,gpu_numbers,pretrained_s2G_path): with open(path_semantic, "w", encoding="utf8") as f: f.write("\n".join(opt) + "\n") ps1c=[] - yield "语义token提取进程结束",{"__type__":"update","visible":True},{"__type__":"update","visible":False} + yield "语义token提取进程结束", {"__type__":"update","visible":True}, {"__type__":"update","visible":False} else: yield "已有正在进行的语义token提取任务,需先终止才能开启下一次任务", {"__type__": "update", "visible": False}, {"__type__": "update", "visible": True} @@ -731,25 +734,53 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: with gr.Row(): asr_model = gr.Dropdown( label = i18n("ASR 模型"), - choices = ['SenseVoice'], + choices = list(asr_dict.keys()), interactive = True, - value="SenseVoice" + value="达摩 ASR (中文)" ) asr_size = gr.Dropdown( label = i18n("ASR 模型尺寸"), - choices = ["small"], + choices = ["large"], interactive = True, - value="small" + value="large" ) asr_lang = gr.Dropdown( label = i18n("ASR 语言设置"), - choices = ["auto","zh","en","ja"], + choices = ["zh"], interactive = True, - value="auto" + value="zh" + ) + asr_precision = gr.Dropdown( + label = i18n("ASR 语言设置"), + choices = ["zh"], + interactive = True, + value="zh" ) with gr.Row(): asr_info = gr.Textbox(label=i18n("ASR进程输出信息")) + def change_lang_choices(key): #根据选择的模型修改可选的语言 + # return gr.Dropdown(choices=asr_dict[key]['lang']) + return {"__type__": "update", "choices": asr_dict[key]['lang'],"value":asr_dict[key]['lang'][0]} + def change_size_choices(key): # 根据选择的模型修改可选的模型尺寸 + # return gr.Dropdown(choices=asr_dict[key]['size']) + return {"__type__": "update", "choices": asr_dict[key]['size'],"value":asr_dict[key]['size'][-1]} + def change_precision_choices(key): #根据选择的模型修改可选的语言 + if key =="Faster Whisper (多语种)": + if default_batch_size <= 4: + precision = 'int8' + elif is_half: + precision = 'float16' + else: + precision = 'float32' + else: + precision = 'float32' + # return gr.Dropdown(choices=asr_dict[key]['lang']) + return {"__type__": "update", "choices": asr_dict[key]['precision'],"value":precision} + asr_model.change(change_lang_choices, [asr_model], [asr_lang]) + asr_model.change(change_size_choices, [asr_model], [asr_size]) + asr_model.change(change_size_choices, [asr_model], [asr_precision]) + gr.Markdown(value=i18n("0d-语音文本校对标注工具")) with gr.Row(): @@ -762,11 +793,11 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app: label_info = gr.Textbox(label=i18n("打标工具进程输出信息")) if_label.change(change_label, [if_label,path_list], [label_info]) if_uvr5.change(change_uvr5, [if_uvr5], [uvr5_info]) - open_asr_button.click(open_asr, [asr_inp_dir, asr_opt_dir, asr_model, asr_size, asr_lang], [asr_info,open_asr_button,close_asr_button,path_list]) + open_asr_button.click(open_asr, [asr_inp_dir, asr_opt_dir, asr_model, asr_size, asr_lang, asr_precision], [asr_info,open_asr_button,close_asr_button,path_list]) close_asr_button.click(close_asr, [], [asr_info,open_asr_button,close_asr_button]) - open_slicer_button.click(open_slice, [slice_inp_path,slice_opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_process], [slicer_info,open_slicer_button,close_slicer_button]) + open_slicer_button.click(open_slice, [slice_inp_path,slice_opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,n_process], [slicer_info,open_slicer_button,close_slicer_button,asr_inp_dir,denoise_input_dir]) close_slicer_button.click(close_slice, [], [slicer_info,open_slicer_button,close_slicer_button]) - open_denoise_button.click(open_denoise, [denoise_input_dir,denoise_output_dir], [denoise_info,open_denoise_button,close_denoise_button]) + open_denoise_button.click(open_denoise, [denoise_input_dir,denoise_output_dir], [denoise_info,open_denoise_button,close_denoise_button,asr_inp_dir]) close_denoise_button.click(close_denoise, [], [denoise_info,open_denoise_button,close_denoise_button]) with gr.TabItem(i18n("1-GPT-SoVITS-TTS")):