diff --git a/Ref_Audio_Selector/ref_audio_selector_webui.py b/Ref_Audio_Selector/ref_audio_selector_webui.py index 74f2f2a..c0ba953 100644 --- a/Ref_Audio_Selector/ref_audio_selector_webui.py +++ b/Ref_Audio_Selector/ref_audio_selector_webui.py @@ -3,6 +3,8 @@ import os.path import gradio as gr import Ref_Audio_Selector.tool.audio_similarity as audio_similarity import Ref_Audio_Selector.tool.audio_inference as audio_inference +import Ref_Audio_Selector.tool.audio_asr as audio_asr +import Ref_Audio_Selector.tool.audio_config as audio_config import Ref_Audio_Selector.common.common as common from tools.i18n.i18n import I18nAuto @@ -49,13 +51,14 @@ def sample(text_work_space_dir, text_character, text_sample_dir, text_base_voice if text_sample_num is None or text_sample_num == '': raise Exception(i18n("每段随机抽样个数不能为空")) - similarity_list = audio_similarity.start_similarity_analysis(text_work_space_dir, text_sample_dir, text_base_voice_path, checkbox_similarity_output) - + similarity_list = audio_similarity.start_similarity_analysis(text_work_space_dir, text_sample_dir, + text_base_voice_path, checkbox_similarity_output) + if similarity_list is None: raise Exception(i18n("相似度分析失败")) - + audio_similarity.sample(ref_audio_dir, similarity_list, text_subsection_num, text_sample_num) - + except Exception as e: text_sample_info = f"发生异常:{e}" ref_audio_dir = '' @@ -94,7 +97,8 @@ def model_inference(text_work_space_dir, text_character, text_model_inference_vo ref_audio_manager = common.RefAudioListManager(text_model_inference_voice_dir) if len(ref_audio_manager.get_audio_list()) == 0: raise Exception(i18n("待推理的参考音频不能为空")) - audio_inference.generate_audio_files(url_composer, text_list, ref_audio_manager.get_ref_audio_list(), inference_dir) + audio_inference.generate_audio_files(url_composer, text_list, ref_audio_manager.get_ref_audio_list(), + inference_dir) except Exception as e: text_model_inference_info = f"发生异常:{e}" text_asr_audio_dir = '' @@ -104,9 +108,9 @@ def model_inference(text_work_space_dir, text_character, text_model_inference_vo # 对推理生成音频执行asr def asr(text_work_space_dir, text_character, text_asr_audio_dir, dropdown_asr_model, dropdown_asr_size, dropdown_asr_lang): - asr_file = os.path.join(text_work_space_dir, 'asr.list') - text_text_similarity_analysis_path = asr_file - text_asr_info = f"asr成功:生成文件asr.list" + asr_file = None + text_text_similarity_analysis_path = None + text_asr_info = None try: check_base_info(text_work_space_dir, text_character) if text_asr_audio_dir is None or text_asr_audio_dir == '': @@ -117,7 +121,10 @@ def asr(text_work_space_dir, text_character, text_asr_audio_dir, dropdown_asr_mo raise Exception(i18n("asr模型大小不能为空")) if dropdown_asr_lang is None or dropdown_asr_lang == '': raise Exception(i18n("asr语言不能为空")) - pass + asr_file = audio_asr.open_asr(text_asr_audio_dir, text_work_space_dir, dropdown_asr_model, dropdown_asr_size, + dropdown_asr_lang) + text_text_similarity_analysis_path = asr_file + text_asr_info = f"asr成功:生成文件{asr_file}" except Exception as e: text_asr_info = f"发生异常:{e}" text_text_similarity_analysis_path = '' @@ -149,7 +156,14 @@ def similarity_audio_output(text_work_space_dir, text_character, text_base_audio raise Exception(i18n("基准音频路径不能为空")) if text_compare_audio_dir is None or text_compare_audio_dir == '': raise Exception(i18n("待分析的音频所在目录不能为空")) - pass + similarity_list, similarity_file, similarity_file_dir = audio_similarity.start_similarity_analysis( + text_work_space_dir, text_compare_audio_dir, text_base_audio_path, True) + + if similarity_list is None: + raise Exception(i18n("相似度分析失败")) + + text_similarity_audio_output_info = f'相似度分析成功:生成目录{similarity_file_dir},文件{similarity_file}' + except Exception as e: text_similarity_audio_output_info = f"发生异常:{e}" return text_similarity_audio_output_info @@ -181,7 +195,8 @@ def create_config(text_work_space_dir, text_character, text_template, text_sync_ raise Exception(i18n("参考音频抽样目录不能为空")) if text_sync_ref_audio_dir2 is None or text_sync_ref_audio_dir2 == '': raise Exception(i18n("参考音频目录不能为空")) - pass + ref_audio_manager = common.RefAudioListManager(text_sync_ref_audio_dir2) + audio_config.generate_audio_config(text_template, ref_audio_manager.get_ref_audio_list(), config_file) except Exception as e: text_create_config_info = f"发生异常:{e}" return text_create_config_info @@ -191,9 +206,9 @@ def create_config(text_work_space_dir, text_character, text_template, text_sync_ def whole_url(text_url, text_text, text_ref_path, text_ref_text, text_emotion): url_composer = audio_inference.URLComposer(text_url, text_emotion, text_text, text_ref_path, text_ref_text) if url_composer.is_emotion(): - text_whole_url = url_composer.build_url_with_emotion('测试内容','情绪类型') + text_whole_url = url_composer.build_url_with_emotion('测试内容', '情绪类型') else: - text_whole_url = url_composer.build_url_with_ref('测试内容','参考路径','参考文本') + text_whole_url = url_composer.build_url_with_ref('测试内容', '参考路径', '参考文本') return text_whole_url diff --git a/tools/speaker_verification/__init__.py b/Ref_Audio_Selector/tool/asr/__init__.py similarity index 100% rename from tools/speaker_verification/__init__.py rename to Ref_Audio_Selector/tool/asr/__init__.py diff --git a/Ref_Audio_Selector/tool/asr/funasr_asr_multi_level_dir.py b/Ref_Audio_Selector/tool/asr/funasr_asr_multi_level_dir.py new file mode 100644 index 0000000..ab94b4a --- /dev/null +++ b/Ref_Audio_Selector/tool/asr/funasr_asr_multi_level_dir.py @@ -0,0 +1,111 @@ +# -*- coding:utf-8 -*- + +import argparse +import os +import traceback +from tqdm import tqdm + +from funasr import AutoModel + +path_asr = 'tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' +path_vad = 'tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch' +path_punc = 'tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch' +path_asr = path_asr if os.path.exists(path_asr) else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" +path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch" +path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" + +model = AutoModel( + model = path_asr, + model_revision = "v2.0.4", + vad_model = path_vad, + vad_model_revision = "v2.0.4", + punc_model = path_punc, + punc_model_revision = "v2.0.4", +) + +def only_asr(input_file): + try: + text = model.generate(input=input_file)[0]["text"] + except: + text = '' + print(traceback.format_exc()) + return text + +def execute_asr(input_folder, output_folder, model_size, language): + input_file_names = os.listdir(input_folder) + input_file_names.sort() + + output = [] + output_file_name = os.path.basename(input_folder) + + for name in tqdm(input_file_names): + try: + text = model.generate(input="%s/%s"%(input_folder, name))[0]["text"] + output.append(f"{input_folder}/{name}|{output_file_name}|{language.upper()}|{text}") + except: + print(traceback.format_exc()) + + output_folder = output_folder or "output/asr_opt" + os.makedirs(output_folder, exist_ok=True) + output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list') + + with open(output_file_path, "w", encoding="utf-8") as f: + f.write("\n".join(output)) + print(f"ASR 任务完成->标注文件路径: {output_file_path}\n") + return output_file_path + + +def execute_asr_multi_level_dir(input_folder, output_folder, model_size, language): + output = [] + output_file_name = os.path.basename(input_folder) + # 递归遍历输入目录及所有子目录 + for root, dirs, files in os.walk(input_folder): + for name in sorted(files): + # 只处理wav文件(假设是wav文件) + if name.endswith(".wav"): + try: + # 构造完整的输入音频文件路径 + input_file_path = os.path.join(root, name) + input_file_path = os.path.normpath(input_file_path) # 先标准化可能存在混合斜杠的情况 + text = model.generate(input=input_file_path)[0]["text"] + + output.append(f"{input_file_path}|{output_file_name}|{language.upper()}|{text}") + + except: + print(traceback.format_exc()) + + # 创建或打开指定的输出目录 + output_folder = output_folder or "output/asr_opt" + output_dir_abs = os.path.abspath(output_folder) + os.makedirs(output_dir_abs, exist_ok=True) + + # 构造输出文件路径 + output_file_path = os.path.join(output_dir_abs, f'{output_file_name}.list') + + # 将输出写入文件 + with open(output_file_path, "w", encoding="utf-8") as f: + f.write("\n".join(output)) + print(f"ASR 任务完成->标注文件路径: {output_file_path}\n") + + return output_file_path + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input_folder", type=str, required=True, + help="Path to the folder containing WAV files.") + parser.add_argument("-o", "--output_folder", type=str, required=True, + help="Output folder to store transcriptions.") + parser.add_argument("-s", "--model_size", type=str, default='large', + help="Model Size of FunASR is Large") + parser.add_argument("-l", "--language", type=str, default='zh', choices=['zh'], + help="Language of the audio files.") + parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'], + help="fp16 or fp32")#还没接入 + + cmd = parser.parse_args() + execute_asr_multi_level_dir( + input_folder = cmd.input_folder, + output_folder = cmd.output_folder, + model_size = cmd.model_size, + language = cmd.language, + ) diff --git a/Ref_Audio_Selector/tool/audio_asr.py b/Ref_Audio_Selector/tool/audio_asr.py new file mode 100644 index 0000000..126d3f9 --- /dev/null +++ b/Ref_Audio_Selector/tool/audio_asr.py @@ -0,0 +1,34 @@ +import os +from config import python_exec,is_half +from tools import my_utils +from tools.asr.config import asr_dict +from subprocess import Popen +def open_asr(asr_inp_dir, asr_opt_dir, asr_model, asr_model_size, asr_lang): + global p_asr + if(p_asr==None): + asr_inp_dir=my_utils.clean_path(asr_inp_dir) + asr_py_path = asr_dict[asr_model]["path"] + if asr_py_path == 'funasr_asr.py': + asr_py_path = 'funasr_asr_multi_level_dir.py' + if asr_py_path == 'fasterwhisper.py': + asr_py_path = 'fasterwhisper_asr_multi_level_dir.py' + cmd = f'"{python_exec}" tools/asr/{asr_py_path}' + cmd += f' -i "{asr_inp_dir}"' + cmd += f' -o "{asr_opt_dir}"' + cmd += f' -s {asr_model_size}' + cmd += f' -l {asr_lang}' + cmd += " -p %s"%("float16"if is_half==True else "float32") + + print(cmd) + p_asr = Popen(cmd, shell=True) + p_asr.wait() + p_asr=None + + output_dir_abs = os.path.abspath(asr_opt_dir) + output_file_name = os.path.basename(asr_inp_dir) + # 构造输出文件路径 + output_file_path = os.path.join(output_dir_abs, f'{output_file_name}.list') + return output_file_path + + else: + return None \ No newline at end of file diff --git a/Ref_Audio_Selector/tool/audio_config.py b/Ref_Audio_Selector/tool/audio_config.py new file mode 100644 index 0000000..7ea9a9b --- /dev/null +++ b/Ref_Audio_Selector/tool/audio_config.py @@ -0,0 +1,26 @@ +import os + + +def generate_audio_config(template_str, audio_list, output_file_path): + # 定义一个空字符串来存储最终要写入文件的内容 + file_content = "" + + # 遍历参考音频列表 + for audio_info in audio_list: + emotion = audio_info['emotion'] + ref_path = audio_info['ref_path'] + ref_text = audio_info['ref_text'] + + # 使用字符串模板替换变量 + formatted_line = template_str.replace('${emotion}', emotion).replace('${ref_path}', ref_path).replace( + '${ref_text}', ref_text) + + # 将格式化后的行添加到内容中,使用逗号和换行符分隔 + file_content += formatted_line + ",\n" + + # 删除最后一个逗号和换行符,确保格式整洁 + file_content = file_content[:-2] + + # 将内容写入输出文件 + with open(output_file_path, 'w', encoding='utf-8') as output_file: + output_file.write(file_content) diff --git a/Ref_Audio_Selector/tool/audio_similarity.py b/Ref_Audio_Selector/tool/audio_similarity.py index 9eb7172..30ae5d1 100644 --- a/Ref_Audio_Selector/tool/audio_similarity.py +++ b/Ref_Audio_Selector/tool/audio_similarity.py @@ -95,7 +95,7 @@ def start_similarity_analysis(work_space_dir, sample_dir, base_voice_path, need_ global p_similarity if(p_similarity==None): - cmd = f'"{python_exec}" tools/speaker_verification/audio_similarity.py ' + cmd = f'"{python_exec}" tools/speaker_verification/voice_similarity.py ' cmd += f' -r "{base_voice_path}"' cmd += f' -c "{sample_dir}"' cmd += f' -o {similarity_file}' @@ -110,9 +110,9 @@ def start_similarity_analysis(work_space_dir, sample_dir, base_voice_path, need_ ref_audio_opt.copy_and_move(similarity_file_dir, similarity_list) p_similarity=None - return similarity_list + return similarity_list, similarity_file, similarity_file_dir else: - return similarity_list + return similarity_list, None, None def parse_similarity_file(file_path): diff --git a/Ref_Audio_Selector/tool/speaker_verification/__init__.py b/Ref_Audio_Selector/tool/speaker_verification/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tools/speaker_verification/voice_similarity.py b/Ref_Audio_Selector/tool/speaker_verification/voice_similarity.py similarity index 100% rename from tools/speaker_verification/voice_similarity.py rename to Ref_Audio_Selector/tool/speaker_verification/voice_similarity.py diff --git a/Ref_Audio_Selector/tool/text_comparison/__init__.py b/Ref_Audio_Selector/tool/text_comparison/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Ref_Audio_Selector/tool/text_comparison/text_comparison.py b/Ref_Audio_Selector/tool/text_comparison/text_comparison.py new file mode 100644 index 0000000..dab00b6 --- /dev/null +++ b/Ref_Audio_Selector/tool/text_comparison/text_comparison.py @@ -0,0 +1,53 @@ +import torch +from transformers import AutoTokenizer, AutoModel +from scipy.spatial.distance import cosine +import math + +bert_path = os.environ.get( + "bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large" +) + + +tokenizer = AutoTokenizer.from_pretrained(bert_path) +model = AutoModel.from_pretrained(bert_path) + + +def calculate_similarity(text1, text2, max_length=512): + # 预处理文本,设置最大长度 + inputs1 = tokenizer(text1, padding=True, truncation=True, max_length=max_length, return_tensors='pt') + inputs2 = tokenizer(text2, padding=True, truncation=True, max_length=max_length, return_tensors='pt') + + # 获取句子向量(这里是取CLS token的向量并展平为一维) + with torch.no_grad(): + encoded_text1 = model(**inputs1)[0][:, 0, :].flatten() + encoded_text2 = model(**inputs2)[0][:, 0, :].flatten() + + # 确保转换为numpy数组并且是一维的 + similarity = 1 - cosine(encoded_text1.cpu().numpy().flatten(), encoded_text2.cpu().numpy().flatten()) + + return similarity + +# 对0.8-1区间的值进行放大 +def adjusted_similarity(similarity_score2, boundary=0.8): + + if similarity_score2 < boundary: + return 0 + + # 倍数 + multiple = 1/(1 - boundary) + + adjusted_score = (similarity_score2 - boundary)*multiple + + return adjusted_score + + +def calculate_result(t1, t2): + # 计算并打印相似度 + similarity_score2 = calculate_similarity(t1, t2) + + # 调整相似度 + adjusted_similarity_score2 = adjusted_similarity(similarity_score2) + + return similarity_score2, adjusted_similarity_score2 + +