GPT-SoVITS/Ref_Audio_Selector/ref_audio_selector_webui.py
2024-04-23 23:31:02 +08:00

301 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import gradio as gr
from tools.i18n.i18n import I18nAuto
i18n = I18nAuto()
# 校验基础信息
def check_base_info(text_work_space_dir, text_character):
if text_work_space_dir is None or text_work_space_dir == '':
raise Exception(i18n("工作目录不能为空"))
if text_character is None or text_character == '':
raise Exception(i18n("角色名称不能为空"))
# 从list文件提取参考音频
def convert_from_list(text_work_space_dir, text_character, text_list_input):
text_convert_from_list_info = "转换成功生成目录XXX"
text_sample_dir = "D//tt"
try:
check_base_info(text_work_space_dir, text_character)
if text_list_input is None or text_list_input == '':
raise Exception(i18n("list文件路径不能为空"))
pass
except Exception as e:
text_convert_from_list_info = f"发生异常:{e}"
text_sample_dir = ''
return [text_convert_from_list_info, text_sample_dir]
# 基于一个基准音频,从参考音频目录中进行分段抽样
def sample(text_work_space_dir, text_character, text_sample_dir, text_base_voice_path,
text_subsection_num, text_sample_num, checkbox_similarity_output):
text_sample_info = "抽样成功生成目录XXX"
ref_audio_dir = "D//tt"
try:
check_base_info(text_work_space_dir, text_character)
if text_sample_dir is None or text_sample_dir == '':
raise Exception(i18n("参考音频抽样目录不能为空,请先完成上一步操作"))
if text_base_voice_path is None or text_base_voice_path == '':
raise Exception(i18n("基准音频路径不能为空"))
if text_subsection_num is None or text_subsection_num == '':
raise Exception(i18n("分段数不能为空"))
if text_sample_num is None or text_sample_num == '':
raise Exception(i18n("每段随机抽样个数不能为空"))
pass
except Exception as e:
text_sample_info = f"发生异常:{e}"
ref_audio_dir = ''
text_model_inference_voice_dir = ref_audio_dir
text_sync_ref_audio_dir = ref_audio_dir
text_sync_ref_audio_dir2 = ref_audio_dir
return [text_sample_info, text_model_inference_voice_dir, text_sync_ref_audio_dir,
text_sync_ref_audio_dir2]
# 根据参考音频和测试文本,执行批量推理
def model_inference(text_work_space_dir, text_character, text_model_inference_voice_dir, text_url,
text_text, text_ref_path, text_ref_text, text_emotion,
text_test_content):
text_model_inference_info = "推理成功生成目录XXX"
text_asr_audio_dir = "D//tt"
try:
check_base_info(text_work_space_dir, text_character)
if text_model_inference_voice_dir is None or text_model_inference_voice_dir == '':
raise Exception(i18n("待推理的参考音频所在目录不能为空,请先完成上一步操作"))
if text_url is None or text_url == '':
raise Exception(i18n("推理服务请求地址不能为空"))
if text_text is None or text_text == '':
raise Exception(i18n("文本参数名不能为空"))
if text_test_content is None or text_test_content == '':
raise Exception(i18n("待推理文本路径不能为空"))
if (text_ref_path is None or text_ref_path == '') and (text_ref_text is None or text_ref_text == '') and (text_emotion is None or text_emotion == ''):
raise Exception(i18n("参考音频路径/文本和角色情绪二选一填写,不能全部为空"))
pass
except Exception as e:
text_model_inference_info = f"发生异常:{e}"
text_asr_audio_dir = ''
return [text_model_inference_info, text_asr_audio_dir]
# 对推理生成音频执行asr
def asr(text_work_space_dir, text_character, text_asr_audio_dir, dropdown_asr_model,
dropdown_asr_size, dropdown_asr_lang):
text_asr_info = "asr成功生成目录XXX"
text_text_similarity_analysis_path = "D//tt"
try:
check_base_info(text_work_space_dir, text_character)
if text_asr_audio_dir is None or text_asr_audio_dir == '':
raise Exception(i18n("待asr的音频所在目录不能为空请先完成上一步操作"))
if dropdown_asr_model is None or dropdown_asr_model == '':
raise Exception(i18n("asr模型不能为空"))
if dropdown_asr_size is None or dropdown_asr_size == '':
raise Exception(i18n("asr模型大小不能为空"))
if dropdown_asr_lang is None or dropdown_asr_lang == '':
raise Exception(i18n("asr语言不能为空"))
pass
except Exception as e:
text_asr_info = f"发生异常:{e}"
text_text_similarity_analysis_path = ''
return [text_asr_info, text_text_similarity_analysis_path]
# 对asr生成的文件与原本的文本内容进行相似度分析
def text_similarity_analysis(text_work_space_dir, text_character,
text_text_similarity_analysis_path):
text_text_similarity_analysis_info = "相似度分析成功生成目录XXX"
try:
check_base_info(text_work_space_dir, text_character)
if text_text_similarity_analysis_path is None or text_text_similarity_analysis_path == '':
raise Exception(i18n("asr生成的文件路径不能为空请先完成上一步操作"))
pass
except Exception as e:
text_text_similarity_analysis_info = f"发生异常:{e}"
return text_text_similarity_analysis_info
# 根据一个参考音频,对指定目录下的音频进行相似度分析,并输出到另一个目录
def similarity_audio_output(text_work_space_dir, text_character, text_base_audio_path,
text_compare_audio_dir):
text_similarity_audio_output_info = "相似度分析成功生成目录XXX"
try:
check_base_info(text_work_space_dir, text_character)
if text_base_audio_path is None or text_base_audio_path == '':
raise Exception(i18n("基准音频路径不能为空"))
if text_compare_audio_dir is None or text_compare_audio_dir == '':
raise Exception(i18n("待分析的音频所在目录不能为空"))
pass
except Exception as e:
text_similarity_audio_output_info = f"发生异常:{e}"
return text_similarity_audio_output_info
# 根据参考音频目录的删除情况,将其同步到推理生成的音频目录中,即参考音频目录下,删除了几个参考音频,就在推理目录下,将这些参考音频生成的音频文件移除
def sync_ref_audio(text_work_space_dir, text_character, text_sync_ref_audio_dir,
text_sync_inference_audio_dir):
text_sync_ref_audio_info = "同步参考音频成功生成目录XXX"
try:
check_base_info(text_work_space_dir, text_character)
if text_sync_ref_audio_dir is None or text_sync_ref_audio_dir == '':
raise Exception(i18n("参考音频目录不能为空"))
if text_sync_inference_audio_dir is None or text_sync_inference_audio_dir == '':
raise Exception(i18n("推理生成的音频目录不能为空"))
pass
except Exception as e:
text_sync_ref_audio_info = f"发生异常:{e}"
return text_sync_ref_audio_info
# 根据模板和参考音频目录,生成参考音频配置内容
def create_config(text_work_space_dir, text_character, text_template, text_sync_ref_audio_dir2):
text_create_config_info = "配置生成成功生成目录XXX"
try:
check_base_info(text_work_space_dir, text_character)
if text_template is None or text_template == '':
raise Exception(i18n("参考音频抽样目录不能为空"))
if text_sync_ref_audio_dir2 is None or text_sync_ref_audio_dir2 == '':
raise Exception(i18n("参考音频目录不能为空"))
pass
except Exception as e:
text_create_config_info = f"发生异常:{e}"
return text_create_config_info
# 基于请求路径和参数,合成完整的请求路径
def whole_url(text_url, text_text, text_ref_path, text_ref_text, text_emotion):
text_whole_url = f'{text_url}?{text_text}=文本内容&{text_ref_path}=参考音频路径&{text_ref_text}=参考文本&{text_emotion}=情绪类型'
return [text_whole_url]
with gr.Blocks() as app:
gr.Markdown(value=i18n("基本介绍:这是一个从训练素材中,批量提取参考音频,并进行效果评估与配置生成的工具"))
with gr.Row():
text_work_space_dir = gr.Text(label=i18n("工作目录,后续操作所生成文件都会保存在此目录下"), value="")
text_character = gr.Text(label=i18n("请输入角色名称"), value="")
with gr.Accordion(label=i18n("第一步:基于训练素材,生成待选参考音频列表"), open=False):
gr.Markdown(value=i18n("1.1选择list文件并提取3-10秒的素材作为参考候选"))
text_list_input = gr.Text(label=i18n("请输入list文件路径"), value="")
with gr.Row():
button_convert_from_list = gr.Button(i18n("开始生成待参考列表"), variant="primary")
text_convert_from_list_info = gr.Text(label=i18n("参考列表生成结果"), value="", interactive=False)
gr.Markdown(value=i18n("1.2:选择基准音频,执行相似度匹配,并分段随机抽样"))
text_sample_dir = gr.Text(label=i18n("参考音频抽样目录"), value="", interactive=False)
button_convert_from_list.click(convert_from_list, [text_work_space_dir, text_character, text_list_input],
[text_convert_from_list_info, text_sample_dir])
with gr.Row():
text_base_voice_path = gr.Text(label=i18n("请输入基准音频路径"), value="")
text_subsection_num = gr.Text(label=i18n("请输入分段数"), value="10")
text_sample_num = gr.Text(label=i18n("请输入每段随机抽样个数"), value="4")
checkbox_similarity_output = gr.Checkbox(label=i18n("是否将相似度匹配结果输出到临时目录?"), show_label=True)
with gr.Row():
button_sample = gr.Button(i18n("开始分段随机抽样"), variant="primary")
text_sample_info = gr.Text(label=i18n("分段随机抽样结果"), value="", interactive=False)
with gr.Accordion(label=i18n("第二步:基于参考音频和测试文本,执行批量推理"), open=False):
gr.Markdown(value=i18n("2.1:配置推理服务参数信息,参考音频路径/文本和角色情绪二选一,如果是角色情绪,需要先执行第四步,"
"将参考音频打包配置到推理服务下,在推理前,请确认完整请求地址是否与正常使用时的一致,包括角色名称,尤其是文本分隔符是否正确"))
text_model_inference_voice_dir = gr.Text(label=i18n("待推理的参考音频所在目录"), value="", interactive=False)
text_url = gr.Text(label=i18n("请输入推理服务请求地址与参数"), value="")
with gr.Row():
text_text = gr.Text(label=i18n("请输入文本参数名"), value="text")
text_ref_path = gr.Text(label=i18n("请输入参考音频路径参数名"), value="text")
text_ref_text = gr.Text(label=i18n("请输入参考音频文本参数名"), value="text")
text_emotion = gr.Text(label=i18n("请输入角色情绪参数名"), value="text")
text_whole_url = gr.Text(label=i18n("完整地址"), value="5555555555555555", interactive=False)
text_url.input(whole_url, [text_url, text_text, text_ref_path, text_ref_text, text_emotion],
[text_whole_url])
text_text.input(whole_url, [text_url, text_text, text_ref_path, text_ref_text, text_emotion],
[text_whole_url])
text_ref_path.input(whole_url, [text_url, text_text, text_ref_path, text_ref_text, text_emotion],
[text_whole_url])
text_ref_text.input(whole_url, [text_url, text_text, text_ref_path, text_ref_text, text_emotion],
[text_whole_url])
text_emotion.input(whole_url, [text_url, text_text, text_ref_path, text_ref_text, text_emotion],
[text_whole_url])
gr.Markdown(value=i18n("2.2配置待推理文本一句一行不要太多10条即可"))
text_test_content = gr.Text(label=i18n("请输入待推理文本路径"), value="text")
gr.Markdown(value=i18n("2.3:启动推理服务,如果还没启动的话"))
gr.Markdown(value=i18n("2.4:开始批量推理,这个过程比较耗时,可以去干点别的"))
with gr.Row():
button_model_inference = gr.Button(i18n("开启批量推理"), variant="primary")
text_model_inference_info = gr.Text(label=i18n("批量推理结果"), value="", interactive=False)
with gr.Accordion(label=i18n("第三步:进行参考音频效果校验与筛选"), open=False):
gr.Markdown(value=i18n("3.1启动asr获取推理音频文本"))
text_asr_audio_dir = gr.Text(label=i18n("待asr的音频所在目录"), value="", interactive=False)
button_model_inference.click(model_inference,
[text_work_space_dir, text_character, text_model_inference_voice_dir, text_url,
text_text, text_ref_path, text_ref_text, text_emotion,
text_test_content], [text_model_inference_info, text_asr_audio_dir])
with gr.Row():
dropdown_asr_model = gr.Dropdown(
label=i18n("ASR 模型"),
choices=[],
interactive=True,
value="达摩 ASR (中文)"
)
dropdown_asr_size = gr.Dropdown(
label=i18n("ASR 模型尺寸"),
choices=["large"],
interactive=True,
value="large"
)
dropdown_asr_lang = gr.Dropdown(
label=i18n("ASR 语言设置"),
choices=["zh"],
interactive=True,
value="zh"
)
with gr.Row():
button_asr = gr.Button(i18n("启动asr"), variant="primary")
text_asr_info = gr.Text(label=i18n("asr结果"), value="", interactive=False)
gr.Markdown(value=i18n("3.2:启动文本相似度分析"))
text_text_similarity_analysis_path = gr.Text(label=i18n("待分析的文件路径"), value="", interactive=False)
button_asr.click(asr, [text_work_space_dir, text_character, text_asr_audio_dir, dropdown_asr_model,
dropdown_asr_size, dropdown_asr_lang],
[text_asr_info, text_text_similarity_analysis_path])
with gr.Row():
button_text_similarity_analysis = gr.Button(i18n("启动文本相似度分析"), variant="primary")
text_text_similarity_analysis_info = gr.Text(label=i18n("文本相似度分析结果"), value="", interactive=False)
button_text_similarity_analysis.click(text_similarity_analysis, [text_work_space_dir, text_character,
text_text_similarity_analysis_path],
[text_text_similarity_analysis_info])
gr.Markdown(value=i18n("3.3:根据相似度分析结果,重点检查最后几条是否存在复读等问题"))
gr.Markdown(value=i18n("3.4:对结果按音频相似度排序,筛选低音质音频"))
with gr.Row():
text_base_audio_path = gr.Text(label=i18n("请输入基准音频"), value="text")
text_compare_audio_dir = gr.Text(label=i18n("请输入待比较的音频文件目录"), value="text")
with gr.Row():
button_similarity_audio_output = gr.Button(i18n("输出相似度-参考音频到临时目录"), variant="primary")
text_similarity_audio_output_info = gr.Text(label=i18n("输出结果"), value="", interactive=False)
button_similarity_audio_output.click(similarity_audio_output,
[text_work_space_dir, text_character, text_base_audio_path,
text_compare_audio_dir], [text_similarity_audio_output_info])
with gr.Row():
text_sync_ref_audio_dir = gr.Text(label=i18n("参考音频路径"), value="", interactive=False)
text_sync_inference_audio_dir = gr.Text(label=i18n("被同步的推理音频路径"), value="", interactive=False)
with gr.Row():
button_sync_ref_audio = gr.Button(i18n("将参考音频的删除情况,同步到推理音频目录"), variant="primary")
text_sync_ref_info = gr.Text(label=i18n("同步结果"), value="", interactive=False)
button_sync_ref_audio.click(sync_ref_audio, [text_work_space_dir, text_character, text_sync_ref_audio_dir,
text_sync_inference_audio_dir], [text_sync_ref_info])
with gr.Accordion("第四步:生成参考音频配置文本", open=False):
gr.Markdown(value=i18n("4.1:编辑模板"))
text_template_path = gr.Text(label=i18n("模板文件路径"), value="", interactive=False)
text_template = gr.Text(label=i18n("模板内容"), value="text", lines=10)
gr.Markdown(value=i18n("4.2:生成配置"))
text_sync_ref_audio_dir2 = gr.Text(label=i18n("参考音频路径"), value="", interactive=False)
with gr.Row():
button_create_config = gr.Button(i18n("生成配置"), variant="primary")
text_create_config_info = gr.Text(label=i18n("生成结果"), value="", interactive=False)
button_create_config.click(create_config,
[text_work_space_dir, text_character, text_template, text_sync_ref_audio_dir2],
[text_create_config_info])
button_sample.click(sample, [text_work_space_dir, text_character, text_sample_dir, text_base_voice_path,
text_subsection_num, text_sample_num, checkbox_similarity_output],
[text_sample_info, text_model_inference_voice_dir, text_sync_ref_audio_dir,
text_sync_ref_audio_dir2])
app.launch(
server_port=9423,
quiet=True,
)