Update inference_webui.py

添加引导音频转写功能
This commit is contained in:
刘悦 2024-01-28 16:01:49 +08:00 committed by GitHub
parent b4a9a7cb42
commit 4e56fb271c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -4,7 +4,7 @@ logging.getLogger("urllib3").setLevel(logging.ERROR)
logging.getLogger("httpcore").setLevel(logging.ERROR) logging.getLogger("httpcore").setLevel(logging.ERROR)
logging.getLogger("httpx").setLevel(logging.ERROR) logging.getLogger("httpx").setLevel(logging.ERROR)
logging.getLogger("asyncio").setLevel(logging.ERROR) logging.getLogger("asyncio").setLevel(logging.ERROR)
from faster_whisper import WhisperModel
logging.getLogger("charset_normalizer").setLevel(logging.ERROR) logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
logging.getLogger("torchaudio._extension").setLevel(logging.ERROR) logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
import pdb import pdb
@ -67,6 +67,28 @@ elif torch.backends.mps.is_available():
else: else:
device = "cpu" device = "cpu"
# 音频转写
def get_whisper(audio_path):
model_name="small"
if device == "cuda":
model = WhisperModel(model_name, device="cuda", compute_type="float16",download_root="./model_from_whisper",local_files_only=False)
else:
model = WhisperModel(model_name, device="cpu", compute_type="int8",download_root="./model_from_whisper",local_files_only=False)
segments, info = model.transcribe(audio_path, beam_size=5)
print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
text_str = ""
for segment in segments:
text_str += f"{segment.text.lstrip()},"
return text_str.rstrip(",")
# 操作记忆功能 # 操作记忆功能
file_path = './audio_log.txt' file_path = './audio_log.txt'
@ -512,10 +534,12 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
gr.Markdown(value=i18n("*请上传并填写参考信息")) gr.Markdown(value=i18n("*请上传并填写参考信息"))
with gr.Row(): with gr.Row():
inp_ref = gr.Audio(label=i18n("请上传参考音频"), type="filepath",value=upload_audio_path) inp_ref = gr.Audio(label=i18n("请上传参考音频"), type="filepath",value=upload_audio_path)
whisper_button = gr.Button(i18n("faster_whisper转写音频内容到文本"))
prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value=upload_audio_text) prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value=upload_audio_text)
prompt_language = gr.Dropdown( prompt_language = gr.Dropdown(
label=i18n("参考音频的语种"),choices=[i18n("中文"),i18n("英文"),i18n("日文")],value=i18n(upload_audio_lanuage) label=i18n("参考音频的语种"),choices=[i18n("中文"),i18n("英文"),i18n("日文")],value=i18n(upload_audio_lanuage)
) )
whisper_button.click(get_whisper,[inp_ref],[prompt_text])
gr.Markdown(value=i18n("*请填写需要合成的目标文本。中英混合选中文,日英混合选日文,中日混合暂不支持,非目标语言文本自动遗弃。")) gr.Markdown(value=i18n("*请填写需要合成的目标文本。中英混合选中文,日英混合选日文,中日混合暂不支持,非目标语言文本自动遗弃。"))
with gr.Row(): with gr.Row():
text = gr.Textbox(label=i18n("需要合成的文本"), value="") text = gr.Textbox(label=i18n("需要合成的文本"), value="")