diff --git a/GPT_SoVITS/api_simple.py b/GPT_SoVITS/api_simple.py new file mode 100644 index 00000000..fa5f3c86 --- /dev/null +++ b/GPT_SoVITS/api_simple.py @@ -0,0 +1,363 @@ +""" +# api.py usage + +` python api.py -dr "123.wav" -dt "一二三。" -dl "zh" ` + +## 执行参数: + +`-s` - `SoVITS模型路径, 可在 config.py 中指定` +`-g` - `GPT模型路径, 可在 config.py 中指定` + +调用请求缺少参考音频时使用 +`-dr` - `默认参考音频路径` +`-dt` - `默认参考音频文本` +`-dl` - `默认参考音频语种, "中文","英文","日文","zh","en","ja"` + +`-d` - `推理设备, "cuda","cpu"` +`-a` - `绑定地址, 默认"127.0.0.1"` +`-p` - `绑定端口, 默认9880, 可在 config.py 中指定` +`-fp` - `覆盖 config.py 使用全精度` +`-hp` - `覆盖 config.py 使用半精度` + +`-hb` - `cnhubert路径` +`-b` - `bert路径` + +## 调用: + +### 推理 + +endpoint: `/` + +使用执行参数指定的参考音频: +GET: + `http://127.0.0.1:9880?text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh` +POST: +```json +{ + "text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。", + "text_language": "zh" +} +``` + +手动指定当次推理所使用的参考音频: +GET: + `http://127.0.0.1:9880?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh&text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh` +POST: +```json +{ + "refer_wav_path": "123.wav", + "prompt_text": "一二三。", + "prompt_language": "zh", + "text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。", + "text_language": "zh" +} +``` + +RESP: +成功: 直接返回 wav 音频流, http code 200 +失败: 返回包含错误信息的 json, http code 400 + + +### 更换默认参考音频 + +endpoint: `/change_refer` + +key与推理端一样 + +GET: + `http://127.0.0.1:9880/change_refer?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh` +POST: +```json +{ + "refer_wav_path": "123.wav", + "prompt_text": "一二三。", + "prompt_language": "zh" +} +``` + +RESP: +成功: json, http code 200 +失败: json, 400 + + +### 命令控制 + +endpoint: `/control` + +command: +"restart": 重新运行 +"exit": 结束运行 + +GET: + `http://127.0.0.1:9880/control?command=restart` +POST: +```json +{ + "command": "restart" +} +``` + +RESP: 无 + +""" + + +import argparse +import os +import sys + +now_dir = os.getcwd() +sys.path.append(now_dir) +sys.path.append("%s/GPT_SoVITS" % (now_dir)) + +import soundfile as sf +from fastapi import FastAPI, Request, HTTPException +from fastapi.responses import StreamingResponse, JSONResponse +import uvicorn +from io import BytesIO +import inference_webui +from inference_webui import inference as get_tts_wav +import signal +import config as global_config +g_config = global_config.Config() +# AVAILABLE_COMPUTE = "cuda" if torch.cuda.is_available() else "cpu" + +parser = argparse.ArgumentParser(description="GPT-SoVITS api") + +parser.add_argument("-s", "--sovits_path", type=str, default=g_config.sovits_path, help="SoVITS模型路径") +parser.add_argument("-g", "--gpt_path", type=str, default=g_config.gpt_path, help="GPT模型路径") + +parser.add_argument("-dr", "--default_refer_path", type=str, default="", help="默认参考音频路径") +parser.add_argument("-dt", "--default_refer_text", type=str, default="", help="默认参考音频文本") +parser.add_argument("-dl", "--default_refer_language", type=str, default="", help="默认参考音频语种") + +parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu") +parser.add_argument("-a", "--bind_addr", type=str, default="0.0.0.0", help="default: 0.0.0.0") +parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880") +#parser.add_argument("-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度") +#parser.add_argument("-hp", "--half_precision", action="store_true", default=False, help="覆盖config.is_half为True, 使用半精度") +# bool值的用法为 `python ./api.py -fp ...` +# 此时 full_precision==True, half_precision==False + +parser.add_argument("-hb", "--hubert_path", type=str, default=g_config.cnhubert_path, help="覆盖config.cnhubert_path") +parser.add_argument("-b", "--bert_path", type=str, default=g_config.bert_path, help="覆盖config.bert_path") + +args = parser.parse_args() + +sovits_path = args.sovits_path +gpt_path = args.gpt_path + + +def change_sovits_weights(sovits_path): + if sovits_path is not None and sovits_path !="": + inference_webui.tts_pipline.init_vits_weights(sovits_path) +def change_gpt_weights(gpt_path): + if gpt_path is not None and gpt_path !="": + inference_webui.tts_pipline.init_t2s_weights(gpt_path) + +change_sovits_weights(sovits_path) +change_gpt_weights(gpt_path) + + +class DefaultRefer: + def __init__(self, path, text, language): + self.path = args.default_refer_path + self.text = args.default_refer_text + self.language = args.default_refer_language + + def is_ready(self) -> bool: + return is_full(self.path, self.text, self.language) + + +default_refer = DefaultRefer(args.default_refer_path, args.default_refer_text, args.default_refer_language) + +device = args.device +port = args.port +host = args.bind_addr + + + +def is_empty(*items): # 任意一项不为空返回False + for item in items: + if item is not None and item != "": + return False + return True + + +def is_full(*items): # 任意一项为空返回False + for item in items: + if item is None or item == "": + return False + return True + + + +dict_language = { + "中文": "zh", + "英文": "en", + "日文": "ja", + "ZH": "zh", + "EN": "en", + "JA": "ja", + "zh": "zh", + "en": "en", + "ja": "ja" +} + + +def handle_control(command): + if command == "restart": + os.execl(g_config.python_exec, g_config.python_exec, *sys.argv) + elif command == "exit": + os.kill(os.getpid(), signal.SIGTERM) + exit(0) + + +def handle_change(path, text, language): + if is_empty(path, text, language): + return JSONResponse({"code": 400, "message": '缺少任意一项以下参数: "path", "text", "language"'}, status_code=400) + + if path != "" or path is not None: + default_refer.path = path + if text != "" or text is not None: + default_refer.text = text + if language != "" or language is not None: + default_refer.language = language + + print(f"[INFO] 当前默认参考音频路径: {default_refer.path}") + print(f"[INFO] 当前默认参考音频文本: {default_refer.text}") + print(f"[INFO] 当前默认参考音频语种: {default_refer.language}") + print(f"[INFO] is_ready: {default_refer.is_ready()}") + + return JSONResponse({"code": 0, "message": "Success"}, status_code=200) + + +def handle(text, text_language, + refer_wav_path, prompt_text, + prompt_language, top_k, + top_p, temperature, + text_split_method, batch_size, + speed_factor, ref_text_free, + split_bucket,fragment_interval, + seed): + if ( + refer_wav_path == "" or refer_wav_path is None + or prompt_text == "" or prompt_text is None + or prompt_language == "" or prompt_language is None + ): + refer_wav_path, prompt_text, prompt_language = ( + default_refer.path, + default_refer.text, + default_refer.language, + ) + if not default_refer.is_ready(): + return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400) + prompt_text = prompt_text.strip("\n") + prompt_language, text = prompt_language, text.strip("\n") + gen = get_tts_wav(text, text_language, + refer_wav_path, prompt_text, + prompt_language, top_k, + top_p, temperature, + text_split_method, batch_size, + speed_factor, ref_text_free, + split_bucket,fragment_interval, + seed + ) + audio,_ = next(gen) + sampling_rate,audio_data=audio + + wav = BytesIO() + sf.write(wav, audio_data, sampling_rate, format="wav") + wav.seek(0) + return StreamingResponse(wav, media_type="audio/wav") + + +app = FastAPI() + +#clark新增-----2024-02-21 +#可在启动后动态修改模型,以此满足同一个api不同的朗读者请求 +@app.post("/set_model") +async def set_model(request: Request): + json_post_raw = await request.json() + global gpt_path + gpt_path=json_post_raw.get("gpt_model_path") + global sovits_path + sovits_path=json_post_raw.get("sovits_model_path") + print("gptpath"+gpt_path+";vitspath"+sovits_path) + change_sovits_weights(sovits_path) + change_gpt_weights(gpt_path) + return "ok" +# 新增-----end------ + +@app.post("/control") +async def control(request: Request): + json_post_raw = await request.json() + return handle_control(json_post_raw.get("command")) + + +@app.get("/control") +async def control(command: str = None): + return handle_control(command) + + +@app.post("/change_refer") +async def change_refer(request: Request): + json_post_raw = await request.json() + return handle_change( + json_post_raw.get("refer_wav_path"), + json_post_raw.get("prompt_text"), + json_post_raw.get("prompt_language") + ) + + +@app.get("/change_refer") +async def change_refer( + refer_wav_path: str = None, + prompt_text: str = None, + prompt_language: str = None +): + return handle_change(refer_wav_path, prompt_text, prompt_language) + +''' +@app.post("/") +async def tts_endpoint(request: Request): + json_post_raw = await request.json() + return handle( + json_post_raw.get("refer_wav_path"), + json_post_raw.get("prompt_text"), + json_post_raw.get("prompt_language"), + json_post_raw.get("text"), + json_post_raw.get("text_language"), + ) +''' + +@app.get("/") +async def tts_endpoint( + refer_wav_path: str = None, + prompt_text: str = None, + prompt_language: str = None, + text: str = None, + text_language: str = None, + top_k:int =5, + top_p:float =1, + temperature:float=1, + text_split_method:str="凑四句一切", + batch_size:int=20, + speed_factor:float=1, + ref_text_free:bool=False, + split_bucket:bool=True, + fragment_interval:float=0.3, + seed:int=-1, +): + return handle(text, text_language, + refer_wav_path, prompt_text, + prompt_language, top_k, + top_p, temperature, + text_split_method, batch_size, + speed_factor, ref_text_free, + split_bucket,fragment_interval, + seed) + + +if __name__ == "__main__": + uvicorn.run(app, host=host, port=port, workers=1) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 199948c6..378ca287 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -1,17 +1,20 @@ -''' +""" 按中英混合识别 按日英混合识别 多语种启动切分识别语种 全部按中文识别 全部按英文识别 全部按日文识别 -''' +""" + import random import os, sys + now_dir = os.getcwd() sys.path.append(now_dir) import os, re, logging + logging.getLogger("markdown_it").setLevel(logging.ERROR) logging.getLogger("urllib3").setLevel(logging.ERROR) logging.getLogger("httpcore").setLevel(logging.ERROR) @@ -34,7 +37,7 @@ gpt_path = os.environ.get("gpt_path", None) sovits_path = os.environ.get("sovits_path", None) cnhubert_base_path = os.environ.get("cnhubert_base_path", None) bert_path = os.environ.get("bert_path", None) - + import gradio as gr from TTS_infer_pack.TTS import TTS, TTS_Config from TTS_infer_pack.text_segmentation_method import get_method @@ -50,18 +53,18 @@ if torch.cuda.is_available(): # device = "mps" else: device = "cpu" - + dict_language = { - i18n("中文"): "all_zh",#全部按中文识别 - i18n("英文"): "en",#全部按英文识别#######不变 - i18n("日文"): "all_ja",#全部按日文识别 - i18n("中英混合"): "zh",#按中英混合识别####不变 - i18n("日英混合"): "ja",#按日英混合识别####不变 - i18n("多语种混合"): "auto",#多语种启动切分识别语种 + i18n("中文"): "all_zh", # 全部按中文识别 + i18n("英文"): "en", # 全部按英文识别#######不变 + i18n("日文"): "all_ja", # 全部按日文识别 + i18n("中英混合"): "zh", # 按中英混合识别####不变 + i18n("日英混合"): "ja", # 按日英混合识别####不变 + i18n("多语种混合"): "auto", # 多语种启动切分识别语种 } cut_method = { - i18n("不切"):"cut0", + i18n("不切"): "cut0", i18n("凑四句一切"): "cut1", i18n("凑50字一切"): "cut2", i18n("按中文句号。切"): "cut3", @@ -80,23 +83,32 @@ if cnhubert_base_path is not None: tts_config.cnhuhbert_base_path = cnhubert_base_path if bert_path is not None: tts_config.bert_base_path = bert_path - + print(tts_config) tts_pipline = TTS(tts_config) gpt_path = tts_config.t2s_weights_path sovits_path = tts_config.vits_weights_path -def inference(text, text_lang, - ref_audio_path, prompt_text, - prompt_lang, top_k, - top_p, temperature, - text_split_method, batch_size, - speed_factor, ref_text_free, - split_bucket,fragment_interval, - seed, - ): + +def inference( + text, + text_lang, + ref_audio_path, + prompt_text, + prompt_lang, + top_k, + top_p, + temperature, + text_split_method, + batch_size, + speed_factor, + ref_text_free, + split_bucket, + fragment_interval, + seed, +): actual_seed = seed if seed not in [-1, "", None] else random.randrange(1 << 32) - inputs={ + inputs = { "text": text, "text_lang": dict_language[text_lang], "ref_audio_path": ref_audio_path, @@ -106,19 +118,20 @@ def inference(text, text_lang, "top_p": top_p, "temperature": temperature, "text_split_method": cut_method[text_split_method], - "batch_size":int(batch_size), - "speed_factor":float(speed_factor), - "split_bucket":split_bucket, - "return_fragment":False, - "fragment_interval":fragment_interval, - "seed":actual_seed, + "batch_size": int(batch_size), + "speed_factor": float(speed_factor), + "split_bucket": split_bucket, + "return_fragment": False, + "fragment_interval": fragment_interval, + "seed": actual_seed, } for item in tts_pipline.run(inputs): yield item, actual_seed - + + def custom_sort_key(s): # 使用正则表达式提取字符串中的数字部分和非数字部分 - parts = re.split('(\d+)', s) + parts = re.split("(\d+)", s) # 将数字部分转换为整数,非数字部分保持不变 parts = [int(part) if part.isdigit() else part for part in parts] return parts @@ -126,11 +139,16 @@ def custom_sort_key(s): def change_choices(): SoVITS_names, GPT_names = get_weights_names() - return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names, key=custom_sort_key), "__type__": "update"} + return { + "choices": sorted(SoVITS_names, key=custom_sort_key), + "__type__": "update", + }, {"choices": sorted(GPT_names, key=custom_sort_key), "__type__": "update"} pretrained_sovits_name = "GPT_SoVITS/pretrained_models/s2G488k.pth" -pretrained_gpt_name = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" +pretrained_gpt_name = ( + "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" +) SoVITS_weight_root = "SoVITS_weights" GPT_weight_root = "GPT_weights" os.makedirs(SoVITS_weight_root, exist_ok=True) @@ -140,122 +158,255 @@ os.makedirs(GPT_weight_root, exist_ok=True) def get_weights_names(): SoVITS_names = [pretrained_sovits_name] for name in os.listdir(SoVITS_weight_root): - if name.endswith(".pth"): SoVITS_names.append("%s/%s" % (SoVITS_weight_root, name)) + if name.endswith(".pth"): + SoVITS_names.append("%s/%s" % (SoVITS_weight_root, name)) GPT_names = [pretrained_gpt_name] for name in os.listdir(GPT_weight_root): - if name.endswith(".ckpt"): GPT_names.append("%s/%s" % (GPT_weight_root, name)) + if name.endswith(".ckpt"): + GPT_names.append("%s/%s" % (GPT_weight_root, name)) return SoVITS_names, GPT_names SoVITS_names, GPT_names = get_weights_names() -with gr.Blocks(title="GPT-SoVITS WebUI") as app: - gr.Markdown( - value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.") - ) - - with gr.Column(): - # with gr.Group(): - gr.Markdown(value=i18n("模型切换")) - with gr.Row(): - GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path, interactive=True) - SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path, interactive=True) - refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary") - refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown]) - SoVITS_dropdown.change(tts_pipline.init_vits_weights, [SoVITS_dropdown], []) - GPT_dropdown.change(tts_pipline.init_t2s_weights, [GPT_dropdown], []) - - with gr.Row(): - with gr.Column(): - gr.Markdown(value=i18n("*请上传并填写参考信息")) - inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频,超过会报错!"), type="filepath") - prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="", lines=2) - with gr.Row(): - prompt_language = gr.Dropdown( - label=i18n("参考音频的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文") - ) - with gr.Column(): - ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True, show_label=True) - gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。")) - - with gr.Column(): - gr.Markdown(value=i18n("*请填写需要合成的目标文本和语种模式")) - text = gr.Textbox(label=i18n("需要合成的文本"), value="", lines=16, max_lines=16) - text_language = gr.Dropdown( - label=i18n("需要合成的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文") +if __name__ == "__main__": + with gr.Blocks(title="GPT-SoVITS WebUI") as app: + gr.Markdown( + value=i18n( + "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE." ) + ) - - with gr.Group(): - gr.Markdown(value=i18n("推理设置")) - with gr.Row(): - - with gr.Column(): - batch_size = gr.Slider(minimum=1,maximum=200,step=1,label=i18n("batch_size"),value=20,interactive=True) - fragment_interval = gr.Slider(minimum=0.01,maximum=1,step=0.01,label=i18n("分段间隔(秒)"),value=0.3,interactive=True) - speed_factor = gr.Slider(minimum=0.25,maximum=4,step=0.05,label="speed_factor",value=1.0,interactive=True) - top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True) - top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True) - temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True) - with gr.Column(): - how_to_cut = gr.Radio( - label=i18n("怎么切"), - choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ], - value=i18n("凑四句一切"), + with gr.Column(): + # with gr.Group(): + gr.Markdown(value=i18n("模型切换")) + with gr.Row(): + GPT_dropdown = gr.Dropdown( + label=i18n("GPT模型列表"), + choices=sorted(GPT_names, key=custom_sort_key), + value=gpt_path, interactive=True, ) - with gr.Row(): - split_bucket = gr.Checkbox(label=i18n("数据分桶(可能会降低一点计算量,选就对了)"), value=True, interactive=True, show_label=True) - seed = gr.Number(label=i18n("随机种子"),value=-1) - # with gr.Column(): - output = gr.Audio(label=i18n("输出的语音")) - with gr.Row(): - inference_button = gr.Button(i18n("合成语音"), variant="primary") - stop_infer = gr.Button(i18n("终止合成"), variant="primary") - - - inference_button.click( - inference, - [ - text,text_language, inp_ref, - prompt_text, prompt_language, - top_k, top_p, temperature, - how_to_cut, batch_size, - speed_factor, ref_text_free, - split_bucket,fragment_interval, - seed - ], - [output, seed], - ) - stop_infer.click(tts_pipline.stop, [], []) + SoVITS_dropdown = gr.Dropdown( + label=i18n("SoVITS模型列表"), + choices=sorted(SoVITS_names, key=custom_sort_key), + value=sovits_path, + interactive=True, + ) + refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary") + refresh_button.click( + fn=change_choices, + inputs=[], + outputs=[SoVITS_dropdown, GPT_dropdown], + ) + SoVITS_dropdown.change( + tts_pipline.init_vits_weights, [SoVITS_dropdown], [] + ) + GPT_dropdown.change(tts_pipline.init_t2s_weights, [GPT_dropdown], []) - with gr.Group(): - gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。")) with gr.Row(): - text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"), value="", lines=4) with gr.Column(): - _how_to_cut = gr.Radio( - label=i18n("怎么切"), - choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ], - value=i18n("凑四句一切"), + gr.Markdown(value=i18n("*请上传并填写参考信息")) + inp_ref = gr.Audio( + label=i18n("请上传3~10秒内参考音频,超过会报错!"), type="filepath" + ) + prompt_text = gr.Textbox( + label=i18n("参考音频的文本"), value="", lines=2 + ) + with gr.Row(): + prompt_language = gr.Dropdown( + label=i18n("参考音频的语种"), + choices=[ + i18n("中文"), + i18n("英文"), + i18n("日文"), + i18n("中英混合"), + i18n("日英混合"), + i18n("多语种混合"), + ], + value=i18n("中文"), + ) + with gr.Column(): + ref_text_free = gr.Checkbox( + label=i18n( + "开启无参考文本模式。不填参考文本亦相当于开启。" + ), + value=False, interactive=True, + show_label=True, + ) + gr.Markdown( + i18n( + "使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。" + ) ) - cut_text= gr.Button(i18n("切分"), variant="primary") - - def to_cut(text_inp, how_to_cut): - if len(text_inp.strip()) == 0 or text_inp==[]: - return "" - method = get_method(cut_method[how_to_cut]) - return method(text_inp) - - text_opt = gr.Textbox(label=i18n("切分后文本"), value="", lines=4) - cut_text.click(to_cut, [text_inp, _how_to_cut], [text_opt]) - gr.Markdown(value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。")) -app.queue(concurrency_count=511, max_size=1022).launch( - server_name="0.0.0.0", - inbrowser=True, - share=is_share, - server_port=infer_ttswebui, - quiet=True, -) + with gr.Column(): + gr.Markdown(value=i18n("*请填写需要合成的目标文本和语种模式")) + text = gr.Textbox( + label=i18n("需要合成的文本"), value="", lines=16, max_lines=16 + ) + text_language = gr.Dropdown( + label=i18n("需要合成的语种"), + choices=[ + i18n("中文"), + i18n("英文"), + i18n("日文"), + i18n("中英混合"), + i18n("日英混合"), + i18n("多语种混合"), + ], + value=i18n("中文"), + ) + + with gr.Group(): + gr.Markdown(value=i18n("推理设置")) + with gr.Row(): + + with gr.Column(): + batch_size = gr.Slider( + minimum=1, + maximum=200, + step=1, + label=i18n("batch_size"), + value=20, + interactive=True, + ) + fragment_interval = gr.Slider( + minimum=0.01, + maximum=1, + step=0.01, + label=i18n("分段间隔(秒)"), + value=0.3, + interactive=True, + ) + speed_factor = gr.Slider( + minimum=0.25, + maximum=4, + step=0.05, + label="speed_factor", + value=1.0, + interactive=True, + ) + top_k = gr.Slider( + minimum=1, + maximum=100, + step=1, + label=i18n("top_k"), + value=5, + interactive=True, + ) + top_p = gr.Slider( + minimum=0, + maximum=1, + step=0.05, + label=i18n("top_p"), + value=1, + interactive=True, + ) + temperature = gr.Slider( + minimum=0, + maximum=1, + step=0.05, + label=i18n("temperature"), + value=1, + interactive=True, + ) + with gr.Column(): + how_to_cut = gr.Radio( + label=i18n("怎么切"), + choices=[ + i18n("不切"), + i18n("凑四句一切"), + i18n("凑50字一切"), + i18n("按中文句号。切"), + i18n("按英文句号.切"), + i18n("按标点符号切"), + ], + value=i18n("凑四句一切"), + interactive=True, + ) + with gr.Row(): + split_bucket = gr.Checkbox( + label=i18n("数据分桶(可能会降低一点计算量,选就对了)"), + value=True, + interactive=True, + show_label=True, + ) + seed = gr.Number(label=i18n("随机种子"), value=-1) + # with gr.Column(): + output = gr.Audio(label=i18n("输出的语音")) + with gr.Row(): + inference_button = gr.Button( + i18n("合成语音"), variant="primary" + ) + stop_infer = gr.Button(i18n("终止合成"), variant="primary") + + inference_button.click( + inference, + [ + text, + text_language, + inp_ref, + prompt_text, + prompt_language, + top_k, + top_p, + temperature, + how_to_cut, + batch_size, + speed_factor, + ref_text_free, + split_bucket, + fragment_interval, + seed, + ], + [output, seed], + ) + stop_infer.click(tts_pipline.stop, [], []) + + with gr.Group(): + gr.Markdown( + value=i18n( + "文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。" + ) + ) + with gr.Row(): + text_inp = gr.Textbox( + label=i18n("需要合成的切分前文本"), value="", lines=4 + ) + with gr.Column(): + _how_to_cut = gr.Radio( + label=i18n("怎么切"), + choices=[ + i18n("不切"), + i18n("凑四句一切"), + i18n("凑50字一切"), + i18n("按中文句号。切"), + i18n("按英文句号.切"), + i18n("按标点符号切"), + ], + value=i18n("凑四句一切"), + interactive=True, + ) + cut_text = gr.Button(i18n("切分"), variant="primary") + + def to_cut(text_inp, how_to_cut): + if len(text_inp.strip()) == 0 or text_inp == []: + return "" + method = get_method(cut_method[how_to_cut]) + return method(text_inp) + + text_opt = gr.Textbox(label=i18n("切分后文本"), value="", lines=4) + cut_text.click(to_cut, [text_inp, _how_to_cut], [text_opt]) + gr.Markdown( + value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。") + ) + + app.queue(concurrency_count=511, max_size=1022).launch( + server_name="0.0.0.0", + inbrowser=True, + share=is_share, + server_port=infer_ttswebui, + quiet=True, + )