diff --git a/GPT_SoVITS/api_simple.py b/GPT_SoVITS/api_simple.py
new file mode 100644
index 00000000..fa5f3c86
--- /dev/null
+++ b/GPT_SoVITS/api_simple.py
@@ -0,0 +1,363 @@
+"""
+# api.py usage
+
+` python api.py -dr "123.wav" -dt "一二三。" -dl "zh" `
+
+## 执行参数:
+
+`-s` - `SoVITS模型路径, 可在 config.py 中指定`
+`-g` - `GPT模型路径, 可在 config.py 中指定`
+
+调用请求缺少参考音频时使用
+`-dr` - `默认参考音频路径`
+`-dt` - `默认参考音频文本`
+`-dl` - `默认参考音频语种, "中文","英文","日文","zh","en","ja"`
+
+`-d` - `推理设备, "cuda","cpu"`
+`-a` - `绑定地址, 默认"127.0.0.1"`
+`-p` - `绑定端口, 默认9880, 可在 config.py 中指定`
+`-fp` - `覆盖 config.py 使用全精度`
+`-hp` - `覆盖 config.py 使用半精度`
+
+`-hb` - `cnhubert路径`
+`-b` - `bert路径`
+
+## 调用:
+
+### 推理
+
+endpoint: `/`
+
+使用执行参数指定的参考音频:
+GET:
+ `http://127.0.0.1:9880?text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh`
+POST:
+```json
+{
+ "text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。",
+ "text_language": "zh"
+}
+```
+
+手动指定当次推理所使用的参考音频:
+GET:
+ `http://127.0.0.1:9880?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh&text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh`
+POST:
+```json
+{
+ "refer_wav_path": "123.wav",
+ "prompt_text": "一二三。",
+ "prompt_language": "zh",
+ "text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。",
+ "text_language": "zh"
+}
+```
+
+RESP:
+成功: 直接返回 wav 音频流, http code 200
+失败: 返回包含错误信息的 json, http code 400
+
+
+### 更换默认参考音频
+
+endpoint: `/change_refer`
+
+key与推理端一样
+
+GET:
+ `http://127.0.0.1:9880/change_refer?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh`
+POST:
+```json
+{
+ "refer_wav_path": "123.wav",
+ "prompt_text": "一二三。",
+ "prompt_language": "zh"
+}
+```
+
+RESP:
+成功: json, http code 200
+失败: json, 400
+
+
+### 命令控制
+
+endpoint: `/control`
+
+command:
+"restart": 重新运行
+"exit": 结束运行
+
+GET:
+ `http://127.0.0.1:9880/control?command=restart`
+POST:
+```json
+{
+ "command": "restart"
+}
+```
+
+RESP: 无
+
+"""
+
+
+import argparse
+import os
+import sys
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+sys.path.append("%s/GPT_SoVITS" % (now_dir))
+
+import soundfile as sf
+from fastapi import FastAPI, Request, HTTPException
+from fastapi.responses import StreamingResponse, JSONResponse
+import uvicorn
+from io import BytesIO
+import inference_webui
+from inference_webui import inference as get_tts_wav
+import signal
+import config as global_config
+g_config = global_config.Config()
+# AVAILABLE_COMPUTE = "cuda" if torch.cuda.is_available() else "cpu"
+
+parser = argparse.ArgumentParser(description="GPT-SoVITS api")
+
+parser.add_argument("-s", "--sovits_path", type=str, default=g_config.sovits_path, help="SoVITS模型路径")
+parser.add_argument("-g", "--gpt_path", type=str, default=g_config.gpt_path, help="GPT模型路径")
+
+parser.add_argument("-dr", "--default_refer_path", type=str, default="", help="默认参考音频路径")
+parser.add_argument("-dt", "--default_refer_text", type=str, default="", help="默认参考音频文本")
+parser.add_argument("-dl", "--default_refer_language", type=str, default="", help="默认参考音频语种")
+
+parser.add_argument("-d", "--device", type=str, default=g_config.infer_device, help="cuda / cpu")
+parser.add_argument("-a", "--bind_addr", type=str, default="0.0.0.0", help="default: 0.0.0.0")
+parser.add_argument("-p", "--port", type=int, default=g_config.api_port, help="default: 9880")
+#parser.add_argument("-fp", "--full_precision", action="store_true", default=False, help="覆盖config.is_half为False, 使用全精度")
+#parser.add_argument("-hp", "--half_precision", action="store_true", default=False, help="覆盖config.is_half为True, 使用半精度")
+# bool值的用法为 `python ./api.py -fp ...`
+# 此时 full_precision==True, half_precision==False
+
+parser.add_argument("-hb", "--hubert_path", type=str, default=g_config.cnhubert_path, help="覆盖config.cnhubert_path")
+parser.add_argument("-b", "--bert_path", type=str, default=g_config.bert_path, help="覆盖config.bert_path")
+
+args = parser.parse_args()
+
+sovits_path = args.sovits_path
+gpt_path = args.gpt_path
+
+
+def change_sovits_weights(sovits_path):
+ if sovits_path is not None and sovits_path !="":
+ inference_webui.tts_pipline.init_vits_weights(sovits_path)
+def change_gpt_weights(gpt_path):
+ if gpt_path is not None and gpt_path !="":
+ inference_webui.tts_pipline.init_t2s_weights(gpt_path)
+
+change_sovits_weights(sovits_path)
+change_gpt_weights(gpt_path)
+
+
+class DefaultRefer:
+ def __init__(self, path, text, language):
+ self.path = args.default_refer_path
+ self.text = args.default_refer_text
+ self.language = args.default_refer_language
+
+ def is_ready(self) -> bool:
+ return is_full(self.path, self.text, self.language)
+
+
+default_refer = DefaultRefer(args.default_refer_path, args.default_refer_text, args.default_refer_language)
+
+device = args.device
+port = args.port
+host = args.bind_addr
+
+
+
+def is_empty(*items): # 任意一项不为空返回False
+ for item in items:
+ if item is not None and item != "":
+ return False
+ return True
+
+
+def is_full(*items): # 任意一项为空返回False
+ for item in items:
+ if item is None or item == "":
+ return False
+ return True
+
+
+
+dict_language = {
+ "中文": "zh",
+ "英文": "en",
+ "日文": "ja",
+ "ZH": "zh",
+ "EN": "en",
+ "JA": "ja",
+ "zh": "zh",
+ "en": "en",
+ "ja": "ja"
+}
+
+
+def handle_control(command):
+ if command == "restart":
+ os.execl(g_config.python_exec, g_config.python_exec, *sys.argv)
+ elif command == "exit":
+ os.kill(os.getpid(), signal.SIGTERM)
+ exit(0)
+
+
+def handle_change(path, text, language):
+ if is_empty(path, text, language):
+ return JSONResponse({"code": 400, "message": '缺少任意一项以下参数: "path", "text", "language"'}, status_code=400)
+
+ if path != "" or path is not None:
+ default_refer.path = path
+ if text != "" or text is not None:
+ default_refer.text = text
+ if language != "" or language is not None:
+ default_refer.language = language
+
+ print(f"[INFO] 当前默认参考音频路径: {default_refer.path}")
+ print(f"[INFO] 当前默认参考音频文本: {default_refer.text}")
+ print(f"[INFO] 当前默认参考音频语种: {default_refer.language}")
+ print(f"[INFO] is_ready: {default_refer.is_ready()}")
+
+ return JSONResponse({"code": 0, "message": "Success"}, status_code=200)
+
+
+def handle(text, text_language,
+ refer_wav_path, prompt_text,
+ prompt_language, top_k,
+ top_p, temperature,
+ text_split_method, batch_size,
+ speed_factor, ref_text_free,
+ split_bucket,fragment_interval,
+ seed):
+ if (
+ refer_wav_path == "" or refer_wav_path is None
+ or prompt_text == "" or prompt_text is None
+ or prompt_language == "" or prompt_language is None
+ ):
+ refer_wav_path, prompt_text, prompt_language = (
+ default_refer.path,
+ default_refer.text,
+ default_refer.language,
+ )
+ if not default_refer.is_ready():
+ return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400)
+ prompt_text = prompt_text.strip("\n")
+ prompt_language, text = prompt_language, text.strip("\n")
+ gen = get_tts_wav(text, text_language,
+ refer_wav_path, prompt_text,
+ prompt_language, top_k,
+ top_p, temperature,
+ text_split_method, batch_size,
+ speed_factor, ref_text_free,
+ split_bucket,fragment_interval,
+ seed
+ )
+ audio,_ = next(gen)
+ sampling_rate,audio_data=audio
+
+ wav = BytesIO()
+ sf.write(wav, audio_data, sampling_rate, format="wav")
+ wav.seek(0)
+ return StreamingResponse(wav, media_type="audio/wav")
+
+
+app = FastAPI()
+
+#clark新增-----2024-02-21
+#可在启动后动态修改模型,以此满足同一个api不同的朗读者请求
+@app.post("/set_model")
+async def set_model(request: Request):
+ json_post_raw = await request.json()
+ global gpt_path
+ gpt_path=json_post_raw.get("gpt_model_path")
+ global sovits_path
+ sovits_path=json_post_raw.get("sovits_model_path")
+ print("gptpath"+gpt_path+";vitspath"+sovits_path)
+ change_sovits_weights(sovits_path)
+ change_gpt_weights(gpt_path)
+ return "ok"
+# 新增-----end------
+
+@app.post("/control")
+async def control(request: Request):
+ json_post_raw = await request.json()
+ return handle_control(json_post_raw.get("command"))
+
+
+@app.get("/control")
+async def control(command: str = None):
+ return handle_control(command)
+
+
+@app.post("/change_refer")
+async def change_refer(request: Request):
+ json_post_raw = await request.json()
+ return handle_change(
+ json_post_raw.get("refer_wav_path"),
+ json_post_raw.get("prompt_text"),
+ json_post_raw.get("prompt_language")
+ )
+
+
+@app.get("/change_refer")
+async def change_refer(
+ refer_wav_path: str = None,
+ prompt_text: str = None,
+ prompt_language: str = None
+):
+ return handle_change(refer_wav_path, prompt_text, prompt_language)
+
+'''
+@app.post("/")
+async def tts_endpoint(request: Request):
+ json_post_raw = await request.json()
+ return handle(
+ json_post_raw.get("refer_wav_path"),
+ json_post_raw.get("prompt_text"),
+ json_post_raw.get("prompt_language"),
+ json_post_raw.get("text"),
+ json_post_raw.get("text_language"),
+ )
+'''
+
+@app.get("/")
+async def tts_endpoint(
+ refer_wav_path: str = None,
+ prompt_text: str = None,
+ prompt_language: str = None,
+ text: str = None,
+ text_language: str = None,
+ top_k:int =5,
+ top_p:float =1,
+ temperature:float=1,
+ text_split_method:str="凑四句一切",
+ batch_size:int=20,
+ speed_factor:float=1,
+ ref_text_free:bool=False,
+ split_bucket:bool=True,
+ fragment_interval:float=0.3,
+ seed:int=-1,
+):
+ return handle(text, text_language,
+ refer_wav_path, prompt_text,
+ prompt_language, top_k,
+ top_p, temperature,
+ text_split_method, batch_size,
+ speed_factor, ref_text_free,
+ split_bucket,fragment_interval,
+ seed)
+
+
+if __name__ == "__main__":
+ uvicorn.run(app, host=host, port=port, workers=1)
diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py
index 199948c6..378ca287 100644
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@@ -1,17 +1,20 @@
-'''
+"""
按中英混合识别
按日英混合识别
多语种启动切分识别语种
全部按中文识别
全部按英文识别
全部按日文识别
-'''
+"""
+
import random
import os, sys
+
now_dir = os.getcwd()
sys.path.append(now_dir)
import os, re, logging
+
logging.getLogger("markdown_it").setLevel(logging.ERROR)
logging.getLogger("urllib3").setLevel(logging.ERROR)
logging.getLogger("httpcore").setLevel(logging.ERROR)
@@ -34,7 +37,7 @@ gpt_path = os.environ.get("gpt_path", None)
sovits_path = os.environ.get("sovits_path", None)
cnhubert_base_path = os.environ.get("cnhubert_base_path", None)
bert_path = os.environ.get("bert_path", None)
-
+
import gradio as gr
from TTS_infer_pack.TTS import TTS, TTS_Config
from TTS_infer_pack.text_segmentation_method import get_method
@@ -50,18 +53,18 @@ if torch.cuda.is_available():
# device = "mps"
else:
device = "cpu"
-
+
dict_language = {
- i18n("中文"): "all_zh",#全部按中文识别
- i18n("英文"): "en",#全部按英文识别#######不变
- i18n("日文"): "all_ja",#全部按日文识别
- i18n("中英混合"): "zh",#按中英混合识别####不变
- i18n("日英混合"): "ja",#按日英混合识别####不变
- i18n("多语种混合"): "auto",#多语种启动切分识别语种
+ i18n("中文"): "all_zh", # 全部按中文识别
+ i18n("英文"): "en", # 全部按英文识别#######不变
+ i18n("日文"): "all_ja", # 全部按日文识别
+ i18n("中英混合"): "zh", # 按中英混合识别####不变
+ i18n("日英混合"): "ja", # 按日英混合识别####不变
+ i18n("多语种混合"): "auto", # 多语种启动切分识别语种
}
cut_method = {
- i18n("不切"):"cut0",
+ i18n("不切"): "cut0",
i18n("凑四句一切"): "cut1",
i18n("凑50字一切"): "cut2",
i18n("按中文句号。切"): "cut3",
@@ -80,23 +83,32 @@ if cnhubert_base_path is not None:
tts_config.cnhuhbert_base_path = cnhubert_base_path
if bert_path is not None:
tts_config.bert_base_path = bert_path
-
+
print(tts_config)
tts_pipline = TTS(tts_config)
gpt_path = tts_config.t2s_weights_path
sovits_path = tts_config.vits_weights_path
-def inference(text, text_lang,
- ref_audio_path, prompt_text,
- prompt_lang, top_k,
- top_p, temperature,
- text_split_method, batch_size,
- speed_factor, ref_text_free,
- split_bucket,fragment_interval,
- seed,
- ):
+
+def inference(
+ text,
+ text_lang,
+ ref_audio_path,
+ prompt_text,
+ prompt_lang,
+ top_k,
+ top_p,
+ temperature,
+ text_split_method,
+ batch_size,
+ speed_factor,
+ ref_text_free,
+ split_bucket,
+ fragment_interval,
+ seed,
+):
actual_seed = seed if seed not in [-1, "", None] else random.randrange(1 << 32)
- inputs={
+ inputs = {
"text": text,
"text_lang": dict_language[text_lang],
"ref_audio_path": ref_audio_path,
@@ -106,19 +118,20 @@ def inference(text, text_lang,
"top_p": top_p,
"temperature": temperature,
"text_split_method": cut_method[text_split_method],
- "batch_size":int(batch_size),
- "speed_factor":float(speed_factor),
- "split_bucket":split_bucket,
- "return_fragment":False,
- "fragment_interval":fragment_interval,
- "seed":actual_seed,
+ "batch_size": int(batch_size),
+ "speed_factor": float(speed_factor),
+ "split_bucket": split_bucket,
+ "return_fragment": False,
+ "fragment_interval": fragment_interval,
+ "seed": actual_seed,
}
for item in tts_pipline.run(inputs):
yield item, actual_seed
-
+
+
def custom_sort_key(s):
# 使用正则表达式提取字符串中的数字部分和非数字部分
- parts = re.split('(\d+)', s)
+ parts = re.split("(\d+)", s)
# 将数字部分转换为整数,非数字部分保持不变
parts = [int(part) if part.isdigit() else part for part in parts]
return parts
@@ -126,11 +139,16 @@ def custom_sort_key(s):
def change_choices():
SoVITS_names, GPT_names = get_weights_names()
- return {"choices": sorted(SoVITS_names, key=custom_sort_key), "__type__": "update"}, {"choices": sorted(GPT_names, key=custom_sort_key), "__type__": "update"}
+ return {
+ "choices": sorted(SoVITS_names, key=custom_sort_key),
+ "__type__": "update",
+ }, {"choices": sorted(GPT_names, key=custom_sort_key), "__type__": "update"}
pretrained_sovits_name = "GPT_SoVITS/pretrained_models/s2G488k.pth"
-pretrained_gpt_name = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
+pretrained_gpt_name = (
+ "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
+)
SoVITS_weight_root = "SoVITS_weights"
GPT_weight_root = "GPT_weights"
os.makedirs(SoVITS_weight_root, exist_ok=True)
@@ -140,122 +158,255 @@ os.makedirs(GPT_weight_root, exist_ok=True)
def get_weights_names():
SoVITS_names = [pretrained_sovits_name]
for name in os.listdir(SoVITS_weight_root):
- if name.endswith(".pth"): SoVITS_names.append("%s/%s" % (SoVITS_weight_root, name))
+ if name.endswith(".pth"):
+ SoVITS_names.append("%s/%s" % (SoVITS_weight_root, name))
GPT_names = [pretrained_gpt_name]
for name in os.listdir(GPT_weight_root):
- if name.endswith(".ckpt"): GPT_names.append("%s/%s" % (GPT_weight_root, name))
+ if name.endswith(".ckpt"):
+ GPT_names.append("%s/%s" % (GPT_weight_root, name))
return SoVITS_names, GPT_names
SoVITS_names, GPT_names = get_weights_names()
-with gr.Blocks(title="GPT-SoVITS WebUI") as app:
- gr.Markdown(
- value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.")
- )
-
- with gr.Column():
- # with gr.Group():
- gr.Markdown(value=i18n("模型切换"))
- with gr.Row():
- GPT_dropdown = gr.Dropdown(label=i18n("GPT模型列表"), choices=sorted(GPT_names, key=custom_sort_key), value=gpt_path, interactive=True)
- SoVITS_dropdown = gr.Dropdown(label=i18n("SoVITS模型列表"), choices=sorted(SoVITS_names, key=custom_sort_key), value=sovits_path, interactive=True)
- refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
- refresh_button.click(fn=change_choices, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown])
- SoVITS_dropdown.change(tts_pipline.init_vits_weights, [SoVITS_dropdown], [])
- GPT_dropdown.change(tts_pipline.init_t2s_weights, [GPT_dropdown], [])
-
- with gr.Row():
- with gr.Column():
- gr.Markdown(value=i18n("*请上传并填写参考信息"))
- inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频,超过会报错!"), type="filepath")
- prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="", lines=2)
- with gr.Row():
- prompt_language = gr.Dropdown(
- label=i18n("参考音频的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文")
- )
- with gr.Column():
- ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True, show_label=True)
- gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。"))
-
- with gr.Column():
- gr.Markdown(value=i18n("*请填写需要合成的目标文本和语种模式"))
- text = gr.Textbox(label=i18n("需要合成的文本"), value="", lines=16, max_lines=16)
- text_language = gr.Dropdown(
- label=i18n("需要合成的语种"), choices=[i18n("中文"), i18n("英文"), i18n("日文"), i18n("中英混合"), i18n("日英混合"), i18n("多语种混合")], value=i18n("中文")
+if __name__ == "__main__":
+ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
+ gr.Markdown(
+ value=i18n(
+ "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.
如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE."
)
+ )
-
- with gr.Group():
- gr.Markdown(value=i18n("推理设置"))
- with gr.Row():
-
- with gr.Column():
- batch_size = gr.Slider(minimum=1,maximum=200,step=1,label=i18n("batch_size"),value=20,interactive=True)
- fragment_interval = gr.Slider(minimum=0.01,maximum=1,step=0.01,label=i18n("分段间隔(秒)"),value=0.3,interactive=True)
- speed_factor = gr.Slider(minimum=0.25,maximum=4,step=0.05,label="speed_factor",value=1.0,interactive=True)
- top_k = gr.Slider(minimum=1,maximum=100,step=1,label=i18n("top_k"),value=5,interactive=True)
- top_p = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("top_p"),value=1,interactive=True)
- temperature = gr.Slider(minimum=0,maximum=1,step=0.05,label=i18n("temperature"),value=1,interactive=True)
- with gr.Column():
- how_to_cut = gr.Radio(
- label=i18n("怎么切"),
- choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ],
- value=i18n("凑四句一切"),
+ with gr.Column():
+ # with gr.Group():
+ gr.Markdown(value=i18n("模型切换"))
+ with gr.Row():
+ GPT_dropdown = gr.Dropdown(
+ label=i18n("GPT模型列表"),
+ choices=sorted(GPT_names, key=custom_sort_key),
+ value=gpt_path,
interactive=True,
)
- with gr.Row():
- split_bucket = gr.Checkbox(label=i18n("数据分桶(可能会降低一点计算量,选就对了)"), value=True, interactive=True, show_label=True)
- seed = gr.Number(label=i18n("随机种子"),value=-1)
- # with gr.Column():
- output = gr.Audio(label=i18n("输出的语音"))
- with gr.Row():
- inference_button = gr.Button(i18n("合成语音"), variant="primary")
- stop_infer = gr.Button(i18n("终止合成"), variant="primary")
-
-
- inference_button.click(
- inference,
- [
- text,text_language, inp_ref,
- prompt_text, prompt_language,
- top_k, top_p, temperature,
- how_to_cut, batch_size,
- speed_factor, ref_text_free,
- split_bucket,fragment_interval,
- seed
- ],
- [output, seed],
- )
- stop_infer.click(tts_pipline.stop, [], [])
+ SoVITS_dropdown = gr.Dropdown(
+ label=i18n("SoVITS模型列表"),
+ choices=sorted(SoVITS_names, key=custom_sort_key),
+ value=sovits_path,
+ interactive=True,
+ )
+ refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary")
+ refresh_button.click(
+ fn=change_choices,
+ inputs=[],
+ outputs=[SoVITS_dropdown, GPT_dropdown],
+ )
+ SoVITS_dropdown.change(
+ tts_pipline.init_vits_weights, [SoVITS_dropdown], []
+ )
+ GPT_dropdown.change(tts_pipline.init_t2s_weights, [GPT_dropdown], [])
- with gr.Group():
- gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"))
with gr.Row():
- text_inp = gr.Textbox(label=i18n("需要合成的切分前文本"), value="", lines=4)
with gr.Column():
- _how_to_cut = gr.Radio(
- label=i18n("怎么切"),
- choices=[i18n("不切"), i18n("凑四句一切"), i18n("凑50字一切"), i18n("按中文句号。切"), i18n("按英文句号.切"), i18n("按标点符号切"), ],
- value=i18n("凑四句一切"),
+ gr.Markdown(value=i18n("*请上传并填写参考信息"))
+ inp_ref = gr.Audio(
+ label=i18n("请上传3~10秒内参考音频,超过会报错!"), type="filepath"
+ )
+ prompt_text = gr.Textbox(
+ label=i18n("参考音频的文本"), value="", lines=2
+ )
+ with gr.Row():
+ prompt_language = gr.Dropdown(
+ label=i18n("参考音频的语种"),
+ choices=[
+ i18n("中文"),
+ i18n("英文"),
+ i18n("日文"),
+ i18n("中英混合"),
+ i18n("日英混合"),
+ i18n("多语种混合"),
+ ],
+ value=i18n("中文"),
+ )
+ with gr.Column():
+ ref_text_free = gr.Checkbox(
+ label=i18n(
+ "开启无参考文本模式。不填参考文本亦相当于开启。"
+ ),
+ value=False,
interactive=True,
+ show_label=True,
+ )
+ gr.Markdown(
+ i18n(
+ "使用无参考文本模式时建议使用微调的GPT,听不清参考音频说的啥(不晓得写啥)可以开,开启后无视填写的参考文本。"
+ )
)
- cut_text= gr.Button(i18n("切分"), variant="primary")
-
- def to_cut(text_inp, how_to_cut):
- if len(text_inp.strip()) == 0 or text_inp==[]:
- return ""
- method = get_method(cut_method[how_to_cut])
- return method(text_inp)
-
- text_opt = gr.Textbox(label=i18n("切分后文本"), value="", lines=4)
- cut_text.click(to_cut, [text_inp, _how_to_cut], [text_opt])
- gr.Markdown(value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。"))
-app.queue(concurrency_count=511, max_size=1022).launch(
- server_name="0.0.0.0",
- inbrowser=True,
- share=is_share,
- server_port=infer_ttswebui,
- quiet=True,
-)
+ with gr.Column():
+ gr.Markdown(value=i18n("*请填写需要合成的目标文本和语种模式"))
+ text = gr.Textbox(
+ label=i18n("需要合成的文本"), value="", lines=16, max_lines=16
+ )
+ text_language = gr.Dropdown(
+ label=i18n("需要合成的语种"),
+ choices=[
+ i18n("中文"),
+ i18n("英文"),
+ i18n("日文"),
+ i18n("中英混合"),
+ i18n("日英混合"),
+ i18n("多语种混合"),
+ ],
+ value=i18n("中文"),
+ )
+
+ with gr.Group():
+ gr.Markdown(value=i18n("推理设置"))
+ with gr.Row():
+
+ with gr.Column():
+ batch_size = gr.Slider(
+ minimum=1,
+ maximum=200,
+ step=1,
+ label=i18n("batch_size"),
+ value=20,
+ interactive=True,
+ )
+ fragment_interval = gr.Slider(
+ minimum=0.01,
+ maximum=1,
+ step=0.01,
+ label=i18n("分段间隔(秒)"),
+ value=0.3,
+ interactive=True,
+ )
+ speed_factor = gr.Slider(
+ minimum=0.25,
+ maximum=4,
+ step=0.05,
+ label="speed_factor",
+ value=1.0,
+ interactive=True,
+ )
+ top_k = gr.Slider(
+ minimum=1,
+ maximum=100,
+ step=1,
+ label=i18n("top_k"),
+ value=5,
+ interactive=True,
+ )
+ top_p = gr.Slider(
+ minimum=0,
+ maximum=1,
+ step=0.05,
+ label=i18n("top_p"),
+ value=1,
+ interactive=True,
+ )
+ temperature = gr.Slider(
+ minimum=0,
+ maximum=1,
+ step=0.05,
+ label=i18n("temperature"),
+ value=1,
+ interactive=True,
+ )
+ with gr.Column():
+ how_to_cut = gr.Radio(
+ label=i18n("怎么切"),
+ choices=[
+ i18n("不切"),
+ i18n("凑四句一切"),
+ i18n("凑50字一切"),
+ i18n("按中文句号。切"),
+ i18n("按英文句号.切"),
+ i18n("按标点符号切"),
+ ],
+ value=i18n("凑四句一切"),
+ interactive=True,
+ )
+ with gr.Row():
+ split_bucket = gr.Checkbox(
+ label=i18n("数据分桶(可能会降低一点计算量,选就对了)"),
+ value=True,
+ interactive=True,
+ show_label=True,
+ )
+ seed = gr.Number(label=i18n("随机种子"), value=-1)
+ # with gr.Column():
+ output = gr.Audio(label=i18n("输出的语音"))
+ with gr.Row():
+ inference_button = gr.Button(
+ i18n("合成语音"), variant="primary"
+ )
+ stop_infer = gr.Button(i18n("终止合成"), variant="primary")
+
+ inference_button.click(
+ inference,
+ [
+ text,
+ text_language,
+ inp_ref,
+ prompt_text,
+ prompt_language,
+ top_k,
+ top_p,
+ temperature,
+ how_to_cut,
+ batch_size,
+ speed_factor,
+ ref_text_free,
+ split_bucket,
+ fragment_interval,
+ seed,
+ ],
+ [output, seed],
+ )
+ stop_infer.click(tts_pipline.stop, [], [])
+
+ with gr.Group():
+ gr.Markdown(
+ value=i18n(
+ "文本切分工具。太长的文本合成出来效果不一定好,所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"
+ )
+ )
+ with gr.Row():
+ text_inp = gr.Textbox(
+ label=i18n("需要合成的切分前文本"), value="", lines=4
+ )
+ with gr.Column():
+ _how_to_cut = gr.Radio(
+ label=i18n("怎么切"),
+ choices=[
+ i18n("不切"),
+ i18n("凑四句一切"),
+ i18n("凑50字一切"),
+ i18n("按中文句号。切"),
+ i18n("按英文句号.切"),
+ i18n("按标点符号切"),
+ ],
+ value=i18n("凑四句一切"),
+ interactive=True,
+ )
+ cut_text = gr.Button(i18n("切分"), variant="primary")
+
+ def to_cut(text_inp, how_to_cut):
+ if len(text_inp.strip()) == 0 or text_inp == []:
+ return ""
+ method = get_method(cut_method[how_to_cut])
+ return method(text_inp)
+
+ text_opt = gr.Textbox(label=i18n("切分后文本"), value="", lines=4)
+ cut_text.click(to_cut, [text_inp, _how_to_cut], [text_opt])
+ gr.Markdown(
+ value=i18n("后续将支持转音素、手工修改音素、语音合成分步执行。")
+ )
+
+ app.queue(concurrency_count=511, max_size=1022).launch(
+ server_name="0.0.0.0",
+ inbrowser=True,
+ share=is_share,
+ server_port=infer_ttswebui,
+ quiet=True,
+ )