""" # WebAPI文档 ` python api_v2.py -a 127.0.0.1 -p 9880 -c GPT_SoVITS/configs/tts_infer.yaml ` ## 执行参数: `-a` - `绑定地址, 默认"127.0.0.1"` `-p` - `绑定端口, 默认9880` `-c` - `TTS配置文件路径, 默认"GPT_SoVITS/configs/tts_infer.yaml"` ## 调用: ### 推理 endpoint: `/tts` GET: ``` http://127.0.0.1:9880/tts?text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_lang=zh&ref_audio_path=archive_jingyuan_1.wav&prompt_lang=zh&prompt_text=我是「罗浮」云骑将军景元。不必拘谨,「将军」只是一时的身份,你称呼我景元便可&text_split_method=cut5&batch_size=1&media_type=wav&streaming_mode=true ``` POST: ```json { "text": "", # str.(required) text to be synthesized "text_lang: "", # str.(required) language of the text to be synthesized "ref_audio_path": "", # str.(required) reference audio path "aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion "prompt_text": "", # str.(optional) prompt text for the reference audio "prompt_lang": "", # str.(required) language of the prompt text for the reference audio "top_k": 15, # int. top k sampling "top_p": 1, # float. top p sampling "temperature": 1, # float. temperature for sampling "text_split_method": "cut5", # str. text split method, see text_segmentation_method.py for details. "batch_size": 1, # int. batch size for inference "batch_threshold": 0.75, # float. threshold for batch splitting. "split_bucket": True, # bool. whether to split the batch into multiple buckets. "speed_factor":1.0, # float. control the speed of the synthesized audio. "fragment_interval":0.3, # float. to control the interval of the audio fragment. "seed": -1, # int. random seed for reproducibility. "parallel_infer": True, # bool. whether to use parallel inference. "repetition_penalty": 1.35, # float. repetition penalty for T2S model. "sample_steps": 32, # int. 仅 v3/v4 vocoder 路径使用;当前 v2/v2ProPlus 主线可忽略。 "super_sampling": False, # bool. 仅 v3/v4 路径使用;不属于当前 v2/v2ProPlus 正式支持目标。 "streaming_mode": False, # bool or int. return audio chunk by chunk.T he available options are: 0,1,2,3 or True/False (0/False: Disabled | 1/True: Best Quality, Slowest response speed (old version streaming_mode) | 2: Medium Quality, Slow response speed | 3: Lower Quality, Faster response speed ) "overlap_length": 2, # int. overlap length of semantic tokens for streaming mode. "min_chunk_length": 16, # int. The minimum chunk length of semantic tokens for streaming mode. (affects audio chunk size) } ``` RESP: 成功: 直接返回 wav 音频流, http code 200 失败: 返回包含错误信息的 json, http code 400 ### 命令控制 endpoint: `/control` command: "restart": 重新运行 "exit": 结束运行 GET: ``` http://127.0.0.1:9880/control?command=restart ``` POST: ```json { "command": "restart" } ``` RESP: 无 ### 切换GPT模型 endpoint: `/set_gpt_weights` GET: ``` http://127.0.0.1:9880/set_gpt_weights?weights_path=GPT_SoVITS/pretrained_models/s1v3.ckpt ``` RESP: 成功: 返回"success", http code 200 失败: 返回包含错误信息的 json, http code 400 ### 切换Sovits模型 endpoint: `/set_sovits_weights` GET: ``` http://127.0.0.1:9880/set_sovits_weights?weights_path=GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus.pth ``` RESP: 成功: 返回"success", http code 200 失败: 返回包含错误信息的 json, http code 400 """ import os import sys import traceback from typing import Union now_dir = os.getcwd() sys.path.append(now_dir) sys.path.append("%s/GPT_SoVITS" % (now_dir)) import argparse import signal from fastapi import FastAPI, Response from fastapi.responses import StreamingResponse, JSONResponse import uvicorn from tools.i18n.i18n import I18nAuto from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import get_method_names as get_cut_method_names from GPT_SoVITS.TTS_infer_pack.unified_engine import RuntimeControlCallbacks, UnifiedTTSEngine from pydantic import BaseModel # print(sys.path) i18n = I18nAuto() cut_method_names = get_cut_method_names() parser = argparse.ArgumentParser(description="GPT-SoVITS api") parser.add_argument("-c", "--tts_config", type=str, default="GPT_SoVITS/configs/tts_infer.yaml", help="tts_infer路径") parser.add_argument("-a", "--bind_addr", type=str, default="127.0.0.1", help="default: 127.0.0.1") parser.add_argument("-p", "--port", type=int, default="9880", help="default: 9880") args = parser.parse_args() config_path = args.tts_config # device = args.device port = args.port host = args.bind_addr argv = sys.argv if config_path in [None, ""]: config_path = "GPT-SoVITS/configs/tts_infer.yaml" tts_config = TTS_Config(config_path) print(tts_config) tts_pipeline = TTS(tts_config) tts_engine = UnifiedTTSEngine( tts_pipeline, cut_method_names=cut_method_names, control_callbacks=RuntimeControlCallbacks( restart=lambda: os.execl(sys.executable, sys.executable, *argv), exit=lambda: os.kill(os.getpid(), signal.SIGTERM), ), ) APP = FastAPI() class TTS_Request(BaseModel): text: str = None text_lang: str = None ref_audio_path: str = None aux_ref_audio_paths: list = None prompt_lang: str = None prompt_text: str = "" top_k: int = 15 top_p: float = 1 temperature: float = 1 text_split_method: str = "cut5" batch_size: int = 1 batch_threshold: float = 0.75 split_bucket: bool = True speed_factor: float = 1.0 fragment_interval: float = 0.3 seed: int = -1 media_type: str = "wav" streaming_mode: Union[bool, int] = False parallel_infer: bool = True repetition_penalty: float = 1.35 sample_steps: int = 32 super_sampling: bool = False overlap_length: int = 2 min_chunk_length: int = 16 def _lower_or_none(value: str | None) -> str | None: return value.lower() if isinstance(value, str) else value async def tts_handle(req: dict): """ Text to speech handler. Args: req (dict): { "text": "", # str.(required) text to be synthesized "text_lang: "", # str.(required) language of the text to be synthesized "ref_audio_path": "", # str.(required) reference audio path "aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion "prompt_text": "", # str.(optional) prompt text for the reference audio "prompt_lang": "", # str.(required) language of the prompt text for the reference audio "top_k": 15, # int. top k sampling "top_p": 1, # float. top p sampling "temperature": 1, # float. temperature for sampling "text_split_method": "cut5", # str. text split method, see text_segmentation_method.py for details. "batch_size": 1, # int. batch size for inference "batch_threshold": 0.75, # float. threshold for batch splitting. "split_bucket": True, # bool. whether to split the batch into multiple buckets. "speed_factor":1.0, # float. control the speed of the synthesized audio. "fragment_interval":0.3, # float. to control the interval of the audio fragment. "seed": -1, # int. random seed for reproducibility. "parallel_infer": True, # bool. whether to use parallel inference. "repetition_penalty": 1.35, # float. repetition penalty for T2S model. "sample_steps": 32, # int. number of sampling steps for VITS model V3. "super_sampling": False, # bool. only for v3/v4; not part of current v2/v2ProPlus mainline. "streaming_mode": False, # bool or int. return audio chunk by chunk.T he available options are: 0,1,2,3 or True/False (0/False: Disabled | 1/True: Best Quality, Slowest response speed (old version streaming_mode) | 2: Medium Quality, Slow response speed | 3: Lower Quality, Faster response speed ) "overlap_length": 2, # int. overlap length of semantic tokens for streaming mode. "min_chunk_length": 16, # int. The minimum chunk length of semantic tokens for streaming mode. (affects audio chunk size) } returns: StreamingResponse: audio stream response. """ try: result = await tts_engine.run_direct_tts_async(req) if result.streaming: return StreamingResponse(result.audio_generator, media_type=f"audio/{result.media_type}") return Response(result.audio_bytes, media_type=f"audio/{result.media_type}") except Exception as e: return JSONResponse(status_code=400, content={"message": "tts failed", "Exception": str(e)}) @APP.get("/control") async def control(command: str = None): if command is None: return JSONResponse(status_code=400, content={"message": "command is required"}) try: tts_engine.handle_control(command) return JSONResponse(status_code=200, content={"message": "success"}) except Exception as e: return JSONResponse(status_code=400, content={"message": "control failed", "Exception": str(e)}) @APP.get("/tts") async def tts_get_endpoint( text: str = None, text_lang: str = None, ref_audio_path: str = None, aux_ref_audio_paths: list = None, prompt_lang: str = None, prompt_text: str = "", top_k: int = 15, top_p: float = 1, temperature: float = 1, text_split_method: str = "cut5", batch_size: int = 1, batch_threshold: float = 0.75, split_bucket: bool = True, speed_factor: float = 1.0, fragment_interval: float = 0.3, seed: int = -1, media_type: str = "wav", parallel_infer: bool = True, repetition_penalty: float = 1.35, sample_steps: int = 32, super_sampling: bool = False, streaming_mode: Union[bool, int] = False, overlap_length: int = 2, min_chunk_length: int = 16, ): req = { "text": text, "text_lang": _lower_or_none(text_lang), "ref_audio_path": ref_audio_path, "aux_ref_audio_paths": aux_ref_audio_paths, "prompt_text": prompt_text, "prompt_lang": _lower_or_none(prompt_lang), "top_k": top_k, "top_p": top_p, "temperature": temperature, "text_split_method": text_split_method, "batch_size": int(batch_size), "batch_threshold": float(batch_threshold), "speed_factor": float(speed_factor), "split_bucket": split_bucket, "fragment_interval": fragment_interval, "seed": seed, "media_type": media_type, "streaming_mode": streaming_mode, "parallel_infer": parallel_infer, "repetition_penalty": float(repetition_penalty), "sample_steps": int(sample_steps), "super_sampling": super_sampling, "overlap_length": int(overlap_length), "min_chunk_length": int(min_chunk_length), } return await tts_handle(req) @APP.post("/tts") async def tts_post_endpoint(request: TTS_Request): req = request.dict() return await tts_handle(req) @APP.get("/set_refer_audio") async def set_refer_aduio(refer_audio_path: str = None): try: payload = tts_engine.set_refer_audio(refer_audio_path) except Exception as e: return JSONResponse(status_code=400, content={"message": "set refer audio failed", "Exception": str(e)}) return JSONResponse(status_code=200, content=payload) # @APP.post("/set_refer_audio") # async def set_refer_aduio_post(audio_file: UploadFile = File(...)): # try: # # 检查文件类型,确保是音频文件 # if not audio_file.content_type.startswith("audio/"): # return JSONResponse(status_code=400, content={"message": "file type is not supported"}) # os.makedirs("uploaded_audio", exist_ok=True) # save_path = os.path.join("uploaded_audio", audio_file.filename) # # 保存音频文件到服务器上的一个目录 # with open(save_path , "wb") as buffer: # buffer.write(await audio_file.read()) # tts_pipeline.set_ref_audio(save_path) # except Exception as e: # return JSONResponse(status_code=400, content={"message": f"set refer audio failed", "Exception": str(e)}) # return JSONResponse(status_code=200, content={"message": "success"}) @APP.get("/set_gpt_weights") async def set_gpt_weights(weights_path: str = None): try: payload = tts_engine.set_gpt_weights(weights_path) except Exception as e: return JSONResponse(status_code=400, content={"message": "change gpt weight failed", "Exception": str(e)}) return JSONResponse(status_code=200, content=payload) @APP.get("/set_sovits_weights") async def set_sovits_weights(weights_path: str = None): try: payload = tts_engine.set_sovits_weights(weights_path) except Exception as e: return JSONResponse(status_code=400, content={"message": "change sovits weight failed", "Exception": str(e)}) return JSONResponse(status_code=200, content=payload) if __name__ == "__main__": try: if host == "None": # 在调用时使用 -a None 参数,可以让api监听双栈 host = None uvicorn.run(app=APP, host=host, port=port, workers=1) except Exception: traceback.print_exc() os.kill(os.getpid(), signal.SIGTERM) exit(0)