diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py index 4fa00e4..c677d77 100644 --- a/GPT_SoVITS/TTS_infer_pack/TTS.py +++ b/GPT_SoVITS/TTS_infer_pack/TTS.py @@ -642,7 +642,7 @@ class TTS: "text": "", # str.(required) text to be synthesized "text_lang: "", # str.(required) language of the text to be synthesized "ref_audio_path": "", # str.(required) reference audio path - "aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker synthesis + "aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion "prompt_text": "", # str.(optional) prompt text for the reference audio "prompt_lang": "", # str.(required) language of the prompt text for the reference audio "top_k": 5, # int. top k sampling diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index e6ed87a..5aff4ae 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -397,7 +397,8 @@ def merge_short_text_in_array(texts, threshold): ##ref_wav_path+prompt_text+prompt_language+text(单个)+text_language+top_k+top_p+temperature # cache_tokens={}#暂未实现清理机制 cache= {} -def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=i18n("不切"), top_k=20, top_p=0.6, temperature=0.6, ref_free = False,speed=1,if_freeze=False,inp_refs=123): +def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language, how_to_cut=i18n("不切"), top_k=20, top_p=0.6, temperature=0.6, ref_free + =False,speed=1,if_freeze=False,inp_refs=None): global cache if ref_wav_path:pass else:gr.Warning(i18n('请上传参考音频')) diff --git a/api_v2.py b/api_v2.py index 2414af0..a9faaeb 100644 --- a/api_v2.py +++ b/api_v2.py @@ -24,7 +24,7 @@ POST: "text": "", # str.(required) text to be synthesized "text_lang: "", # str.(required) language of the text to be synthesized "ref_audio_path": "", # str.(required) reference audio path - "aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker synthesis + "aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion "prompt_text": "", # str.(optional) prompt text for the reference audio "prompt_lang": "", # str.(required) language of the prompt text for the reference audio "top_k": 5, # int. top k sampling @@ -34,7 +34,6 @@ POST: "batch_size": 1, # int. batch size for inference "batch_threshold": 0.75, # float. threshold for batch splitting. "split_bucket: True, # bool. whether to split the batch into multiple buckets. - "return_fragment": False, # bool. step by step return the audio fragment. "speed_factor":1.0, # float. control the speed of the synthesized audio. "streaming_mode": False, # bool. whether to return a streaming response. "seed": -1, # int. random seed for reproducibility. @@ -302,13 +301,14 @@ async def tts_handle(req:dict): """ streaming_mode = req.get("streaming_mode", False) + return_fragment = req.get("return_fragment", False) media_type = req.get("media_type", "wav") check_res = check_params(req) if check_res is not None: return check_res - if streaming_mode: + if streaming_mode or return_fragment: req["return_fragment"] = True try: