mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-12-01 05:53:38 +08:00
更友好的流模式选项 (#2678)
This commit is contained in:
parent
c85c54eca9
commit
6fb441f65e
48
api_v2.py
48
api_v2.py
@ -41,11 +41,9 @@ POST:
|
||||
"repetition_penalty": 1.35, # float. repetition penalty for T2S model.
|
||||
"sample_steps": 32, # int. number of sampling steps for VITS model V3.
|
||||
"super_sampling": False, # bool. whether to use super-sampling for audio when using VITS model V3.
|
||||
"return_fragment": False, # bool. step by step return the audio fragment. (Best Quality, Slowest response speed. old version of streaming mode)
|
||||
"streaming_mode": False, # bool. return audio chunk by chunk. (Medium quality, Slow response speed)
|
||||
"streaming_mode": False, # bool or int. return audio chunk by chunk.T he available options are: 0,1,2,3 or True/False (0/False: Disabled | 1/True: Best Quality, Slowest response speed (old version streaming_mode) | 2: Medium Quality, Slow response speed | 3: Lower Quality, Faster response speed )
|
||||
"overlap_length": 2, # int. overlap length of semantic tokens for streaming mode.
|
||||
"min_chunk_length": 16, # int. The minimum chunk length of semantic tokens for streaming mode. (affects audio chunk size)
|
||||
"fixed_length_chunk": False, # bool. When turned on, it can achieve faster streaming response, but with lower quality. (lower quality, faster response speed)
|
||||
"min_chunk_length": 16, # int. The minimum chunk length of semantic tokens for streaming mode. (affects audio chunk size)
|
||||
}
|
||||
```
|
||||
|
||||
@ -106,7 +104,7 @@ RESP:
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
from typing import Generator
|
||||
from typing import Generator, Union
|
||||
|
||||
now_dir = os.getcwd()
|
||||
sys.path.append(now_dir)
|
||||
@ -171,15 +169,13 @@ class TTS_Request(BaseModel):
|
||||
fragment_interval: float = 0.3
|
||||
seed: int = -1
|
||||
media_type: str = "wav"
|
||||
streaming_mode: bool = False
|
||||
streaming_mode: Union[bool, int] = False
|
||||
parallel_infer: bool = True
|
||||
repetition_penalty: float = 1.35
|
||||
sample_steps: int = 32
|
||||
super_sampling: bool = False
|
||||
overlap_length: int = 2
|
||||
min_chunk_length: int = 16
|
||||
return_fragment: bool = False
|
||||
fixed_length_chunk: bool = False
|
||||
|
||||
|
||||
def pack_ogg(io_buffer: BytesIO, data: np.ndarray, rate: int):
|
||||
@ -373,11 +369,9 @@ async def tts_handle(req: dict):
|
||||
"repetition_penalty": 1.35, # float. repetition penalty for T2S model.
|
||||
"sample_steps": 32, # int. number of sampling steps for VITS model V3.
|
||||
"super_sampling": False, # bool. whether to use super-sampling for audio when using VITS model V3.
|
||||
"return_fragment": False, # bool. step by step return the audio fragment. (Best Quality, Slowest response speed. old version of streaming mode)
|
||||
"streaming_mode": False, # bool. return audio chunk by chunk. (Medium quality, Slow response speed)
|
||||
"streaming_mode": False, # bool or int. return audio chunk by chunk.T he available options are: 0,1,2,3 or True/False (0/False: Disabled | 1/True: Best Quality, Slowest response speed (old version streaming_mode) | 2: Medium Quality, Slow response speed | 3: Lower Quality, Faster response speed )
|
||||
"overlap_length": 2, # int. overlap length of semantic tokens for streaming mode.
|
||||
"min_chunk_length": 16, # int. The minimum chunk length of semantic tokens for streaming mode. (affects audio chunk size)
|
||||
"fixed_length_chunk": False, # bool. When turned on, it can achieve faster streaming response, but with lower quality. (lower quality, faster response speed)
|
||||
"min_chunk_length": 16, # int. The minimum chunk length of semantic tokens for streaming mode. (affects audio chunk size)
|
||||
}
|
||||
returns:
|
||||
StreamingResponse: audio stream response.
|
||||
@ -390,9 +384,33 @@ async def tts_handle(req: dict):
|
||||
check_res = check_params(req)
|
||||
if check_res is not None:
|
||||
return check_res
|
||||
|
||||
if streaming_mode == 0:
|
||||
streaming_mode = False
|
||||
return_fragment = False
|
||||
fixed_length_chunk = False
|
||||
elif streaming_mode == 1:
|
||||
streaming_mode = False
|
||||
return_fragment = True
|
||||
fixed_length_chunk = False
|
||||
elif streaming_mode == 2:
|
||||
streaming_mode = True
|
||||
return_fragment = False
|
||||
fixed_length_chunk = False
|
||||
elif streaming_mode == 3:
|
||||
streaming_mode = True
|
||||
return_fragment = False
|
||||
fixed_length_chunk = True
|
||||
|
||||
else:
|
||||
return JSONResponse(status_code=400, content={"message": f"the value of streaming_mode must be 0, 1, 2, 3(int) or true/false(bool)"})
|
||||
|
||||
req["streaming_mode"] = streaming_mode
|
||||
req["return_fragment"] = return_fragment
|
||||
req["fixed_length_chunk"] = fixed_length_chunk
|
||||
|
||||
print(f"{streaming_mode} {return_fragment} {fixed_length_chunk}")
|
||||
|
||||
streaming_mode = streaming_mode or return_fragment
|
||||
|
||||
|
||||
@ -457,11 +475,9 @@ async def tts_get_endpoint(
|
||||
repetition_penalty: float = 1.35,
|
||||
sample_steps: int = 32,
|
||||
super_sampling: bool = False,
|
||||
return_fragment: bool = False,
|
||||
streaming_mode: bool = False,
|
||||
streaming_mode: Union[bool, int] = False,
|
||||
overlap_length: int = 2,
|
||||
min_chunk_length: int = 16,
|
||||
fixed_length_chunk: bool = False,
|
||||
):
|
||||
req = {
|
||||
"text": text,
|
||||
@ -488,8 +504,6 @@ async def tts_get_endpoint(
|
||||
"super_sampling": super_sampling,
|
||||
"overlap_length": int(overlap_length),
|
||||
"min_chunk_length": int(min_chunk_length),
|
||||
"return_fragment": return_fragment,
|
||||
"fixed_length_chunk": fixed_length_chunk
|
||||
}
|
||||
return await tts_handle(req)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user