mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2026-01-09 11:57:01 +08:00
Compare commits
18 Commits
216f60e39c
...
e78aada32e
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e78aada32e | ||
|
|
c767f0b83b | ||
|
|
9080a967d5 | ||
|
|
51df9f7384 | ||
|
|
bfca0f6b2d | ||
|
|
abe984395c | ||
|
|
cc89c3660e | ||
|
|
539ad38d31 | ||
|
|
6df7921d56 | ||
|
|
8d212a9255 | ||
|
|
2fb92e74a2 | ||
|
|
be16f387f1 | ||
|
|
7e6a607b9e | ||
|
|
ab5e8dc0ae | ||
|
|
b666becb19 | ||
|
|
47426d18e7 | ||
|
|
b8de0ec0ac | ||
|
|
f20f17c2c0 |
@ -707,10 +707,12 @@ class Text2SemanticDecoder(nn.Module):
|
||||
|
||||
if idx == 0:
|
||||
attn_mask = F.pad(attn_mask[:, :, -1].unsqueeze(-2), (0, 1), value=False)
|
||||
logits = logits[:, :-1]
|
||||
else:
|
||||
attn_mask = F.pad(attn_mask, (0, 1), value=False)
|
||||
|
||||
if idx < 11: ###至少预测出10个token不然不给停止(0.4s)
|
||||
logits = logits[:, :-1]
|
||||
|
||||
samples = sample(
|
||||
logits, y, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, temperature=temperature
|
||||
)[0]
|
||||
|
||||
@ -1008,7 +1008,7 @@ class TTS:
|
||||
"aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion
|
||||
"prompt_text": "", # str.(optional) prompt text for the reference audio
|
||||
"prompt_lang": "", # str.(required) language of the prompt text for the reference audio
|
||||
"top_k": 5, # int. top k sampling
|
||||
"top_k": 15, # int. top k sampling
|
||||
"top_p": 1, # float. top p sampling
|
||||
"temperature": 1, # float. temperature for sampling
|
||||
"text_split_method": "cut1", # str. text split method, see text_segmentation_method.py for details.
|
||||
@ -1039,7 +1039,7 @@ class TTS:
|
||||
aux_ref_audio_paths: list = inputs.get("aux_ref_audio_paths", [])
|
||||
prompt_text: str = inputs.get("prompt_text", "")
|
||||
prompt_lang: str = inputs.get("prompt_lang", "")
|
||||
top_k: int = inputs.get("top_k", 5)
|
||||
top_k: int = inputs.get("top_k", 15)
|
||||
top_p: float = inputs.get("top_p", 1)
|
||||
temperature: float = inputs.get("temperature", 1)
|
||||
text_split_method: str = inputs.get("text_split_method", "cut1")
|
||||
|
||||
@ -385,7 +385,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
|
||||
minimum=0.6, maximum=1.65, step=0.05, label="语速", value=1.0, interactive=True
|
||||
)
|
||||
with gr.Row():
|
||||
top_k = gr.Slider(minimum=1, maximum=100, step=1, label=i18n("top_k"), value=5, interactive=True)
|
||||
top_k = gr.Slider(minimum=1, maximum=100, step=1, label=i18n("top_k"), value=15, interactive=True)
|
||||
top_p = gr.Slider(minimum=0, maximum=1, step=0.05, label=i18n("top_p"), value=1, interactive=True)
|
||||
with gr.Row():
|
||||
temperature = gr.Slider(
|
||||
|
||||
@ -347,7 +347,7 @@ Use v4 from v1/v2/v3 environment:
|
||||
|
||||
2. Clone the latest codes from github.
|
||||
|
||||
3. Download v4 pretrained models (gsv-v4-pretrained/s2v4.ckpt, and gsv-v4-pretrained/vocoder.pth) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`.
|
||||
3. Download v4 pretrained models (gsv-v4-pretrained/s2v4.pth, and gsv-v4-pretrained/vocoder.pth) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`.
|
||||
|
||||
## V2Pro Release Notes
|
||||
|
||||
|
||||
@ -27,7 +27,7 @@ POST:
|
||||
"aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion
|
||||
"prompt_text": "", # str.(optional) prompt text for the reference audio
|
||||
"prompt_lang": "", # str.(required) language of the prompt text for the reference audio
|
||||
"top_k": 5, # int. top k sampling
|
||||
"top_k": 15, # int. top k sampling
|
||||
"top_p": 1, # float. top p sampling
|
||||
"temperature": 1, # float. temperature for sampling
|
||||
"text_split_method": "cut5", # str. text split method, see text_segmentation_method.py for details.
|
||||
@ -158,7 +158,7 @@ class TTS_Request(BaseModel):
|
||||
aux_ref_audio_paths: list = None
|
||||
prompt_lang: str = None
|
||||
prompt_text: str = ""
|
||||
top_k: int = 5
|
||||
top_k: int = 15
|
||||
top_p: float = 1
|
||||
temperature: float = 1
|
||||
text_split_method: str = "cut5"
|
||||
@ -355,7 +355,7 @@ async def tts_handle(req: dict):
|
||||
"aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion
|
||||
"prompt_text": "", # str.(optional) prompt text for the reference audio
|
||||
"prompt_lang": "", # str.(required) language of the prompt text for the reference audio
|
||||
"top_k": 5, # int. top k sampling
|
||||
"top_k": 15, # int. top k sampling
|
||||
"top_p": 1, # float. top p sampling
|
||||
"temperature": 1, # float. temperature for sampling
|
||||
"text_split_method": "cut5", # str. text split method, see text_segmentation_method.py for details.
|
||||
@ -460,7 +460,7 @@ async def tts_get_endpoint(
|
||||
aux_ref_audio_paths: list = None,
|
||||
prompt_lang: str = None,
|
||||
prompt_text: str = "",
|
||||
top_k: int = 5,
|
||||
top_k: int = 15,
|
||||
top_p: float = 1,
|
||||
temperature: float = 1,
|
||||
text_split_method: str = "cut5",
|
||||
|
||||
2
go-webui-simple-mode.bat
Normal file
2
go-webui-simple-mode.bat
Normal file
@ -0,0 +1,2 @@
|
||||
runtime\python.exe -I webui_simple.py zh_CN
|
||||
pause
|
||||
@ -19,7 +19,7 @@ torchaudio
|
||||
modelscope
|
||||
sentencepiece
|
||||
transformers>=4.43,<=4.50
|
||||
peft
|
||||
peft<0.18.0
|
||||
chardet
|
||||
PyYAML
|
||||
psutil
|
||||
|
||||
2083
webui_simple.py
Normal file
2083
webui_simple.py
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user