mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2026-01-10 04:17:01 +08:00
Compare commits
18 Commits
216f60e39c
...
e78aada32e
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e78aada32e | ||
|
|
c767f0b83b | ||
|
|
9080a967d5 | ||
|
|
51df9f7384 | ||
|
|
bfca0f6b2d | ||
|
|
abe984395c | ||
|
|
cc89c3660e | ||
|
|
539ad38d31 | ||
|
|
6df7921d56 | ||
|
|
8d212a9255 | ||
|
|
2fb92e74a2 | ||
|
|
be16f387f1 | ||
|
|
7e6a607b9e | ||
|
|
ab5e8dc0ae | ||
|
|
b666becb19 | ||
|
|
47426d18e7 | ||
|
|
b8de0ec0ac | ||
|
|
f20f17c2c0 |
@ -707,10 +707,12 @@ class Text2SemanticDecoder(nn.Module):
|
|||||||
|
|
||||||
if idx == 0:
|
if idx == 0:
|
||||||
attn_mask = F.pad(attn_mask[:, :, -1].unsqueeze(-2), (0, 1), value=False)
|
attn_mask = F.pad(attn_mask[:, :, -1].unsqueeze(-2), (0, 1), value=False)
|
||||||
logits = logits[:, :-1]
|
|
||||||
else:
|
else:
|
||||||
attn_mask = F.pad(attn_mask, (0, 1), value=False)
|
attn_mask = F.pad(attn_mask, (0, 1), value=False)
|
||||||
|
|
||||||
|
if idx < 11: ###至少预测出10个token不然不给停止(0.4s)
|
||||||
|
logits = logits[:, :-1]
|
||||||
|
|
||||||
samples = sample(
|
samples = sample(
|
||||||
logits, y, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, temperature=temperature
|
logits, y, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, temperature=temperature
|
||||||
)[0]
|
)[0]
|
||||||
|
|||||||
@ -1008,7 +1008,7 @@ class TTS:
|
|||||||
"aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion
|
"aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion
|
||||||
"prompt_text": "", # str.(optional) prompt text for the reference audio
|
"prompt_text": "", # str.(optional) prompt text for the reference audio
|
||||||
"prompt_lang": "", # str.(required) language of the prompt text for the reference audio
|
"prompt_lang": "", # str.(required) language of the prompt text for the reference audio
|
||||||
"top_k": 5, # int. top k sampling
|
"top_k": 15, # int. top k sampling
|
||||||
"top_p": 1, # float. top p sampling
|
"top_p": 1, # float. top p sampling
|
||||||
"temperature": 1, # float. temperature for sampling
|
"temperature": 1, # float. temperature for sampling
|
||||||
"text_split_method": "cut1", # str. text split method, see text_segmentation_method.py for details.
|
"text_split_method": "cut1", # str. text split method, see text_segmentation_method.py for details.
|
||||||
@ -1039,7 +1039,7 @@ class TTS:
|
|||||||
aux_ref_audio_paths: list = inputs.get("aux_ref_audio_paths", [])
|
aux_ref_audio_paths: list = inputs.get("aux_ref_audio_paths", [])
|
||||||
prompt_text: str = inputs.get("prompt_text", "")
|
prompt_text: str = inputs.get("prompt_text", "")
|
||||||
prompt_lang: str = inputs.get("prompt_lang", "")
|
prompt_lang: str = inputs.get("prompt_lang", "")
|
||||||
top_k: int = inputs.get("top_k", 5)
|
top_k: int = inputs.get("top_k", 15)
|
||||||
top_p: float = inputs.get("top_p", 1)
|
top_p: float = inputs.get("top_p", 1)
|
||||||
temperature: float = inputs.get("temperature", 1)
|
temperature: float = inputs.get("temperature", 1)
|
||||||
text_split_method: str = inputs.get("text_split_method", "cut1")
|
text_split_method: str = inputs.get("text_split_method", "cut1")
|
||||||
|
|||||||
@ -385,7 +385,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
|
|||||||
minimum=0.6, maximum=1.65, step=0.05, label="语速", value=1.0, interactive=True
|
minimum=0.6, maximum=1.65, step=0.05, label="语速", value=1.0, interactive=True
|
||||||
)
|
)
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
top_k = gr.Slider(minimum=1, maximum=100, step=1, label=i18n("top_k"), value=5, interactive=True)
|
top_k = gr.Slider(minimum=1, maximum=100, step=1, label=i18n("top_k"), value=15, interactive=True)
|
||||||
top_p = gr.Slider(minimum=0, maximum=1, step=0.05, label=i18n("top_p"), value=1, interactive=True)
|
top_p = gr.Slider(minimum=0, maximum=1, step=0.05, label=i18n("top_p"), value=1, interactive=True)
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
temperature = gr.Slider(
|
temperature = gr.Slider(
|
||||||
|
|||||||
@ -347,7 +347,7 @@ Use v4 from v1/v2/v3 environment:
|
|||||||
|
|
||||||
2. Clone the latest codes from github.
|
2. Clone the latest codes from github.
|
||||||
|
|
||||||
3. Download v4 pretrained models (gsv-v4-pretrained/s2v4.ckpt, and gsv-v4-pretrained/vocoder.pth) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`.
|
3. Download v4 pretrained models (gsv-v4-pretrained/s2v4.pth, and gsv-v4-pretrained/vocoder.pth) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`.
|
||||||
|
|
||||||
## V2Pro Release Notes
|
## V2Pro Release Notes
|
||||||
|
|
||||||
|
|||||||
@ -27,7 +27,7 @@ POST:
|
|||||||
"aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion
|
"aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion
|
||||||
"prompt_text": "", # str.(optional) prompt text for the reference audio
|
"prompt_text": "", # str.(optional) prompt text for the reference audio
|
||||||
"prompt_lang": "", # str.(required) language of the prompt text for the reference audio
|
"prompt_lang": "", # str.(required) language of the prompt text for the reference audio
|
||||||
"top_k": 5, # int. top k sampling
|
"top_k": 15, # int. top k sampling
|
||||||
"top_p": 1, # float. top p sampling
|
"top_p": 1, # float. top p sampling
|
||||||
"temperature": 1, # float. temperature for sampling
|
"temperature": 1, # float. temperature for sampling
|
||||||
"text_split_method": "cut5", # str. text split method, see text_segmentation_method.py for details.
|
"text_split_method": "cut5", # str. text split method, see text_segmentation_method.py for details.
|
||||||
@ -158,7 +158,7 @@ class TTS_Request(BaseModel):
|
|||||||
aux_ref_audio_paths: list = None
|
aux_ref_audio_paths: list = None
|
||||||
prompt_lang: str = None
|
prompt_lang: str = None
|
||||||
prompt_text: str = ""
|
prompt_text: str = ""
|
||||||
top_k: int = 5
|
top_k: int = 15
|
||||||
top_p: float = 1
|
top_p: float = 1
|
||||||
temperature: float = 1
|
temperature: float = 1
|
||||||
text_split_method: str = "cut5"
|
text_split_method: str = "cut5"
|
||||||
@ -355,7 +355,7 @@ async def tts_handle(req: dict):
|
|||||||
"aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion
|
"aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion
|
||||||
"prompt_text": "", # str.(optional) prompt text for the reference audio
|
"prompt_text": "", # str.(optional) prompt text for the reference audio
|
||||||
"prompt_lang": "", # str.(required) language of the prompt text for the reference audio
|
"prompt_lang": "", # str.(required) language of the prompt text for the reference audio
|
||||||
"top_k": 5, # int. top k sampling
|
"top_k": 15, # int. top k sampling
|
||||||
"top_p": 1, # float. top p sampling
|
"top_p": 1, # float. top p sampling
|
||||||
"temperature": 1, # float. temperature for sampling
|
"temperature": 1, # float. temperature for sampling
|
||||||
"text_split_method": "cut5", # str. text split method, see text_segmentation_method.py for details.
|
"text_split_method": "cut5", # str. text split method, see text_segmentation_method.py for details.
|
||||||
@ -460,7 +460,7 @@ async def tts_get_endpoint(
|
|||||||
aux_ref_audio_paths: list = None,
|
aux_ref_audio_paths: list = None,
|
||||||
prompt_lang: str = None,
|
prompt_lang: str = None,
|
||||||
prompt_text: str = "",
|
prompt_text: str = "",
|
||||||
top_k: int = 5,
|
top_k: int = 15,
|
||||||
top_p: float = 1,
|
top_p: float = 1,
|
||||||
temperature: float = 1,
|
temperature: float = 1,
|
||||||
text_split_method: str = "cut5",
|
text_split_method: str = "cut5",
|
||||||
|
|||||||
2
go-webui-simple-mode.bat
Normal file
2
go-webui-simple-mode.bat
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
runtime\python.exe -I webui_simple.py zh_CN
|
||||||
|
pause
|
||||||
@ -19,7 +19,7 @@ torchaudio
|
|||||||
modelscope
|
modelscope
|
||||||
sentencepiece
|
sentencepiece
|
||||||
transformers>=4.43,<=4.50
|
transformers>=4.43,<=4.50
|
||||||
peft
|
peft<0.18.0
|
||||||
chardet
|
chardet
|
||||||
PyYAML
|
PyYAML
|
||||||
psutil
|
psutil
|
||||||
|
|||||||
2083
webui_simple.py
Normal file
2083
webui_simple.py
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user