Compare commits

..

7 Commits

Author SHA1 Message Date
Karasukaigan
e78aada32e
Merge 539ad38d310d452aa1a08e659d5eac22f843a01d into c767f0b83b998e996a4d230d86da575a03f54a3f 2026-01-05 00:00:20 -08:00
ChasonJiang
c767f0b83b
修复bug (#2704)
* 修复bug

* fallbak and bug fix
2025-12-30 16:00:21 +08:00
ChasonJiang
9080a967d5
修复采样错误 (#2703) 2025-12-30 15:21:03 +08:00
sushistack
51df9f7384
Fix model file name in README instructions (#2700) 2025-12-25 16:44:21 +08:00
ChasonJiang
bfca0f6b2d
对齐naive_infer的解码策略,防止吞句 (#2697) 2025-12-19 17:37:19 +08:00
ChasonJiang
abe984395c
对齐gpt topk默认采样参数 (#2696) 2025-12-19 16:05:36 +08:00
RVC-Boss
cc89c3660e
Update requirements.txt 2025-12-19 15:54:54 +08:00
6 changed files with 12 additions and 10 deletions

View File

@ -707,10 +707,12 @@ class Text2SemanticDecoder(nn.Module):
if idx == 0:
attn_mask = F.pad(attn_mask[:, :, -1].unsqueeze(-2), (0, 1), value=False)
logits = logits[:, :-1]
else:
attn_mask = F.pad(attn_mask, (0, 1), value=False)
if idx < 11: ###至少预测出10个token不然不给停止0.4s
logits = logits[:, :-1]
samples = sample(
logits, y, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, temperature=temperature
)[0]

View File

@ -1008,7 +1008,7 @@ class TTS:
"aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion
"prompt_text": "", # str.(optional) prompt text for the reference audio
"prompt_lang": "", # str.(required) language of the prompt text for the reference audio
"top_k": 5, # int. top k sampling
"top_k": 15, # int. top k sampling
"top_p": 1, # float. top p sampling
"temperature": 1, # float. temperature for sampling
"text_split_method": "cut1", # str. text split method, see text_segmentation_method.py for details.
@ -1039,7 +1039,7 @@ class TTS:
aux_ref_audio_paths: list = inputs.get("aux_ref_audio_paths", [])
prompt_text: str = inputs.get("prompt_text", "")
prompt_lang: str = inputs.get("prompt_lang", "")
top_k: int = inputs.get("top_k", 5)
top_k: int = inputs.get("top_k", 15)
top_p: float = inputs.get("top_p", 1)
temperature: float = inputs.get("temperature", 1)
text_split_method: str = inputs.get("text_split_method", "cut1")

View File

@ -385,7 +385,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
minimum=0.6, maximum=1.65, step=0.05, label="语速", value=1.0, interactive=True
)
with gr.Row():
top_k = gr.Slider(minimum=1, maximum=100, step=1, label=i18n("top_k"), value=5, interactive=True)
top_k = gr.Slider(minimum=1, maximum=100, step=1, label=i18n("top_k"), value=15, interactive=True)
top_p = gr.Slider(minimum=0, maximum=1, step=0.05, label=i18n("top_p"), value=1, interactive=True)
with gr.Row():
temperature = gr.Slider(

View File

@ -347,7 +347,7 @@ Use v4 from v1/v2/v3 environment:
2. Clone the latest codes from github.
3. Download v4 pretrained models (gsv-v4-pretrained/s2v4.ckpt, and gsv-v4-pretrained/vocoder.pth) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`.
3. Download v4 pretrained models (gsv-v4-pretrained/s2v4.pth, and gsv-v4-pretrained/vocoder.pth) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`.
## V2Pro Release Notes

View File

@ -27,7 +27,7 @@ POST:
"aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion
"prompt_text": "", # str.(optional) prompt text for the reference audio
"prompt_lang": "", # str.(required) language of the prompt text for the reference audio
"top_k": 5, # int. top k sampling
"top_k": 15, # int. top k sampling
"top_p": 1, # float. top p sampling
"temperature": 1, # float. temperature for sampling
"text_split_method": "cut5", # str. text split method, see text_segmentation_method.py for details.
@ -158,7 +158,7 @@ class TTS_Request(BaseModel):
aux_ref_audio_paths: list = None
prompt_lang: str = None
prompt_text: str = ""
top_k: int = 5
top_k: int = 15
top_p: float = 1
temperature: float = 1
text_split_method: str = "cut5"
@ -355,7 +355,7 @@ async def tts_handle(req: dict):
"aux_ref_audio_paths": [], # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion
"prompt_text": "", # str.(optional) prompt text for the reference audio
"prompt_lang": "", # str.(required) language of the prompt text for the reference audio
"top_k": 5, # int. top k sampling
"top_k": 15, # int. top k sampling
"top_p": 1, # float. top p sampling
"temperature": 1, # float. temperature for sampling
"text_split_method": "cut5", # str. text split method, see text_segmentation_method.py for details.
@ -460,7 +460,7 @@ async def tts_get_endpoint(
aux_ref_audio_paths: list = None,
prompt_lang: str = None,
prompt_text: str = "",
top_k: int = 5,
top_k: int = 15,
top_p: float = 1,
temperature: float = 1,
text_split_method: str = "cut5",

View File

@ -19,7 +19,7 @@ torchaudio
modelscope
sentencepiece
transformers>=4.43,<=4.50
peft
peft<0.18.0
chardet
PyYAML
psutil