diff --git a/GPT_SoVITS/Accelerate/PyTorch/backends/flash_attn_varlen_cuda_graph.py b/GPT_SoVITS/Accelerate/PyTorch/backends/flash_attn_varlen_cuda_graph.py
index d8a33f4e..62a187da 100644
--- a/GPT_SoVITS/Accelerate/PyTorch/backends/flash_attn_varlen_cuda_graph.py
+++ b/GPT_SoVITS/Accelerate/PyTorch/backends/flash_attn_varlen_cuda_graph.py
@@ -130,7 +130,7 @@ class CUDAGraphCache(CUDAGraphCacheABC):
super().__init__(decoder)
def release_graph(self, session: T2SSession):
- if session.id != self.id:
+ if session.id == self.id:
self.assigned = False
else:
del session.graph, session.xy_pos_, session.xy_dec_, session.input_pos, session.kv_cache
diff --git a/GPT_SoVITS/Accelerate/PyTorch/backends/mps_flash_attn_varlen.py b/GPT_SoVITS/Accelerate/PyTorch/backends/mps_flash_attn_varlen.py
index 6c83fcc3..7d50dae0 100644
--- a/GPT_SoVITS/Accelerate/PyTorch/backends/mps_flash_attn_varlen.py
+++ b/GPT_SoVITS/Accelerate/PyTorch/backends/mps_flash_attn_varlen.py
@@ -127,7 +127,7 @@ class CUDAGraphCache(CUDAGraphCacheABC):
)
def release_graph(self, session: T2SSession):
- if session.id != self.id:
+ if session.id == self.id:
self.assigned = False
else:
del (
diff --git a/GPT_SoVITS/Accelerate/PyTorch/backends/sage_attn_varlen_cuda_graph.py b/GPT_SoVITS/Accelerate/PyTorch/backends/sage_attn_varlen_cuda_graph.py
index 3212e755..05db87ee 100644
--- a/GPT_SoVITS/Accelerate/PyTorch/backends/sage_attn_varlen_cuda_graph.py
+++ b/GPT_SoVITS/Accelerate/PyTorch/backends/sage_attn_varlen_cuda_graph.py
@@ -133,7 +133,7 @@ class CUDAGraphCache(CUDAGraphCacheABC):
self.cu_seqlens_kv = torch.cat([torch.tensor(0, dtype=torch.int32), self.input_pos]).to(self.device)
def release_graph(self, session: T2SSession):
- if session.id != self.id:
+ if session.id == self.id:
self.assigned = False
else:
del (
diff --git a/GPT_SoVITS/Accelerate/PyTorch/backends/torch_static_cuda_graph.py b/GPT_SoVITS/Accelerate/PyTorch/backends/torch_static_cuda_graph.py
index 8eb677f7..f9ac2cd5 100644
--- a/GPT_SoVITS/Accelerate/PyTorch/backends/torch_static_cuda_graph.py
+++ b/GPT_SoVITS/Accelerate/PyTorch/backends/torch_static_cuda_graph.py
@@ -127,7 +127,7 @@ class CUDAGraphCache(CUDAGraphCacheABC):
)
def release_graph(self, session: T2SSession):
- if session.id != self.id:
+ if session.id == self.id:
self.assigned = False
else:
del (
diff --git a/GPT_SoVITS/Accelerate/PyTorch/t2s_model_abc.py b/GPT_SoVITS/Accelerate/PyTorch/t2s_model_abc.py
index 5d5d57bb..56032753 100644
--- a/GPT_SoVITS/Accelerate/PyTorch/t2s_model_abc.py
+++ b/GPT_SoVITS/Accelerate/PyTorch/t2s_model_abc.py
@@ -534,7 +534,7 @@ class T2SDecoderABC(nn.Module, ABC, T2SDecoderProtocol):
graph = torch.cuda.CUDAGraph()
- with torch.cuda.stream(s): # type: ignore
+ with torch.cuda.stream(s):
for _ in range(5):
self.h(input_pos, x, kv_caches, *args, **kwds)
torch.cuda.current_stream().wait_stream(s)
@@ -560,7 +560,7 @@ class CUDAGraphCacheABC(ABC):
decoder: T2SDecoderABC,
) -> None:
self.is_applicable: bool
-
+
if torch.cuda.is_available() and self.is_applicable:
self.device: torch.device = decoder.device
self.dtype = decoder.bert_proj.bias.dtype
@@ -585,7 +585,7 @@ class CUDAGraphCacheABC(ABC):
args, kwds = self.decoder.pre_forward(session)
graph = self.decoder.capture(self.input_pos, self.xy_pos, self.xy_dec, self.kv_cache, *args, **kwds)
self.graph = graph
- self.stream = torch.cuda.Stream() # type: ignore
+ self.stream = torch.cuda.Stream()
if self.assigned is False:
self.get_cache_graph(session)
diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py
index 2f4c771a..7beb0496 100644
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@@ -29,6 +29,7 @@ from config import (
infer_device as default_device,
)
from GPT_SoVITS.Accelerate import MLX, PyTorch, T2SEngineProtocol, T2SRequest, backends
+from GPT_SoVITS.Accelerate.logger import console
from GPT_SoVITS.feature_extractor import cnhubert
from GPT_SoVITS.module.mel_processing import mel_spectrogram_torch, spectrogram_torch
from GPT_SoVITS.module.models import Generator, SynthesizerTrn, SynthesizerTrnV3
@@ -329,7 +330,6 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
hps = DictToAttrRecursive(hps)
hps.model.semantic_frame_rate = "25hz"
hps.model.version = model_version
- # print("sovits版本:",hps.model.version)
if model_version not in v3v4set:
vq_model = SynthesizerTrn(
hps.data.filter_length // 2 + 1,
@@ -351,12 +351,12 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
pass
if is_lora is False:
- print(f">> loading sovits_{model_version}", vq_model.load_state_dict(dict_s2["weight"], strict=False))
+ console.print(f">> loading sovits_{model_version}", vq_model.load_state_dict(dict_s2["weight"], strict=False))
else:
path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
- print(f">> loading sovits_{model_version}spretrained_G")
+ console.print(f">> loading sovits_{model_version}spretrained_G")
dict_pretrain = torch.load(path_sovits)["weight"]
- print(f">> loading sovits_{model_version}_lora{model_version}")
+ console.print(f">> loading sovits_{model_version}_lora{model_version}")
state_dict = dict_pretrain.update(dict_s2["weight"])
lora_rank = dict_s2["lora_rank"]
lora_config = LoraConfig(
@@ -408,7 +408,7 @@ def change_gpt_weights(gpt_path):
)
# t2s_engine.decoder_model.compile()
total = sum(p.numel() for p in t2s_engine.decoder_model.parameters())
- print(">> Number of parameter: %.2fM" % (total / 1e6))
+ console.print(">> Number of parameter: %.2fM" % (total / 1e6))
change_gpt_weights(gpt_path)
@@ -479,7 +479,7 @@ def init_hifigan():
map_location="cpu",
weights_only=False,
)
- print(">> loading vocoder", hifigan_model.load_state_dict(state_dict_g))
+ console.print(">> loading vocoder", hifigan_model.load_state_dict(state_dict_g))
clean_bigvgan_model()
clean_sv_cn_model()
@@ -759,7 +759,6 @@ def get_tts_wav(
text = cut5(text)
while "\n\n" in text:
text = text.replace("\n\n", "\n")
- print(">>", i18n("实际输入的目标文本(切句后):"), text)
texts = text.split("\n")
texts = process_text(texts)
texts = merge_short_text_in_array(texts, 5)
@@ -940,9 +939,9 @@ def get_tts_wav(
infer_speed_avg = sum(infer_len) / sum(infer_time)
rtf_value = sum(t) / (audio_opt_n.__len__() / opt_sr)
- print(f">> Time Stamps: {t0:.3f}\t{t1:.3f}\t{t2:.3f}\t{t3:.3f}")
- print(f">> Infer Speed: {infer_speed_avg:.2f} Token/s")
- print(f">> RTF: {rtf_value:.2f}")
+ console.print(f">> Time Stamps: {t0:.3f}\t{t1:.3f}\t{t2:.3f}\t{t3:.3f}")
+ console.print(f">> Infer Speed: {infer_speed_avg:.2f} Token/s")
+ console.print(f">> RTF: {rtf_value:.2f}")
gr.Info(f"{infer_speed_avg:.2f} Token/s", title="Infer Speed")
gr.Info(f"{rtf_value:.2f}", title="RTF")
@@ -1082,199 +1081,200 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
),
elem_classes="markdown",
)
- with gr.Group():
- gr.Markdown(html_center(i18n("模型切换"), "h3"))
- with gr.Row(equal_height=True):
- GPT_dropdown = gr.Dropdown(
- label=i18n("GPT模型列表"),
- choices=GPT_names,
- value=gpt_path,
- interactive=True,
- scale=14,
- )
- SoVITS_dropdown = gr.Dropdown(
- label=i18n("SoVITS模型列表"),
- choices=SoVITS_names,
- value=sovits_path,
- interactive=True,
- scale=14,
- )
+ gr.Markdown(html_center(i18n("模型切换"), "h3"))
+ with gr.Row(equal_height=True):
+ with gr.Column(scale=2):
+ with gr.Row(equal_height=True):
+ GPT_dropdown = gr.Dropdown(
+ label=i18n("GPT模型列表"),
+ choices=GPT_names,
+ value=gpt_path,
+ interactive=True,
+ )
+ SoVITS_dropdown = gr.Dropdown(
+ label=i18n("SoVITS模型列表"),
+ choices=SoVITS_names,
+ value=sovits_path,
+ interactive=True,
+ )
+ with gr.Column(scale=1):
refresh_button = gr.Button(i18n("刷新模型路径"), variant="primary", scale=14)
- refresh_button.click(fn=change_choices_i18n, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown])
- gr.Markdown(html_center(i18n("*请上传并填写参考信息"), "h3"))
- with gr.Row(equal_height=True):
- inp_ref = gr.Audio(
- label=i18n("请上传3~10秒内参考音频,超过会报错!"),
- type="filepath",
- sources="upload",
- scale=13,
- editable=False,
- waveform_options={"show_recording_waveform": False},
- )
- with gr.Column(scale=13):
- ref_text_free = gr.Checkbox(
- label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。")
- + i18n("v3暂不支持该模式,使用了会报错。"),
- value=False,
- interactive=True if model_version not in v3v4set else False,
- show_label=True,
- scale=1,
- )
- gr.Markdown(
- html_left(
- i18n("使用无参考文本模式时建议使用微调的GPT")
- + "
"
- + i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。")
+ refresh_button.click(fn=change_choices_i18n, inputs=[], outputs=[SoVITS_dropdown, GPT_dropdown])
+ gr.Markdown(html_center(i18n("*请上传并填写参考信息"), "h3"))
+ with gr.Row(equal_height=True):
+ with gr.Column(scale=2):
+ with gr.Row(equal_height=True):
+ with gr.Column(scale=1):
+ inp_ref = gr.Audio(
+ label=i18n("请上传3~10秒内参考音频,超过会报错!"),
+ type="filepath",
+ sources="upload",
+ scale=13,
+ editable=False,
+ waveform_options={"show_recording_waveform": False},
)
- )
- prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="", lines=5, max_lines=5, scale=1)
- with gr.Column(scale=14):
- prompt_language = gr.Dropdown(
- label=i18n("参考音频的语种"),
- choices=list(dict_language.keys()),
- value=i18n("中文"),
- )
- inp_refs = (
- gr.File(
- label=i18n(
- "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。"
- ),
- file_count="multiple",
+ with gr.Column(scale=1):
+ gr.Markdown(
+ html_center(
+ i18n("使用无参考文本模式时建议使用微调的GPT")
+ + "
"
+ + i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。")
+ )
)
- if model_version not in v3v4set
- else gr.File(
- label=i18n(
- "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。"
- ),
- file_count="multiple",
- visible=False,
- )
- )
- sample_steps = (
- gr.Radio(
- label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"),
- value=32 if model_version == "v3" else 8,
- choices=[4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32],
- visible=True,
- )
- if model_version in v3v4set
- else gr.Radio(
- label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"),
- choices=[4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32],
- visible=False,
- value=32 if model_version == "v3" else 8,
- )
- )
- if_sr_Checkbox = gr.Checkbox(
- label=i18n("v3输出如果觉得闷可以试试开超分"),
- value=False,
- interactive=True,
- show_label=True,
- visible=False if model_version != "v3" else True,
- )
- gr.Markdown(html_center(i18n("*请填写需要合成的目标文本和语种模式"), "h3"))
- with gr.Row(equal_height=True):
- with gr.Column(scale=13):
- text = gr.Textbox(label=i18n("需要合成的文本"), value="", lines=26, max_lines=26)
- with gr.Column(scale=7):
- text_language = gr.Dropdown(
- label=i18n("需要合成的语种") + i18n(".限制范围越小判别效果越好。"),
- choices=list(dict_language.keys()),
- value=i18n("中文"),
- scale=1,
- )
- how_to_cut = gr.Dropdown(
- label=i18n("怎么切"),
- choices=[
- i18n("不切"),
- i18n("凑四句一切"),
- i18n("凑50字一切"),
- i18n("按中文句号。切"),
- i18n("按英文句号.切"),
- i18n("按标点符号切"),
- ],
- value=i18n("凑四句一切"),
- interactive=True,
- scale=1,
- )
- gr.Markdown(value=html_center(i18n("语速调整,高为更快")))
- if_freeze = gr.Checkbox(
- label=i18n("是否直接对上次合成结果调整语速和音色。防止随机性。"),
- value=False,
- interactive=True,
- show_label=True,
- scale=1,
- )
- with gr.Row(equal_height=True):
- speed = gr.Slider(
- minimum=0.6, maximum=1.65, step=0.05, label=i18n("语速"), value=1, interactive=True, scale=1
- )
- pause_second_slider = gr.Slider(
- minimum=0.1,
- maximum=0.5,
- step=0.01,
- label=i18n("句间停顿秒数"),
- value=0.3,
- interactive=True,
+ ref_text_free = gr.Checkbox(
+ label=i18n("开启无参考文本模式"),
+ info=i18n("不填参考文本亦相当于开启") + ", " + i18n("v3暂不支持该模式,使用了会报错。"),
+ value=False,
+ interactive=True if model_version not in v3v4set else False,
+ show_label=True,
scale=1,
)
- gr.Markdown(html_center(i18n("GPT采样参数(无参考文本时不要太低。不懂就用默认):")))
- top_k = gr.Slider(
- minimum=1, maximum=100, step=1, label=i18n("top_k"), value=15, interactive=True, scale=1
+ prompt_language = gr.Dropdown(
+ label="",
+ info=i18n("参考音频的语种"),
+ choices=list(dict_language.keys()),
+ value=i18n("中文"),
+ )
+ prompt_text = gr.Textbox(label="", info=i18n("参考音频的文本"), value="", lines=3, max_lines=3)
+
+ with gr.Column(scale=1):
+ inp_refs = (
+ gr.File(
+ label=i18n(
+ "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。"
+ ),
+ file_count="multiple",
)
- top_p = gr.Slider(
- minimum=0, maximum=1, step=0.05, label=i18n("top_p"), value=1, interactive=True, scale=1
+ if model_version not in v3v4set
+ else gr.File(
+ label=i18n(
+ "可选项:通过拖拽多个文件上传多个参考音频(建议同性),平均融合他们的音色。如不填写此项,音色由左侧单个参考音频控制。如是微调模型,建议参考音频全部在微调训练集音色内,底模不用管。"
+ ),
+ file_count="multiple",
+ visible=False,
)
- temperature = gr.Slider(
- minimum=0, maximum=1, step=0.05, label=i18n("temperature"), value=1, interactive=True, scale=1
+ )
+ sample_steps = (
+ gr.Radio(
+ label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"),
+ value=32 if model_version == "v3" else 8,
+ choices=[4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32],
+ visible=True,
)
- with gr.Row(equal_height=True):
- inference_button = gr.Button(value=i18n("合成语音"), variant="primary", size="lg", scale=25)
+ if model_version in v3v4set
+ else gr.Radio(
+ label=i18n("采样步数,如果觉得电,提高试试,如果觉得慢,降低试试"),
+ choices=[4, 8, 16, 32, 64, 128] if model_version == "v3" else [4, 8, 16, 32],
+ visible=False,
+ value=32 if model_version == "v3" else 8,
+ )
+ )
+ if_sr_Checkbox = gr.Checkbox(
+ label=i18n("v3输出如果觉得闷可以试试开超分"),
+ value=False,
+ interactive=True,
+ show_label=True,
+ visible=False if model_version != "v3" else True,
+ )
+ gr.Markdown(html_center(i18n("*请填写需要合成的目标文本和语种模式"), "h3"))
+ with gr.Row(equal_height=True):
+ with gr.Column(scale=2):
+ text = gr.Textbox(label=i18n("需要合成的文本"), value="", lines=30, max_lines=40)
+ with gr.Column(scale=1):
+ text_language = gr.Dropdown(
+ label=i18n("需要合成的语种") + i18n(".限制范围越小判别效果越好。"),
+ choices=list(dict_language.keys()),
+ value=i18n("中文"),
+ scale=1,
+ )
+ how_to_cut = gr.Dropdown(
+ label=i18n("怎么切"),
+ choices=[
+ i18n("不切"),
+ i18n("凑四句一切"),
+ i18n("凑50字一切"),
+ i18n("按中文句号。切"),
+ i18n("按英文句号.切"),
+ i18n("按标点符号切"),
+ ],
+ value=i18n("凑四句一切"),
+ interactive=True,
+ scale=1,
+ )
+ if_freeze = gr.Checkbox(
+ label=i18n("是否直接对上次合成结果调整语速和音色"),
+ value=False,
+ interactive=True,
+ show_label=True,
+ scale=1,
+ )
+ with gr.Row(equal_height=True):
+ speed = gr.Slider(
+ minimum=0.6, maximum=1.65, step=0.05, label=i18n("语速"), value=1, interactive=True, scale=1
+ )
+ pause_second_slider = gr.Slider(
+ minimum=0.1,
+ maximum=0.5,
+ step=0.01,
+ label=i18n("句间停顿秒数"),
+ value=0.3,
+ interactive=True,
+ scale=1,
+ )
+ gr.Markdown(html_center(i18n("GPT采样参数(不懂就用默认):")))
+ top_k = gr.Slider(minimum=1, maximum=100, step=1, label=i18n("top_k"), value=15, interactive=True, scale=1)
+ top_p = gr.Slider(minimum=0, maximum=1, step=0.05, label=i18n("top_p"), value=1, interactive=True, scale=1)
+ temperature = gr.Slider(
+ minimum=0, maximum=1, step=0.05, label=i18n("temperature"), value=1, interactive=True, scale=1
+ )
+ with gr.Row(equal_height=True):
+ with gr.Column(scale=2):
+ inference_button = gr.Button(value=i18n("合成语音"), variant="primary", size="lg")
+ with gr.Column(scale=1):
output = gr.Audio(
label=i18n("输出的语音"),
- scale=14,
waveform_options={"show_recording_waveform": False},
editable=False,
)
- inference_button.click(
- get_tts_wav,
- [
- inp_ref,
- prompt_text,
- prompt_language,
- text,
- text_language,
- how_to_cut,
- top_k,
- top_p,
- temperature,
- ref_text_free,
- speed,
- if_freeze,
- inp_refs,
- sample_steps,
- if_sr_Checkbox,
- pause_second_slider,
- ],
- [output],
- )
- SoVITS_dropdown.change(
- change_sovits_weights,
- [SoVITS_dropdown, prompt_language, text_language],
- [
- prompt_text,
- prompt_language,
- text,
- text_language,
- sample_steps,
- inp_refs,
- ref_text_free,
- if_sr_Checkbox,
- inference_button,
- ],
- )
- GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], [])
+ inference_button.click(
+ get_tts_wav,
+ [
+ inp_ref,
+ prompt_text,
+ prompt_language,
+ text,
+ text_language,
+ how_to_cut,
+ top_k,
+ top_p,
+ temperature,
+ ref_text_free,
+ speed,
+ if_freeze,
+ inp_refs,
+ sample_steps,
+ if_sr_Checkbox,
+ pause_second_slider,
+ ],
+ [output],
+ )
+ SoVITS_dropdown.change(
+ change_sovits_weights,
+ [SoVITS_dropdown, prompt_language, text_language],
+ [
+ prompt_text,
+ prompt_language,
+ text,
+ text_language,
+ sample_steps,
+ inp_refs,
+ ref_text_free,
+ if_sr_Checkbox,
+ inference_button,
+ ],
+ )
+ GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], [])
if __name__ == "__main__":
diff --git a/tools/assets.py b/tools/assets.py
index b2c302fe..de1b8557 100644
--- a/tools/assets.py
+++ b/tools/assets.py
@@ -51,7 +51,7 @@ footer * {
top_html = """