Merge eed2095a42927ea0a9c6bd548eaf4926b6c99f8f into 9da7e17efe05041e31d3c3f42c8730ae890397f2

Add files via upload
Update Changelog_CN.md
2025-04-05 12:38:35 +08:00 · 2025-04-02 04:19:18 +09:00 · 2025-04-01 18:44:35 +08:00 · 2025-04-01 17:21:48 +08:00 · 2025-04-01 17:15:52 +08:00 · 2025-04-01 16:50:54 +08:00
23 changed files with 878 additions and 269 deletions
--- a/.gitignore
+++ b/.gitignore
@ -18,5 +18,183 @@ TEMP
 weight.json
 ffmpeg*
 ffprobe*
+cfg.json
+speakers.json
+ref_audios
 tools/AP_BWE_main/24kto48k/*
 !tools/AP_BWE_main/24kto48k/readme.txt
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+# Ruff stuff:
+.ruff_cache/
+
+# PyPI configuration file
+.pypirc
--- a/GPT_SoVITS/TTS_infer_pack/TTS.py
+++ b/GPT_SoVITS/TTS_infer_pack/TTS.py
@ -3,7 +3,7 @@ import math
 import os, sys, gc
 import random
 import traceback
-
+import time
 import torchaudio
 from tqdm import tqdm
 now_dir = os.getcwd()
@ -462,8 +462,6 @@ class TTS:
                n_speakers=self.configs.n_speakers,
                **kwargs
            )
-            if hasattr(vits_model, "enc_q"):
-                del vits_model.enc_q
            self.configs.is_v3_synthesizer = False
        else:
            vits_model = SynthesizerTrnV3(
@ -474,7 +472,8 @@ class TTS:
            )
            self.configs.is_v3_synthesizer = True
            self.init_bigvgan()
-            
+            if "pretrained" not in weights_path and hasattr(vits_model, "enc_q"):
+                del vits_model.enc_q

        if if_lora_v3==False:
            print(f"Loading VITS weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}")
@ -908,11 +907,14 @@ class TTS:
                split_bucket = False
                print(i18n("分段返回模式不支持分桶处理，已自动关闭分桶处理"))

-        if split_bucket and speed_factor==1.0:
+        if split_bucket and speed_factor==1.0 and not (self.configs.is_v3_synthesizer and parallel_infer):
            print(i18n("分桶处理模式已开启"))
        elif speed_factor!=1.0:
            print(i18n("语速调节不支持分桶处理，已自动关闭分桶处理"))
            split_bucket = False
+        elif self.configs.is_v3_synthesizer and parallel_infer:
+            print(i18n("当开启并行推理模式时，SoVits V3模型不支持分桶处理，已自动关闭分桶处理"))
+            split_bucket = False
        else:
            print(i18n("分桶处理模式已关闭"))

@ -936,7 +938,7 @@ class TTS:
            raise ValueError("ref_audio_path cannot be empty, when the reference audio is not set using set_ref_audio()")

        ###### setting reference audio and prompt text preprocessing ########
-        t0 = ttime()
+        t0 = time.perf_counter()
        if (ref_audio_path is not None) and (ref_audio_path != self.prompt_cache["ref_audio_path"]):
            if not os.path.exists(ref_audio_path):
                raise ValueError(f"{ref_audio_path} not exists")
@ -975,7 +977,7 @@ class TTS:


        ###### text preprocessing ########
-        t1 = ttime()
+        t1 = time.perf_counter()
        data:list = None
        if not return_fragment:
            data = self.text_preprocessor.preprocess(text, text_lang, text_split_method, self.configs.version)
@ -1027,7 +1029,7 @@ class TTS:
                return batch[0]


-        t2 = ttime()
+        t2 = time.perf_counter()
        try:
            print("############ 推理 ############")
            ###### inference ######
@ -1036,7 +1038,7 @@ class TTS:
            audio = []
            output_sr = self.configs.sampling_rate if not self.configs.is_v3_synthesizer else 24000
            for item in data:
-                t3 = ttime()
+                t3 = time.perf_counter()
                if return_fragment:
                    item = make_batch(item)
                    if item is None:
@ -1071,7 +1073,7 @@ class TTS:
                    max_len=max_len,
                    repetition_penalty=repetition_penalty,
                )
-                t4 = ttime()
+                t4 = time.perf_counter()
                t_34 += t4 - t3

                refer_audio_spec:torch.Tensor = [item.to(dtype=self.precision, device=self.configs.device) for item in self.prompt_cache["refer_spec"]]
@ -1094,6 +1096,7 @@ class TTS:
                print(f"############ {i18n('合成音频')} ############")
                if not self.configs.is_v3_synthesizer:
                    if speed_factor == 1.0:
+                        print(f"{i18n('并行合成中')}...")
                        # ## vits并行推理 method 2
                        pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)]
                        upsample_rate = math.prod(self.vits_model.upsample_rates)
@ -1117,6 +1120,17 @@ class TTS:
                            batch_audio_fragment.append(
                                audio_fragment
                            )  ###试试重建不带上prompt部分
+                else:
+                    if parallel_infer:
+                        print(f"{i18n('并行合成中')}...")
+                        audio_fragments = self.v3_synthesis_batched_infer(
+                                                                        idx_list,
+                                                                        pred_semantic_list, 
+                                                                        batch_phones, 
+                                                                        speed=speed_factor, 
+                                                                        sample_steps=sample_steps
+                                                                    )
+                        batch_audio_fragment.extend(audio_fragments)
                    else:
                        for i, idx in enumerate(tqdm(idx_list)):
                            phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
@ -1128,7 +1142,7 @@ class TTS:
                                audio_fragment
                            ) 

-                t5 = ttime()
+                t5 = time.perf_counter()
                t_45 += t5 - t4
                if return_fragment:
                    print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t4 - t3, t5 - t4))
@ -1219,13 +1233,13 @@ class TTS:

        if super_sampling:
            print(f"############ {i18n('音频超采样')} ############")
-            t1 = ttime()
+            t1 = time.perf_counter()
            self.init_sr_model()
            if not self.sr_model_not_exist:
                audio,sr=self.sr_model(audio.unsqueeze(0),sr)
                max_audio=np.abs(audio).max()
                if max_audio > 1: audio /= max_audio
-            t2 = ttime()
+            t2 = time.perf_counter()
            print(f"超采样用时：{t2-t1:.3f}s")
        else:
            audio = audio.cpu().numpy()
@ -1260,7 +1274,7 @@ class TTS:
            ref_audio = ref_audio.mean(0).unsqueeze(0)
        if ref_sr!=24000:
            ref_audio=resample(ref_audio, ref_sr, self.configs.device)
-        # print("ref_audio",ref_audio.abs().mean())
+
        mel2 = mel_fn(ref_audio)
        mel2 = norm_spec(mel2)
        T_min = min(mel2.shape[2], fea_ref.shape[2])
@ -1285,15 +1299,156 @@ class TTS:

            cfm_res = self.vits_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0)
            cfm_res = cfm_res[:, :, mel2.shape[2]:]
-            mel2 = cfm_res[:, :, -T_min:]

+            mel2 = cfm_res[:, :, -T_min:]
            fea_ref = fea_todo_chunk[:, :, -T_min:]
+
            cfm_resss.append(cfm_res)
-        cmf_res = torch.cat(cfm_resss, 2)
-        cmf_res = denorm_spec(cmf_res)
+        cfm_res = torch.cat(cfm_resss, 2)
+        cfm_res = denorm_spec(cfm_res)
+
        
        with torch.inference_mode():
-            wav_gen = self.bigvgan_model(cmf_res)
+            wav_gen = self.bigvgan_model(cfm_res)
            audio=wav_gen[0][0]#.cpu().detach().numpy()
    
        return audio
+
+    
+
+    def v3_synthesis_batched_infer(self, 
+                    idx_list:List[int],
+                    semantic_tokens_list:List[torch.Tensor], 
+                    batch_phones:List[torch.Tensor], 
+                    speed:float=1.0,
+                    sample_steps:int=32
+                    )->List[torch.Tensor]:
+            
+        prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device)
+        prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device)
+        refer_audio_spec = self.prompt_cache["refer_spec"][0].to(dtype=self.precision, device=self.configs.device)
+
+        fea_ref,ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
+        ref_audio:torch.Tensor = self.prompt_cache["raw_audio"]
+        ref_sr = self.prompt_cache["raw_sr"]
+        ref_audio=ref_audio.to(self.configs.device).float()
+        if (ref_audio.shape[0] == 2):
+            ref_audio = ref_audio.mean(0).unsqueeze(0)
+        if ref_sr!=24000:
+            ref_audio=resample(ref_audio, ref_sr, self.configs.device)
+
+        mel2 = mel_fn(ref_audio)
+        mel2 = norm_spec(mel2)
+        T_min = min(mel2.shape[2], fea_ref.shape[2])
+        mel2 = mel2[:, :, :T_min]
+        fea_ref = fea_ref[:, :, :T_min]
+        if (T_min > 468):
+            mel2 = mel2[:, :, -468:]
+            fea_ref = fea_ref[:, :, -468:]
+            T_min = 468
+        chunk_len = 934 - T_min
+
+        mel2=mel2.to(self.precision)
+
+
+        # #### batched inference
+        overlapped_len = 12
+        feat_chunks = []
+        feat_lens = []
+        feat_list = []
+
+        for i, idx in enumerate(idx_list):
+            phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
+            semantic_tokens = semantic_tokens_list[i][-idx:].unsqueeze(0).unsqueeze(0)   # .unsqueeze(0)#mq要多unsqueeze一次
+            feat, _ = self.vits_model.decode_encp(semantic_tokens, phones, refer_audio_spec, ge, speed)
+            feat_list.append(feat)
+            feat_lens.append(feat.shape[2])
+
+        feats = torch.cat(feat_list, 2)
+        feats_padded = F.pad(feats, (overlapped_len,0), "constant", 0)
+        pos = 0
+        padding_len = 0
+        while True:
+            if pos ==0:
+                chunk = feats_padded[:, :, pos:pos + chunk_len]
+            else:
+                pos = pos - overlapped_len
+                chunk = feats_padded[:, :, pos:pos + chunk_len]
+            pos += chunk_len
+            if (chunk.shape[-1] == 0): break
+
+            # padding for the last chunk
+            padding_len = chunk_len - chunk.shape[2]
+            if padding_len != 0:
+                chunk = F.pad(chunk, (0,padding_len), "constant", 0)
+            feat_chunks.append(chunk)
+            
+
+
+        feat_chunks = torch.cat(feat_chunks, 0)
+        bs = feat_chunks.shape[0]
+        fea_ref = fea_ref.repeat(bs,1,1)
+        fea = torch.cat([fea_ref, feat_chunks], 2).transpose(2, 1)
+        pred_spec = self.vits_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0)
+        pred_spec = pred_spec[:, :, -chunk_len:]
+        dd = pred_spec.shape[1]
+        pred_spec = pred_spec.permute(1, 0, 2).contiguous().view(dd, -1).unsqueeze(0)
+        # pred_spec = pred_spec[..., :-padding_len]
+
+
+        pred_spec = denorm_spec(pred_spec)
+        
+        with torch.no_grad():
+            wav_gen = self.bigvgan_model(pred_spec)
+            audio = wav_gen[0][0]#.cpu().detach().numpy()
+
+
+        audio_fragments = []
+        upsample_rate = 256
+        pos = 0
+
+        while pos < audio.shape[-1]:
+            audio_fragment = audio[pos:pos+chunk_len*upsample_rate]
+            audio_fragments.append(audio_fragment)
+            pos += chunk_len*upsample_rate
+
+        audio = self.sola_algorithm(audio_fragments, overlapped_len*upsample_rate)
+        audio = audio[overlapped_len*upsample_rate:-padding_len*upsample_rate]
+
+        audio_fragments = []
+        for feat_len in feat_lens:
+            audio_fragment = audio[:feat_len*upsample_rate]
+            audio_fragments.append(audio_fragment)
+            audio = audio[feat_len*upsample_rate:]
+
+
+        return audio_fragments
+    
+
+
+    def sola_algorithm(self, 
+                    audio_fragments:List[torch.Tensor],
+                    overlap_len:int,
+                    ):
+        
+        for i in range(len(audio_fragments)-1):
+            f1 = audio_fragments[i]
+            f2 = audio_fragments[i+1]
+            w1 = f1[-overlap_len:]
+            w2 = f2[:overlap_len]
+            assert w1.shape == w2.shape
+            corr = F.conv1d(w1.view(1,1,-1), w2.view(1,1,-1),padding=w2.shape[-1]//2).view(-1)[:-1]
+            idx = corr.argmax()
+            f1_ = f1[:-(overlap_len-idx)]
+            audio_fragments[i] = f1_
+
+            f2_ = f2[idx:]
+            window = torch.hann_window((overlap_len-idx)*2, device=f1.device, dtype=f1.dtype)
+            f2_[:(overlap_len-idx)] = window[:(overlap_len-idx)]*f2_[:(overlap_len-idx)] + window[(overlap_len-idx):]*f1[-(overlap_len-idx):]
+            audio_fragments[i+1] = f2_
+
+
+        return torch.cat(audio_fragments, 0)
+            
+                
+
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@ -238,7 +238,7 @@ def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
        else:
            visible_sample_steps=False
            visible_inp_refs=True
-        yield  {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False}
+        yield  {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False},{"__type__": "update", "value":i18n("模型加载中，请等待"),"interactive":False}

    dict_s2 = load_sovits_new(sovits_path)
    hps = dict_s2["config"]
@ -294,6 +294,7 @@ def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
        # torch.save(vq_model.state_dict(),"merge_win.pth")
        vq_model.eval()

+    yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False},{"__type__": "update", "value":i18n("合成语音"),"interactive":True}
    with open("./weight.json")as f:
        data=f.read()
        data=json.loads(data)
@ -877,7 +878,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
        with gr.Row():
            inp_ref = gr.Audio(label=i18n("请上传3~10秒内参考音频，超过会报错！"), type="filepath", scale=13)
            with gr.Column(scale=13):
-                ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。")+i18n("v3暂不支持该模式，使用了会报错。"), value=False, interactive=True, show_label=True,scale=1)
+                ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。")+i18n("v3暂不支持该模式，使用了会报错。"), value=False, interactive=True if model_version!="v3"else False, show_label=True,scale=1)
                gr.Markdown(html_left(i18n("使用无参考文本模式时建议使用微调的GPT")+"<br>"+i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。")))
                prompt_text = gr.Textbox(label=i18n("参考音频的文本"), value="", lines=5, max_lines=5,scale=1)
            with gr.Column(scale=14):
@ -915,7 +916,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
            #     phoneme=gr.Textbox(label=i18n("音素框"), value="")
            #     get_phoneme_button = gr.Button(i18n("目标文本转音素"), variant="primary")
        with gr.Row():
-            inference_button = gr.Button(i18n("合成语音"), variant="primary", size='lg', scale=25)
+            inference_button = gr.Button(value=i18n("合成语音"), variant="primary", size='lg', scale=25)
            output = gr.Audio(label=i18n("输出的语音"), scale=14)

        inference_button.click(
@ -923,7 +924,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
            [inp_ref, prompt_text, prompt_language, text, text_language, how_to_cut, top_k, top_p, temperature, ref_text_free,speed,if_freeze,inp_refs,sample_steps,if_sr_Checkbox,pause_second_slider],
            [output],
        )
-        SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language,sample_steps,inp_refs,ref_text_free,if_sr_Checkbox])
+        SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language,sample_steps,inp_refs,ref_text_free,if_sr_Checkbox,inference_button])
        GPT_dropdown.change(change_gpt_weights, [GPT_dropdown], [])

        # gr.Markdown(value=i18n("文本切分工具。太长的文本合成出来效果不一定好，所以太长建议先切。合成会根据文本的换行分开合成再拼起来。"))
--- a/GPT_SoVITS/inference_webui_fast.py
+++ b/GPT_SoVITS/inference_webui_fast.py
@ -41,12 +41,13 @@ gpt_path = os.environ.get("gpt_path", None)
 sovits_path = os.environ.get("sovits_path", None)
 cnhubert_base_path = os.environ.get("cnhubert_base_path", None)
 bert_path = os.environ.get("bert_path", None)
-version=os.environ.get("version","v2")
+version=model_version=os.environ.get("version","v2")

 import gradio as gr
 from TTS_infer_pack.TTS import TTS, TTS_Config, NO_PROMPT_ERROR
 from TTS_infer_pack.text_segmentation_method import get_method
 from tools.i18n.i18n import I18nAuto, scan_language_list
+from inference_webui import DictToAttrRecursive

 language=os.environ.get("language","Auto")
 language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
@ -221,19 +222,16 @@ def get_weights_names(GPT_weight_root, SoVITS_weight_root):
 SoVITS_names, GPT_names = get_weights_names(GPT_weight_root, SoVITS_weight_root)


-from process_ckpt import get_sovits_version_from_path_fast
+from process_ckpt import get_sovits_version_from_path_fast,load_sovits_new
 def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
-    global version, dict_language
+    global version, model_version, dict_language,if_lora_v3
    version, model_version, if_lora_v3=get_sovits_version_from_path_fast(sovits_path)
-
+    # print(sovits_path,version, model_version, if_lora_v3)
    if if_lora_v3 and not os.path.exists(path_sovits_v3):
        info= path_sovits_v3 + i18n("SoVITS V3 底模缺失，无法加载相应 LoRA 权重")
        gr.Warning(info)
        raise FileExistsError(info)
-
-    tts_pipeline.init_vits_weights(sovits_path)
-
-    dict_language = dict_language_v1 if tts_pipeline.configs.version =='v1' else dict_language_v2
+    dict_language = dict_language_v1 if version =='v1' else dict_language_v2
    if prompt_language is not None and text_language is not None:
        if prompt_language in list(dict_language.keys()):
            prompt_text_update, prompt_language_update = {'__type__':'update'}, {'__type__':'update', 'value':prompt_language}
@ -251,8 +249,11 @@ def change_sovits_weights(sovits_path,prompt_language=None,text_language=None):
        else:
            visible_sample_steps=False
            visible_inp_refs=True
-        yield  {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "visible": visible_sample_steps},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "value": False,"interactive":True if model_version!="v3"else False},{"__type__": "update", "visible":True if model_version=="v3"else False}
+        #prompt_language,text_language,prompt_text,prompt_language,text,text_language,inp_refs,ref_text_free,
+        yield  {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "interactive": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "interactive": True if model_version!="v3"else False},{"__type__": "update", "value":i18n("模型加载中，请等待"),"interactive":False}

+    tts_pipeline.init_vits_weights(sovits_path)
+    yield {'__type__':'update', 'choices':list(dict_language.keys())}, {'__type__':'update', 'choices':list(dict_language.keys())}, prompt_text_update, prompt_language_update, text_update, text_language_update,{"__type__": "update", "interactive": visible_sample_steps,"value":32},{"__type__": "update", "visible": visible_inp_refs},{"__type__": "update", "interactive": True if model_version!="v3"else False},{"__type__": "update", "value":i18n("合成语音"),"interactive":True}
    with open("./weight.json")as f:
        data=f.read()
        data=json.loads(data)
@ -279,14 +280,14 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
            gr.Markdown(value=i18n("*请上传并填写参考信息"))
            with gr.Row():
                inp_ref = gr.Audio(label=i18n("主参考音频(请上传3~10秒内参考音频，超过会报错！)"), type="filepath")
-                inp_refs = gr.File(label=i18n("辅参考音频(可选多个，或不选)"),file_count="multiple")
+                inp_refs = gr.File(label=i18n("辅参考音频(可选多个，或不选)"),file_count="multiple", visible=True if model_version!="v3"else False)
            prompt_text = gr.Textbox(label=i18n("主参考音频的文本"), value="", lines=2)
            with gr.Row():
                prompt_language = gr.Dropdown(
                    label=i18n("主参考音频的语种"), choices=list(dict_language.keys()), value=i18n("中文")
                )
                with gr.Column():
-                    ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True, show_label=True)
+                    ref_text_free = gr.Checkbox(label=i18n("开启无参考文本模式。不填参考文本亦相当于开启。"), value=False, interactive=True if model_version!="v3"else False, show_label=True)
                    gr.Markdown(i18n("使用无参考文本模式时建议使用微调的GPT")+"<br>"+i18n("听不清参考音频说的啥(不晓得写啥)可以开。开启后无视填写的参考文本。"))

        with gr.Column():
@ -355,7 +356,7 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
            [output, seed],
        )
        stop_infer.click(tts_pipeline.stop, [], [])
-        SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language])
+        SoVITS_dropdown.change(change_sovits_weights, [SoVITS_dropdown,prompt_language,text_language], [prompt_language,text_language,prompt_text,prompt_language,text,text_language,sample_steps,inp_refs,ref_text_free,inference_button])#
        GPT_dropdown.change(tts_pipeline.init_t2s_weights, [GPT_dropdown], [])

    with gr.Group():
--- a/GPT_SoVITS/s2_train.py
+++ b/GPT_SoVITS/s2_train.py
@ -429,6 +429,8 @@ def train_and_evaluate(
                # scalar_dict.update({"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)})
                # scalar_dict.update({"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)})
                # scalar_dict.update({"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)})
+                image_dict=None
+                try:###Some people installed the wrong version of matplotlib.
                    image_dict = {
                        "slice/mel_org": utils.plot_spectrogram_to_numpy(
                            y_mel[0].data.cpu().numpy()
@ -443,12 +445,9 @@ def train_and_evaluate(
                            stats_ssl[0].data.cpu().numpy()
                        ),
                    }
-                utils.summarize(
-                    writer=writer,
-                    global_step=global_step,
-                    images=image_dict,
-                    scalars=scalar_dict,
-                )
+                except:pass
+                if image_dict:utils.summarize(writer=writer,global_step=global_step,images=image_dict,scalars=scalar_dict,)
+                else:utils.summarize(writer=writer,global_step=global_step,scalars=scalar_dict,)
        global_step += 1
    if epoch % hps.train.save_every_epoch == 0 and rank == 0:
        if hps.train.if_save_latest == 0:
--- a/GPT_SoVITS/text/g2pw/onnx_api.py
+++ b/GPT_SoVITS/text/g2pw/onnx_api.py
@ -58,7 +58,7 @@ def download_and_decompress(model_dir: str='G2PWModel/'):
        extract_dir = os.path.join(parent_directory,"G2PWModel_1.1")
        extract_dir_new = os.path.join(parent_directory,"G2PWModel")
        print("Downloading g2pw model...")
-        modelscope_url = "https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip"
+        modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip"#"https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip"
        with requests.get(modelscope_url, stream=True) as r:
            r.raise_for_status()
            with open(zip_dir, 'wb') as f:
--- a/GPT_SoVITS_Inference.ipynb
+++ b/GPT_SoVITS_Inference.ipynb
@ -1,42 +1,37 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "accelerator": "GPU"
-  },
  "cells": [
    {
      "cell_type": "markdown",
-      "source": [
-        "# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)"
-      ],
      "metadata": {
        "id": "himHYZmra7ix"
-      }
+      },
+      "source": [
+        "# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)"
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "e9b7iFV3dm1f"
      },
+      "outputs": [],
      "source": [
        "!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
        "%cd GPT-SoVITS\n",
        "!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
+        "!pip install -r extra-req.txt --no-deps\n",
        "!pip install -r requirements.txt"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "0NgxXg5sjv7z"
+      },
+      "outputs": [],
      "source": [
        "# @title Download pretrained models 下载预训练模型\n",
        "!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
@ -53,16 +48,16 @@
        "!git clone https://huggingface.co/Delik/uvr5_weights\n",
        "!git config core.sparseCheckout true\n",
        "!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
-      ],
-      "metadata": {
-        "id": "0NgxXg5sjv7z",
-        "cellView": "form"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "cPDEH-9czOJF"
+      },
+      "outputs": [],
      "source": [
        "#@title Create folder models 创建文件夹模型\n",
        "import os\n",
@ -77,16 +72,16 @@
        "    print(f\"The folder '{folder_name}' was created successfully! (文件夹'{folder_name}'已成功创建！)\")\n",
        "\n",
        "print(\"All folders have been created. (所有文件夹均已创建。)\")"
-      ],
-      "metadata": {
-        "cellView": "form",
-        "id": "cPDEH-9czOJF"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "vbZY-LnM0tzq"
+      },
+      "outputs": [],
      "source": [
        "import requests\n",
        "import zipfile\n",
@ -124,29 +119,35 @@
        "        shutil.move(source_path, destination_path)\n",
        "\n",
        "print(f'Model downloaded. (模型已下载。)')"
-      ],
-      "metadata": {
-        "cellView": "form",
-        "id": "vbZY-LnM0tzq"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "4oRGUzkrk8C7"
+      },
+      "outputs": [],
      "source": [
        "# @title launch WebUI 启动WebUI\n",
        "!/usr/local/bin/pip install ipykernel\n",
        "!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
        "%cd /content/GPT-SoVITS/\n",
        "!/usr/local/bin/python  webui.py"
-      ],
-      "metadata": {
-        "id": "4oRGUzkrk8C7",
-        "cellView": "form"
-      },
-      "execution_count": null,
-      "outputs": []
-    }
      ]
    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/README.md
+++ b/README.md
@ -1,6 +1,5 @@
 <div align="center">

-
 <h1>GPT-SoVITS-WebUI</h1>
 A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>

@ -77,6 +76,7 @@ bash install.sh
 ```bash
 conda create -n GPTSoVits python=3.9
 conda activate GPTSoVits
+pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```

@ -105,6 +105,7 @@ Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWeb
 Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) (Korean TTS Only)

 ##### MacOS Users
+
 ```bash
 brew install ffmpeg
 ```
@ -112,6 +113,7 @@ brew install ffmpeg
 #### Install Dependences

 ```bash
+pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```

@ -200,6 +202,7 @@ if you want to switch to V1,then
 ```bash
 python webui.py v1 <language(optional)>
 ```
+
 Or maunally switch version in WebUI

 ### Finetune
@ -224,11 +227,13 @@ Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference
 ```bash
 python GPT_SoVITS/inference_webui.py <language(optional)>
 ```
+
 OR

 ```bash
 python webui.py
 ```
+
 then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`

 ## V2 Release Notes
@ -243,7 +248,7 @@ New Features:

 4. Improved synthesis quality for low-quality reference audio

-    [more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

 Use v2 from v1 environment:

@ -263,7 +268,7 @@ New Features:

 2. GPT model is more stable, with fewer repetitions and omissions, and it is easier to generate speech with richer emotional expression.

-    [more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

 Use v3 from v2 environment:

@ -275,7 +280,6 @@ Use v3 from v2 environment:

   additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt)

-
 ## Todo List

 - [x] **High Priority:**
@ -297,15 +301,20 @@ Use v3 from v2 environment:
  - [ ] model mix

 ## (Additional) Method for running from the command line
+
 Use the command line to open the WebUI for UVR5
+
 ```
 python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
 ```
+
 <!-- If you can't open a browser, follow the format below for UVR processing,This is using mdxnet for audio processing
 ```
 python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
 ``` -->
+
 This is how the audio segmentation of the dataset is done using the command line
+
 ```
 python audio_slicer.py \
    --input_path "<path_to_original_audio_file_or_directory>" \
@ -315,16 +324,21 @@ python audio_slicer.py \
    --min_interval <shortest_time_gap_between_adjacent_subclips>
    --hop_size <step_size_for_computing_volume_curve>
 ```
+
 This is how dataset ASR processing is done using the command line(Only Chinese)
+
 ```
 python tools/asr/funasr_asr.py -i <input> -o <output>
 ```
+
 ASR processing is performed through Faster_Whisper(ASR marking except Chinese)

 (No progress bars, GPU performance may cause time delays)
+
 ```
 python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
 ```
+
 A custom list save path is enabled

 ## Credits
@ -332,6 +346,7 @@ A custom list save path is enabled
 Special thanks to the following projects and contributors:

 ### Theoretical Research
+
 - [ar-vits](https://github.com/innnky/ar-vits)
 - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
 - [vits](https://github.com/jaywalnut310/vits)
@ -341,17 +356,23 @@ Special thanks to the following projects and contributors:
 - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
 - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
 - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
+
 ### Pretrained Models
+
 - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
 - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
 - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
+
 ### Text Frontend for Inference
+
 - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
 - [split-lang](https://github.com/DoodleBears/split-lang)
 - [g2pW](https://github.com/GitYCC/g2pW)
 - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
 - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
+
 ### WebUI Tools
+
 - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
 - [audio-slicer](https://github.com/openvpi/audio-slicer)
 - [SubFix](https://github.com/cronrpc/SubFix)
--- a/api_v2.py
+++ b/api_v2.py
@ -112,9 +112,12 @@ import wave
 import signal
 import numpy as np
 import soundfile as sf
+import shutil
 from fastapi import FastAPI, Request, HTTPException, Response
 from fastapi.responses import StreamingResponse, JSONResponse
 from fastapi import FastAPI, UploadFile, File
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
 import uvicorn
 from io import BytesIO
 from tools.i18n.i18n import I18nAuto
@ -141,6 +144,7 @@ if config_path in [None, ""]:
    config_path = "GPT-SoVITS/configs/tts_infer.yaml"

 tts_config = TTS_Config(config_path)
+print("以下为TTS_CONFIG配置, 如需修改请查看/GPT_SoVITS/configs/tts_infer.yaml")
 print(tts_config)
 tts_pipeline = TTS(tts_config)

@ -459,7 +463,84 @@ async def set_sovits_weights(weights_path: str = None):
        return JSONResponse(status_code=400, content={"message": f"change sovits weight failed", "Exception": str(e)})
    return JSONResponse(status_code=200, content={"message": "success"})

+APP.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # 允许所有域名的请求
+    allow_credentials=True,
+    allow_methods=["*"],  # 允许所有方法
+    allow_headers=["*"],  # 允许所有请求头
+)

+@APP.get("/info")
+async def get_info():
+    try:
+        gpt_weights_dir_v2 = 'GPT_weights_v2'
+        sovits_weights_dir_v2 = 'SoVITS_weights_v2'
+        gpt_weights_dir = 'GPT_weights'
+        sovits_weights_dir = 'SoVITS_weights'
+        
+        gpt_filenames = []
+        sovits_filenames = []
+        
+        for dir in [gpt_weights_dir_v2, gpt_weights_dir]:
+            if os.path.exists(dir):
+                gpt_filenames.extend([f"{dir}/{f}" for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))])
+        
+        for dir in [sovits_weights_dir_v2, sovits_weights_dir]:
+            if os.path.exists(dir):
+                sovits_filenames.extend([f"{dir}/{f}" for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))])
+        
+        if not gpt_filenames:
+            return JSONResponse(status_code=404, content={"message": "No GPT weights files found"})
+        if not sovits_filenames:
+            return JSONResponse(status_code=404, content={"message": "No SoVITS weights files found"})
+        
+        return JSONResponse(status_code=200, content={
+            "gpt_weights_files": gpt_filenames,
+            "sovits_weights_files": sovits_filenames,
+            "server_port": port
+        })
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"message": f"Error retrieving weights info", "error": str(e)})
+
+@APP.post("/tts")
+async def tts_post_endpoint(request: TTS_Request):
+    req = request.model_dump()
+    print("\nProcessed request (req):")
+    print(f"Type: {type(req)}")
+    print("Content:")
+    for key, value in req.items():
+        print(f"  {key}: {value}")
+    
+    return await tts_handle(req)
+
+@APP.post("/upload_file")
+async def upload_file(file: UploadFile = File(...)):
+    try:
+        # Create a temporary directory if it doesn't exist
+        temp_dir = "temp_files"
+        os.makedirs(temp_dir, exist_ok=True)
+
+        # Define the path to save the uploaded file
+        file_path = os.path.join(temp_dir, file.filename)
+
+        # Save the uploaded file to the temporary directory
+        with open(file_path, "wb") as buffer:
+            shutil.copyfileobj(file.file, buffer)
+
+        return JSONResponse(status_code=200, content={"message": "File uploaded successfully", "file_path": file_path})
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"message": "File upload failed", "error": str(e)})
+
+APP.mount("/", StaticFiles(directory="dist", html=True), name="static")
+print("--------------------------------")
+print(f"前端界面已在 http://{host}:{port} 开启。")
+print("目前的前端版本只适配默认端口9880, 更改api端口会导致前端页面无法工作, 但不影响后端api运行。")
+print("在前端界面中上传的音频文件将会保存在 ./temp_files 目录下，如有需要请手动删除。")
+print("请至少运行一遍webui.py, 放好模型, 再运行本API, 以确保存放模型的文件夹SoVITS_weights和GPT_weights存在。")
+print("如遇配置错误，请检查命令行上方输出的配置详情，并修改文件/GPT_SoVITS/configs/tts_infer.yaml")
+print("如果运行环境是mac, 请将tts_infer.yaml内custom条目下的device改为cpu, is_half改为false")
+print("--------------------------------")

 if __name__ == "__main__":
    try:
--- a/colab_webui.ipynb
+++ b/colab_webui.ipynb
@ -1,23 +1,10 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": [],
-      "include_colab_link": true
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "accelerator": "GPU"
-  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
-        "id": "view-in-github",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "view-in-github"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
@ -25,18 +12,20 @@
    },
    {
      "cell_type": "markdown",
-      "source": [
-        "环境配置 environment"
-      ],
      "metadata": {
        "id": "_o6a8GS2lWQM"
-      }
+      },
+      "source": [
+        "环境配置 environment"
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "e9b7iFV3dm1f"
      },
+      "outputs": [],
      "source": [
        "!pip install -q condacolab\n",
        "# Setting up condacolab and installing packages\n",
@ -47,13 +36,17 @@
        "!conda install -y -q -c pytorch -c nvidia cudatoolkit\n",
        "%cd -q /content/GPT-SoVITS\n",
        "!conda install -y -q -c conda-forge gcc gxx ffmpeg cmake -c pytorch -c nvidia\n",
+        "!/usr/local/bin/pip install -r extra-req.txt --no-deps\n",
        "!/usr/local/bin/pip install -r requirements.txt"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "0NgxXg5sjv7z"
+      },
+      "outputs": [],
      "source": [
        "# @title Download pretrained models 下载预训练模型\n",
        "!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
@ -71,27 +64,35 @@
        "!git clone https://huggingface.co/Delik/uvr5_weights\n",
        "!git config core.sparseCheckout true\n",
        "!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
-      ],
-      "metadata": {
-        "id": "0NgxXg5sjv7z"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "4oRGUzkrk8C7"
+      },
+      "outputs": [],
      "source": [
        "# @title launch WebUI 启动WebUI\n",
        "!/usr/local/bin/pip install ipykernel\n",
        "!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
        "%cd /content/GPT-SoVITS/\n",
        "!/usr/local/bin/python  webui.py"
-      ],
-      "metadata": {
-        "id": "4oRGUzkrk8C7"
-      },
-      "execution_count": null,
-      "outputs": []
-    }
      ]
    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "include_colab_link": true,
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/dist/assets/index-BXQvAA72.js
+++ b/dist/assets/index-BXQvAA72.js
--- a/dist/assets/index-Dl43Gj3X.css
+++ b/dist/assets/index-Dl43Gj3X.css
--- a/dist/index.html
+++ b/dist/index.html
@ -0,0 +1,13 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>Vite Project</title>
+    <script type="module" crossorigin src="/assets/index-BXQvAA72.js"></script>
+    <link rel="stylesheet" crossorigin href="/assets/index-Dl43Gj3X.css">
+  </head>
+  <body>
+    <div id="app"></div>
+  </body>
+</html>
--- a/docs/cn/Changelog_CN.md
+++ b/docs/cn/Changelog_CN.md
@ -286,3 +286,17 @@ https://github.com/RVC-Boss/GPT-SoVITS/pull/2112 https://github.com/RVC-Boss/GPT
 修复短文本语种选择出错 https://github.com/RVC-Boss/GPT-SoVITS/pull/2122

 修复v3sovits未传参以支持调节语速
+
+### 202503
+
+修复一批由依赖的库版本不对导致的问题https://github.com/RVC-Boss/GPT-SoVITS/commit/6c468583c5566e5fbb4fb805e4cc89c403e997b8
+
+修复模型加载异步逻辑https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa
+
+修复其他若干bug
+
+重点更新：
+
+1-v3支持并行推理 https://github.com/RVC-Boss/GPT-SoVITS/commit/03b662a769946b7a6a8569a354860e8eeeb743aa
+
+2-整合包修复onnxruntime GPU推理的支持，影响：（1）g2pw有个onnx模型原先是CPU推理现在用GPU，显著降低推理的CPU瓶颈 （2）foxjoy去混响模型现在可使用GPU推理
--- a/docs/cn/README.md
+++ b/docs/cn/README.md
@ -76,6 +76,7 @@ bash install.sh
 ```bash
 conda create -n GPTSoVits python=3.9
 conda activate GPTSoVits
+pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```

@ -104,6 +105,7 @@ conda install -c conda-forge 'ffmpeg<7'
 安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语 TTS)

 ##### MacOS 用户
+
 ```bash
 brew install ffmpeg
 ```
@ -111,6 +113,7 @@ brew install ffmpeg
 #### 安装依赖

 ```bash
+pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```

@ -155,7 +158,6 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker

   - 建议在模型名称和配置文件名中**直接指定模型类型**，例如`mel_mand_roformer`、`bs_roformer`。如果未指定，将从配置文中比对特征，以确定它是哪种类型的模型。例如，模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对。`kim_mel_band_roformer.ckpt` 和 `kim_mel_band_roformer.yaml` 也是一对。

-
 4. 对于中文 ASR（额外功能），从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型，并将它们放置在 `tools/asr/models` 目录中。

 5. 对于英语或日语 ASR（额外功能），从 [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 下载模型，并将其放置在 `tools/asr/models` 目录中。此外，[其他模型](https://huggingface.co/Systran) 可能具有类似效果且占用更少的磁盘空间。
@ -202,6 +204,7 @@ python webui.py <language(optional)>
 ```bash
 python webui.py v1 <language(optional)>
 ```
+
 或者在 webUI 内动态切换

 ### 微调
@ -226,11 +229,13 @@ python webui.py v1 <language(optional)>
 ```bash
 python GPT_SoVITS/inference_webui.py <language(optional)>
 ```
+
 或者

 ```bash
 python webui.py
 ```
+
 然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI

 ## V2 发布说明
@ -245,7 +250,7 @@ python webui.py

 4. 对低音质参考音频（尤其是来源于网络的高频严重缺失、听着很闷的音频）合成出来音质更好

-    详见[wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

 从 v1 环境迁移至 v2

@ -265,7 +270,7 @@ python webui.py

 2. GPT 合成更稳定，重复漏字更少，也更容易跑出丰富情感

-    详见[wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

 从 v2 环境迁移至 v3

@ -277,7 +282,6 @@ python webui.py

   如果想用音频超分功能缓解 v3 模型生成 24k 音频觉得闷的问题，需要下载额外的模型参数，参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt)

-
 ## 待办事项清单

 - [x] **高优先级：**
@ -299,16 +303,21 @@ python webui.py
  - [ ] 模型混合。

 ## （附加）命令行运行方式
+
 使用命令行打开 UVR5 的 WebUI
-````
+
+```
 python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
-````
+```
+
 <!-- 如果打不开浏览器，请按照下面的格式进行UVR处理，这是使用mdxnet进行音频处理的方式
 ````
 python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
 ```` -->
+
 这是使用命令行完成数据集的音频切分的方式
-````
+
+```
 python audio_slicer.py \
    --input_path "<path_to_original_audio_file_or_directory>" \
    --output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
@ -316,17 +325,22 @@ python audio_slicer.py \
    --min_length <minimum_duration_of_each_subclip> \
    --min_interval <shortest_time_gap_between_adjacent_subclips>
    --hop_size <step_size_for_computing_volume_curve>
-````
+```
+
 这是使用命令行完成数据集 ASR 处理的方式（仅限中文）
-````
+
+```
 python tools/asr/funasr_asr.py -i <input> -o <output>
-````
+```
+
 通过 Faster_Whisper 进行 ASR 处理（除中文之外的 ASR 标记）

 （没有进度条，GPU 性能可能会导致时间延迟）
+
 ```
 python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
 ```
+
 启用自定义列表保存路径

 ## 致谢
@ -334,6 +348,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
 特别感谢以下项目和贡献者：

 ### 理论研究
+
 - [ar-vits](https://github.com/innnky/ar-vits)
 - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
 - [vits](https://github.com/jaywalnut310/vits)
@ -343,17 +358,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
 - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
 - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
 - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
+
 ### 预训练模型
+
 - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
 - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
 - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
+
 ### 推理用文本前端
+
 - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
 - [split-lang](https://github.com/DoodleBears/split-lang)
 - [g2pW](https://github.com/GitYCC/g2pW)
 - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
 - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
+
 ### WebUI 工具
+
 - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
 - [audio-slicer](https://github.com/openvpi/audio-slicer)
 - [SubFix](https://github.com/cronrpc/SubFix)
--- a/docs/ja/README.md
+++ b/docs/ja/README.md
@ -70,7 +70,7 @@ bash install.sh
 ```bash
 conda create -n GPTSoVits python=3.9
 conda activate GPTSoVits
-
+pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```

@ -97,6 +97,7 @@ conda install -c conda-forge 'ffmpeg<7'
 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます。

 ##### MacOS ユーザー
+
 ```bash
 brew install ffmpeg
 ```
@ -104,6 +105,7 @@ brew install ffmpeg
 #### 依存関係をインストールします

 ```bash
+pip install -r extra-req.txt --no-deps
 pip install -r requirementx.txt
 ```

@ -169,6 +171,7 @@ vocal_path|speaker_name|language|text
 ```
 D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
 ```
+
 ## 微調整と推論

 ### WebUI を開く
@ -189,6 +192,7 @@ V1に切り替えたい場合は
 ```bash
 python webui.py v1 <言語(オプション)>
 ```
+
 または WebUI で手動でバージョンを切り替えてください。

 ### 微調整
@ -213,11 +217,13 @@ python webui.py v1 <言語(オプション)>
 ```bash
 python GPT_SoVITS/inference_webui.py <言語(オプション)>
 ```
+
 または

 ```bash
 python webui.py
 ```
+
 その後、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます。

 ## V2 リリースノート
@ -232,7 +238,7 @@ python webui.py

 4. 低品質の参照音声に対する合成品質の向上

-    [詳細はこちら](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [詳細はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

 V1 環境から V2 を使用するには:

@ -252,7 +258,7 @@ V1環境からV2を使用するには:

 2. GPT モデルがより安定し、繰り返しや省略が減少し、より豊かな感情表現を持つ音声の生成が容易になりました。

-    [詳細情報はこちら](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [詳細情報はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

 v2 環境から v3 を使用する方法:

@ -285,15 +291,20 @@ v2 環境から v3 を使用する方法:
  - [ ] モデルミックス

 ## (追加の) コマンドラインから実行する方法
+
 コマンド ラインを使用して UVR5 の WebUI を開きます
+
 ```
 python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
 ```
+
 <!-- ブラウザを開けない場合は、以下の形式に従って UVR 処理を行ってください。これはオーディオ処理に mdxnet を使用しています。
 ```
 python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
 ``` -->
+
 コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです。
+
 ```
 python audio_slicer.py \
    --input_path "<path_to_original_audio_file_or_directory>" \
@ -303,16 +314,21 @@ python audio_slicer.py \
    --min_interval <shortest_time_gap_between_adjacent_subclips>
    --hop_size <step_size_for_computing_volume_curve>
 ```
+
 コマンドラインを使用してデータセット ASR 処理を行う方法です (中国語のみ)
+
 ```
 python tools/asr/funasr_asr.py -i <input> -o <output>
 ```
+
 ASR 処理は Faster_Whisper を通じて実行されます(中国語を除く ASR マーキング)

 (進行状況バーは表示されません。GPU のパフォーマンスにより時間遅延が発生する可能性があります)
+
 ```
 python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
 ```
+
 カスタムリストの保存パスが有効になっています

 ## クレジット
@ -320,6 +336,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
 特に以下のプロジェクトと貢献者に感謝します：

 ### 理論研究
+
 - [ar-vits](https://github.com/innnky/ar-vits)
 - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
 - [vits](https://github.com/jaywalnut310/vits)
@ -329,17 +346,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
 - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
 - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
 - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
+
 ### 事前学習モデル
+
 - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
 - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
 - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
+
 ### 推論用テキストフロントエンド
+
 - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
 - [split-lang](https://github.com/DoodleBears/split-lang)
 - [g2pW](https://github.com/GitYCC/g2pW)
 - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
 - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
+
 ### WebUI ツール
+
 - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
 - [audio-slicer](https://github.com/openvpi/audio-slicer)
 - [SubFix](https://github.com/cronrpc/SubFix)
--- a/docs/ko/README.md
+++ b/docs/ko/README.md
@ -70,7 +70,7 @@ bash install.sh
 ```bash
 conda create -n GPTSoVits python=3.9
 conda activate GPTSoVits
-
+pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```

@ -99,6 +99,7 @@ conda install -c conda-forge 'ffmpeg<7'
 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치 (Korean TTS 전용)

 ##### MacOS 사용자
+
 ```bash
 brew install ffmpeg
 ```
@ -106,6 +107,7 @@ brew install ffmpeg
 #### 의존성 설치

 ```bash
+pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```

@ -195,6 +197,7 @@ V1으로 전환하려면,
 ```bash
 python webui.py v1 <언어(옵션)>
 ```
+
 또는 WebUI에서 수동으로 버전을 전환하십시오.

 ### 미세 조정
@ -219,11 +222,13 @@ python webui.py v1 <언어(옵션)>
 ```bash
 python GPT_SoVITS/inference_webui.py <언어(옵션)>
 ```
+
 또는

 ```bash
 python webui.py
 ```
+
 그런 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.

 ## V2 릴리스 노트
@ -238,7 +243,7 @@ python webui.py

 4. 저품질 참조 오디오에 대한 합성 품질 향상

-    [자세한 내용](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

 V1 환경에서 V2를 사용하려면:

@ -258,7 +263,7 @@ V1 환경에서 V2를 사용하려면:

 2. GPT 모델이 더 안정적이며 반복 및 생략이 적고, 더 풍부한 감정 표현을 가진 음성을 생성하기가 더 쉽습니다.

-    [자세한 내용](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

 v2 환경에서 v3 사용하기:

@ -270,7 +275,6 @@ v2 환경에서 v3 사용하기:

   추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요.

-
 ## 할 일 목록

 - [x] **최우선순위:**
@ -293,15 +297,20 @@ v2 환경에서 v3 사용하기:
  - [ ] 모델 블렌딩.

 ## (추가적인) 명령줄에서 실행하는 방법
+
 명령줄을 사용하여 UVR5용 WebUI 열기
+
 ```
 python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
 ```
+
 <!-- 브라우저를 열 수 없는 경우 UVR 처리를 위해 아래 형식을 따르십시오. 이는 오디오 처리를 위해 mdxnet을 사용하는 것입니다.
 ```
 python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
 ``` -->
+
 명령줄을 사용하여 데이터세트의 오디오 분할을 수행하는 방법은 다음과 같습니다.
+
 ```
 python audio_slicer.py \
    --input_path "<path_to_original_audio_file_or_directory>" \
@ -311,16 +320,21 @@ python audio_slicer.py \
    --min_interval <shortest_time_gap_between_adjacent_subclips>
    --hop_size <step_size_for_computing_volume_curve>
 ```
+
 명령줄을 사용하여 데이터 세트 ASR 처리를 수행하는 방법입니다(중국어만 해당).
+
 ```
 python tools/asr/funasr_asr.py -i <input> -o <output>
 ```
+
 ASR 처리는 Faster_Whisper(중국어를 제외한 ASR 마킹)를 통해 수행됩니다.

 (진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음)
+
 ```
 python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
 ```
+
 사용자 정의 목록 저장 경로가 활성화되었습니다.

 ## 감사의 말
@ -328,6 +342,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
 다음 프로젝트와 기여자들에게 특별히 감사드립니다:

 ### 이론 연구
+
 - [ar-vits](https://github.com/innnky/ar-vits)
 - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
 - [vits](https://github.com/jaywalnut310/vits)
@ -337,17 +352,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
 - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
 - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
 - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
+
 ### 사전 학습 모델
+
 - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
 - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
 - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
+
 ### 추론용 텍스트 프론트엔드
+
 - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
 - [split-lang](https://github.com/DoodleBears/split-lang)
 - [g2pW](https://github.com/GitYCC/g2pW)
 - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
 - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
+
 ### WebUI 도구
+
 - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
 - [audio-slicer](https://github.com/openvpi/audio-slicer)
 - [SubFix](https://github.com/cronrpc/SubFix)
--- a/docs/tr/README.md
+++ b/docs/tr/README.md
@ -72,7 +72,7 @@ bash install.sh
 ```bash
 conda create -n GPTSoVits python=3.9
 conda activate GPTSoVits
-
+pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```

@ -99,6 +99,7 @@ conda install -c conda-forge 'ffmpeg<7'
 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin.

 ##### MacOS Kullanıcıları
+
 ```bash
 brew install ffmpeg
 ```
@ -106,6 +107,7 @@ brew install ffmpeg
 #### Bağımlılıkları Yükleme

 ```bash
+pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```

@ -192,6 +194,7 @@ V1'e geçmek istiyorsanız,
 ```bash
 python webui.py v1 <dil(isteğe bağlı)>
 ```
+
 veya WebUI'de manuel olarak sürüm değiştirin.

 ### İnce Ayar
@ -216,11 +219,13 @@ veya WebUI'de manuel olarak sürüm değiştirin.
 ```bash
 python GPT_SoVITS/inference_webui.py <dil(isteğe bağlı)>
 ```
+
 VEYA

 ```bash
 python webui.py
 ```
+
 ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.

 ## V2 Sürüm Notları
@ -235,7 +240,7 @@ Yeni Özellikler:

 4. Düşük kaliteli referans sesler için geliştirilmiş sentez kalitesi

-    [detaylar burada](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [detaylar burada](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

 V1 ortamından V2'yi kullanmak için:

@ -255,7 +260,7 @@ V1 ortamından V2'yi kullanmak için:

 2. GPT modeli daha **kararlı** hale geldi, tekrarlar ve atlamalar azaldı ve **daha zengin duygusal ifadeler** ile konuşma üretmek daha kolay hale geldi.

-    [daha fazla detay](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [daha fazla detay](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

 ### v2 ortamında v3 kullanımı:

@ -288,15 +293,20 @@ V1 ortamından V2'yi kullanmak için:
  - [ ] model karışımı

 ## (Ekstra) Komut satırından çalıştırma yöntemi
+
 UVR5 için Web Arayüzünü açmak için komut satırını kullanın
+
 ```
 python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
 ```
+
 <!-- Bir tarayıcı açamıyorsanız, UVR işleme için aşağıdaki formatı izleyin,Bu ses işleme için mdxnet kullanıyor
 ```
 python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
 ``` -->
+
 Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır
+
 ```
 python audio_slicer.py \
    --input_path "<orijinal_ses_dosyası_veya_dizininin_yolu>" \
@ -306,16 +316,21 @@ python audio_slicer.py \
    --min_interval <bitişik_alt_klipler_arasındaki_en_kısa_zaman_aralığı>
    --hop_size <ses_eğrisini_hesaplamak_için_adım_boyutu>
 ```
+
 Veri seti ASR işleme komut satırı kullanılarak bu şekilde yapılır (Yalnızca Çince)
+
 ```
 python tools/asr/funasr_asr.py -i <girdi> -o <çıktı>
 ```
+
 ASR işleme Faster_Whisper aracılığıyla gerçekleştirilir (Çince dışındaki ASR işaretleme)

 (İlerleme çubukları yok, GPU performansı zaman gecikmelerine neden olabilir)
+
 ```
 python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
 ```
+
 Özel bir liste kaydetme yolu etkinleştirildi

 ## Katkı Verenler
@ -323,6 +338,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
 Özellikle aşağıdaki projelere ve katkıda bulunanlara teşekkür ederiz:

 ### Teorik Araştırma
+
 - [ar-vits](https://github.com/innnky/ar-vits)
 - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
 - [vits](https://github.com/jaywalnut310/vits)
@ -332,17 +348,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
 - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
 - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
 - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
+
 ### Önceden Eğitilmiş Modeller
+
 - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
 - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
 - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
+
 ### Tahmin İçin Metin Ön Ucu
+
 - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
 - [split-lang](https://github.com/DoodleBears/split-lang)
 - [g2pW](https://github.com/GitYCC/g2pW)
 - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
 - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
+
 ### WebUI Araçları
+
 - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
 - [audio-slicer](https://github.com/openvpi/audio-slicer)
 - [SubFix](https://github.com/cronrpc/SubFix)
--- a/extra-req.txt
+++ b/extra-req.txt
@ -0,0 +1 @@
+faster-whisper
--- a/gpt-sovits_kaggle.ipynb
+++ b/gpt-sovits_kaggle.ipynb
@ -27,7 +27,8 @@
    "!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
    "%cd GPT-SoVITS\n",
    "!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
-    "!pip install -r requirements.txt"
+    "!pip install -r requirements.txt\n",
+    "!pip install -r extra-req.txt --no-deps"
   ]
  },
  {
--- a/install.sh
+++ b/install.sh
@ -1,15 +1,17 @@
 #!/bin/bash

+set -e
+
 # 安装构建工具
 # Install build tools
 echo "Installing GCC..."
-conda install -c conda-forge gcc=14
+conda install -c conda-forge gcc=14 -y

 echo "Installing G++..."
-conda install -c conda-forge gxx
+conda install -c conda-forge gxx -y

 echo "Installing ffmpeg and cmake..."
-conda install ffmpeg cmake
+conda install ffmpeg cmake -y

 # 设置编译环境
 # Set up build environment
@ -26,7 +28,6 @@ else
    USE_CUDA=false
 fi

-
 if [ "$USE_CUDA" = false ]; then
    echo "Checking for ROCm installation..."
    if [ -d "/opt/rocm" ]; then
@ -56,21 +57,53 @@ else
    conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 cpuonly -c pytorch
 fi

-
 echo "Installing Python dependencies from requirements.txt..."

 # 刷新环境
 # Refresh environment
 hash -r
+
+# pyopenjtalk Installation
+conda install jq -y
+
+OS_TYPE=$(uname)
+
+PACKAGE_NAME="pyopenjtalk"
+
+VERSION=$(curl -s https://pypi.org/pypi/$PACKAGE_NAME/json | jq -r .info.version)
+
+wget "https://files.pythonhosted.org/packages/source/${PACKAGE_NAME:0:1}/$PACKAGE_NAME/$PACKAGE_NAME-$VERSION.tar.gz"
+
+TAR_FILE=$(ls ${PACKAGE_NAME}-*.tar.gz)
+DIR_NAME="${TAR_FILE%.tar.gz}"
+
+tar -xzf "$TAR_FILE"
+rm "$TAR_FILE"
+
+CMAKE_FILE="$DIR_NAME/lib/open_jtalk/src/CMakeLists.txt"
+
+if [[ "$OS_TYPE" == "darwin"* ]]; then
+    sed -i '' -E 's/cmake_minimum_required\(VERSION[^\)]*\)/cmake_minimum_required(VERSION 3.5...3.31)/' "$CMAKE_FILE"
+else
+    sed -i -E 's/cmake_minimum_required\(VERSION[^\)]*\)/cmake_minimum_required(VERSION 3.5...3.31)/' "$CMAKE_FILE"
+fi
+
+tar -czf "$TAR_FILE" "$DIR_NAME"
+
+pip install "$TAR_FILE"
+
+rm -rf "$TAR_FILE" "$DIR_NAME"
+
+pip install -r extra-req.txt --no-deps
+
 pip install -r requirements.txt

 if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ]; then
    echo "Update to WSL compatible runtime lib..."
-    location=`pip show torch | grep Location | awk -F ": " '{print $2}'`
-    cd ${location}/torch/lib/
+    location=$(pip show torch | grep Location | awk -F ": " '{print $2}')
+    cd "${location}"/torch/lib/ || exit
    rm libhsa-runtime64.so*
    cp /opt/rocm/lib/libhsa-runtime64.so.1.2 libhsa-runtime64.so
 fi

 echo "Installation completed successfully!"
-
--- a/requirements.txt
+++ b/requirements.txt
@ -3,7 +3,7 @@ scipy
 tensorboard
 librosa==0.9.2
 numba==0.56.4
-pytorch-lightning
+pytorch-lightning>2.0
 gradio>=4.0,<=4.24.0
 ffmpeg-python
 onnxruntime; sys_platform == 'darwin'
@ -26,7 +26,6 @@ jieba_fast
 jieba
 split-lang
 fast_langdetect>=0.3.0
-Faster_Whisper
 wordsegment
 rotary_embedding_torch
 ToJyutping 
@ -38,4 +37,9 @@ python_mecab_ko; sys_platform != 'win32'
 fastapi<0.112.2
 x_transformers
 torchmetrics<=1.5
-attrdict
+pydantic<=2.10.6
+ctranslate2>=4.0,<5
+huggingface_hub>=0.13
+tokenizers>=0.13,<1
+av>=11
+tqdm
--- a/webui.py
+++ b/webui.py
@ -298,9 +298,9 @@ def change_tts_inference(bert_path,cnhubert_base_path,gpu_number,gpt_path,sovits
        cmd = '"%s" GPT_SoVITS/inference_webui_fast.py "%s"'%(python_exec, language)
    else:
        cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language)
-    #####v3暂不支持加速推理
-    if version=="v3":
-        cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language)
+    # #####v3暂不支持加速推理
+    # if version=="v3":
+    #     cmd = '"%s" GPT_SoVITS/inference_webui.py "%s"'%(python_exec, language)
    if p_tts_inference is None:
        os.environ["gpt_path"]=gpt_path if "/" in gpt_path else "%s/%s"%(GPT_weight_root,gpt_path)
        os.environ["sovits_path"]=sovits_path if "/"in sovits_path else "%s/%s"%(SoVITS_weight_root,sovits_path)
@ -849,8 +849,8 @@ def switch_version(version_):
        {'__type__': 'update', "value": default_sovits_save_every_epoch,"maximum": max_sovits_save_every_epoch}, \
        {'__type__': 'update', "visible": True if version!="v3"else False}, \
        {'__type__': 'update', "value": False if not if_force_ckpt else True, "interactive": True if not if_force_ckpt else False}, \
-        {'__type__': 'update', "interactive": False if version == "v3" else True, "value": False}, \
-        {'__type__': 'update', "visible": True if version== "v3" else False}
+        {'__type__': 'update', "interactive": True, "value": False}, \
+        {'__type__': 'update', "visible": True if version== "v3" else False}        # {'__type__': 'update', "interactive": False if version == "v3" else True, "value": False}, \ ####batch infer

 if os.path.exists('GPT_SoVITS/text/G2PWModel'):...
 else:
Author	SHA1	Message	Date
Monophotic	f93eb3b866	Merge eed2095a42927ea0a9c6bd548eaf4926b6c99f8f into 9da7e17efe05041e31d3c3f42c8730ae890397f2	2025-04-02 04:19:18 +09:00
RVC-Boss	9da7e17efe	Add files via upload	2025-04-01 18:44:35 +08:00
RVC-Boss	b0de354c63	Update Changelog_CN.md	2025-04-01 17:21:48 +08:00
RVC-Boss	41090e5a7c	Update g2pw url	2025-04-01 17:15:52 +08:00
RVC-Boss	605b380114	修复模型加载异步逻辑修复模型加载异步逻辑	2025-04-01 16:50:54 +08:00
RVC-Boss	9f8d455130	支持v3并行推理 support v3 models batch inference	2025-04-01 16:31:48 +08:00
RVC-Boss	7abae557fb	删除加载v3sovits模型缺少enc_q告警删除加载v3sovits模型缺少enc_q告警	2025-04-01 16:31:15 +08:00
RVC-Boss	6a60e5edb1	v3解锁并行推理;修复模型加载异步逻辑 v3解锁并行推理;修复模型加载异步逻辑	2025-04-01 16:29:52 +08:00
RVC-Boss	28bdff356f	fix https://github.com/RVC-Boss/GPT-SoVITS/issues/2250 fix https://github.com/RVC-Boss/GPT-SoVITS/issues/2250	2025-04-01 10:34:02 +08:00
ChasonJiang	03b662a769	为sovits_v3 适配并行推理 (#2241 ) * 为sovits_v3 适配并行推理 * 清理无用代码	2025-03-31 11:56:05 +08:00
XXXXRT666	6c468583c5	Fix dependency-related issues via requirements update (#2236 ) * Update requirements.txt * Create constraints.txt * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * pyopenjtalk and onnx fix * Update requirements.txt * Update requirements.txt * Update install.sh * update shell install.sh * update docs * Update Install.sh * fix bugs * Update .gitignore * Update .gitignore * Update install.sh * Update install.sh * Update extra-req.txt * Update requirements.txt	2025-03-31 11:27:12 +08:00
Monophotic	eed2095a42	Delete dist/vite.svg，移除无用的图片	2024-10-10 11:02:57 +08:00
Monophotic	aad5afd5b0	Update requirements.txt，移除shutil	2024-10-10 11:01:27 +08:00
Svring	87fe6f2fd5	增设前端页面，并改进api_v2以进行适配。	2024-10-09 20:15:40 +08:00