Merge 50a88a596dea718c83e535136e9cb46b513cef6f into 03b662a769946b7a6a8569a354860e8eeeb743aa

为sovits_v3 适配并行推理 (#2241 )
* 为sovits_v3 适配并行推理 * 清理无用代码
2025-04-05 19:41:56 +08:00 · 2025-03-31 15:38:15 +08:00 · 2025-03-31 11:56:05 +08:00 · 2025-03-31 11:27:12 +08:00 · 2024-09-15 19:25:58 +08:00 · 2024-08-05 20:11:11 +08:00
47 changed files with 3757 additions and 224 deletions
--- a/.gitignore
+++ b/.gitignore
@ -18,5 +18,183 @@ TEMP
 weight.json
 ffmpeg*
 ffprobe*
+cfg.json
+speakers.json
+ref_audios
 tools/AP_BWE_main/24kto48k/*
-!tools/AP_BWE_main/24kto48k/readme.txt
+!tools/AP_BWE_main/24kto48k/readme.txt
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+# Ruff stuff:
+.ruff_cache/
+
+# PyPI configuration file
+.pypirc
--- a/GPT_SoVITS/TTS_infer_pack/TTS.py
+++ b/GPT_SoVITS/TTS_infer_pack/TTS.py
@ -3,7 +3,7 @@ import math
 import os, sys, gc
 import random
 import traceback
-
+import time
 import torchaudio
 from tqdm import tqdm
 now_dir = os.getcwd()
@ -908,11 +908,14 @@ class TTS:
                split_bucket = False
                print(i18n("分段返回模式不支持分桶处理，已自动关闭分桶处理"))

-        if split_bucket and speed_factor==1.0:
+        if split_bucket and speed_factor==1.0 and not (self.configs.is_v3_synthesizer and parallel_infer):
            print(i18n("分桶处理模式已开启"))
        elif speed_factor!=1.0:
            print(i18n("语速调节不支持分桶处理，已自动关闭分桶处理"))
            split_bucket = False
+        elif self.configs.is_v3_synthesizer and parallel_infer:
+            print(i18n("当开启并行推理模式时，SoVits V3模型不支持分桶处理，已自动关闭分桶处理"))
+            split_bucket = False
        else:
            print(i18n("分桶处理模式已关闭"))

@ -936,7 +939,7 @@ class TTS:
            raise ValueError("ref_audio_path cannot be empty, when the reference audio is not set using set_ref_audio()")

        ###### setting reference audio and prompt text preprocessing ########
-        t0 = ttime()
+        t0 = time.perf_counter()
        if (ref_audio_path is not None) and (ref_audio_path != self.prompt_cache["ref_audio_path"]):
            if not os.path.exists(ref_audio_path):
                raise ValueError(f"{ref_audio_path} not exists")
@ -975,7 +978,7 @@ class TTS:


        ###### text preprocessing ########
-        t1 = ttime()
+        t1 = time.perf_counter()
        data:list = None
        if not return_fragment:
            data = self.text_preprocessor.preprocess(text, text_lang, text_split_method, self.configs.version)
@ -1027,7 +1030,7 @@ class TTS:
                return batch[0]


-        t2 = ttime()
+        t2 = time.perf_counter()
        try:
            print("############ 推理 ############")
            ###### inference ######
@ -1036,7 +1039,7 @@ class TTS:
            audio = []
            output_sr = self.configs.sampling_rate if not self.configs.is_v3_synthesizer else 24000
            for item in data:
-                t3 = ttime()
+                t3 = time.perf_counter()
                if return_fragment:
                    item = make_batch(item)
                    if item is None:
@ -1071,7 +1074,7 @@ class TTS:
                    max_len=max_len,
                    repetition_penalty=repetition_penalty,
                )
-                t4 = ttime()
+                t4 = time.perf_counter()
                t_34 += t4 - t3

                refer_audio_spec:torch.Tensor = [item.to(dtype=self.precision, device=self.configs.device) for item in self.prompt_cache["refer_spec"]]
@ -1094,6 +1097,7 @@ class TTS:
                print(f"############ {i18n('合成音频')} ############")
                if not self.configs.is_v3_synthesizer:
                    if speed_factor == 1.0:
+                        print(f"{i18n('并行合成中')}...")
                        # ## vits并行推理 method 2
                        pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)]
                        upsample_rate = math.prod(self.vits_model.upsample_rates)
@ -1118,17 +1122,28 @@ class TTS:
                                audio_fragment
                            )  ###试试重建不带上prompt部分
                else:
-                    for i, idx in enumerate(tqdm(idx_list)):
-                        phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
-                        _pred_semantic = (pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0))   # .unsqueeze(0)#mq要多unsqueeze一次
-                        audio_fragment = self.v3_synthesis(
-                                _pred_semantic, phones, speed=speed_factor, sample_steps=sample_steps
-                            )
-                        batch_audio_fragment.append(
-                            audio_fragment
-                        ) 
+                    if parallel_infer:
+                        print(f"{i18n('并行合成中')}...")
+                        audio_fragments = self.v3_synthesis_batched_infer(
+                                                                        idx_list,
+                                                                        pred_semantic_list, 
+                                                                        batch_phones, 
+                                                                        speed=speed_factor, 
+                                                                        sample_steps=sample_steps
+                                                                    )
+                        batch_audio_fragment.extend(audio_fragments)
+                    else:
+                        for i, idx in enumerate(tqdm(idx_list)):
+                            phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
+                            _pred_semantic = (pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0))   # .unsqueeze(0)#mq要多unsqueeze一次
+                            audio_fragment = self.v3_synthesis(
+                                    _pred_semantic, phones, speed=speed_factor, sample_steps=sample_steps
+                                )
+                            batch_audio_fragment.append(
+                                audio_fragment
+                            ) 

-                t5 = ttime()
+                t5 = time.perf_counter()
                t_45 += t5 - t4
                if return_fragment:
                    print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t4 - t3, t5 - t4))
@ -1219,13 +1234,13 @@ class TTS:

        if super_sampling:
            print(f"############ {i18n('音频超采样')} ############")
-            t1 = ttime()
+            t1 = time.perf_counter()
            self.init_sr_model()
            if not self.sr_model_not_exist:
                audio,sr=self.sr_model(audio.unsqueeze(0),sr)
                max_audio=np.abs(audio).max()
                if max_audio > 1: audio /= max_audio
-            t2 = ttime()
+            t2 = time.perf_counter()
            print(f"超采样用时：{t2-t1:.3f}s")
        else:
            audio = audio.cpu().numpy()
@ -1260,7 +1275,7 @@ class TTS:
            ref_audio = ref_audio.mean(0).unsqueeze(0)
        if ref_sr!=24000:
            ref_audio=resample(ref_audio, ref_sr, self.configs.device)
-        # print("ref_audio",ref_audio.abs().mean())
+
        mel2 = mel_fn(ref_audio)
        mel2 = norm_spec(mel2)
        T_min = min(mel2.shape[2], fea_ref.shape[2])
@ -1285,15 +1300,156 @@ class TTS:

            cfm_res = self.vits_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0)
            cfm_res = cfm_res[:, :, mel2.shape[2]:]
-            mel2 = cfm_res[:, :, -T_min:]

+            mel2 = cfm_res[:, :, -T_min:]
            fea_ref = fea_todo_chunk[:, :, -T_min:]
+
            cfm_resss.append(cfm_res)
-        cmf_res = torch.cat(cfm_resss, 2)
-        cmf_res = denorm_spec(cmf_res)
+        cfm_res = torch.cat(cfm_resss, 2)
+        cfm_res = denorm_spec(cfm_res)
+
        
        with torch.inference_mode():
-            wav_gen = self.bigvgan_model(cmf_res)
+            wav_gen = self.bigvgan_model(cfm_res)
            audio=wav_gen[0][0]#.cpu().detach().numpy()
    
        return audio
+
+    
+
+    def v3_synthesis_batched_infer(self, 
+                    idx_list:List[int],
+                    semantic_tokens_list:List[torch.Tensor], 
+                    batch_phones:List[torch.Tensor], 
+                    speed:float=1.0,
+                    sample_steps:int=32
+                    )->List[torch.Tensor]:
+            
+        prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device)
+        prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device)
+        refer_audio_spec = self.prompt_cache["refer_spec"][0].to(dtype=self.precision, device=self.configs.device)
+
+        fea_ref,ge = self.vits_model.decode_encp(prompt_semantic_tokens, prompt_phones, refer_audio_spec)
+        ref_audio:torch.Tensor = self.prompt_cache["raw_audio"]
+        ref_sr = self.prompt_cache["raw_sr"]
+        ref_audio=ref_audio.to(self.configs.device).float()
+        if (ref_audio.shape[0] == 2):
+            ref_audio = ref_audio.mean(0).unsqueeze(0)
+        if ref_sr!=24000:
+            ref_audio=resample(ref_audio, ref_sr, self.configs.device)
+
+        mel2 = mel_fn(ref_audio)
+        mel2 = norm_spec(mel2)
+        T_min = min(mel2.shape[2], fea_ref.shape[2])
+        mel2 = mel2[:, :, :T_min]
+        fea_ref = fea_ref[:, :, :T_min]
+        if (T_min > 468):
+            mel2 = mel2[:, :, -468:]
+            fea_ref = fea_ref[:, :, -468:]
+            T_min = 468
+        chunk_len = 934 - T_min
+
+        mel2=mel2.to(self.precision)
+
+
+        # #### batched inference
+        overlapped_len = 12
+        feat_chunks = []
+        feat_lens = []
+        feat_list = []
+
+        for i, idx in enumerate(idx_list):
+            phones = batch_phones[i].unsqueeze(0).to(self.configs.device)
+            semantic_tokens = semantic_tokens_list[i][-idx:].unsqueeze(0).unsqueeze(0)   # .unsqueeze(0)#mq要多unsqueeze一次
+            feat, _ = self.vits_model.decode_encp(semantic_tokens, phones, refer_audio_spec, ge, speed)
+            feat_list.append(feat)
+            feat_lens.append(feat.shape[2])
+
+        feats = torch.cat(feat_list, 2)
+        feats_padded = F.pad(feats, (overlapped_len,0), "constant", 0)
+        pos = 0
+        padding_len = 0
+        while True:
+            if pos ==0:
+                chunk = feats_padded[:, :, pos:pos + chunk_len]
+            else:
+                pos = pos - overlapped_len
+                chunk = feats_padded[:, :, pos:pos + chunk_len]
+            pos += chunk_len
+            if (chunk.shape[-1] == 0): break
+
+            # padding for the last chunk
+            padding_len = chunk_len - chunk.shape[2]
+            if padding_len != 0:
+                chunk = F.pad(chunk, (0,padding_len), "constant", 0)
+            feat_chunks.append(chunk)
+            
+
+
+        feat_chunks = torch.cat(feat_chunks, 0)
+        bs = feat_chunks.shape[0]
+        fea_ref = fea_ref.repeat(bs,1,1)
+        fea = torch.cat([fea_ref, feat_chunks], 2).transpose(2, 1)
+        pred_spec = self.vits_model.cfm.inference(fea, torch.LongTensor([fea.size(1)]).to(fea.device), mel2, sample_steps, inference_cfg_rate=0)
+        pred_spec = pred_spec[:, :, -chunk_len:]
+        dd = pred_spec.shape[1]
+        pred_spec = pred_spec.permute(1, 0, 2).contiguous().view(dd, -1).unsqueeze(0)
+        # pred_spec = pred_spec[..., :-padding_len]
+
+
+        pred_spec = denorm_spec(pred_spec)
+        
+        with torch.no_grad():
+            wav_gen = self.bigvgan_model(pred_spec)
+            audio = wav_gen[0][0]#.cpu().detach().numpy()
+
+
+        audio_fragments = []
+        upsample_rate = 256
+        pos = 0
+
+        while pos < audio.shape[-1]:
+            audio_fragment = audio[pos:pos+chunk_len*upsample_rate]
+            audio_fragments.append(audio_fragment)
+            pos += chunk_len*upsample_rate
+
+        audio = self.sola_algorithm(audio_fragments, overlapped_len*upsample_rate)
+        audio = audio[overlapped_len*upsample_rate:-padding_len*upsample_rate]
+
+        audio_fragments = []
+        for feat_len in feat_lens:
+            audio_fragment = audio[:feat_len*upsample_rate]
+            audio_fragments.append(audio_fragment)
+            audio = audio[feat_len*upsample_rate:]
+
+
+        return audio_fragments
+    
+
+
+    def sola_algorithm(self, 
+                    audio_fragments:List[torch.Tensor],
+                    overlap_len:int,
+                    ):
+        
+        for i in range(len(audio_fragments)-1):
+            f1 = audio_fragments[i]
+            f2 = audio_fragments[i+1]
+            w1 = f1[-overlap_len:]
+            w2 = f2[:overlap_len]
+            assert w1.shape == w2.shape
+            corr = F.conv1d(w1.view(1,1,-1), w2.view(1,1,-1),padding=w2.shape[-1]//2).view(-1)[:-1]
+            idx = corr.argmax()
+            f1_ = f1[:-(overlap_len-idx)]
+            audio_fragments[i] = f1_
+
+            f2_ = f2[idx:]
+            window = torch.hann_window((overlap_len-idx)*2, device=f1.device, dtype=f1.dtype)
+            f2_[:(overlap_len-idx)] = window[:(overlap_len-idx)]*f2_[:(overlap_len-idx)] + window[(overlap_len-idx):]*f1[-(overlap_len-idx):]
+            audio_fragments[i+1] = f2_
+
+
+        return torch.cat(audio_fragments, 0)
+            
+                
+
--- a/GPT_SoVITS/text/symbols.py
+++ b/GPT_SoVITS/text/symbols.py
@ -398,4 +398,5 @@ arpa = {
 symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)
 symbols = sorted(set(symbols))
 if __name__ == "__main__":
+    print(symbols)
    print(len(symbols))
--- a/GPT_SoVITS_Inference.ipynb
+++ b/GPT_SoVITS_Inference.ipynb
@ -1,42 +1,37 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "accelerator": "GPU"
-  },
  "cells": [
    {
      "cell_type": "markdown",
-      "source": [
-        "# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)"
-      ],
      "metadata": {
        "id": "himHYZmra7ix"
-      }
+      },
+      "source": [
+        "# Credits for bubarino giving me the huggingface import code (感谢 bubarino 给了我 huggingface 导入代码)"
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "e9b7iFV3dm1f"
      },
+      "outputs": [],
      "source": [
        "!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
        "%cd GPT-SoVITS\n",
        "!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
+        "!pip install -r extra-req.txt --no-deps\n",
        "!pip install -r requirements.txt"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "0NgxXg5sjv7z"
+      },
+      "outputs": [],
      "source": [
        "# @title Download pretrained models 下载预训练模型\n",
        "!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
@ -53,16 +48,16 @@
        "!git clone https://huggingface.co/Delik/uvr5_weights\n",
        "!git config core.sparseCheckout true\n",
        "!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
-      ],
-      "metadata": {
-        "id": "0NgxXg5sjv7z",
-        "cellView": "form"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "cPDEH-9czOJF"
+      },
+      "outputs": [],
      "source": [
        "#@title Create folder models 创建文件夹模型\n",
        "import os\n",
@ -77,16 +72,16 @@
        "    print(f\"The folder '{folder_name}' was created successfully! (文件夹'{folder_name}'已成功创建！)\")\n",
        "\n",
        "print(\"All folders have been created. (所有文件夹均已创建。)\")"
-      ],
-      "metadata": {
-        "cellView": "form",
-        "id": "cPDEH-9czOJF"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "vbZY-LnM0tzq"
+      },
+      "outputs": [],
      "source": [
        "import requests\n",
        "import zipfile\n",
@ -124,29 +119,35 @@
        "        shutil.move(source_path, destination_path)\n",
        "\n",
        "print(f'Model downloaded. (模型已下载。)')"
-      ],
-      "metadata": {
-        "cellView": "form",
-        "id": "vbZY-LnM0tzq"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "4oRGUzkrk8C7"
+      },
+      "outputs": [],
      "source": [
        "# @title launch WebUI 启动WebUI\n",
        "!/usr/local/bin/pip install ipykernel\n",
        "!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
        "%cd /content/GPT-SoVITS/\n",
        "!/usr/local/bin/python  webui.py"
-      ],
-      "metadata": {
-        "id": "4oRGUzkrk8C7",
-        "cellView": "form"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
    }
-  ]
-}
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/README.md
+++ b/README.md
@ -1,6 +1,5 @@
 <div align="center">

-
 <h1>GPT-SoVITS-WebUI</h1>
 A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>

@ -77,6 +76,7 @@ bash install.sh
 ```bash
 conda create -n GPTSoVits python=3.9
 conda activate GPTSoVits
+pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```

@ -105,6 +105,7 @@ Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWeb
 Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) (Korean TTS Only)

 ##### MacOS Users
+
 ```bash
 brew install ffmpeg
 ```
@ -112,6 +113,7 @@ brew install ffmpeg
 #### Install Dependences

 ```bash
+pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```

@ -150,9 +152,9 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker

 3. For UVR5 (Vocals/Accompaniment Separation & Reverberation Removal, additionally), download models from [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) and place them in `tools/uvr5/uvr5_weights`.

-    - If you want to use `bs_roformer` or `mel_band_roformer` models for UVR5, you can manually download the model and corresponding configuration file, and put them in `tools/uvr5/uvr5_weights`. **Rename the model file and configuration file, ensure that the model and configuration files have the same and corresponding names except for the suffix**. In addition, the model and configuration file names **must include `roformer`** in order to be recognized as models of the roformer class.
+   - If you want to use `bs_roformer` or `mel_band_roformer` models for UVR5, you can manually download the model and corresponding configuration file, and put them in `tools/uvr5/uvr5_weights`. **Rename the model file and configuration file, ensure that the model and configuration files have the same and corresponding names except for the suffix**. In addition, the model and configuration file names **must include `roformer`** in order to be recognized as models of the roformer class.

-    - The suggestion is to **directly specify the model type** in the model name and configuration file name, such as `mel_mand_roformer`, `bs_roformer`. If not specified, the features will be compared from the configuration file to determine which type of model it is. For example, the model `bs_roformer_ep_368_sdr_12.9628.ckpt` and its corresponding configuration file `bs_roformer_ep_368_sdr_12.9628.yaml` are a pair, `kim_mel_band_roformer.ckpt` and `kim_mel_band_roformer.yaml` are also a pair.
+   - The suggestion is to **directly specify the model type** in the model name and configuration file name, such as `mel_mand_roformer`, `bs_roformer`. If not specified, the features will be compared from the configuration file to determine which type of model it is. For example, the model `bs_roformer_ep_368_sdr_12.9628.ckpt` and its corresponding configuration file `bs_roformer_ep_368_sdr_12.9628.yaml` are a pair, `kim_mel_band_roformer.ckpt` and `kim_mel_band_roformer.yaml` are also a pair.

 4. For Chinese ASR (additionally), download models from [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), and [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) and place them in `tools/asr/models`.

@ -200,6 +202,7 @@ if you want to switch to V1,then
 ```bash
 python webui.py v1 <language(optional)>
 ```
+
 Or maunally switch version in WebUI

 ### Finetune
@ -217,18 +220,20 @@ Or maunally switch version in WebUI

 #### Integrated Package Users

-Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference webui at  `1-GPT-SoVITS-TTS/1C-inference`
+Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`

 #### Others

 ```bash
 python GPT_SoVITS/inference_webui.py <language(optional)>
 ```
+
 OR

 ```bash
 python webui.py
 ```
+
 then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`

 ## V2 Release Notes
@ -243,7 +248,7 @@ New Features:

 4. Improved synthesis quality for low-quality reference audio

-    [more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

 Use v2 from v1 environment:

@ -253,7 +258,7 @@ Use v2 from v1 environment:

 3. Download v2 pretrained models from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) and put them into `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`.

-    Chinese v2 additional: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)（Download G2PW models,  unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.
+   Chinese v2 additional: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)（Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.

 ## V3 Release Notes

@ -263,7 +268,7 @@ New Features:

 2. GPT model is more stable, with fewer repetitions and omissions, and it is easier to generate speech with richer emotional expression.

-    [more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

 Use v3 from v2 environment:

@ -273,8 +278,7 @@ Use v3 from v2 environment:

 3. Download v3 pretrained models (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS\pretrained_models`.

-    additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt)
-
+   additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt)

 ## Todo List

@ -297,15 +301,20 @@ Use v3 from v2 environment:
  - [ ] model mix

 ## (Additional) Method for running from the command line
+
 Use the command line to open the WebUI for UVR5
+
 ```
 python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
 ```
+
 <!-- If you can't open a browser, follow the format below for UVR processing,This is using mdxnet for audio processing
 ```
 python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
 ``` -->
+
 This is how the audio segmentation of the dataset is done using the command line
+
 ```
 python audio_slicer.py \
    --input_path "<path_to_original_audio_file_or_directory>" \
@ -315,16 +324,21 @@ python audio_slicer.py \
    --min_interval <shortest_time_gap_between_adjacent_subclips>
    --hop_size <step_size_for_computing_volume_curve>
 ```
+
 This is how dataset ASR processing is done using the command line(Only Chinese)
+
 ```
 python tools/asr/funasr_asr.py -i <input> -o <output>
 ```
+
 ASR processing is performed through Faster_Whisper(ASR marking except Chinese)

 (No progress bars, GPU performance may cause time delays)
+
 ```
 python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
 ```
+
 A custom list save path is enabled

 ## Credits
@ -332,6 +346,7 @@ A custom list save path is enabled
 Special thanks to the following projects and contributors:

 ### Theoretical Research
+
 - [ar-vits](https://github.com/innnky/ar-vits)
 - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
 - [vits](https://github.com/jaywalnut310/vits)
@ -341,17 +356,23 @@ Special thanks to the following projects and contributors:
 - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
 - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
 - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
+
 ### Pretrained Models
+
 - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
 - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
 - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
+
 ### Text Frontend for Inference
+
 - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
 - [split-lang](https://github.com/DoodleBears/split-lang)
 - [g2pW](https://github.com/GitYCC/g2pW)
 - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
 - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
+
 ### WebUI Tools
+
 - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
 - [audio-slicer](https://github.com/openvpi/audio-slicer)
 - [SubFix](https://github.com/cronrpc/SubFix)
--- a/Ref_Audio_Selector/init.py
+++ b/Ref_Audio_Selector/init.py
--- a/Ref_Audio_Selector/common/init.py
+++ b/Ref_Audio_Selector/common/init.py
--- a/Ref_Audio_Selector/common/common.py
+++ b/Ref_Audio_Selector/common/common.py
@ -0,0 +1,156 @@
+from tools import my_utils
+from config import python_exec, is_half
+import subprocess
+import sys
+import os
+
+
+class RefAudioListManager:
+    def __init__(self, root_dir):
+        self.audio_dict = {'default': []}
+        absolute_root = os.path.abspath(root_dir)
+
+        for subdir, dirs, files in os.walk(absolute_root):
+            relative_path = os.path.relpath(subdir, absolute_root)
+
+            if relative_path == '.':
+                category = 'default'
+            else:
+                category = relative_path.replace(os.sep, '')
+
+            for file in files:
+                if file.endswith('.wav'):
+                    # 将相对路径转换为绝对路径
+                    audio_abs_path = os.path.join(subdir, file)
+                    if category not in self.audio_dict:
+                        self.audio_dict[category] = []
+                    self.audio_dict[category].append(audio_abs_path)
+
+    def get_audio_list(self):
+        return self.audio_dict
+
+    def get_flattened_audio_list(self):
+        all_audio_files = []
+        for category_audios in self.audio_dict.values():
+            all_audio_files.extend(category_audios)
+        return all_audio_files
+
+    def get_ref_audio_list(self):
+        audio_info_list = []
+        for category, audio_paths in self.audio_dict.items():
+            for audio_path in audio_paths:
+                filename_without_extension = os.path.splitext(os.path.basename(audio_path))[0]
+                audio_info = {
+                    'emotion': f"{category}-{filename_without_extension}",
+                    'ref_path': audio_path,
+                    'ref_text': filename_without_extension,
+                }
+                audio_info_list.append(audio_info)
+        return audio_info_list
+
+
+def batch_clean_paths(paths):
+    """
+    批量处理路径列表，对每个路径调用 clean_path() 函数。
+
+    参数:
+        paths (list[str]): 包含待处理路径的列表。
+
+    返回:
+        list[str]: 经过 clean_path() 处理后的路径列表。
+    """
+    cleaned_paths = []
+    for path in paths:
+        cleaned_paths.append(my_utils.clean_path(path))
+    return cleaned_paths
+
+
+def read_text_file_to_list(file_path):
+    # 按照UTF-8编码打开文件（确保能够正确读取中文）
+    with open(file_path, mode='r', encoding='utf-8') as file:
+        # 读取所有行并存储到一个列表中
+        lines = file.read().splitlines()
+    return lines
+
+
+def get_filename_without_extension(file_path):
+    """
+    Given a file path string, returns the file name without its extension.
+
+    Parameters:
+    file_path (str): The full path to the file.
+
+    Returns:
+    str: The file name without its extension.
+    """
+    base_name = os.path.basename(file_path)  # Get the base name (file name with extension)
+    file_name, file_extension = os.path.splitext(base_name)  # Split the base name into file name and extension
+    return file_name  # Return the file name without extension
+
+
+def read_file(file_path):
+    # 使用with语句打开并读取文件
+    with open(file_path, 'r', encoding='utf-8') as file:  # 'r' 表示以读取模式打开文件
+        # 一次性读取文件所有内容
+        file_content = file.read()
+
+    # 文件在with语句结束时会自动关闭
+    # 现在file_content变量中存储了文件的所有文本内容
+    return file_content
+
+
+def write_text_to_file(text, output_file_path):
+    try:
+        with open(output_file_path, 'w', encoding='utf-8') as file:
+            file.write(text)
+    except IOError as e:
+        print(f"Error occurred while writing to the file: {e}")
+    else:
+        print(f"Text successfully written to file: {output_file_path}")
+
+
+def check_path_existence_and_return(path):
+    """
+    检查给定路径（文件或目录）是否存在。如果存在，返回该路径；否则，返回空字符串。
+    :param path: 待检查的文件或目录路径（字符串）
+    :return: 如果路径存在，返回原路径；否则，返回空字符串
+    """
+    if os.path.exists(path):
+        return path
+    else:
+        return ""
+
+
+def open_file(filepath):
+    if sys.platform.startswith('darwin'):
+        subprocess.run(['open', filepath])  # macOS
+    elif os.name == 'nt':  # For Windows
+        os.startfile(filepath)
+    elif os.name == 'posix':  # For Linux, Unix, etc.
+        subprocess.run(['xdg-open', filepath])
+
+
+def start_new_service(script_path):
+    # 对于Windows系统
+    if sys.platform.startswith('win'):
+        cmd = f'start cmd /k {python_exec} {script_path}'
+    # 对于Mac或者Linux系统
+    else:
+        cmd = f'xterm -e {python_exec} {script_path}'
+
+    proc = subprocess.Popen(cmd, shell=True)
+
+    # 关闭之前启动的子进程
+    # proc.terminate()
+
+    # 或者如果需要强制关闭可以使用
+    # proc.kill()
+
+    return proc
+
+
+if __name__ == '__main__':
+    dir = r'C:\Users\Administrator\Desktop/test'
+    dir2 = r'"C:\Users\Administrator\Desktop\test2"'
+    dir, dir2 = batch_clean_paths([dir, dir2])
+    print(dir, dir2)
--- a/Ref_Audio_Selector/common/model_manager.py
+++ b/Ref_Audio_Selector/common/model_manager.py
@ -0,0 +1,46 @@
+import os
+import re
+
+pretrained_sovits_name = "GPT_SoVITS/pretrained_models/s2G488k.pth"
+pretrained_gpt_name = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
+SoVITS_weight_root = "SoVITS_weights"
+GPT_weight_root = "GPT_weights"
+os.makedirs(SoVITS_weight_root, exist_ok=True)
+os.makedirs(GPT_weight_root, exist_ok=True)
+
+speaker_verification_models = {
+    'speech_campplus_sv_zh-cn_16k-common': {
+        'task': 'speaker-verification',
+        'model': 'Ref_Audio_Selector/tool/speaker_verification/models/speech_campplus_sv_zh-cn_16k-common',
+        'model_revision': 'v1.0.0'
+    },
+    'speech_eres2net_sv_zh-cn_16k-common': {
+        'task': 'speaker-verification',
+        'model': 'Ref_Audio_Selector/tool/speaker_verification/models/speech_eres2net_sv_zh-cn_16k-common',
+        'model_revision': 'v1.0.5'
+    }
+}
+
+def custom_sort_key(s):
+    # 使用正则表达式提取字符串中的数字部分和非数字部分
+    parts = re.split('(\d+)', s)
+    # 将数字部分转换为整数，非数字部分保持不变
+    parts = [int(part) if part.isdigit() else part for part in parts]
+    return parts
+
+
+def get_gpt_model_names():
+    gpt_names = [pretrained_gpt_name]
+    for name in os.listdir(GPT_weight_root):
+        if name.endswith(".ckpt"): gpt_names.append("%s/%s" % (GPT_weight_root, name))
+    sorted(gpt_names, key=custom_sort_key)
+    return gpt_names
+
+
+def get_sovits_model_names():
+    sovits_names = [pretrained_sovits_name]
+    for name in os.listdir(SoVITS_weight_root):
+        if name.endswith(".pth"): sovits_names.append("%s/%s" % (SoVITS_weight_root, name))
+    sorted(sovits_names, key=custom_sort_key)
+    return sovits_names
+
--- a/Ref_Audio_Selector/common/time_util.py
+++ b/Ref_Audio_Selector/common/time_util.py
@ -0,0 +1,72 @@
+import time
+import os
+from Ref_Audio_Selector.config_param.log_config import p_logger
+import Ref_Audio_Selector.config_param.config_params as params
+
+
+def timeit_decorator(func):
+    """
+    装饰器，用于计算被装饰函数的执行时间。
+
+    参数:
+        func (function): 要计时的函数。
+
+    返回:
+        function: 包含计时功能的新函数。
+    """
+
+    def wrapper(*args, **kwargs):
+        if params.time_log_print_type != 'file':
+            return func(*args, **kwargs)
+
+        start_time = time.perf_counter()  # 使用 perf_counter 获取高精度计时起点
+
+        func_result = func(*args, **kwargs)  # 执行原函数
+
+        end_time = time.perf_counter()  # 获取计时终点
+        elapsed_time = end_time - start_time  # 计算执行耗时
+
+        # 记录日志内容
+        log_message = f"进程ID: {os.getpid()}, {func.__name__} 执行耗时: {elapsed_time:.6f} 秒"
+        p_logger.info(log_message)
+
+        return func_result
+
+    return wrapper
+
+
+def time_monitor(func):
+    """
+        返回结果，追加时间
+    """
+
+    def wrapper(*args, **kwargs):
+
+        start_time = time.perf_counter()  # 使用 perf_counter 获取高精度计时起点
+
+        func_result = func(*args, **kwargs)  # 执行原函数
+
+        end_time = time.perf_counter()  # 获取计时终点
+        elapsed_time = end_time - start_time  # 计算执行耗时
+
+        return elapsed_time, func_result
+
+    return wrapper
+
+
+# 使用装饰器
+@timeit_decorator
+def example_function(n):
+    time.sleep(n)  # 假设这是需要计时的函数，这里模拟耗时操作
+    return n * 2
+
+
+def example_function2(n):
+    time.sleep(n)  # 假设这是需要计时的函数，这里模拟耗时操作
+    return n * 2
+
+
+if __name__ == "__main__":
+    # 调用经过装饰的函数
+    # result = example_function(2)
+    print(time_monitor(example_function2)(2))
--- a/Ref_Audio_Selector/config.ini
+++ b/Ref_Audio_Selector/config.ini
@ -0,0 +1,57 @@
+# config.ini
+
+[Base]
+# 服务端口号
+server_port = 9423
+# 参考音频目录
+reference_audio_dir = refer_audio
+# 临时文件目录
+temp_dir = Ref_Audio_Selector/temp
+
+[Log]
+# 日志保存目录路径
+log_dir = Ref_Audio_Selector/log/general
+# 日志级别 CRITICAL、FATAL、ERROR、WARNING、WARN、INFO、DEBUG、NOTSET、
+log_level = INFO
+# 函数时间消耗日志打印类型 file 打印到文件; close 关闭
+time_log_print_type = file
+# 函数时间消耗日志保存目录路径
+time_log_print_dir = Ref_Audio_Selector/log/performance
+
+[AudioSample]
+# list转换待选参考音频目录
+list_to_convert_reference_audio_dir = refer_audio_all
+# 音频相似度目录
+audio_similarity_dir = similarity
+# 是否开启基准音频预采样 true false
+enable_pre_sample = true
+
+[Inference]
+# 默认测试文本位置
+default_test_text_path = Ref_Audio_Selector/file/test_content/test_content.txt
+# 推理音频目录
+inference_audio_dir = inference_audio
+# 推理音频文本聚合目录
+inference_audio_text_aggregation_dir = text
+# 推理音频情绪聚合目录
+inference_audio_emotion_aggregation_dir = emotion
+
+[ResultCheck]
+# asr输出文件
+asr_filename = asr
+# 文本相似度输出目录
+text_similarity_output_dir = text_similarity
+# 文本情绪平均相似度报告文件名
+text_emotion_average_similarity_report_filename = average_similarity
+# 文本相似度按情绪聚合明细文件名
+text_similarity_by_emotion_detail_filename = emotion_group_detail
+# 文本相似度按文本聚合明细文件名
+text_similarity_by_text_detail_filename = text_group_detail
+
+[AudioConfig]
+# 默认模板文件位置
+default_template_path = Ref_Audio_Selector/file/config_template/ref_audio_template.txt
+# 参考音频配置文件名
+reference_audio_config_filename = refer_audio
+
+[Other]
--- a/Ref_Audio_Selector/config_param/init.py
+++ b/Ref_Audio_Selector/config_param/init.py
--- a/Ref_Audio_Selector/config_param/config_manager.py
+++ b/Ref_Audio_Selector/config_param/config_manager.py
@ -0,0 +1,111 @@
+import configparser
+import os
+import Ref_Audio_Selector.common.common as common
+
+
+class ParamReadWriteManager:
+    def __init__(self):
+        self.base_dir = 'Ref_Audio_Selector/file/base_info'
+        os.makedirs(self.base_dir, exist_ok=True)
+        # 基础信息
+        self.work_dir = 'work_dir'
+        self.role = 'role'
+        # 第一步
+        self.subsection_num = 'subsection_num'
+        self.sample_num = 'sample_num'
+        # 第二步
+        self.api_set_model_base_url = 'api_set_model_base_url'
+        self.api_gpt_param = 'api_gpt_param'
+        self.api_sovits_param = 'api_sovits_param'
+
+        self.api_v2_set_gpt_model_base_url = 'api_v2_set_gpt_model_base_url'
+        self.api_v2_gpt_model_param = 'api_v2_gpt_model_param'
+        self.api_v2_set_sovits_model_base_url = 'api_v2_set_sovits_model_base_url'
+        self.api_v2_sovits_model_param = 'api_v2_sovits_model_param'
+
+        self.text_url = 'text_url'
+        self.text_param = 'text_param'
+        self.refer_type_param = 'refer_type_param'
+        self.ref_path_param = 'ref_path_param'
+        self.ref_text_param = 'ref_text_param'
+        self.emotion_param = 'emotion_param'
+
+        self.test_content_path = 'test_content_path'
+        self.request_concurrency_num = 'request_concurrency_num'
+
+        # 第三步
+        self.text_similarity_amplification_boundary = 'text_similarity_amplification_boundary'
+        # 第四步
+        # 第五步
+        self.text_template = 'text_template'
+
+    def read(self, key):
+        file_path = os.path.join(self.base_dir, key + '.txt')
+        if os.path.exists(file_path):
+            content = common.read_file(file_path)
+            return content.strip()
+        else:
+            return ''
+
+    def write(self, key, content):
+        file_path = os.path.join(self.base_dir, key + '.txt')
+
+        # 确保内容是字符串类型，如果不是，转换为字符串
+        if not isinstance(content, str):
+            clean_content = str(content).strip()  # 转换为字符串并移除首尾空白
+        else:
+            clean_content = content.strip()
+
+        common.write_text_to_file(clean_content, file_path)
+
+
+class ConfigManager:
+    def __init__(self):
+        self.config_path = 'Ref_Audio_Selector/config.ini'
+        self.config = configparser.ConfigParser()
+        self.config.read(self.config_path, encoding='utf-8')
+
+    def get_base(self, key):
+        return self.config.get('Base', key)
+
+    def get_log(self, key):
+        return self.config.get('Log', key)
+
+    def get_audio_sample(self, key):
+        return self.config.get('AudioSample', key)
+
+    def get_inference(self, key):
+        return self.config.get('Inference', key)
+
+    def get_result_check(self, key):
+        return self.config.get('ResultCheck', key)
+
+    def get_audio_config(self, key):
+        return self.config.get('AudioConfig', key)
+
+    def get_other(self, key):
+        return self.config.get('Other', key)
+
+    def print(self):
+        # 打印所有配置
+        for section in self.config.sections():
+            print('[{}]'.format(section))
+            for key in self.config[section]:
+                print('{} = {}'.format(key, self.config[section][key]))
+            print()
+
+
+_config = ConfigManager()
+_param_read_write_manager = ParamReadWriteManager()
+
+
+def get_config():
+    return _config
+
+
+def get_rw_param():
+    return _param_read_write_manager
+
+
+if __name__ == '__main__':
+    print(_config.print())
--- a/Ref_Audio_Selector/config_param/config_params.py
+++ b/Ref_Audio_Selector/config_param/config_params.py
@ -0,0 +1,58 @@
+import Ref_Audio_Selector.config_param.config_manager as config_manager
+
+config = config_manager.get_config()
+
+# [Base]
+# 服务端口号
+server_port = int(config.get_base('server_port'))
+# 参考音频目录
+reference_audio_dir = config.get_base('reference_audio_dir')
+# 临时文件目录
+temp_dir = config.get_base('temp_dir')
+
+# [Log]
+# 日志保存目录路径
+log_dir = config.get_log('log_dir')
+# 日志级别 CRITICAL、FATAL、ERROR、WARNING、WARN、INFO、DEBUG、NOTSET、
+log_level = config.get_log('log_level')
+# 函数时间消耗日志打印类型 file 打印到文件; close 关闭
+time_log_print_type = config.get_log('time_log_print_type')
+# 函数时间消耗日志保存目录路径
+time_log_print_dir = config.get_log('time_log_print_dir')
+
+# [AudioSample]
+# list转换待选参考音频目录
+list_to_convert_reference_audio_dir = config.get_audio_sample('list_to_convert_reference_audio_dir')
+# 音频相似度目录
+audio_similarity_dir = config.get_audio_sample('audio_similarity_dir')
+# 是否开启基准音频预采样 true false
+enable_pre_sample = config.get_audio_sample('enable_pre_sample')
+
+# [Inference]
+# 默认测试文本位置
+default_test_text_path = config.get_inference('default_test_text_path')
+# 推理音频目录
+inference_audio_dir = config.get_inference('inference_audio_dir')
+# 推理音频文本聚合目录
+inference_audio_text_aggregation_dir = config.get_inference('inference_audio_text_aggregation_dir')
+# 推理音频情绪聚合目录
+inference_audio_emotion_aggregation_dir = config.get_inference('inference_audio_emotion_aggregation_dir')
+
+# [ResultCheck]
+# asr输出文件
+asr_filename = config.get_result_check('asr_filename')
+# 文本相似度输出目录
+text_similarity_output_dir = config.get_result_check('text_similarity_output_dir')
+# 文本情绪平均相似度报告文件名
+text_emotion_average_similarity_report_filename = config.get_result_check('text_emotion_average_similarity_report_filename')
+# 文本相似度按情绪聚合明细文件名
+text_similarity_by_emotion_detail_filename = config.get_result_check('text_similarity_by_emotion_detail_filename')
+# 文本相似度按文本聚合明细文件名
+text_similarity_by_text_detail_filename = config.get_result_check('text_similarity_by_text_detail_filename')
+
+# [AudioConfig]
+# 默认模板文件位置
+default_template_path = config.get_audio_config('default_template_path')
+# 参考音频配置文件名
+reference_audio_config_filename = config.get_audio_config('reference_audio_config_filename')
+
--- a/Ref_Audio_Selector/config_param/log_config.py
+++ b/Ref_Audio_Selector/config_param/log_config.py
@ -0,0 +1,65 @@
+import logging
+import os
+import datetime
+import Ref_Audio_Selector.config_param.config_params as params
+
+
+def create_general_logger():
+    # 获取当前日期，用于文件名和日志内容
+    current_date = datetime.datetime.now().strftime('%Y-%m-%d')
+
+    # 创建一个用于控制台输出的处理器，并设置日志级别
+    console_handler = logging.StreamHandler()
+    # console_handler.setLevel(logging.INFO)
+    # 可以设置控制台输出的格式
+    console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+    console_handler.setFormatter(console_formatter)
+    console_handler.encoding = 'utf-8'  # 设置字符编码为utf-8
+
+    os.makedirs(params.log_dir, exist_ok=True)
+
+    # 创建一个用于常规日志的处理器
+    general_handler = logging.FileHandler(f"{params.log_dir}/{current_date}.log", mode='a', encoding='utf-8')
+    # general_handler.setLevel(logging.INFO)
+    general_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+    general_handler.setFormatter(general_formatter)
+
+    # 配置一个常规的logger
+    general_logger = logging.getLogger('general')
+    level = logging.getLevelName(params.log_level)
+    general_logger.setLevel(level)
+    general_logger.addHandler(console_handler)
+    general_logger.addHandler(general_handler)
+
+    # 配置根logger，以防万一
+    logging.basicConfig(level=logging.WARNING, handlers=[general_handler])
+
+    return general_logger
+
+
+def create_performance_logger():
+    # 获取当前日期，用于文件名和日志内容
+    current_date = datetime.datetime.now().strftime('%Y-%m-%d')
+
+    os.makedirs(params.time_log_print_dir, exist_ok=True)
+
+    # 创建一个专用于性能监控日志的处理器
+    performance_handler = logging.FileHandler(
+        f"{params.time_log_print_dir}/{current_date}.log", mode='a', encoding='utf-8')
+    # performance_handler.setLevel(logging.INFO)
+    performance_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+    performance_handler.setFormatter(performance_formatter)
+
+    # 配置一个专门用于性能监控的logger
+    performance_logger = logging.getLogger('performance')
+    performance_logger.setLevel(logging.INFO)
+    performance_logger.addHandler(performance_handler)
+
+    return performance_logger
+
+
+def setup_logging():
+    return create_general_logger(), create_performance_logger()
+
+
+logger, p_logger = setup_logging()
--- a/Ref_Audio_Selector/file/base_info/role.txt
+++ b/Ref_Audio_Selector/file/base_info/role.txt
--- a/Ref_Audio_Selector/file/base_info/work_dir.txt
+++ b/Ref_Audio_Selector/file/base_info/work_dir.txt
--- a/Ref_Audio_Selector/file/config_template/ref_audio_template.txt
+++ b/Ref_Audio_Selector/file/config_template/ref_audio_template.txt
@ -0,0 +1,5 @@
+"${emotion}": {
+    "ref_wav_path": "${ref_path}",
+    "prompt_text": "${ref_text}",
+    "prompt_language": "中文"
+}
--- a/Ref_Audio_Selector/file/test_content/test_content.txt
+++ b/Ref_Audio_Selector/file/test_content/test_content.txt
@ -0,0 +1,4 @@
+也是只有一次。”白蓉简单地回答，然后迅速转移话锋，搂住罗辑的脖子说，“算了，我不要那生日礼物了，你也回到正常的生活中来，好吗？”
+云天明看到那是一条丑陋的虫子，软乎乎湿漉漉的，在她白皙的手指间蠕动着，旁边一个女生尖叫道：恶心死了，你碰它干吗？！程心把虫子轻轻放到旁边的草丛中，说，它在这里会给踩死的。
+“那么多的星星，像雾似的。”云天明感叹道。程心把目光从银河收回，转头看着他，指着下面的校园和城市说：“你看下面也很漂亮啊，我们的生活是在这儿，可不是在那么远的银河里。”
+“可我们的专业，不就是为了到地球之外去吗？”“那是为了这里的生活更好，可不是为了逃离地球啊。”云天明当然知道程心的话是委婉地指向他的孤僻和自闭，他也只有默然以对。
--- a/Ref_Audio_Selector/ref_audio_selector_webui.py
+++ b/Ref_Audio_Selector/ref_audio_selector_webui.py
--- a/Ref_Audio_Selector/start_ref_audio_selector_webui.bat
+++ b/Ref_Audio_Selector/start_ref_audio_selector_webui.bat
@ -0,0 +1,5 @@
+CHCP 65001
+@echo off 
+cd ../
+runtime\python.exe ./Ref_Audio_Selector/ref_audio_selector_webui.py
+pause
--- a/Ref_Audio_Selector/tool/init.py
+++ b/Ref_Audio_Selector/tool/init.py
--- a/Ref_Audio_Selector/tool/asr/init.py
+++ b/Ref_Audio_Selector/tool/asr/init.py
--- a/Ref_Audio_Selector/tool/asr/fasterwhisper_asr_multi_level_dir.py
+++ b/Ref_Audio_Selector/tool/asr/fasterwhisper_asr_multi_level_dir.py
@ -0,0 +1,120 @@
+import argparse
+import os
+import traceback
+import Ref_Audio_Selector.config_param.config_params as params
+
+os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+
+import torch
+from faster_whisper import WhisperModel
+from tqdm import tqdm
+
+from tools.asr.config import check_fw_local_models
+from Ref_Audio_Selector.config_param.log_config import logger
+
+language_code_list = [
+    "af", "am", "ar", "as", "az",
+    "ba", "be", "bg", "bn", "bo",
+    "br", "bs", "ca", "cs", "cy",
+    "da", "de", "el", "en", "es",
+    "et", "eu", "fa", "fi", "fo",
+    "fr", "gl", "gu", "ha", "haw",
+    "he", "hi", "hr", "ht", "hu",
+    "hy", "id", "is", "it", "ja",
+    "jw", "ka", "kk", "km", "kn",
+    "ko", "la", "lb", "ln", "lo",
+    "lt", "lv", "mg", "mi", "mk",
+    "ml", "mn", "mr", "ms", "mt",
+    "my", "ne", "nl", "nn", "no",
+    "oc", "pa", "pl", "ps", "pt",
+    "ro", "ru", "sa", "sd", "si",
+    "sk", "sl", "sn", "so", "sq",
+    "sr", "su", "sv", "sw", "ta",
+    "te", "tg", "th", "tk", "tl",
+    "tr", "tt", "uk", "ur", "uz",
+    "vi", "yi", "yo", "zh", "yue",
+    "auto"]
+
+
+def execute_asr_multi_level_dir(input_folder, output_folder, model_size, language, precision):
+    if '-local' in model_size:
+        model_size = model_size[:-6]
+        model_path = f'tools/asr/models/faster-whisper-{model_size}'
+    else:
+        model_path = model_size
+    if language == 'auto':
+        language = None  # 不设置语种由模型自动输出概率最高的语种
+    logger.info("loading faster whisper model:", model_size, model_path)
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    try:
+        model = WhisperModel(model_path, device=device, compute_type=precision)
+    except:
+        return logger.error(traceback.format_exc())
+
+    output = []
+
+    # 递归遍历输入目录及所有子目录
+    for root, dirs, files in os.walk(input_folder):
+        for file_name in sorted(files):
+            # 只处理wav文件（假设是wav文件）
+            if file_name.endswith(".wav"):
+                try:
+                    file_path = os.path.join(root, file_name)
+                    original_text = os.path.basename(root)
+                    segments, info = model.transcribe(
+                        audio=file_path,
+                        beam_size=5,
+                        vad_filter=True,
+                        vad_parameters=dict(min_silence_duration_ms=700),
+                        language=language)
+                    text = ''
+
+                    if info.language == "zh":
+                        logger.info("检测为中文文本, 转 FunASR 处理")
+                        if ("only_asr" not in globals()):
+                            from Ref_Audio_Selector.tool.asr.funasr_asr_multi_level_dir import \
+                                only_asr  # #如果用英文就不需要导入下载模型
+                        text = only_asr(file_path)
+
+                    if text == '':
+                        for segment in segments:
+                            text += segment.text
+                    output.append(f"{file_path}|{original_text}|{info.language.upper()}|{text}")
+                    print(f"{file_path}|{original_text}|{info.language.upper()}|{text}")
+                except:
+                    return logger.error(traceback.format_exc())
+
+    output_folder = output_folder
+    os.makedirs(output_folder, exist_ok=True)
+    output_file_path = os.path.abspath(f'{output_folder}/{params.asr_filename}.list')
+
+    with open(output_file_path, "w", encoding="utf-8") as f:
+        f.write("\n".join(output))
+        logger.info(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
+    return output_file_path
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--input_folder", type=str, required=True,
+                        help="Path to the folder containing WAV files.")
+    parser.add_argument("-o", "--output_folder", type=str, required=True,
+                        help="Output folder to store transcriptions.")
+    parser.add_argument("-s", "--model_size", type=str, default='large-v3',
+                        choices=check_fw_local_models(),
+                        help="Model Size of Faster Whisper")
+    parser.add_argument("-l", "--language", type=str, default='ja',
+                        choices=language_code_list,
+                        help="Language of the audio files.")
+    parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16', 'float32'],
+                        help="fp16 or fp32")
+
+    cmd = parser.parse_args()
+    output_file_path = execute_asr_multi_level_dir(
+        input_folder=cmd.input_folder,
+        output_folder=cmd.output_folder,
+        model_size=cmd.model_size,
+        language=cmd.language,
+        precision=cmd.precision,
+    )
--- a/Ref_Audio_Selector/tool/asr/funasr_asr_multi_level_dir.py
+++ b/Ref_Audio_Selector/tool/asr/funasr_asr_multi_level_dir.py
@ -0,0 +1,94 @@
+# -*- coding:utf-8 -*-
+
+import argparse
+import os
+import traceback
+import Ref_Audio_Selector.config_param.config_params as params
+from Ref_Audio_Selector.config_param.log_config import logger
+from Ref_Audio_Selector.common.time_util import timeit_decorator
+from tqdm import tqdm
+from funasr import AutoModel
+
+path_asr = 'tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
+path_vad = 'tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch'
+path_punc = 'tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch'
+path_asr = path_asr if os.path.exists(
+    path_asr) else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
+path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+
+model = AutoModel(
+    model=path_asr,
+    model_revision="v2.0.4",
+    vad_model=path_vad,
+    vad_model_revision="v2.0.4",
+    punc_model=path_punc,
+    punc_model_revision="v2.0.4",
+)
+
+
+def only_asr(input_file):
+    try:
+        text = model.generate(input=input_file)[0]["text"]
+    except:
+        text = ''
+        logger.error(traceback.format_exc())
+    return text
+
+
+@timeit_decorator
+def execute_asr_multi_level_dir(input_folder, output_folder, model_size, language):
+    output = []
+    # 递归遍历输入目录及所有子目录
+    for root, dirs, files in os.walk(input_folder):
+        for name in sorted(files):
+            # 只处理wav文件（假设是wav文件）
+            if name.endswith(".wav"):
+                try:
+                    original_text = os.path.basename(root)
+                    # 构造完整的输入音频文件路径
+                    input_file_path = os.path.join(root, name)
+                    input_file_path = os.path.normpath(input_file_path)  # 先标准化可能存在混合斜杠的情况
+                    asr_text = model.generate(input=input_file_path)[0]["text"]
+
+                    output.append(f"{input_file_path}|{original_text}|{language.upper()}|{asr_text}")
+
+                except:
+                    logger.error(traceback.format_exc())
+
+    # 创建或打开指定的输出目录
+    output_folder = output_folder
+    output_dir_abs = os.path.abspath(output_folder)
+    os.makedirs(output_dir_abs, exist_ok=True)
+
+    # 构造输出文件路径
+    output_file_path = os.path.join(output_dir_abs, f'{params.asr_filename}.list')
+
+    # 将输出写入文件
+    with open(output_file_path, "w", encoding="utf-8") as f:
+        f.write("\n".join(output))
+        logger.info(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
+
+    return output_file_path
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--input_folder", type=str, required=True,
+                        help="Path to the folder containing WAV files.")
+    parser.add_argument("-o", "--output_folder", type=str, required=True,
+                        help="Output folder to store transcriptions.")
+    parser.add_argument("-s", "--model_size", type=str, default='large',
+                        help="Model Size of FunASR is Large")
+    parser.add_argument("-l", "--language", type=str, default='zh', choices=['zh'],
+                        help="Language of the audio files.")
+    parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16', 'float32'],
+                        help="fp16 or fp32")  # 还没接入
+
+    cmd = parser.parse_args()
+    execute_asr_multi_level_dir(
+        input_folder=cmd.input_folder,
+        output_folder=cmd.output_folder,
+        model_size=cmd.model_size,
+        language=cmd.language,
+    )
--- a/Ref_Audio_Selector/tool/audio_check.py
+++ b/Ref_Audio_Selector/tool/audio_check.py
@ -0,0 +1,54 @@
+import os
+import shutil
+import Ref_Audio_Selector.common.common as common
+import Ref_Audio_Selector.config_param.config_params as params
+from Ref_Audio_Selector.config_param.log_config import logger
+
+
+def remove_matching_audio_files_in_text_dir(text_dir, emotions_list):
+    count = 0
+    emotions = [item['emotion'] for item in emotions_list]
+    for root, dirs, files in os.walk(text_dir):
+        for file in files:
+            if file.endswith(".wav"):
+                emotion_tag = os.path.basename(file)[:-4]
+                if emotion_tag not in emotions:
+                    file_path = os.path.join(root, file)
+                    logger.info(f"Deleting file: {file_path}")
+                    try:
+                        os.remove(file_path)
+                        count += 1
+                    except Exception as e:
+                        logger.error(f"Error deleting file {file_path}: {e}")
+
+    return count
+
+
+def delete_emotion_subdirectories(emotion_dir, emotions_list):
+    count = 0
+
+    emotions = [item['emotion'] for item in emotions_list]
+
+    for entry in os.listdir(emotion_dir):
+        entry_path = os.path.join(emotion_dir, entry)
+        if os.path.isdir(entry_path):
+            if entry not in emotions:
+                logger.info(f"Deleting directory: {entry_path}")
+                try:
+                    # 使用shutil.rmtree删除整个子目录及其内容
+                    shutil.rmtree(entry_path)
+                    count += 1
+                except Exception as e:
+                    logger.error(f"Error deleting directory {entry_path}: {e}")
+
+    return count
+
+
+def sync_ref_audio(ref_audio_dir, inference_audio_dir):
+    ref_audio_manager = common.RefAudioListManager(ref_audio_dir)
+    ref_list = ref_audio_manager.get_ref_audio_list()
+    text_dir = os.path.join(inference_audio_dir, params.inference_audio_text_aggregation_dir)
+    emotion_dir = os.path.join(inference_audio_dir, params.inference_audio_emotion_aggregation_dir)
+    delete_text_wav_num = remove_matching_audio_files_in_text_dir(text_dir, ref_list)
+    delete_emotion_dir_num = delete_emotion_subdirectories(emotion_dir, ref_list)
+    return delete_text_wav_num, delete_emotion_dir_num
--- a/Ref_Audio_Selector/tool/audio_config.py
+++ b/Ref_Audio_Selector/tool/audio_config.py
@ -0,0 +1,31 @@
+import os
+import platform
+
+
+def generate_audio_config(work_space_dir, template_str, audio_list, output_file_path):
+    # 定义一个空字符串来存储最终要写入文件的内容
+    file_content = ""
+
+    # 遍历参考音频列表
+    for audio_info in audio_list:
+        emotion = audio_info['emotion']
+        ref_path = audio_info['ref_path']
+        ref_text = audio_info['ref_text']
+
+        relative_path = os.path.relpath(ref_path, work_space_dir)
+        if platform.system() == 'Windows':
+            relative_path = relative_path.replace('\\', '/')
+
+        # 使用字符串模板替换变量
+        formatted_line = template_str.replace('${emotion}', emotion).replace('${ref_path}', relative_path).replace(
+            '${ref_text}', ref_text)
+
+        # 将格式化后的行添加到内容中，使用逗号和换行符分隔
+        file_content += formatted_line + ",\n"
+
+    # 删除最后一个逗号和换行符，确保格式整洁
+    file_content = file_content[:-2]
+
+    # 将内容写入输出文件
+    with open(output_file_path, 'w', encoding='utf-8') as output_file:
+        output_file.write(file_content)
--- a/Ref_Audio_Selector/tool/audio_inference.py
+++ b/Ref_Audio_Selector/tool/audio_inference.py
@ -0,0 +1,238 @@
+import time
+import os
+import requests
+import itertools
+import multiprocessing
+from concurrent.futures import ProcessPoolExecutor
+import numpy as np
+import Ref_Audio_Selector.config_param.config_params as params
+from urllib.parse import urlparse, parse_qs, urlencode, urlunparse, quote
+from Ref_Audio_Selector.config_param.log_config import logger, p_logger
+
+
+class SetModelURLComposer:
+    def __init__(self, type, base_url, gpt_param_name, sovits_param_name):
+        self.type = type
+        self.base_url = base_url
+        self.gpt_param_name = gpt_param_name
+        self.sovits_param_name = sovits_param_name
+
+    def is_valid(self):
+        if self.base_url is None or self.base_url == '':
+            raise Exception("请求地址不能为空")
+        if self.type in ['gpt', 'all']:
+            if self.gpt_param_name is None or self.gpt_param_name == '':
+                raise Exception("GPT参数名不能为空")
+        if self.type in ['sovits', 'all']:
+            if self.sovits_param_name is None or self.sovits_param_name == '':
+                raise Exception("Sovits参数名不能为空")
+
+    def build_get_url(self, value_array, need_url_encode=True):
+        params = {}
+        if self.type == 'gpt':
+            params[self.gpt_param_name] = value_array[0]
+        if self.type == 'sovits':
+            params[self.sovits_param_name] = value_array[0]
+        if self.type == 'all':
+            params[self.gpt_param_name] = value_array[0]
+            params[self.sovits_param_name] = value_array[1]
+        return append_params_to_url(self.base_url, params, need_url_encode)
+
+    def build_post_url(self, value_array, need_url_encode=True):
+        url = append_params_to_url(self.base_url, {}, need_url_encode)
+        params = {}
+        if self.type == 'gpt':
+            params[self.gpt_param_name] = value_array[0]
+        if self.type == 'sovits':
+            params[self.sovits_param_name] = value_array[0]
+        if self.type == 'all':
+            params[self.gpt_param_name] = value_array[0]
+            params[self.sovits_param_name] = value_array[1]
+        return url, params
+
+
+class TTSURLComposer:
+    def __init__(self, base_url, refer_type_param, emotion_param_name, text_param_name, ref_path_param_name, ref_text_param_name):
+        self.base_url = base_url
+        # 角色情绪 or 参考音频
+        self.refer_type_param = refer_type_param 
+        self.emotion_param_name = emotion_param_name
+        self.text_param_name = text_param_name
+        self.ref_path_param_name = ref_path_param_name
+        self.ref_text_param_name = ref_text_param_name
+
+    def is_valid(self):
+        if self.base_url is None or self.base_url == '':
+            raise ValueError("请输入url")
+
+        if self.text_param_name is None or self.text_param_name == '':
+            raise ValueError("请输入text参数名")
+
+        if self.emotion_param_name is None and self.ref_path_param_name is None and self.ref_text_param_name is None:
+            raise ValueError("请输入至少一个参考or情绪的参数")
+
+    def is_emotion(self):
+        return self.refer_type_param == '角色情绪'
+
+    def build_url_with_emotion(self, text_value, emotion_value, need_url_encode=True):
+        params = {
+            self.text_param_name: text_value,
+            self.emotion_param_name: emotion_value,
+        }
+        return append_params_to_url(self.base_url, params, need_url_encode)
+
+    def build_url_with_ref(self, text_value, ref_path_value, ref_text_value, need_url_encode=True):
+        params = {
+            self.text_param_name: text_value,
+            self.ref_path_param_name: ref_path_value,
+            self.ref_text_param_name: ref_text_value,
+        }
+        return append_params_to_url(self.base_url, params, need_url_encode)
+
+
+def append_params_to_url(url_with_params, params, need_url_encode):
+    if params:
+        query_params = '&'.join([f"{k}={v}" for k, v in params.items()])
+        url_with_params += '?' + query_params if '?' not in url_with_params else '&' + query_params
+    return url_with_params if not need_url_encode else safe_encode_query_params(url_with_params)
+
+
+def safe_encode_query_params(original_url):
+    # 分析URL以获取查询字符串部分
+    parsed_url = urlparse(original_url)
+    query_params = parse_qs(parsed_url.query)
+
+    # 将查询参数转换为编码过的字典（键值对会被转码）
+    encoded_params = {k: quote(v[0]) for k, v in query_params.items()}
+
+    # 重新编码查询字符串
+    new_query_string = urlencode(encoded_params, doseq=False)
+
+    # 重建完整的URL
+    new_parsed_url = parsed_url._replace(query=new_query_string)
+    encoded_url = urlunparse(new_parsed_url)
+
+    logger.info(encoded_url)
+    return encoded_url
+
+
+def generate_audio_files_parallel(url_composer, text_list, emotion_list, output_dir_path, num_processes=1):
+
+    # 将emotion_list均匀分成num_processes个子集
+    emotion_groups = np.array_split(emotion_list, num_processes)
+
+    with ProcessPoolExecutor(max_workers=num_processes) as executor:
+        futures = [
+            executor.submit(generate_audio_files_for_emotion_group, url_composer, text_list, group, output_dir_path)
+            for group in emotion_groups]
+        for future in futures:
+            future.result()  # 等待所有进程完成
+
+
+def generate_audio_files_for_emotion_group(url_composer, text_list, emotion_list, output_dir_path):
+    start_time = time.perf_counter()  # 使用 perf_counter 获取高精度计时起点
+    # Ensure the output directory exists
+    output_dir = os.path.abspath(output_dir_path)
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Create subdirectories for text and emotion categories
+    text_subdir = os.path.join(output_dir, params.inference_audio_text_aggregation_dir)
+    os.makedirs(text_subdir, exist_ok=True)
+    emotion_subdir = os.path.join(output_dir, params.inference_audio_emotion_aggregation_dir)
+    os.makedirs(emotion_subdir, exist_ok=True)
+
+    all_count = len(text_list) * len(emotion_list)
+    has_generated_count = 0
+    all_text_count = sum(len(item) for item in text_list)
+
+    # 计算笛卡尔积
+    cartesian_product = list(itertools.product(text_list, emotion_list))
+
+    for text, emotion in cartesian_product:
+        # Generate audio byte stream using the create_audio function
+
+        emotion_name = emotion['emotion']
+
+        text_subdir_text = os.path.join(text_subdir, text)
+        os.makedirs(text_subdir_text, exist_ok=True)
+        text_subdir_text_file_path = os.path.join(text_subdir_text, emotion_name + '.wav')
+
+        emotion_subdir_emotion = os.path.join(emotion_subdir, emotion_name)
+        os.makedirs(emotion_subdir_emotion, exist_ok=True)
+        emotion_subdir_emotion_file_path = os.path.join(emotion_subdir_emotion, text + '.wav')
+
+        # 检查是否已经存在对应的音频文件，如果存在则跳过
+        if os.path.exists(text_subdir_text_file_path) and os.path.exists(emotion_subdir_emotion_file_path):
+            has_generated_count += 1
+            logger.info(f"进程ID: {os.getpid()}, 进度: {has_generated_count}/{all_count}")
+            continue
+
+        if url_composer.is_emotion():
+            real_url = url_composer.build_url_with_emotion(text, emotion['emotion'], False)
+        else:
+            real_url = url_composer.build_url_with_ref(text, emotion['ref_path'], emotion['ref_text'], False)
+
+        audio_bytes = inference_audio_from_api(real_url)
+
+        # Write audio bytes to the respective files
+        with open(text_subdir_text_file_path, 'wb') as f:
+            f.write(audio_bytes)
+        with open(emotion_subdir_emotion_file_path, 'wb') as f:
+            f.write(audio_bytes)
+
+        has_generated_count += 1
+        logger.info(f"进程ID: {os.getpid()}, 进度: {has_generated_count}/{all_count}")
+    end_time = time.perf_counter()  # 获取计时终点
+    elapsed_time = end_time - start_time  # 计算执行耗时
+    # 记录日志内容
+    log_message = f"进程ID: {os.getpid()}, generate_audio_files_for_emotion_group 执行耗时: {elapsed_time:.6f} 秒；推理数量: {has_generated_count}； 字符总数：{all_text_count}；每秒推理字符数：{all_text_count*len(emotion_list) / elapsed_time:.3f}；"
+    p_logger.info(log_message)
+    logger.info(log_message)
+
+
+def inference_audio_from_api(url):
+    logger.info(f'inference_audio_from_api url: {url}')
+    # 发起GET请求
+    response = requests.get(url, stream=True)
+
+    # 检查响应状态码是否正常（例如200表示成功）
+    if response.status_code == 200:
+        # 返回音频数据的字节流
+        return response.content
+    else:
+        raise Exception(f"Failed to fetch audio from API. Server responded with status code {response.status_code}.message: {response.json()}")
+
+
+def start_api_set_model(set_model_url_composer, gpt_models, sovits_models):
+    url, post_body = set_model_url_composer.build_post_url([gpt_models, sovits_models], True)
+    logger.info(f'set_model_url_composer url: {set_model_url_composer}')
+    logger.info(f'start_api_set_model url: {url}')
+    logger.info(f'start_api_set_model post_body: {post_body}')
+    response = requests.post(url, json=post_body)
+    if response.status_code == 200:
+        result = response.text
+        return result
+    else:
+        return f'请求失败，状态码：{response.status_code}'
+
+
+def start_api_v2_set_gpt_model(set_model_url_composer, gpt_models):
+    url = set_model_url_composer.build_get_url([gpt_models], False)
+    logger.info(f'start_api_v2_set_gpt_model url: {url}')
+    response = requests.get(url)
+    if response.status_code == 200:
+        result = response.text
+        return result
+    else:
+        return f'请求失败，状态码：{response.status_code}'
+
+
+def start_api_v2_set_sovits_model(set_model_url_composer, sovits_models):
+    url = set_model_url_composer.build_get_url([sovits_models], False)
+    logger.info(f'start_api_v2_set_sovits_model url: {url}')
+    response = requests.get(url)
+    if response.status_code == 200:
+        result = response.text
+        return result
+    else:
+        return f'请求失败，状态码：{response.status_code}'
--- a/Ref_Audio_Selector/tool/audio_sample.py
+++ b/Ref_Audio_Selector/tool/audio_sample.py
@ -0,0 +1,162 @@
+import os
+import shutil
+import random
+import librosa
+from Ref_Audio_Selector.config_param.log_config import logger
+
+
+def check_audio_duration(path, min_duration=3, max_duration=10):
+    try:
+
+        # 直接计算音频文件的时长（单位：秒）
+        duration = librosa.get_duration(filename=path)
+
+        # 判断时长是否在3s至10s之间
+        if min_duration <= duration <= max_duration:
+            return True
+        else:
+            return False
+
+    except Exception as e:
+        logger.error(f"无法打开或处理音频文件：{e}")
+        return None
+
+
+def convert_from_list(list_file, output_dir):
+    # 创建输出目录，如果它不存在的话
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    # 解析.list文件，并操作文件
+    with open(list_file, 'r', encoding='utf-8') as file:
+        lines = file.readlines()
+
+    for line in lines:
+        parts = line.strip().split('|')
+        if len(parts) != 4:
+            logger.error(f"Line format incorrect: {line}")
+            continue
+
+        audio_path, _, _, transcription = parts
+
+        # 构建新的文件名和路径
+        new_filename = transcription.strip() + '.wav'
+        # new_filename = new_filename.replace(' ', '_')  # 移除空格
+        # new_filename = ''.join(e for e in new_filename if e.isalnum() or e in ['_', '.'])  # 移除非法字符
+        new_path = os.path.join(output_dir, new_filename)
+
+        # 如果目标文件已存在，不要覆盖
+        if os.path.exists(new_path):
+            logger.info(f"File already exists: {new_path}")
+            continue
+
+        try:
+            # 检查音频文件是否存在
+            if not os.path.exists(audio_path):
+                logger.info(f"Audio file does not exist: {audio_path}")
+                continue
+
+            if check_audio_duration(audio_path):
+                # 复制音频文件到output目录并重命名
+                shutil.copy2(audio_path, new_path)
+                logger.info(f"File copied and renamed to: {new_path}")
+            else:
+                logger.info(f"File skipped due to duration: {audio_path}")
+
+        except Exception as e:
+            logger.error(f"An error occurred while processing: {audio_path}")
+            logger.error(e)
+
+    logger.info("Processing complete.")
+
+
+def sample(output_audio_dir, similarity_list, subsection_num, sample_num):
+    # 按照相似度分值降序排序相似度列表
+    similarity_list.sort(key=lambda x: x['score'], reverse=True)
+
+    # 计算每段的起始索引
+    step = len(similarity_list) // subsection_num
+    if len(similarity_list) % subsection_num != 0:
+        step += 1
+
+    # 分段并随机采样
+    for i in range(subsection_num):
+        start = i * step
+        end = (i + 1) * step
+        end = min(end, len(similarity_list))  # 防止最后一段越界
+
+        # 创建子列表
+        subsection = similarity_list[start:end]
+        # 在子列表上随机打乱
+        random.shuffle(subsection)
+
+        # 从打乱后的子列表中抽取相应数量的个体
+        num = min(sample_num, len(subsection))
+        sampled_subsection = subsection[:num]
+
+        # 创建并进入子目录
+        subdir_name = f'emotion_{i + 1}'
+        subdir_path = os.path.join(output_audio_dir, subdir_name)
+        os.makedirs(subdir_path, exist_ok=True)
+
+        # 复制采样结果的音频到子目录
+        for item in sampled_subsection:
+            src_path = item['wav_path']
+            dst_path = os.path.join(subdir_path, os.path.basename(src_path))
+            shutil.copyfile(src_path, dst_path)
+
+    logger.info("Sampling completed.")
+
+
+def parse_similarity_file(file_path):
+    """
+    解析指定文本文件，将其中的内容以元组形式存入列表。
+
+    参数:
+        file_path (str): 待解析的文本文件路径。
+
+    返回:
+        list[tuple[float, str]]: 存储浮点数和路径的元组列表。
+    """
+    result_list = []
+
+    with open(file_path, 'r', encoding='utf-8') as file:
+        for line in file:
+            # 去除行尾换行符并按'|'分割
+            score, filepath = line.strip().split('|')
+
+            # 将浮点数字符串转换为浮点数类型
+            score = float(score)
+
+            # 将得分和路径作为元组添加到结果列表
+            result_list.append({
+                'score': score,
+                'wav_path': filepath
+            })
+
+    return result_list
+
+
+def copy_and_move(output_audio_directory, similarity_scores):
+    # 确保新目录存在
+    if not os.path.exists(output_audio_directory):
+        os.makedirs(output_audio_directory)
+
+    # 遍历并复制文件
+    for item in similarity_scores:
+        # 构造新的文件名
+        base_name = os.path.basename(item['wav_path'])[:-4]  # 去掉.wav扩展名
+        new_name = f"{item['score'] * 10000:04.0f}-{base_name}.wav"
+
+        # 新文件的完整路径
+        new_path = os.path.join(output_audio_directory, new_name)
+
+        # 复制文件到新目录
+        shutil.copyfile(item['wav_path'], new_path)
+
+    logger.info("已完成复制和重命名操作。")
+
+
+if __name__ == '__main__':
+    similarity_list = parse_similarity_file("D:/tt/similarity/啊，除了伊甸和樱，竟然还有其他人会提起我？.txt")
+    sample('D:/tt/similarity/output', similarity_list, 10, 4)
--- a/Ref_Audio_Selector/tool/speaker_verification/init.py
+++ b/Ref_Audio_Selector/tool/speaker_verification/init.py
--- a/Ref_Audio_Selector/tool/speaker_verification/voice_similarity.py
+++ b/Ref_Audio_Selector/tool/speaker_verification/voice_similarity.py
@ -0,0 +1,142 @@
+import argparse
+import os
+import torchaudio
+import torchaudio.transforms as T
+import platform
+import Ref_Audio_Selector.config_param.config_params as params
+import Ref_Audio_Selector.config_param.log_config as log_config
+from Ref_Audio_Selector.common.time_util import timeit_decorator
+from Ref_Audio_Selector.common.model_manager import speaker_verification_models as models
+
+from modelscope.pipelines import pipeline
+
+
+def init_model(model_type='speech_campplus_sv_zh-cn_16k-common'):
+    log_config.logger.info(f'人声识别模型类型：{model_type}')
+    return pipeline(
+        task=models[model_type]['task'],
+        model=models[model_type]['model'],
+        model_revision=models[model_type]['model_revision']
+    )
+
+
+@timeit_decorator
+def compare_audio_and_generate_report(reference_audio_path, comparison_dir_path, output_file_path, model_type):
+    sv_pipeline = init_model(model_type)
+
+    # Step 1: 获取比较音频目录下所有音频文件的路径
+    comparison_audio_paths = [os.path.join(comparison_dir_path, f) for f in os.listdir(comparison_dir_path) if
+                              f.endswith('.wav')]
+
+    if platform.system() == 'Windows':
+        # 因为这个模型是基于16k音频数据训练的，为了避免后续比较时，每次都对参考音频进行重采样，所以，提前进行了采样
+        # windows不支持torchaudio.sox_effects.apply_effects_tensor，所以改写了依赖文件中的重采样方法
+        # 改用torchaudio.transforms.Resample进行重采样，如果在非windows环境下，没有更改依赖包的采样方法的话，
+        # 使用这段代码进行预采样会出现因为采样方法不同，而导致的模型相似度计算不准确的问题
+        # 当然如果在windows下，使用了其他的采样方法，也会出现不准确的问题
+        if params.enable_pre_sample == 'true':
+            reference_audio_16k = ensure_16k_wav(reference_audio_path)
+        else:
+            reference_audio_16k = reference_audio_path
+    else:
+        reference_audio_16k = reference_audio_path
+
+    # Step 2: 用参考音频依次比较音频目录下的每个音频，获取相似度分数及对应路径
+    all_count = len(comparison_audio_paths)
+    has_processed_count = 0
+    similarity_scores = []
+    for audio_path in comparison_audio_paths:
+        score = sv_pipeline([reference_audio_16k, audio_path])['score']
+        similarity_scores.append({
+            'score': score,
+            'path': audio_path
+        })
+        has_processed_count += 1
+        log_config.logger.info(f'进度：{has_processed_count}/{all_count}')
+
+    # Step 3: 根据相似度分数降序排列
+    similarity_scores.sort(key=lambda x: x['score'], reverse=True)
+
+    # Step 4: 处理输出文件不存在的情况，创建新文件
+    if not os.path.exists(output_file_path):
+        open(output_file_path, 'w').close()  # Create an empty file
+
+    # Step 5: 将排序后的结果写入输出结果文件（支持中文）
+    formatted_scores = [f'{item["score"]}|{item["path"]}' for item in similarity_scores]
+    with open(output_file_path, 'w', encoding='utf-8') as f:
+        # 使用'\n'将每个字符串分开，使其写入不同行
+        content = '\n'.join(formatted_scores)
+        f.write(content)
+
+
+def ensure_16k_wav(audio_file_path, target_sample_rate=16000):
+    """
+    输入一个音频文件地址，判断其采样率并决定是否进行重采样，然后将结果保存到指定的输出文件。
+
+    参数:
+        audio_file_path (str): 音频文件路径。
+        output_file_path (str): 保存重采样后音频数据的目标文件路径。
+        target_sample_rate (int, optional): 目标采样率，默认为16000Hz。
+    """
+    # 读取音频文件并获取其采样率
+    waveform, sample_rate = torchaudio.load(audio_file_path)
+
+    # 判断是否需要重采样
+    if sample_rate == target_sample_rate:
+        return audio_file_path
+    else:
+
+        # 创建Resample实例
+        resampler = T.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
+
+        # 应用重采样
+        resampled_waveform = resampler(waveform)
+
+        # 创建临时文件夹
+        os.makedirs(params.temp_dir, exist_ok=True)
+
+        # 设置临时文件名
+        temp_file_path = os.path.join(params.temp_dir, os.path.basename(audio_file_path))
+
+        # 保存重采样后的音频到指定文件
+        torchaudio.save(temp_file_path, resampled_waveform, target_sample_rate)
+
+    return temp_file_path
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Audio processing script arguments")
+
+    # Reference audio path
+    parser.add_argument("-r", "--reference_audio", type=str, required=True,
+                        help="Path to the reference WAV file.")
+
+    # Comparison directory path
+    parser.add_argument("-c", "--comparison_dir", type=str, required=True,
+                        help="Path to the directory containing comparison WAV files.")
+
+    # Output file path
+    parser.add_argument("-o", "--output_file", type=str, required=True,
+                        help="Path to the output file where results will be written.")
+
+    # Model Type
+    parser.add_argument("-m", "--model_type", type=str, required=True,
+                        help="Path to the model type.")
+
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    cmd = parse_arguments()
+    compare_audio_and_generate_report(
+        reference_audio_path=cmd.reference_audio,
+        comparison_dir_path=cmd.comparison_dir,
+        output_file_path=cmd.output_file,
+        model_type=cmd.model_type,
+    )
+
+    # compare_audio_and_generate_report(
+    #     reference_audio_path="D:/tt/渡鸦/refer_audio_all/也对，你的身份和我们不同吗？.wav",
+    #     comparison_dir_path='D:/tt/渡鸦/refer_audio_all',
+    #     output_file_path='D:/tt/渡鸦/test.txt',
+    # )
--- a/Ref_Audio_Selector/tool/text_check.py
+++ b/Ref_Audio_Selector/tool/text_check.py
@ -0,0 +1,77 @@
+import os
+import Ref_Audio_Selector.common.common as common
+import Ref_Audio_Selector.tool.audio_check as audio_check
+from Ref_Audio_Selector.config_param.log_config import logger
+
+
+def parse_text_similarity_result_txt(file_path):
+    """
+    解析指定格式的txt文件，每行格式：f"{item['average_similarity_score']}|{item['count']}|{item['emotion']}"
+
+    :param file_path: txt文件的路径
+    :return: 包含解析后数据的字典列表
+    """
+    data_list = []
+    with open(file_path, 'r', encoding='utf-8') as file:
+        for line in file:
+            # 使用'|'作为分隔符分割每行数据
+            parts = line.strip().split('|')
+            if len(parts) == 3:
+                # 将分割后的字符串转换为浮点数、整数和字符串
+                try:
+                    item = {
+                        'average_similarity_score': float(parts[0]),
+                        'count': int(parts[1]),
+                        'emotion': parts[2]
+                    }
+                    data_list.append(item)
+                except ValueError as e:
+                    # 如果转换失败，打印错误信息并跳过该行
+                    logger.error(f"Error parsing line: {line.strip()} - {e}")
+
+    return data_list
+
+
+def remove_low_similarity_files(ref_audio_list, report_list, audio_text_similarity_boundary):
+    """
+    根据条件删除低相似度音频文件并返回删除数量。
+    
+    :param ref_audio_list: 包含音频路径和情感属性的列表
+    :param report_list: 包含相似度评分和情感属性的列表
+    :param audio_text_similarity_boundary: 相似度阈值
+    :return: 删除的文件数量
+    """
+    deleted_count = 0
+
+    # 筛选出平均相似度低于阈值的报告
+    low_similarity_reports = [report for report in report_list if
+                              report['average_similarity_score'] < audio_text_similarity_boundary]
+
+    # 遍历低相似度报告，查找并删除对应音频文件
+    for report in low_similarity_reports:
+        emotion = report['emotion']
+        # 查找ref_audio_list中相同情感的音频文件路径
+        matching_refs = [ref for ref in ref_audio_list if ref['emotion'] == emotion]
+        for match in matching_refs:
+            ref_path = match['ref_path']
+            # 检查文件是否存在，然后尝试删除
+            if os.path.exists(ref_path):
+                try:
+                    os.remove(ref_path)
+                    deleted_count += 1
+                    logger.info(f"Deleted file: {ref_path}")
+                except Exception as e:
+                    logger.error(f"Error deleting file {ref_path}: {e}")
+            else:
+                logger.error(f"File not found: {ref_path}")
+
+    return deleted_count
+
+
+def delete_ref_audio_below_boundary(ref_audio_path, text_similarity_result_path, sync_inference_audio_dir,
+                                    audio_text_similarity_boundary):
+    ref_audio_list = common.RefAudioListManager(ref_audio_path).get_ref_audio_list()
+    report_list = parse_text_similarity_result_txt(text_similarity_result_path)
+    count = remove_low_similarity_files(ref_audio_list, report_list, audio_text_similarity_boundary)
+    audio_check.sync_ref_audio(ref_audio_path, sync_inference_audio_dir)
+    return count
--- a/Ref_Audio_Selector/tool/text_comparison/init.py
+++ b/Ref_Audio_Selector/tool/text_comparison/init.py
--- a/Ref_Audio_Selector/tool/text_comparison/asr_text_process.py
+++ b/Ref_Audio_Selector/tool/text_comparison/asr_text_process.py
@ -0,0 +1,161 @@
+import os
+import argparse
+from collections import defaultdict
+from operator import itemgetter
+from Ref_Audio_Selector.common.time_util import timeit_decorator
+import Ref_Audio_Selector.tool.text_comparison.text_comparison as text_comparison
+import Ref_Audio_Selector.config_param.config_params as params
+import Ref_Audio_Selector.common.common as common
+from Ref_Audio_Selector.config_param.log_config import logger
+
+
+def parse_asr_file(file_path):
+    output = []
+
+    with open(file_path, 'r', encoding='utf-8') as file:
+        for line in file:
+            # 假设每行都是正确的格式，且"|"'是固定分隔符
+            input_file_path, original_text, language, asr_text = line.strip().split('|')
+
+            emotion = common.get_filename_without_extension(input_file_path)
+
+            # 将解析出的数据构造成新的字典或元组等结构
+            parsed_data = {
+                'emotion': emotion,
+                'input_file_path': input_file_path,
+                'original_text': original_text,
+                'language': language,
+                'asr_text': asr_text,
+                'similarity_score': 0
+            }
+
+            output.append(parsed_data)
+
+    return output
+
+
+@timeit_decorator
+def calculate_similarity_and_append_to_list(input_list, boundary):
+    all_count = len(input_list)
+    has_been_processed_count = 0
+    for item in input_list:
+        original_score, similarity_score = text_comparison.calculate_result(item['original_text'], item['asr_text'], boundary)
+        item['similarity_score'] = similarity_score
+        item['original_score'] = original_score
+        has_been_processed_count += 1
+        logger.info(f'进度：{has_been_processed_count}/{all_count}')
+
+    return input_list
+
+
+def calculate_average_similarity_by_emotion(data_list):
+    result_dict = defaultdict(list)
+
+    for item in data_list:
+        emotion = item['emotion']
+        similarity_score = item['similarity_score']
+        result_dict[emotion].append(similarity_score)
+
+    average_scores = [{'emotion': emotion, 'average_similarity_score': sum(scores) / len(scores), 'count': len(scores)}
+                      for emotion, scores in result_dict.items()]
+
+    average_scores.sort(key=lambda x: x['average_similarity_score'], reverse=True)
+
+    return average_scores
+
+
+def group_and_sort_by_field(data, group_by_field):
+    # 创建一个空的结果字典，键是group_by_field指定的字段，值是一个列表
+    result_dict = defaultdict(list)
+
+    # 遍历输入列表
+    for item in data:
+        # 根据指定的group_by_field将当前元素添加到对应键的列表中
+        key_to_group = item[group_by_field]
+        result_dict[key_to_group].append(item)
+
+    # 对每个键对应的列表中的元素按similarity_score降序排序
+    for key in result_dict:
+        result_dict[key].sort(key=itemgetter('similarity_score'), reverse=True)
+
+    # 将结果字典转换为列表，每个元素是一个包含键（emotion或original_text）和排序后数组的元组
+    result_list = [(k, v) for k, v in result_dict.items()]
+
+    return result_list
+
+
+def format_list_to_text(data_list, output_filename):
+    with open(output_filename, 'w', encoding='utf-8') as output_file:
+        output_file.write('放大后的相似度分值|原始分值|ASR文本|原文文本\n')
+        for key, items in data_list:
+            # 写入情绪标题
+            output_file.write(key + '\n')
+
+            # 写入每条记录
+            for item in items:
+                formatted_line = f"{item['similarity_score']}|{item['original_score']}|{item['asr_text']}|{item['original_text']}\n"
+                output_file.write(formatted_line)
+
+
+def format_list_to_emotion(data_list, output_filename):
+    with open(output_filename, 'w', encoding='utf-8') as output_file:
+        output_file.write('放大后的相似度分值|原始分值|ASR文本|情绪类型\n')
+        for key, items in data_list:
+            # 写入情绪标题
+            output_file.write(key + '\n')
+
+            # 写入每条记录
+            for item in items:
+                formatted_line = f"{item['similarity_score']}|{item['original_score']}|{item['asr_text']}|{item['emotion']}\n"
+                output_file.write(formatted_line)
+
+
+@timeit_decorator
+def process(asr_file_path, output_dir, similarity_enlarge_boundary):
+    # 检查输出目录是否存在，如果不存在则创建
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    records = parse_asr_file(asr_file_path)
+    calculate_similarity_and_append_to_list(records, similarity_enlarge_boundary)
+    average_similarity_list = calculate_average_similarity_by_emotion(records)
+
+    average_similarity_file = os.path.join(output_dir,
+                                           f'{params.text_emotion_average_similarity_report_filename}.txt')
+    average_similarity_content = \
+        '\n'.join([f"{item['average_similarity_score']}|{item['count']}|{item['emotion']}" for item in average_similarity_list])
+    common.write_text_to_file(average_similarity_content, average_similarity_file)
+
+    emotion_detail_list = group_and_sort_by_field(records, 'emotion')
+
+    emotion_detail_file = os.path.join(output_dir, f'{params.text_similarity_by_emotion_detail_filename}.txt')
+    format_list_to_text(emotion_detail_list, emotion_detail_file)
+
+    original_text_detail_list = group_and_sort_by_field(records, 'original_text')
+
+    original_text_detail_file = os.path.join(output_dir, f'{params.text_similarity_by_text_detail_filename}.txt')
+    format_list_to_emotion(original_text_detail_list, original_text_detail_file)
+
+    logger.info('文本相似度分析完成。')
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Process ASR files and analyze similarity.")
+
+    parser.add_argument("-a", "--asr_file_path", type=str, required=True,
+                        help="Path to the directory containing ASR files or path to a single ASR file.")
+
+    parser.add_argument("-o", "--output_dir", type=str, required=True,
+                        help="Path to the directory where the analysis results should be saved.")
+
+    parser.add_argument("-b", "--similarity_enlarge_boundary", type=float, required=True,
+                        help="Similarity score boundary value to be used in your calculations.")
+
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    cmd = parse_arguments()
+    # print(cmd)
+    process(cmd.asr_file_path, cmd.output_dir, cmd.similarity_enlarge_boundary)
--- a/Ref_Audio_Selector/tool/text_comparison/text_comparison.py
+++ b/Ref_Audio_Selector/tool/text_comparison/text_comparison.py
@ -0,0 +1,128 @@
+import os
+import torch
+from transformers import AutoTokenizer, AutoModel
+from scipy.spatial.distance import cosine
+from Ref_Audio_Selector.config_param.log_config import logger
+
+bert_path = os.environ.get(
+    "bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
+)
+
+# Set device to GPU if available, else CPU
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+logger.info(f'使用计算设备: {device}')
+
+tokenizer = AutoTokenizer.from_pretrained(bert_path)
+model = AutoModel.from_pretrained(bert_path).to(device)
+
+
+def calculate_similarity(text1, text2, max_length=512):
+    # 预处理文本，设置最大长度
+    inputs1 = tokenizer(text1, padding=True, truncation=True, max_length=max_length, return_tensors='pt').to(device)
+    inputs2 = tokenizer(text2, padding=True, truncation=True, max_length=max_length, return_tensors='pt').to(device)
+
+    # 获取句子向量（这里是取CLS token的向量并展平为一维）
+    with torch.no_grad():
+        encoded_text1 = model(**inputs1)[0][:, 0, :].flatten()
+        encoded_text2 = model(**inputs2)[0][:, 0, :].flatten()
+
+    # 确保转换为numpy数组并且是一维的
+    similarity = 1 - cosine(encoded_text1.cpu().numpy().flatten(), encoded_text2.cpu().numpy().flatten())
+
+    return similarity
+
+
+# 对boundary到1区间的值进行放大
+def adjusted_similarity(similarity_score2, boundary=0.8):
+    if similarity_score2 < boundary:
+        return 0
+
+    # 倍数
+    multiple = 1 / (1 - boundary)
+
+    adjusted_score = (similarity_score2 - boundary) * multiple
+
+    return adjusted_score
+
+
+def calculate_result(t1, t2, boundary):
+    # 计算并打印相似度
+    similarity_score2 = calculate_similarity(t1, t2)
+
+    # 调整相似度
+    adjusted_similarity_score2 = adjusted_similarity(similarity_score2, boundary)
+
+    return similarity_score2, adjusted_similarity_score2
+
+
+def print_result(t1, t2, boundary):
+    print(f't2: {t2}')
+    # 计算并打印相似度
+    similarity_score2 = calculate_similarity(t1, t2)
+    print(f"两句话的相似度为: {similarity_score2:.4f}")
+
+    # 调整相似度
+    adjusted_similarity_score2 = adjusted_similarity(similarity_score2, boundary)
+    print(f"调整后的相似度为: {adjusted_similarity_score2:.4f}")
+
+
+def test(boundary):
+    # 原始文本
+    text1 = "这是第一个句子"
+    list = """
+    这是第一个句子
+    这是第二个句子。
+    那么，这是第三个表达。
+    当前呈现的是第四个句子。
+    接下来，我们有第五句话。
+    在此，展示第六条陈述。
+    继续下去，这是第七个短句。
+    不容忽视的是第八个表述。
+    顺延着序列，这是第九句。
+    此处列举的是第十个说法。
+    进入新的篇章，这是第十一个句子。
+    下一段内容即为第十二个句子。
+    显而易见，这是第十三个叙述。
+    渐进地，我们来到第十四句话。
+    向下滚动，您会看到第十五个表达。
+    此刻，呈现在眼前的是第十六个句子。
+    它们中的一个——第十七个句子在此。
+    如同链条般连接，这是第十八个断言。
+    按照顺序排列，接下来是第十九个话语。
+    逐一列举，这是第二十个陈述句。
+    结构相似，本例给出第二十一个实例句。
+    这是最初的陈述句。
+    首先表达的是这一个句子。
+    第一句内容即为此处所示。
+    这是起始的叙述段落。
+    开篇所展示的第一句话就是这个。
+    明媚的阳光洒满大地
+    窗外飘落粉色樱花瓣
+    笔尖轻触纸面思绪万千
+    深夜的月光如水般静谧
+    穿越丛林的小径蜿蜒曲折
+    浅酌清茶品味人生百态
+    破晓时分雄鸡一唱天下白
+    草原上奔驰的骏马无拘无束
+    秋叶纷飞描绘季节更替画卷
+    寒冬雪夜炉火旁围坐共话家常
+    kszdRjYXw
+    pfsMgTlVHnB
+    uQaGxIbWz
+    ZtqNhPmKcOe
+    jfyrXsStVUo
+    wDiEgLkZbn
+    yhNvAfUmqC
+    TpKjxMrWgs
+    eBzHUaFJtYd
+    oQnXcVSiPkL
+    00000
+    """
+    list2 = list.strip().split('\n')
+    for item in list2:
+        print_result(text1, item, boundary)
+
+
+if __name__ == '__main__':
+    test(0.9)
--- a/Ref_Audio_Selector/ui_init/init.py
+++ b/Ref_Audio_Selector/ui_init/init.py
--- a/Ref_Audio_Selector/ui_init/init_ui_param.py
+++ b/Ref_Audio_Selector/ui_init/init_ui_param.py
@ -0,0 +1,197 @@
+import os
+import multiprocessing
+import Ref_Audio_Selector.config_param.config_params as params
+import Ref_Audio_Selector.tool.audio_inference as audio_inference
+import Ref_Audio_Selector.common.common as common
+
+rw_param = params.config_manager.get_rw_param()
+# -------------------基本信息---------------------------
+
+# 角色所在工作目录
+base_dir_default = None
+# 工作目录
+text_work_space_dir_default = None
+# 角色名称
+text_role_default = None
+# 参考音频所在目录
+text_refer_audio_file_dir_default = None
+# 推理音频所在目录
+text_inference_audio_file_dir_default = None
+
+# -------------------第一步------------------------------
+
+# 参考音频抽样目录
+text_sample_dir_default = None
+# 分段数
+slider_subsection_num_default = None
+# 每段随机抽样个数
+slider_sample_num_default = None
+
+# -------------------第二步------------------------------
+
+# api服务模型切换接口地址
+text_api_set_model_base_url_default = None
+# GPT模型参数名
+text_api_gpt_param_default = None
+# SoVITS模型参数名
+text_api_sovits_param_default = None
+# api服务GPT模型切换接口地址
+text_api_v2_set_gpt_model_base_url_default = None
+# GPT模型参数名
+text_api_v2_gpt_model_param_default = None
+# api服务SoVITS模型切换接口地址
+text_api_v2_set_sovits_model_base_url_default = None
+# SoVITS模型参数名
+text_api_v2_sovits_model_param_default = None
+# 推理服务请求地址与参数
+text_url_default = None
+# 推理服务请求完整地址
+text_whole_url_default = None
+# 文本参数名
+text_text_default = None
+# 参考参数类型
+dropdown_refer_type_param_default = None
+# 参考音频路径参数名
+text_ref_path_default = None
+# 参考音频文本参数名
+text_ref_text_default = None
+# 角色情绪参数名
+text_emotion_default = None
+# 待推理文本路径
+text_test_content_default = None
+# 请求并发数
+slider_request_concurrency_num_default = 3
+# 最大并发数
+slider_request_concurrency_max_num = None
+
+# -------------------第三步------------------------------
+
+# 待asr的音频所在目录
+text_asr_audio_dir_default = None
+# 待分析的文件路径
+text_text_similarity_analysis_path_default = None
+# 文本相似度放大边界
+slider_text_similarity_amplification_boundary_default = 0.90
+# 文本相似度分析结果文件所在路径
+text_text_similarity_result_path_default = None
+
+# -------------------第四步------------------------------
+# -------------------第五步------------------------------
+# 模板内容
+text_template_default = None
+
+
+def empty_default(vale, default_value):
+    if vale is None or vale == "":
+        return default_value
+    else:
+        return vale
+
+
+def init_base():
+    global text_work_space_dir_default, text_role_default, base_dir_default, text_refer_audio_file_dir_default, text_inference_audio_file_dir_default
+
+    text_work_space_dir_default = rw_param.read(rw_param.work_dir)
+    text_role_default = rw_param.read(rw_param.role)
+    base_dir_default = os.path.join(text_work_space_dir_default, text_role_default)
+
+    text_refer_audio_file_dir_default = common.check_path_existence_and_return(
+        os.path.join(base_dir_default, params.reference_audio_dir))
+
+    text_inference_audio_file_dir_default = common.check_path_existence_and_return(
+        os.path.join(base_dir_default, params.inference_audio_dir))
+
+
+def init_first():
+    global text_sample_dir_default, slider_subsection_num_default, slider_sample_num_default
+
+    text_sample_dir_default = common.check_path_existence_and_return(
+        os.path.join(base_dir_default, params.list_to_convert_reference_audio_dir))
+
+    slider_subsection_num_default = int(empty_default(rw_param.read(rw_param.subsection_num), 10))
+
+    slider_sample_num_default = (empty_default(rw_param.read(rw_param.sample_num), 4))
+
+
+def init_second():
+    global text_api_set_model_base_url_default, text_api_gpt_param_default, text_api_sovits_param_default, text_api_v2_set_gpt_model_base_url_default, text_api_v2_gpt_model_param_default
+    global text_api_v2_set_sovits_model_base_url_default, text_api_v2_sovits_model_param_default, text_url_default, text_whole_url_default, text_text_default, dropdown_refer_type_param_default, text_ref_path_default
+    global text_ref_text_default, text_emotion_default, text_test_content_default, slider_request_concurrency_num_default, slider_request_concurrency_max_num
+
+    text_api_set_model_base_url_default = empty_default(rw_param.read(rw_param.api_set_model_base_url),
+                                                        'http://localhost:9880/set_model')
+    text_api_gpt_param_default = empty_default(rw_param.read(rw_param.api_gpt_param), 'gpt_model_path')
+    text_api_sovits_param_default = empty_default(rw_param.read(rw_param.api_sovits_param), 'sovits_model_path')
+
+    text_api_v2_set_gpt_model_base_url_default = empty_default(rw_param.read(rw_param.api_v2_set_gpt_model_base_url),
+                                                               'http://localhost:9880/set_gpt_weights')
+    text_api_v2_gpt_model_param_default = empty_default(rw_param.read(rw_param.api_v2_gpt_model_param), 'weights_path')
+
+    text_api_v2_set_sovits_model_base_url_default = empty_default(
+        rw_param.read(rw_param.api_v2_set_sovits_model_base_url), 'http://localhost:9880/set_sovits_weights')
+    text_api_v2_sovits_model_param_default = empty_default(rw_param.read(rw_param.api_v2_sovits_model_param), 'weights_path')
+
+    text_url_default = empty_default(rw_param.read(rw_param.text_url),
+                                     'http://localhost:9880?prompt_language=中文&text_language=中文&cut_punc=,.;?!、，。？！;：…')
+    text_text_default = empty_default(rw_param.read(rw_param.text_param), 'text')
+    dropdown_refer_type_param_default = empty_default(rw_param.read(rw_param.refer_type_param), '参考音频')
+
+    text_ref_path_default = empty_default(rw_param.read(rw_param.ref_path_param), 'refer_wav_path')
+    text_ref_text_default = empty_default(rw_param.read(rw_param.ref_text_param), 'prompt_text')
+    text_emotion_default = empty_default(rw_param.read(rw_param.emotion_param), 'emotion')
+
+    text_whole_url_default = whole_url(text_url_default, dropdown_refer_type_param_default, text_text_default,
+                                       text_ref_path_default, text_ref_text_default, text_emotion_default)
+
+    text_test_content_default = empty_default(rw_param.read(rw_param.test_content_path), params.default_test_text_path)
+
+    slider_request_concurrency_max_num = multiprocessing.cpu_count()
+
+    slider_request_concurrency_num_default = empty_default(rw_param.read(rw_param.request_concurrency_num), 3)
+
+    slider_request_concurrency_num_default = min(int(slider_request_concurrency_num_default), slider_request_concurrency_max_num)
+
+
+# 基于请求路径和参数，合成完整的请求路径
+def whole_url(text_url, dropdown_refer_type_param, text_text, text_ref_path, text_ref_text, text_emotion):
+    url_composer = audio_inference.TTSURLComposer(text_url, dropdown_refer_type_param, text_emotion, text_text,
+                                                  text_ref_path, text_ref_text)
+    if url_composer.is_emotion():
+        text_whole_url = url_composer.build_url_with_emotion('测试内容', '情绪类型', False)
+    else:
+        text_whole_url = url_composer.build_url_with_ref('测试内容', '参考路径', '参考文本', False)
+    return text_whole_url
+
+
+def init_third():
+    global text_asr_audio_dir_default, text_text_similarity_analysis_path_default, slider_text_similarity_amplification_boundary_default, text_text_similarity_result_path_default
+
+    text_asr_audio_dir_default = common.check_path_existence_and_return(
+        os.path.join(base_dir_default, params.inference_audio_dir, params.inference_audio_text_aggregation_dir))
+    text_text_similarity_analysis_path_default = common.check_path_existence_and_return(
+        os.path.join(base_dir_default, params.asr_filename + '.list'))
+    slider_text_similarity_amplification_boundary_default = empty_default(
+        rw_param.read(rw_param.text_similarity_amplification_boundary), 0.90)
+    text_text_similarity_result_path_default = common.check_path_existence_and_return(
+        os.path.join(base_dir_default, params.text_emotion_average_similarity_report_filename + '.txt'))
+
+
+def init_fourth():
+    pass
+
+
+def init_fifth():
+    global text_template_default
+
+    default_template_path = params.default_template_path
+    text_template_default = empty_default(rw_param.read(rw_param.text_template),
+                                          common.read_file(default_template_path))
+
+
+def init_all():
+    init_base()
+    init_first()
+    init_second()
+    init_third()
+    init_fourth()
+    init_fifth()
--- a/Ref_Audio_Selector/参考音频筛选流程.png
+++ b/Ref_Audio_Selector/参考音频筛选流程.png
--- a/colab_webui.ipynb
+++ b/colab_webui.ipynb
@ -1,23 +1,10 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": [],
-      "include_colab_link": true
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "accelerator": "GPU"
-  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
-        "id": "view-in-github",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "view-in-github"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
@ -25,18 +12,20 @@
    },
    {
      "cell_type": "markdown",
-      "source": [
-        "环境配置 environment"
-      ],
      "metadata": {
        "id": "_o6a8GS2lWQM"
-      }
+      },
+      "source": [
+        "环境配置 environment"
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
      "metadata": {
        "id": "e9b7iFV3dm1f"
      },
+      "outputs": [],
      "source": [
        "!pip install -q condacolab\n",
        "# Setting up condacolab and installing packages\n",
@ -47,13 +36,17 @@
        "!conda install -y -q -c pytorch -c nvidia cudatoolkit\n",
        "%cd -q /content/GPT-SoVITS\n",
        "!conda install -y -q -c conda-forge gcc gxx ffmpeg cmake -c pytorch -c nvidia\n",
+        "!/usr/local/bin/pip install -r extra-req.txt --no-deps\n",
        "!/usr/local/bin/pip install -r requirements.txt"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "0NgxXg5sjv7z"
+      },
+      "outputs": [],
      "source": [
        "# @title Download pretrained models 下载预训练模型\n",
        "!mkdir -p /content/GPT-SoVITS/GPT_SoVITS/pretrained_models\n",
@ -71,27 +64,35 @@
        "!git clone https://huggingface.co/Delik/uvr5_weights\n",
        "!git config core.sparseCheckout true\n",
        "!mv /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/GPT-SoVITS/* /content/GPT-SoVITS/GPT_SoVITS/pretrained_models/"
-      ],
-      "metadata": {
-        "id": "0NgxXg5sjv7z"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
    },
    {
      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "4oRGUzkrk8C7"
+      },
+      "outputs": [],
      "source": [
        "# @title launch WebUI 启动WebUI\n",
        "!/usr/local/bin/pip install ipykernel\n",
        "!sed -i '10s/False/True/' /content/GPT-SoVITS/config.py\n",
        "%cd /content/GPT-SoVITS/\n",
        "!/usr/local/bin/python  webui.py"
-      ],
-      "metadata": {
-        "id": "4oRGUzkrk8C7"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
    }
-  ]
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "include_colab_link": true,
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
 }
--- a/docs/cn/README.md
+++ b/docs/cn/README.md
@ -76,6 +76,7 @@ bash install.sh
 ```bash
 conda create -n GPTSoVits python=3.9
 conda activate GPTSoVits
+pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```

@ -101,9 +102,10 @@ conda install -c conda-forge 'ffmpeg<7'

 下载并将 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) 和 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) 放置在 GPT-SoVITS 根目录下。

-安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语TTS)
+安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境(仅限韩语 TTS)

 ##### MacOS 用户
+
 ```bash
 brew install ffmpeg
 ```
@ -111,6 +113,7 @@ brew install ffmpeg
 #### 安装依赖

 ```bash
+pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```

@ -147,14 +150,13 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker

 1. 从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型，并将其放置在 `GPT_SoVITS/pretrained_models` 目录中。

-2. 从 [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) 下载模型，解压并重命名为 `G2PWModel`，然后将其放置在 `GPT_SoVITS/text` 目录中。（仅限中文TTS）
+2. 从 [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) 下载模型，解压并重命名为 `G2PWModel`，然后将其放置在 `GPT_SoVITS/text` 目录中。（仅限中文 TTS）

 3. 对于 UVR5（人声/伴奏分离和混响移除，额外功能），从 [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 下载模型，并将其放置在 `tools/uvr5/uvr5_weights` 目录中。

-     - 如果你在 UVR5 中使用 `bs_roformer` 或 `mel_band_roformer`模型，你可以手动下载模型和相应的配置文件，并将它们放在 `tools/UVR5/UVR5_weights` 中。**重命名模型文件和配置文件，确保除后缀外**，模型和配置文件具有相同且对应的名称。此外，模型和配置文件名**必须包含“roformer”**，才能被识别为 roformer 类的模型。
-
-     - 建议在模型名称和配置文件名中**直接指定模型类型**，例如`mel_mand_roformer`、`bs_roformer`。如果未指定，将从配置文中比对特征，以确定它是哪种类型的模型。例如，模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对。`kim_mel_band_roformer.ckpt` 和 `kim_mel_band_roformer.yaml` 也是一对。
+   - 如果你在 UVR5 中使用 `bs_roformer` 或 `mel_band_roformer`模型，你可以手动下载模型和相应的配置文件，并将它们放在 `tools/UVR5/UVR5_weights` 中。**重命名模型文件和配置文件，确保除后缀外**，模型和配置文件具有相同且对应的名称。此外，模型和配置文件名**必须包含“roformer”**，才能被识别为 roformer 类的模型。

+   - 建议在模型名称和配置文件名中**直接指定模型类型**，例如`mel_mand_roformer`、`bs_roformer`。如果未指定，将从配置文中比对特征，以确定它是哪种类型的模型。例如，模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对。`kim_mel_band_roformer.ckpt` 和 `kim_mel_band_roformer.yaml` 也是一对。

 4. 对于中文 ASR（额外功能），从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型，并将它们放置在 `tools/asr/models` 目录中。

@ -184,12 +186,12 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神。

 ## 微调与推理

-### 打开WebUI
+### 打开 WebUI

 #### 整合包用户

 双击`go-webui.bat`或者使用`go-webui.ps1`
-若想使用V1,则双击`go-webui-v1.bat`或者使用`go-webui-v1.ps1`
+若想使用 V1,则双击`go-webui-v1.bat`或者使用`go-webui-v1.ps1`

 #### 其他

@ -197,12 +199,13 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神。
 python webui.py <language(optional)>
 ```

-若想使用V1,则
+若想使用 V1,则

 ```bash
 python webui.py v1 <language(optional)>
 ```
-或者在webUI内动态切换
+
+或者在 webUI 内动态切换

 ### 微调

@ -215,25 +218,27 @@ python webui.py v1 <language(optional)>
    5. 校对标注
    6. 前往下一个窗口,点击训练

-### 打开推理WebUI
+### 打开推理 WebUI

 #### 整合包用户

-双击 `go-webui.bat` 或者使用 `go-webui.ps1` ,然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理webUI
+双击 `go-webui.bat` 或者使用 `go-webui.ps1` ,然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI

 #### 其他

 ```bash
 python GPT_SoVITS/inference_webui.py <language(optional)>
 ```
+
 或者

 ```bash
 python webui.py
 ```
-然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理webUI

-## V2发布说明
+然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI
+
+## V2 发布说明

 新特性:

@ -241,42 +246,41 @@ python webui.py

 2. 更好的文本前端

-3. 底模由2k小时扩展至5k小时
+3. 底模由 2k 小时扩展至 5k 小时

 4. 对低音质参考音频（尤其是来源于网络的高频严重缺失、听着很闷的音频）合成出来音质更好

-    详见[wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

-从v1环境迁移至v2
+从 v1 环境迁移至 v2

-1. 需要pip安装requirements.txt更新环境
+1. 需要 pip 安装 requirements.txt 更新环境

-2. 需要克隆github上的最新代码
+2. 需要克隆 github 上的最新代码

-3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到GPT_SoVITS\pretrained_models\gsv-v2final-pretrained下
+3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到 GPT_SoVITS\pretrained_models\gsv-v2final-pretrained 下

-    中文额外需要下载[G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)（下载G2PW模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下）
+   中文额外需要下载[G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)（下载 G2PW 模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下）

-## V3更新说明
+## V3 更新说明

 新模型特点:

 1. 音色相似度更像，需要更少训练集来逼近本人（不训练直接使用底模模式下音色相似性提升更大）

-2. GPT合成更稳定，重复漏字更少，也更容易跑出丰富情感
+2. GPT 合成更稳定，重复漏字更少，也更容易跑出丰富情感

-    详见[wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

-从v2环境迁移至v3
+从 v2 环境迁移至 v3

-1. 需要pip安装requirements.txt更新环境
+1. 需要 pip 安装 requirements.txt 更新环境

-2. 需要克隆github上的最新代码
+2. 需要克隆 github 上的最新代码

-3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些v3新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS\pretrained_models`目录下
-
-    如果想用音频超分功能缓解v3模型生成24k音频觉得闷的问题，需要下载额外的模型参数，参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt)
+3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些 v3 新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS\pretrained_models`目录下

+   如果想用音频超分功能缓解 v3 模型生成 24k 音频觉得闷的问题，需要下载额外的模型参数，参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt)

 ## 待办事项清单

@ -299,16 +303,21 @@ python webui.py
  - [ ] 模型混合。

 ## （附加）命令行运行方式
-使用命令行打开UVR5的WebUI
-````
+
+使用命令行打开 UVR5 的 WebUI
+
+```
 python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
-````
+```
+
 <!-- 如果打不开浏览器，请按照下面的格式进行UVR处理，这是使用mdxnet进行音频处理的方式
 ````
 python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
 ```` -->
+
 这是使用命令行完成数据集的音频切分的方式
-````
+
+```
 python audio_slicer.py \
    --input_path "<path_to_original_audio_file_or_directory>" \
    --output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
@ -316,17 +325,22 @@ python audio_slicer.py \
    --min_length <minimum_duration_of_each_subclip> \
    --min_interval <shortest_time_gap_between_adjacent_subclips>
    --hop_size <step_size_for_computing_volume_curve>
-````
-这是使用命令行完成数据集ASR处理的方式（仅限中文）
-````
-python tools/asr/funasr_asr.py -i <input> -o <output>
-````
-通过Faster_Whisper进行ASR处理（除中文之外的ASR标记）
+```
+
+这是使用命令行完成数据集 ASR 处理的方式（仅限中文）
+
+```
+python tools/asr/funasr_asr.py -i <input> -o <output>
+```
+
+通过 Faster_Whisper 进行 ASR 处理（除中文之外的 ASR 标记）
+
+（没有进度条，GPU 性能可能会导致时间延迟）

-（没有进度条，GPU性能可能会导致时间延迟）
 ```
 python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
 ```
+
 启用自定义列表保存路径

 ## 致谢
@ -334,6 +348,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
 特别感谢以下项目和贡献者：

 ### 理论研究
+
 - [ar-vits](https://github.com/innnky/ar-vits)
 - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
 - [vits](https://github.com/jaywalnut310/vits)
@ -343,17 +358,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
 - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
 - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
 - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
+
 ### 预训练模型
+
 - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
 - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
 - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
+
 ### 推理用文本前端
+
 - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
 - [split-lang](https://github.com/DoodleBears/split-lang)
 - [g2pW](https://github.com/GitYCC/g2pW)
 - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
 - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
+
 ### WebUI 工具
+
 - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
 - [audio-slicer](https://github.com/openvpi/audio-slicer)
 - [SubFix](https://github.com/cronrpc/SubFix)
--- a/docs/ja/README.md
+++ b/docs/ja/README.md
@ -20,17 +20,17 @@

 ## 機能:

-1. **Zero-Shot TTS:** たった5秒間の音声サンプルで、即座にテキストからその音声に変換できます。
+1. **Zero-Shot TTS:** たった 5 秒間の音声サンプルで、即座にテキストからその音声に変換できます。

-2. **Few-Shot TTS:** わずか1分間のトレーニングデータでモデルを微調整し、音声のクオリティを向上。
+2. **Few-Shot TTS:** わずか 1 分間のトレーニングデータでモデルを微調整し、音声のクオリティを向上。

 3. **多言語サポート:** 現在、英語、日本語、韓国語、広東語、中国語をサポートしています。

-4. **WebUI ツール:** 統合されたツールは、音声と伴奏（BGM等）の分離、トレーニングセットの自動セグメンテーション、ASR（中国語のみ）、テキストラベリング等を含むため、初心者の方でもトレーニングデータセットの作成やGPT/SoVITSモデルのトレーニング等を非常に簡単に行えます。
+4. **WebUI ツール:** 統合されたツールは、音声と伴奏（BGM 等）の分離、トレーニングセットの自動セグメンテーション、ASR（中国語のみ）、テキストラベリング等を含むため、初心者の方でもトレーニングデータセットの作成や GPT/SoVITS モデルのトレーニング等を非常に簡単に行えます。

 **[デモ動画](https://www.bilibili.com/video/BV12g4y1m7Uw)をチェック！**

-声の事前学習無しかつFew-Shotでトレーニングされたモデルのデモ:
+声の事前学習無しかつ Few-Shot でトレーニングされたモデルのデモ:

 https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb

@ -43,7 +43,7 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
 - Python 3.9, PyTorch 2.0.1, CUDA 11
 - Python 3.10.13, PyTorch 2.1.2, CUDA 12.3
 - Python 3.9, PyTorch 2.2.2, macOS 14.4.1 (Apple silicon)
- Python 3.9, PyTorch 2.2.2, CPUデバイス
+- Python 3.9, PyTorch 2.2.2, CPU デバイス

 _注記: numba==0.56.4 は py<3.11 が必要です_

@ -61,22 +61,22 @@ bash install.sh

 ### macOS

-**注：MacでGPUを使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面はCPUを使用して訓練することを強く推奨します。**
+**注：Mac で GPU を使用して訓練されたモデルは、他のデバイスで訓練されたモデルと比較して著しく品質が低下するため、当面は CPU を使用して訓練することを強く推奨します。**

-1. `xcode-select --install` を実行して、Xcodeコマンドラインツールをインストールします。
-2. `brew install ffmpeg` を実行してFFmpegをインストールします。
+1. `xcode-select --install` を実行して、Xcode コマンドラインツールをインストールします。
+2. `brew install ffmpeg` を実行して FFmpeg をインストールします。
 3. 上記の手順を完了した後、以下のコマンドを実行してこのプロジェクトをインストールします。

 ```bash
 conda create -n GPTSoVits python=3.9
 conda activate GPTSoVits
-
+pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```

 ### 手動インストール

-#### FFmpegをインストールします。
+#### FFmpeg をインストールします。

 ##### Conda ユーザー

@ -97,6 +97,7 @@ conda install -c conda-forge 'ffmpeg<7'
 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます。

 ##### MacOS ユーザー
+
 ```bash
 brew install ffmpeg
 ```
@ -104,6 +105,7 @@ brew install ffmpeg
 #### 依存関係をインストールします

 ```bash
+pip install -r extra-req.txt --no-deps
 pip install -r requirementx.txt
 ```

@ -138,17 +140,17 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker

 1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) から事前訓練済みモデルをダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリに配置してください。

-2. [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください。（中国語TTSのみ）
+2. [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください。（中国語 TTS のみ）

-3. UVR5（ボーカル/伴奏（BGM等）分離 & リバーブ除去の追加機能）の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`tools/uvr5/uvr5_weights` ディレクトリに配置してください。
+3. UVR5（ボーカル/伴奏（BGM 等）分離 & リバーブ除去の追加機能）の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`tools/uvr5/uvr5_weights` ディレクトリに配置してください。

-    - UVR5でbs_roformerまたはmel_band_roformerモデルを使用する場合、モデルと対応する設定ファイルを手動でダウンロードし、`tools/UVR5/UVR5_weights`フォルダに配置することができます。**モデルファイルと設定ファイルの名前は、拡張子を除いて同じであることを確認してください**。さらに、モデルと設定ファイルの名前には**「roformer」が含まれている必要があります**。これにより、roformerクラスのモデルとして認識されます。
+   - UVR5 で bs_roformer または mel_band_roformer モデルを使用する場合、モデルと対応する設定ファイルを手動でダウンロードし、`tools/UVR5/UVR5_weights`フォルダに配置することができます。**モデルファイルと設定ファイルの名前は、拡張子を除いて同じであることを確認してください**。さらに、モデルと設定ファイルの名前には**「roformer」が含まれている必要があります**。これにより、roformer クラスのモデルとして認識されます。

-    - モデル名と設定ファイル名には、**直接モデルタイプを指定することをお勧めします**。例：mel_mand_roformer、bs_roformer。指定しない場合、設定文から特徴を照合して、モデルの種類を特定します。例えば、モデル`bs_roformer_ep_368_sdr_12.9628.ckpt`と対応する設定ファイル`bs_roformer_ep_368_sdr_12.9628.yaml`はペアです。同様に、`kim_mel_band_roformer.ckpt`と`kim_mel_band_roformer.yaml`もペアです。
+   - モデル名と設定ファイル名には、**直接モデルタイプを指定することをお勧めします**。例：mel_mand_roformer、bs_roformer。指定しない場合、設定文から特徴を照合して、モデルの種類を特定します。例えば、モデル`bs_roformer_ep_368_sdr_12.9628.ckpt`と対応する設定ファイル`bs_roformer_ep_368_sdr_12.9628.yaml`はペアです。同様に、`kim_mel_band_roformer.ckpt`と`kim_mel_band_roformer.yaml`もペアです。

-4. 中国語ASR（追加機能）の場合は、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、および [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。
+4. 中国語 ASR（追加機能）の場合は、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、および [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。

-5. 英語または日本語のASR（追加機能）を使用する場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。また、[他のモデル](https://huggingface.co/Systran) は、より小さいサイズで高クオリティな可能性があります。
+5. 英語または日本語の ASR（追加機能）を使用する場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください。また、[他のモデル](https://huggingface.co/Systran) は、より小さいサイズで高クオリティな可能性があります。

 ## データセット形式

@ -169,14 +171,15 @@ vocal_path|speaker_name|language|text
 ```
 D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
 ```
+
 ## 微調整と推論

-### WebUIを開く
+### WebUI を開く

 #### 統合パッケージ利用者

 `go-webui.bat`をダブルクリックするか、`go-webui.ps1`を使用します。
-V1に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックするか、`go-webui-v1.ps1`を使用してください。
+V1 に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックするか、`go-webui-v1.ps1`を使用してください。

 #### その他

@ -184,12 +187,13 @@ V1に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックす
 python webui.py <言語(オプション)>
 ```

-V1に切り替えたい場合は
+V1 に切り替えたい場合は

 ```bash
 python webui.py v1 <言語(オプション)>
 ```
-またはWebUIで手動でバージョンを切り替えてください。
+
+または WebUI で手動でバージョンを切り替えてください。

 ### 微調整

@ -202,25 +206,27 @@ python webui.py v1 <言語(オプション)>
    5. ASR転写を校正する
    6. 次のタブに移動し、モデルを微調整する

-### 推論WebUIを開く
+### 推論 WebUI を開く

 #### 統合パッケージ利用者

-`go-webui-v2.bat`をダブルクリックするか、`go-webui-v2.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論webuiを開きます。
+`go-webui-v2.bat`をダブルクリックするか、`go-webui-v2.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます。

 #### その他

 ```bash
 python GPT_SoVITS/inference_webui.py <言語(オプション)>
 ```
+
 または

 ```bash
 python webui.py
 ```
-その後、`1-GPT-SoVITS-TTS/1C-inference`で推論webuiを開きます。

-## V2リリースノート
+その後、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます。
+
+## V2 リリースノート

 新機能:

@ -228,21 +234,21 @@ python webui.py

 2. 最適化されたテキストフロントエンド

-3. 事前学習済みモデルが2千時間から5千時間に拡張
+3. 事前学習済みモデルが 2 千時間から 5 千時間に拡張

 4. 低品質の参照音声に対する合成品質の向上

-    [詳細はこちら](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [詳細はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

-V1環境からV2を使用するには:
+V1 環境から V2 を使用するには:

 1. `pip install -r requirements.txt`を使用していくつかのパッケージを更新

-2. 最新のコードをgithubからクローン
+2. 最新のコードを github からクローン

-3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)からV2の事前学習モデルをダウンロードし、それらを`GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`に配置
+3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)から V2 の事前学習モデルをダウンロードし、それらを`GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`に配置

-    中国語V2追加: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)（G2PWモデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します）
+   中国語 V2 追加: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip)（G2PW モデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します）

 ## V3 リリースノート

@ -250,19 +256,19 @@ V1環境からV2を使用するには:

 1. 音色の類似性が向上し、ターゲットスピーカーを近似するために必要な学習データが少なくなりました（音色の類似性は、ファインチューニングなしでベースモデルを直接使用することで顕著に改善されます）。

-2. GPTモデルがより安定し、繰り返しや省略が減少し、より豊かな感情表現を持つ音声の生成が容易になりました。
+2. GPT モデルがより安定し、繰り返しや省略が減少し、より豊かな感情表現を持つ音声の生成が容易になりました。

-    [詳細情報はこちら](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [詳細情報はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

 v2 環境から v3 を使用する方法:

 1. `pip install -r requirements.txt` を実行して、いくつかのパッケージを更新します。

-2. GitHubから最新のコードをクローンします。
+2. GitHub から最新のコードをクローンします。

-3. v3の事前学習済みモデル（s1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ）を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS\pretrained_models フォルダに配置します。
+3. v3 の事前学習済みモデル（s1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ）を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS\pretrained_models フォルダに配置します。

-    追加: 音声超解像モデルについては、[ダウンロード方法](../../tools/AP_BWE_main/24kto48k/readme.txt)を参照してください。
+   追加: 音声超解像モデルについては、[ダウンロード方法](../../tools/AP_BWE_main/24kto48k/readme.txt)を参照してください。

 ## Todo リスト

@ -285,15 +291,20 @@ v2 環境から v3 を使用する方法:
  - [ ] モデルミックス

 ## (追加の) コマンドラインから実行する方法
+
 コマンド ラインを使用して UVR5 の WebUI を開きます
+
 ```
 python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
 ```
+
 <!-- ブラウザを開けない場合は、以下の形式に従って UVR 処理を行ってください。これはオーディオ処理に mdxnet を使用しています。
 ```
 python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
 ``` -->
+
 コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです。
+
 ```
 python audio_slicer.py \
    --input_path "<path_to_original_audio_file_or_directory>" \
@ -303,16 +314,21 @@ python audio_slicer.py \
    --min_interval <shortest_time_gap_between_adjacent_subclips>
    --hop_size <step_size_for_computing_volume_curve>
 ```
+
 コマンドラインを使用してデータセット ASR 処理を行う方法です (中国語のみ)
+
 ```
 python tools/asr/funasr_asr.py -i <input> -o <output>
 ```
-ASR処理はFaster_Whisperを通じて実行されます(中国語を除くASRマーキング)
+
+ASR 処理は Faster_Whisper を通じて実行されます(中国語を除く ASR マーキング)

 (進行状況バーは表示されません。GPU のパフォーマンスにより時間遅延が発生する可能性があります)
+
 ```
 python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
 ```
+
 カスタムリストの保存パスが有効になっています

 ## クレジット
@ -320,6 +336,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
 特に以下のプロジェクトと貢献者に感謝します：

 ### 理論研究
+
 - [ar-vits](https://github.com/innnky/ar-vits)
 - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
 - [vits](https://github.com/jaywalnut310/vits)
@ -329,17 +346,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
 - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
 - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
 - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
+
 ### 事前学習モデル
+
 - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
 - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
 - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
+
 ### 推論用テキストフロントエンド
+
 - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
 - [split-lang](https://github.com/DoodleBears/split-lang)
 - [g2pW](https://github.com/GitYCC/g2pW)
 - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
 - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
+
 ### WebUI ツール
+
 - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
 - [audio-slicer](https://github.com/openvpi/audio-slicer)
 - [SubFix](https://github.com/cronrpc/SubFix)
--- a/docs/ko/README.md
+++ b/docs/ko/README.md
@ -70,7 +70,7 @@ bash install.sh
 ```bash
 conda create -n GPTSoVits python=3.9
 conda activate GPTSoVits
-
+pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```

@ -99,6 +99,7 @@ conda install -c conda-forge 'ffmpeg<7'
 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치 (Korean TTS 전용)

 ##### MacOS 사용자
+
 ```bash
 brew install ffmpeg
 ```
@ -106,6 +107,7 @@ brew install ffmpeg
 #### 의존성 설치

 ```bash
+pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```

@ -147,9 +149,9 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker

 3. UVR5 (보컬/반주 분리 & 잔향 제거 추가 기능)의 경우, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 에서 모델을 다운로드하고 `tools/uvr5/uvr5_weights` 디렉토리에 배치하세요.

-    - UVR5에서 bs_roformer 또는 mel_band_roformer 모델을 사용할 경우, 모델과 해당 설정 파일을 수동으로 다운로드하여 `tools/UVR5/UVR5_weights` 폴더에 저장할 수 있습니다. **모델 파일과 설정 파일의 이름은 확장자를 제외하고 동일한 이름을 가지도록 해야 합니다**. 또한, 모델과 설정 파일 이름에는 **“roformer”**가 포함되어야 roformer 클래스의 모델로 인식됩니다.
+   - UVR5에서 bs_roformer 또는 mel_band_roformer 모델을 사용할 경우, 모델과 해당 설정 파일을 수동으로 다운로드하여 `tools/UVR5/UVR5_weights` 폴더에 저장할 수 있습니다. **모델 파일과 설정 파일의 이름은 확장자를 제외하고 동일한 이름을 가지도록 해야 합니다**. 또한, 모델과 설정 파일 이름에는 **“roformer”**가 포함되어야 roformer 클래스의 모델로 인식됩니다.

-    - 모델 이름과 설정 파일 이름에 **모델 유형을 직접 지정하는 것이 좋습니다**. 예: mel_mand_roformer, bs_roformer. 지정하지 않으면 설정 파일을 기준으로 특성을 비교하여 어떤 유형의 모델인지를 판단합니다. 예를 들어, 모델 `bs_roformer_ep_368_sdr_12.9628.ckpt`와 해당 설정 파일 `bs_roformer_ep_368_sdr_12.9628.yaml`은 한 쌍입니다. `kim_mel_band_roformer.ckpt`와 `kim_mel_band_roformer.yaml`도 한 쌍입니다.
+   - 모델 이름과 설정 파일 이름에 **모델 유형을 직접 지정하는 것이 좋습니다**. 예: mel_mand_roformer, bs_roformer. 지정하지 않으면 설정 파일을 기준으로 특성을 비교하여 어떤 유형의 모델인지를 판단합니다. 예를 들어, 모델 `bs_roformer_ep_368_sdr_12.9628.ckpt`와 해당 설정 파일 `bs_roformer_ep_368_sdr_12.9628.yaml`은 한 쌍입니다. `kim_mel_band_roformer.ckpt`와 `kim_mel_band_roformer.yaml`도 한 쌍입니다.

 4. 중국어 ASR (추가 기능)의 경우, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 및 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 에서 모델을 다운로드하고, `tools/asr/models` 디렉토리에 배치하세요.

@ -195,6 +197,7 @@ V1으로 전환하려면,
 ```bash
 python webui.py v1 <언어(옵션)>
 ```
+
 또는 WebUI에서 수동으로 버전을 전환하십시오.

 ### 미세 조정
@ -219,11 +222,13 @@ python webui.py v1 <언어(옵션)>
 ```bash
 python GPT_SoVITS/inference_webui.py <언어(옵션)>
 ```
+
 또는

 ```bash
 python webui.py
 ```
+
 그런 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.

 ## V2 릴리스 노트
@ -238,7 +243,7 @@ python webui.py

 4. 저품질 참조 오디오에 대한 합성 품질 향상

-    [자세한 내용](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

 V1 환경에서 V2를 사용하려면:

@ -248,7 +253,7 @@ V1 환경에서 V2를 사용하려면:

 3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)에서 V2 사전 학습 모델을 다운로드하여 `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained`에 넣으십시오.

-    중국어 V2 추가: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.)
+   중국어 V2 추가: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.)

 ## V3 릴리스 노트

@ -258,7 +263,7 @@ V1 환경에서 V2를 사용하려면:

 2. GPT 모델이 더 안정적이며 반복 및 생략이 적고, 더 풍부한 감정 표현을 가진 음성을 생성하기가 더 쉽습니다.

-    [자세한 내용](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

 v2 환경에서 v3 사용하기:

@ -268,8 +273,7 @@ v2 환경에서 v3 사용하기:

 3. v3 사전 훈련된 모델(s1v3.ckpt, s2Gv3.pth, 그리고 models--nvidia--bigvgan_v2_24khz_100band_256x 폴더)을 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 다운로드하여 `GPT_SoVITS\pretrained_models` 폴더에 넣습니다.

-    추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요.
-
+   추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요.

 ## 할 일 목록

@ -293,15 +297,20 @@ v2 환경에서 v3 사용하기:
  - [ ] 모델 블렌딩.

 ## (추가적인) 명령줄에서 실행하는 방법
+
 명령줄을 사용하여 UVR5용 WebUI 열기
+
 ```
 python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
 ```
+
 <!-- 브라우저를 열 수 없는 경우 UVR 처리를 위해 아래 형식을 따르십시오. 이는 오디오 처리를 위해 mdxnet을 사용하는 것입니다.
 ```
 python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
 ``` -->
+
 명령줄을 사용하여 데이터세트의 오디오 분할을 수행하는 방법은 다음과 같습니다.
+
 ```
 python audio_slicer.py \
    --input_path "<path_to_original_audio_file_or_directory>" \
@ -311,16 +320,21 @@ python audio_slicer.py \
    --min_interval <shortest_time_gap_between_adjacent_subclips>
    --hop_size <step_size_for_computing_volume_curve>
 ```
+
 명령줄을 사용하여 데이터 세트 ASR 처리를 수행하는 방법입니다(중국어만 해당).
+
 ```
 python tools/asr/funasr_asr.py -i <input> -o <output>
 ```
+
 ASR 처리는 Faster_Whisper(중국어를 제외한 ASR 마킹)를 통해 수행됩니다.

 (진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음)
+
 ```
 python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
 ```
+
 사용자 정의 목록 저장 경로가 활성화되었습니다.

 ## 감사의 말
@ -328,6 +342,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
 다음 프로젝트와 기여자들에게 특별히 감사드립니다:

 ### 이론 연구
+
 - [ar-vits](https://github.com/innnky/ar-vits)
 - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
 - [vits](https://github.com/jaywalnut310/vits)
@ -337,17 +352,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
 - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
 - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
 - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
+
 ### 사전 학습 모델
+
 - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
 - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
 - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
+
 ### 추론용 텍스트 프론트엔드
+
 - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
 - [split-lang](https://github.com/DoodleBears/split-lang)
 - [g2pW](https://github.com/GitYCC/g2pW)
 - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
 - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
+
 ### WebUI 도구
+
 - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
 - [audio-slicer](https://github.com/openvpi/audio-slicer)
 - [SubFix](https://github.com/cronrpc/SubFix)
--- a/docs/tr/README.md
+++ b/docs/tr/README.md
@ -72,7 +72,7 @@ bash install.sh
 ```bash
 conda create -n GPTSoVits python=3.9
 conda activate GPTSoVits
-
+pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```

@ -99,6 +99,7 @@ conda install -c conda-forge 'ffmpeg<7'
 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin.

 ##### MacOS Kullanıcıları
+
 ```bash
 brew install ffmpeg
 ```
@ -106,6 +107,7 @@ brew install ffmpeg
 #### Bağımlılıkları Yükleme

 ```bash
+pip install -r extra-req.txt --no-deps
 pip install -r requirements.txt
 ```

@ -142,9 +144,9 @@ docker run --rm -it --gpus=all --env=is_half=False --volume=G:\GPT-SoVITS-Docker

 3. UVR5 (Vokal/Enstrümantal Ayrımı & Yankı Giderme) için, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) üzerinden modelleri indirip `tools/uvr5/uvr5_weights` dizinine yerleştirin.

-    - UVR5'te bs_roformer veya mel_band_roformer modellerini kullanıyorsanız, modeli ve ilgili yapılandırma dosyasını manuel olarak indirip `tools/UVR5/UVR5_weights` klasörüne yerleştirebilirsiniz. **Model dosyası ve yapılandırma dosyasının adı, uzantı dışında aynı olmalıdır**. Ayrıca, model ve yapılandırma dosyasının adlarında **“roformer”** kelimesi yer almalıdır, böylece roformer sınıfındaki bir model olarak tanınır.
+   - UVR5'te bs_roformer veya mel_band_roformer modellerini kullanıyorsanız, modeli ve ilgili yapılandırma dosyasını manuel olarak indirip `tools/UVR5/UVR5_weights` klasörüne yerleştirebilirsiniz. **Model dosyası ve yapılandırma dosyasının adı, uzantı dışında aynı olmalıdır**. Ayrıca, model ve yapılandırma dosyasının adlarında **“roformer”** kelimesi yer almalıdır, böylece roformer sınıfındaki bir model olarak tanınır.

-    - Model adı ve yapılandırma dosyası adı içinde **doğrudan model tipini belirtmek önerilir**. Örneğin: mel_mand_roformer, bs_roformer. Belirtilmezse, yapılandırma dosyasından özellikler karşılaştırılarak model tipi belirlenir. Örneğin, `bs_roformer_ep_368_sdr_12.9628.ckpt` modeli ve karşılık gelen yapılandırma dosyası `bs_roformer_ep_368_sdr_12.9628.yaml` bir çifttir. Aynı şekilde, `kim_mel_band_roformer.ckpt` ve `kim_mel_band_roformer.yaml` da bir çifttir.
+   - Model adı ve yapılandırma dosyası adı içinde **doğrudan model tipini belirtmek önerilir**. Örneğin: mel_mand_roformer, bs_roformer. Belirtilmezse, yapılandırma dosyasından özellikler karşılaştırılarak model tipi belirlenir. Örneğin, `bs_roformer_ep_368_sdr_12.9628.ckpt` modeli ve karşılık gelen yapılandırma dosyası `bs_roformer_ep_368_sdr_12.9628.yaml` bir çifttir. Aynı şekilde, `kim_mel_band_roformer.ckpt` ve `kim_mel_band_roformer.yaml` da bir çifttir.

 4. Çince ASR için, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) ve [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) üzerinden modelleri indirip `tools/asr/models` dizinine yerleştirin.

@ -192,6 +194,7 @@ V1'e geçmek istiyorsanız,
 ```bash
 python webui.py v1 <dil(isteğe bağlı)>
 ```
+
 veya WebUI'de manuel olarak sürüm değiştirin.

 ### İnce Ayar
@ -216,11 +219,13 @@ veya WebUI'de manuel olarak sürüm değiştirin.
 ```bash
 python GPT_SoVITS/inference_webui.py <dil(isteğe bağlı)>
 ```
+
 VEYA

 ```bash
 python webui.py
 ```
+
 ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.

 ## V2 Sürüm Notları
@ -235,7 +240,7 @@ Yeni Özellikler:

 4. Düşük kaliteli referans sesler için geliştirilmiş sentez kalitesi

-    [detaylar burada](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [detaylar burada](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

 V1 ortamından V2'yi kullanmak için:

@ -245,7 +250,7 @@ V1 ortamından V2'yi kullanmak için:

 3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) adresinden v2 önceden eğitilmiş modelleri indirin ve bunları `GPT_SoVITS\pretrained_models\gsv-v2final-pretrained` dizinine yerleştirin.

-    Ek olarak Çince V2: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.)
+   Ek olarak Çince V2: [G2PWModel_1.1.zip](https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.)

 ## V3 Sürüm Notları

@ -255,7 +260,7 @@ V1 ortamından V2'yi kullanmak için:

 2. GPT modeli daha **kararlı** hale geldi, tekrarlar ve atlamalar azaldı ve **daha zengin duygusal ifadeler** ile konuşma üretmek daha kolay hale geldi.

-    [daha fazla detay](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))
+   [daha fazla detay](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

 ### v2 ortamında v3 kullanımı:

@ -265,7 +270,7 @@ V1 ortamından V2'yi kullanmak için:

 3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden v3 önceden eğitilmiş modellerini (s1v3.ckpt, s2Gv3.pth ve models--nvidia--bigvgan_v2_24khz_100band_256x klasörünü) indirin ve `GPT_SoVITS\pretrained_models` dizinine yerleştirin.

-    ek: Ses Süper Çözünürlük modeli için [nasıl indirileceği](../../tools/AP_BWE_main/24kto48k/readme.txt) hakkında bilgi alabilirsiniz.
+   ek: Ses Süper Çözünürlük modeli için [nasıl indirileceği](../../tools/AP_BWE_main/24kto48k/readme.txt) hakkında bilgi alabilirsiniz.

 ## Yapılacaklar Listesi

@ -288,15 +293,20 @@ V1 ortamından V2'yi kullanmak için:
  - [ ] model karışımı

 ## (Ekstra) Komut satırından çalıştırma yöntemi
+
 UVR5 için Web Arayüzünü açmak için komut satırını kullanın
+
 ```
 python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
 ```
+
 <!-- Bir tarayıcı açamıyorsanız, UVR işleme için aşağıdaki formatı izleyin,Bu ses işleme için mdxnet kullanıyor
 ```
 python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
 ``` -->
+
 Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır
+
 ```
 python audio_slicer.py \
    --input_path "<orijinal_ses_dosyası_veya_dizininin_yolu>" \
@ -306,16 +316,21 @@ python audio_slicer.py \
    --min_interval <bitişik_alt_klipler_arasındaki_en_kısa_zaman_aralığı>
    --hop_size <ses_eğrisini_hesaplamak_için_adım_boyutu>
 ```
+
 Veri seti ASR işleme komut satırı kullanılarak bu şekilde yapılır (Yalnızca Çince)
+
 ```
 python tools/asr/funasr_asr.py -i <girdi> -o <çıktı>
 ```
+
 ASR işleme Faster_Whisper aracılığıyla gerçekleştirilir (Çince dışındaki ASR işaretleme)

 (İlerleme çubukları yok, GPU performansı zaman gecikmelerine neden olabilir)
+
 ```
 python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
 ```
+
 Özel bir liste kaydetme yolu etkinleştirildi

 ## Katkı Verenler
@ -323,6 +338,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
 Özellikle aşağıdaki projelere ve katkıda bulunanlara teşekkür ederiz:

 ### Teorik Araştırma
+
 - [ar-vits](https://github.com/innnky/ar-vits)
 - [SoundStorm](https://github.com/yangdongchao/SoundStorm/tree/master/soundstorm/s1/AR)
 - [vits](https://github.com/jaywalnut310/vits)
@ -332,17 +348,23 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
 - [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
 - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
 - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)
+
 ### Önceden Eğitilmiş Modeller
+
 - [Chinese Speech Pretrain](https://github.com/TencentGameMate/chinese_speech_pretrain)
 - [Chinese-Roberta-WWM-Ext-Large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large)
 - [BigVGAN](https://github.com/NVIDIA/BigVGAN)
+
 ### Tahmin İçin Metin Ön Ucu
+
 - [paddlespeech zh_normalization](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization)
 - [split-lang](https://github.com/DoodleBears/split-lang)
 - [g2pW](https://github.com/GitYCC/g2pW)
 - [pypinyin-g2pW](https://github.com/mozillazg/pypinyin-g2pW)
 - [paddlespeech g2pw](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/g2pw)
+
 ### WebUI Araçları
+
 - [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui)
 - [audio-slicer](https://github.com/openvpi/audio-slicer)
 - [SubFix](https://github.com/cronrpc/SubFix)
--- a/extra-req.txt
+++ b/extra-req.txt
@ -0,0 +1 @@
+faster-whisper
--- a/gpt-sovits_kaggle.ipynb
+++ b/gpt-sovits_kaggle.ipynb
@ -27,7 +27,8 @@
    "!git clone https://github.com/RVC-Boss/GPT-SoVITS.git\n",
    "%cd GPT-SoVITS\n",
    "!apt-get update && apt-get install -y --no-install-recommends tzdata ffmpeg libsox-dev parallel aria2 git git-lfs && git lfs install\n",
-    "!pip install -r requirements.txt"
+    "!pip install -r requirements.txt\n",
+    "!pip install -r extra-req.txt --no-deps"
   ]
  },
  {
--- a/install.sh
+++ b/install.sh
@ -1,15 +1,17 @@
 #!/bin/bash

+set -e
+
 # 安装构建工具
 # Install build tools
 echo "Installing GCC..."
-conda install -c conda-forge gcc=14
+conda install -c conda-forge gcc=14 -y

 echo "Installing G++..."
-conda install -c conda-forge gxx
+conda install -c conda-forge gxx -y

 echo "Installing ffmpeg and cmake..."
-conda install ffmpeg cmake
+conda install ffmpeg cmake -y

 # 设置编译环境
 # Set up build environment
@ -18,7 +20,7 @@ export CC="$CONDA_PREFIX/bin/gcc"
 export CXX="$CONDA_PREFIX/bin/g++"

 echo "Checking for CUDA installation..."
-if command -v nvidia-smi &> /dev/null; then
+if command -v nvidia-smi &>/dev/null; then
    USE_CUDA=true
    echo "CUDA found."
 else
@ -26,7 +28,6 @@ else
    USE_CUDA=false
 fi

-
 if [ "$USE_CUDA" = false ]; then
    echo "Checking for ROCm installation..."
    if [ -d "/opt/rocm" ]; then
@ -48,7 +49,7 @@ fi
 if [ "$USE_CUDA" = true ]; then
    echo "Installing PyTorch with CUDA support..."
    conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 pytorch-cuda=11.8 -c pytorch -c nvidia
-elif [ "$USE_ROCM" = true ] ; then
+elif [ "$USE_ROCM" = true ]; then
    echo "Installing PyTorch with ROCm support..."
    pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/rocm6.2
 else
@ -56,21 +57,53 @@ else
    conda install pytorch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 cpuonly -c pytorch
 fi

-
 echo "Installing Python dependencies from requirements.txt..."

 # 刷新环境
 # Refresh environment
 hash -r
+
+# pyopenjtalk Installation
+conda install jq -y
+
+OS_TYPE=$(uname)
+
+PACKAGE_NAME="pyopenjtalk"
+
+VERSION=$(curl -s https://pypi.org/pypi/$PACKAGE_NAME/json | jq -r .info.version)
+
+wget "https://files.pythonhosted.org/packages/source/${PACKAGE_NAME:0:1}/$PACKAGE_NAME/$PACKAGE_NAME-$VERSION.tar.gz"
+
+TAR_FILE=$(ls ${PACKAGE_NAME}-*.tar.gz)
+DIR_NAME="${TAR_FILE%.tar.gz}"
+
+tar -xzf "$TAR_FILE"
+rm "$TAR_FILE"
+
+CMAKE_FILE="$DIR_NAME/lib/open_jtalk/src/CMakeLists.txt"
+
+if [[ "$OS_TYPE" == "darwin"* ]]; then
+    sed -i '' -E 's/cmake_minimum_required\(VERSION[^\)]*\)/cmake_minimum_required(VERSION 3.5...3.31)/' "$CMAKE_FILE"
+else
+    sed -i -E 's/cmake_minimum_required\(VERSION[^\)]*\)/cmake_minimum_required(VERSION 3.5...3.31)/' "$CMAKE_FILE"
+fi
+
+tar -czf "$TAR_FILE" "$DIR_NAME"
+
+pip install "$TAR_FILE"
+
+rm -rf "$TAR_FILE" "$DIR_NAME"
+
+pip install -r extra-req.txt --no-deps
+
 pip install -r requirements.txt

-if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ] ; then
+if [ "$USE_ROCM" = true ] && [ "$IS_WSL" = true ]; then
    echo "Update to WSL compatible runtime lib..."
-    location=`pip show torch | grep Location | awk -F ": " '{print $2}'`
-    cd ${location}/torch/lib/
+    location=$(pip show torch | grep Location | awk -F ": " '{print $2}')
+    cd "${location}"/torch/lib/ || exit
    rm libhsa-runtime64.so*
    cp /opt/rocm/lib/libhsa-runtime64.so.1.2 libhsa-runtime64.so
 fi

 echo "Installation completed successfully!"
-
--- a/requirements.txt
+++ b/requirements.txt
@ -3,7 +3,7 @@ scipy
 tensorboard
 librosa==0.9.2
 numba==0.56.4
-pytorch-lightning
+pytorch-lightning>2.0
 gradio>=4.0,<=4.24.0
 ffmpeg-python
 onnxruntime; sys_platform == 'darwin'
@ -26,7 +26,6 @@ jieba_fast
 jieba
 split-lang
 fast_langdetect>=0.3.0
-Faster_Whisper
 wordsegment
 rotary_embedding_torch
 ToJyutping 
@ -38,4 +37,9 @@ python_mecab_ko; sys_platform != 'win32'
 fastapi<0.112.2
 x_transformers
 torchmetrics<=1.5
-attrdict
+pydantic<=2.10.6
+ctranslate2>=4.0,<5
+huggingface_hub>=0.13
+tokenizers>=0.13,<1
+av>=11
+tqdm
Author	SHA1	Message	Date
FengQingYunDan	cc88d33348	Merge 50a88a596dea718c83e535136e9cb46b513cef6f into 03b662a769946b7a6a8569a354860e8eeeb743aa	2025-03-31 15:38:15 +08:00
ChasonJiang	03b662a769	为sovits_v3 适配并行推理 (#2241 ) * 为sovits_v3 适配并行推理 * 清理无用代码	2025-03-31 11:56:05 +08:00
XXXXRT666	6c468583c5	Fix dependency-related issues via requirements update (#2236 ) * Update requirements.txt * Create constraints.txt * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * pyopenjtalk and onnx fix * Update requirements.txt * Update requirements.txt * Update install.sh * update shell install.sh * update docs * Update Install.sh * fix bugs * Update .gitignore * Update .gitignore * Update install.sh * Update install.sh * Update extra-req.txt * Update requirements.txt	2025-03-31 11:27:12 +08:00
Downupanddownup	50a88a596d	更新到gsv官方最新版本	2024-09-15 19:25:58 +08:00
Downupanddownup	86e5b67448	减少推理文本	2024-08-05 20:11:11 +08:00
Downupanddownup	2faf74beaa	Merge branch 'main' into ref_audio_selector_tool	2024-06-22 10:00:36 +08:00
Downupanddownup	16b3c2a131	--	2024-06-15 02:06:10 +08:00
Downupanddownup	9f418af1dd	--	2024-06-06 22:02:40 +08:00
Downupanddownup	5ffb193bcd	初始化为数字	2024-06-06 18:14:38 +08:00
Downupanddownup	56d6ae6b3b	Merge branch 'main' into ref_audio_selector_tool	2024-06-06 18:04:49 +08:00
Downupanddownup	7c3c778b17	添加可能存在高频齿音的文本	2024-05-07 19:08:00 +08:00
Downupanddownup	61b21e1fca	Merge branch 'main' into ref_audio_selector_tool	2024-05-03 07:10:29 +08:00
Downupanddownup	18002ad809	bug修复	2024-05-02 09:58:28 +08:00
Downupanddownup	12fa7d875f	bug修复	2024-05-02 09:56:00 +08:00
Downupanddownup	48cc70a7de	bug修复	2024-05-02 08:02:56 +08:00
Downupanddownup	3ac7aad4d0	bug修复	2024-05-02 07:39:43 +08:00
Downupanddownup	036d828a7e	bug修复	2024-05-02 07:26:26 +08:00
Downupanddownup	7e1c40ef9f	00	2024-05-01 21:52:09 +08:00
Downupanddownup	fdffd50066	00	2024-05-01 21:23:28 +08:00
Downupanddownup	8a10c528e3	bug修复	2024-05-01 01:49:37 +08:00
Downupanddownup	02fabe807f	bug修复	2024-05-01 00:46:42 +08:00
Downupanddownup	4ebcb3bf1b	bug修复	2024-05-01 00:24:40 +08:00
Downupanddownup	5843d56c4e	bug修复	2024-05-01 00:11:27 +08:00
Downupanddownup	7660f1c8fb	优化监控信息	2024-04-30 10:59:31 +08:00
Downupanddownup	fa45c5ac4f	bug修复	2024-04-30 10:04:41 +08:00
Downupanddownup	2dc36d3d60	添加流程说明	2024-04-29 15:42:46 +08:00
Downupanddownup	f70fd8ff87	优化说明	2024-04-29 15:37:47 +08:00
Downupanddownup	ed8d276ac9	优化说明	2024-04-29 15:02:22 +08:00
Downupanddownup	1de89feb7b	优化代码	2024-04-29 14:49:27 +08:00
Downupanddownup	b8356880dc	优化代码	2024-04-29 14:19:24 +08:00
Downupanddownup	8182908f7d	添加说话人确认模型切换	2024-04-29 14:14:16 +08:00
Downupanddownup	5081168918	添加切换重置按钮事件	2024-04-29 13:04:33 +08:00
Downupanddownup	c26fa983a4	参考类型，添加选择	2024-04-29 11:23:41 +08:00
Downupanddownup	5280d17d2f	ui布局调整	2024-04-29 10:49:07 +08:00
Downupanddownup	371a2d7138	bug调整	2024-04-29 10:13:22 +08:00
Downupanddownup	fe969ab9a2	测试	2024-04-29 00:58:15 +08:00
Downupanddownup	61db7f05dc	测试	2024-04-29 00:41:27 +08:00
Downupanddownup	536c226b1a	添加url消息提示	2024-04-29 00:32:01 +08:00
Downupanddownup	01468158d3	添加url消息提示	2024-04-29 00:29:52 +08:00
Downupanddownup	c9547ab669	添加url消息提示	2024-04-29 00:27:38 +08:00
Downupanddownup	b1ad8b5dcd	bug修复	2024-04-29 00:20:13 +08:00
Downupanddownup	d6e255a071	添加windows下启动文件	2024-04-28 20:21:15 +08:00
Downupanddownup	e89f986e3f	添加ui参数写入	2024-04-28 19:07:09 +08:00
Downupanddownup	af0bd9f414	添加ui初始化值	2024-04-28 18:47:44 +08:00
Downupanddownup	13567362d9	提取一部分公共组件	2024-04-28 16:44:45 +08:00
Downupanddownup	27325f4cf9	调整项目结构，修复随机采样bug	2024-04-28 15:49:05 +08:00
Downupanddownup	6cb3c15448	添加非中文语言的asr操作	2024-04-28 15:20:10 +08:00
Downupanddownup	9264f7e38e	添加事件绑定和实现	2024-04-28 14:10:02 +08:00
Downupanddownup	25b65cdfd0	调整ui布局	2024-04-27 22:09:03 +08:00
Downupanddownup	1a7cf580e0	创建日志目录	2024-04-27 11:24:57 +08:00
Downupanddownup	c36d0a93fe	api推理，添加多进程请求	2024-04-27 01:27:57 +08:00
Downupanddownup	2a23f95f61	bug修复	2024-04-26 22:55:09 +08:00
Downupanddownup	d1e92edc7c	添加一些参数的读取和保存	2024-04-26 17:46:40 +08:00
Downupanddownup	d8d551d4d2	bug修复	2024-04-26 17:10:23 +08:00
Downupanddownup	1d434e1a0a	添加初始启动时的默认值	2024-04-26 17:01:03 +08:00
Downupanddownup	9fe20c14d6	添加音频预采样开关	2024-04-26 16:27:21 +08:00
Downupanddownup	64cc2fd9d1	将打印信息，改由日志输出	2024-04-26 16:18:40 +08:00
Downupanddownup	a291629438	音频相似度比较，添加参考音频的预采样步骤	2024-04-26 15:37:58 +08:00
Downupanddownup	e3e47d2c06	音频相似度比较，添加参考音频的预采样步骤	2024-04-26 15:08:33 +08:00
Downupanddownup	ca9ffbf98e	音频相似度比较，添加参考音频的预采样步骤	2024-04-26 15:00:34 +08:00
Downupanddownup	684e1cfd2f	文本相似度，添加GPU加速	2024-04-26 14:31:54 +08:00
Downupanddownup	878fef248a	bug修复	2024-04-26 14:16:16 +08:00
Downupanddownup	2880e3a6f8	添加性能监控	2024-04-26 13:25:02 +08:00
Downupanddownup	1da23aa259	bug修复	2024-04-25 22:54:40 +08:00
Downupanddownup	c8be484c0e	添加路径清理	2024-04-25 19:09:27 +08:00
Downupanddownup	d855eecc7b	添加目录保存	2024-04-25 18:50:52 +08:00
Downupanddownup	d20bd37965	调整配置参数，进行集中管理	2024-04-25 17:36:13 +08:00
Downupanddownup	926dd6b34a	调整配置管理，去除写入	2024-04-25 17:13:30 +08:00
Downupanddownup	f61a723bab	添加3s至10s的音频过滤	2024-04-25 16:45:42 +08:00
Downupanddownup	441ab54889	url编码调整	2024-04-25 16:39:56 +08:00
Downupanddownup	ecbc7d0b1e	添加配置文件管理	2024-04-25 16:20:11 +08:00
Downupanddownup	b6f0bb36ef	添加同步参考音频代码	2024-04-25 13:26:32 +08:00
Downupanddownup	4daa9ad53c	添加文本相似度比较功能	2024-04-25 11:54:13 +08:00
Downupanddownup	2c8f6bd4c9	配置文件生成、音频抽样、音频推理测试	2024-04-25 00:22:58 +08:00
Downupanddownup	4cbbe2a258	调整目录结构	2024-04-24 18:57:36 +08:00
Downupanddownup	a1fc00a9d8	调整目录结构	2024-04-24 18:53:00 +08:00
Downupanddownup	8c9627bb30	功能补全	2024-04-24 16:58:02 +08:00
Downupanddownup	e69e449599	功能补全	2024-04-24 16:54:51 +08:00
Downupanddownup	29b8370c45	添加根据list，转换参考音频的方法	2024-04-23 23:56:49 +08:00
Downupanddownup	7efdf31113	添加参考音频筛功能选界面	2024-04-23 23:31:02 +08:00