Merge 319a09fa2d54335ab4893190c860c77beed76a83 into 2d9193b0d3c0eae0c3a14d8c68a839f1bae157dc

2026-06-03 20:40:30 +08:00 · 2026-02-17 04:35:59 +00:00 · 2026-02-17 04:35:59 +00:00 · 60a25737d2
commit 60a25737d2
parent 2d9193b0d3 319a09fa2d
131 changed files with 10190 additions and 1911 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,6 +1,6 @@
 GPT_SoVITS/pretrained_models/*
-tools/asr/models/*
-tools/uvr5/uvr5_weights/*
+gsv_tools/asr/models/*
+gsv_tools/uvr5/uvr5_weights/*

 .git
 .DS_Store
@ -21,6 +21,8 @@ cfg.json
 speakers.json
 ref_audios

+pylock.toml
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 **/__pycache__/
--- a/.github/build_windows_packages.ps1
+++ b/.github/build_windows_packages.ps1
@ -92,13 +92,13 @@ Write-Host "[INFO] Download G2PWModel..."
 DownloadAndUnzip $G2PW_URL "GPT_SoVITS\text"

 Write-Host "[INFO] Download UVR5 model..."
-DownloadAndUnzip $UVR5_URL "tools\uvr5"
+DownloadAndUnzip $UVR5_URL "gsv_tools\uvr5"

 Write-Host "[INFO] Downloading funasr..."
 $funasrUrl = "https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/funasr.zip"
 $funasrZip = "$tmpDir\funasr.zip"
 Invoke-WebRequest -Uri $funasrUrl -OutFile $funasrZip
-Expand-Archive -Path $funasrZip -DestinationPath "$srcDir\tools\asr\models" -Force
+Expand-Archive -Path $funasrZip -DestinationPath "$srcDir\gsv_tools\asr\models" -Force
 Remove-Item $funasrZip

 Write-Host "[INFO] Download ffmpeg..."
--- a/.gitignore
+++ b/.gitignore
@ -1,24 +1,3 @@
-.DS_Store
-.vscode
-__pycache__
-*.pyc
-env
-runtime
-.idea
-output
-logs
-SoVITS_weights*/
-GPT_weights*/
-TEMP
-weight.json
-ffmpeg*
-ffprobe*
-cfg.json
-speakers.json
-ref_audios
-tools/AP_BWE_main/24kto48k/*
-!tools/AP_BWE_main/24kto48k/readme.txt
-
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@ -193,3 +172,28 @@ cython_debug/

 # PyPI configuration file
 .pypirc
+
+# GPT-SoVITS
+
+.DS_Store
+.vscode
+__pycache__
+*.pyc
+pylock.toml
+env
+runtime
+.idea
+output
+logs
+SoVITS_weights*/
+GPT_weights*/
+TEMP
+weight.json
+ffmpeg*
+ffprobe*
+cfg.json
+speakers.json
+ref_audios
+gsv_tools/AP_BWE_main/24kto48k/*
+!gsv_tools/AP_BWE_main/24kto48k/readme.txt
+!gsv_tools/uvr5/lib
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -3,13 +3,18 @@ ci:

 repos:
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.11.7
+  rev: v0.15.1
  hooks:
    # Run the linter.
    - id: ruff
      types_or: [ python, pyi ]
-      args: [ --fix , "--exit-zero" ]
+      args: [ --fix , "--unsafe-fixes", "--exit-zero" ]
    # Run the formatter.
    - id: ruff-format
      types_or: [ python, pyi ]
-      args: [ --line-length, "120", --target-version, "py311" ]
+
+- repo: https://github.com/astral-sh/uv-pre-commit
+  # uv version.
+  rev: 0.10.3
+  hooks:
+    - id: uv-lock
--- a/Colab-Inference.ipynb
+++ b/Colab-Inference.ipynb
@ -53,7 +53,7 @@
        "if conda env list | awk '{print $1}' | grep -Fxq \"GPTSoVITS\"; then\n",
        "    :\n",
        "else\n",
-        "    conda create -n GPTSoVITS python=3.10 -y\n",
+        "    conda create -n GPTSoVITS python=3.11 -y\n",
        "fi\n",
        "\n",
        "source activate GPTSoVITS\n",
--- a/Colab-WebUI.ipynb
+++ b/Colab-WebUI.ipynb
@ -52,7 +52,7 @@
        "if conda env list | awk '{print $1}' | grep -Fxq \"GPTSoVITS\"; then\n",
        "    :\n",
        "else\n",
-        "    conda create -n GPTSoVITS python=3.10 -y\n",
+        "    conda create -n GPTSoVITS python=3.11 -y\n",
        "fi\n",
        "\n",
        "source activate GPTSoVITS\n",
--- a/Docker/miniforge_install.sh
+++ b/Docker/miniforge_install.sh
@ -57,21 +57,20 @@ source "$HOME/.bashrc"

 "$HOME/conda/bin/conda" install python=3.12 -y

-"$HOME/conda/bin/conda" install gcc=11 gxx ffmpeg cmake make unzip $SYSROOT_PKG "libstdcxx-ng>=11" -y
+"$HOME/conda/bin/conda" install gcc=11 gxx ffmpeg uv cmake make unzip $SYSROOT_PKG "libstdcxx-ng>=11" -y
+
+cd workspace

 if [ "$CUDA_VERSION" = "12.8" ]; then
-    "$HOME/conda/bin/pip" install torch torchcodec --no-cache-dir --index-url https://download.pytorch.org/whl/cu128
-    "$HOME/conda/bin/conda" install cuda-nvcc=12.8 -y
+    "$HOME/conda/bin/uv" pip install ".[cu128]" --no-cache-dir --python "$HOME/conda/bin/python"
 elif [ "$CUDA_VERSION" = "12.6" ]; then
-    "$HOME/conda/bin/pip" install torch torchcodec --no-cache-dir --index-url https://download.pytorch.org/whl/cu126
-    "$HOME/conda/bin/conda" install cuda-nvcc=12.6 -y
+    "$HOME/conda/bin/uv" pip install ".[cu126]" --no-cache-dir --python "$HOME/conda/bin/python"
 fi

 export PATH="$HOME/conda/bin:$PATH"

-"$HOME/conda/bin/pip" install psutil ninja packaging wheel "setuptools>=42" einops
-"$HOME/conda/bin/pip" install flash-attn -i https://xxxxrt666.github.io/PIP-Index/ --no-build-isolation
-"$HOME/conda/bin/pip" cache purge
+"$HOME/conda/bin/uv" pip install ".[flash-attn]" --python "$HOME/conda/bin/python"
+"$HOME/conda/bin/uv" cache clean

 rm $LOG_PATH

--- a/24
+++ b/24
@ -15,7 +15,7 @@ SHELL ["/bin/bash", "-c"]

 WORKDIR /workspace/GPT-SoVITS

-COPY Docker /workspace/GPT-SoVITS/Docker/
+COPY . /workspace/GPT-SoVITS

 ARG LITE=false
 ENV LITE=${LITE}
@ -26,12 +26,6 @@ ENV WORKFLOW=${WORKFLOW}
 ARG TARGETPLATFORM
 ENV TARGETPLATFORM=${TARGETPLATFORM}

-COPY extra-req.txt /workspace/GPT-SoVITS/
-
-COPY requirements.txt /workspace/GPT-SoVITS/
-
-COPY install.sh /workspace/GPT-SoVITS/
-
 RUN bash Docker/install_wrapper.sh

 EXPOSE 9871 9872 9873 9874 9880
@ -40,21 +34,13 @@ ENV PYTHONPATH="/workspace/GPT-SoVITS"

 RUN conda init bash && echo "conda activate base" >> ~/.bashrc

-WORKDIR /workspace
-
-RUN rm -rf /workspace/GPT-SoVITS
-
-WORKDIR /workspace/GPT-SoVITS
-
-COPY . /workspace/GPT-SoVITS
-
 CMD ["/bin/bash", "-c", "\
  rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
  rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
-  rm -rf /workspace/GPT-SoVITS/tools/asr/models && \
-  rm -rf /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \
+  rm -rf /workspace/GPT-SoVITS/gsv_tools/asr/models && \
+  rm -rf /workspace/GPT-SoVITS/gsv_tools/uvr5/uvr5_weights && \
  ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
  ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
-  ln -s /workspace/models/asr_models /workspace/GPT-SoVITS/tools/asr/models && \
-  ln -s /workspace/models/uvr5_weights /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \
+  ln -s /workspace/models/asr_models /workspace/GPT-SoVITS/gsv_tools/asr/models && \
+  ln -s /workspace/models/uvr5_weights /workspace/GPT-SoVITS/gsv_tools/uvr5/uvr5_weights && \
  exec bash"]
--- a/GPT_SoVITS/BigVGAN/README.md
+++ b/GPT_SoVITS/BigVGAN/README.md
@ -9,6 +9,7 @@
 <center><img src="https://user-images.githubusercontent.com/15963413/218609148-881e39df-33af-4af9-ab95-1427c4ebf062.png" width="800"></center>

 ## News
+
 - **Sep 2024 (v2.4):**
  - We have updated the pretrained checkpoints trained for 5M steps. This is final release of the BigVGAN-v2 checkpoints.

@ -31,7 +32,7 @@
 The codebase has been tested on Python `3.10` and PyTorch `2.3.1` conda packages with either `pytorch-cuda=12.1` or `pytorch-cuda=11.8`. Below is an example command to create the conda environment:

 ```shell
-conda create -n bigvgan python=3.10 pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
+conda create -n bigvgan python=3.11 pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
 conda activate bigvgan
 ```

@ -186,17 +187,17 @@ If you see `[Fail] test CUDA fused vs. plain torch BigVGAN inference`, it means
 We provide the [pretrained models on Hugging Face Collections](https://huggingface.co/collections/nvidia/bigvgan-66959df3d97fd7d98d97dc9a).
 One can download the checkpoints of the generator weight (named `bigvgan_generator.pt`) and its discriminator/optimizer states (named `bigvgan_discriminator_optimizer.pt`) within the listed model repositories.

-| Model Name                                                                                               | Sampling Rate | Mel band | fmax  | Upsampling Ratio | Params | Dataset                    | Steps | Fine-Tuned |
-|:--------------------------------------------------------------------------------------------------------:|:-------------:|:--------:|:-----:|:----------------:|:------:|:--------------------------:|:-----:|:----------:|
-| [bigvgan_v2_44khz_128band_512x](https://huggingface.co/nvidia/bigvgan_v2_44khz_128band_512x)             | 44 kHz        | 128      | 22050 | 512              | 122M   | Large-scale Compilation    | 5M    | No         |
-| [bigvgan_v2_44khz_128band_256x](https://huggingface.co/nvidia/bigvgan_v2_44khz_128band_256x)             | 44 kHz        | 128      | 22050 | 256              | 112M   | Large-scale Compilation    | 5M    | No         |
-| [bigvgan_v2_24khz_100band_256x](https://huggingface.co/nvidia/bigvgan_v2_24khz_100band_256x)             | 24 kHz        | 100      | 12000 | 256              | 112M   | Large-scale Compilation    | 5M    | No         |
-| [bigvgan_v2_22khz_80band_256x](https://huggingface.co/nvidia/bigvgan_v2_22khz_80band_256x)               | 22 kHz        | 80       | 11025 | 256              | 112M   | Large-scale Compilation    | 5M    | No         |
-| [bigvgan_v2_22khz_80band_fmax8k_256x](https://huggingface.co/nvidia/bigvgan_v2_22khz_80band_fmax8k_256x) | 22 kHz        | 80       | 8000  | 256              | 112M   | Large-scale Compilation    | 5M    | No         |
-| [bigvgan_24khz_100band](https://huggingface.co/nvidia/bigvgan_24khz_100band)                             | 24 kHz        | 100      | 12000 | 256              | 112M   | LibriTTS                   | 5M    | No         |
-| [bigvgan_base_24khz_100band](https://huggingface.co/nvidia/bigvgan_base_24khz_100band)                   | 24 kHz        | 100      | 12000 | 256              | 14M    | LibriTTS                   | 5M    | No         |
-| [bigvgan_22khz_80band](https://huggingface.co/nvidia/bigvgan_22khz_80band)                               | 22 kHz        | 80       | 8000  | 256              | 112M   | LibriTTS + VCTK + LJSpeech | 5M    | No         |
-| [bigvgan_base_22khz_80band](https://huggingface.co/nvidia/bigvgan_base_22khz_80band)                     | 22 kHz        | 80       | 8000  | 256              | 14M    | LibriTTS + VCTK + LJSpeech | 5M    | No         |
+|                                                Model Name                                                | Sampling Rate | Mel band | fmax  | Upsampling Ratio | Params |          Dataset           | Steps | Fine-Tuned |
+| :------------------------------------------------------------------------------------------------------: | :-----------: | :------: | :---: | :--------------: | :----: | :------------------------: | :---: | :--------: |
+|       [bigvgan_v2_44khz_128band_512x](https://huggingface.co/nvidia/bigvgan_v2_44khz_128band_512x)       |    44 kHz     |   128    | 22050 |       512        |  122M  |  Large-scale Compilation   |  5M   |     No     |
+|       [bigvgan_v2_44khz_128band_256x](https://huggingface.co/nvidia/bigvgan_v2_44khz_128band_256x)       |    44 kHz     |   128    | 22050 |       256        |  112M  |  Large-scale Compilation   |  5M   |     No     |
+|       [bigvgan_v2_24khz_100band_256x](https://huggingface.co/nvidia/bigvgan_v2_24khz_100band_256x)       |    24 kHz     |   100    | 12000 |       256        |  112M  |  Large-scale Compilation   |  5M   |     No     |
+|        [bigvgan_v2_22khz_80band_256x](https://huggingface.co/nvidia/bigvgan_v2_22khz_80band_256x)        |    22 kHz     |    80    | 11025 |       256        |  112M  |  Large-scale Compilation   |  5M   |     No     |
+| [bigvgan_v2_22khz_80band_fmax8k_256x](https://huggingface.co/nvidia/bigvgan_v2_22khz_80band_fmax8k_256x) |    22 kHz     |    80    | 8000  |       256        |  112M  |  Large-scale Compilation   |  5M   |     No     |
+|               [bigvgan_24khz_100band](https://huggingface.co/nvidia/bigvgan_24khz_100band)               |    24 kHz     |   100    | 12000 |       256        |  112M  |          LibriTTS          |  5M   |     No     |
+|          [bigvgan_base_24khz_100band](https://huggingface.co/nvidia/bigvgan_base_24khz_100band)          |    24 kHz     |   100    | 12000 |       256        |  14M   |          LibriTTS          |  5M   |     No     |
+|                [bigvgan_22khz_80band](https://huggingface.co/nvidia/bigvgan_22khz_80band)                |    22 kHz     |    80    | 8000  |       256        |  112M  | LibriTTS + VCTK + LJSpeech |  5M   |     No     |
+|           [bigvgan_base_22khz_80band](https://huggingface.co/nvidia/bigvgan_base_22khz_80band)           |    22 kHz     |    80    | 8000  |       256        |  14M   | LibriTTS + VCTK + LJSpeech |  5M   |     No     |

 The paper results are based on the original 24kHz BigVGAN models (`bigvgan_24khz_100band` and `bigvgan_base_24khz_100band`) trained on LibriTTS dataset.
 We also provide 22kHz BigVGAN models with band-limited setup (i.e., fmax=8000) for TTS applications.
@ -219,37 +220,37 @@ When training BigVGAN-v2 from scratch with small batch size, it can potentially

 Below are the objective results of the 24kHz model (`bigvgan_v2_24khz_100band_256x`) obtained from the LibriTTS `dev` sets. BigVGAN-v2 shows noticeable improvements of the metrics. The model also exhibits reduced perceptual artifacts, especially for non-speech audio.

-| Model      | Dataset                 | Steps | PESQ(↑)   | M-STFT(↓)  | MCD(↓)     | Periodicity(↓) | V/UV F1(↑) |
-|:----------:|:-----------------------:|:-----:|:---------:|:----------:|:----------:|:--------------:|:----------:|
-| BigVGAN    | LibriTTS                | 1M    | 4.027     | 0.7997     | 0.3745     | 0.1018         | 0.9598     |
-| BigVGAN    | LibriTTS                | 5M    | 4.256     | 0.7409     | 0.2988     | 0.0809         | 0.9698     |
-| BigVGAN-v2 | Large-scale Compilation | 3M    | 4.359     | 0.7134     | 0.3060     | 0.0621         | 0.9777     |
-| BigVGAN-v2 | Large-scale Compilation | 5M    | **4.362** | **0.7026** | **0.2903** | **0.0593**     | **0.9793** |
+|   Model    |         Dataset         | Steps |  PESQ(↑)  | M-STFT(↓)  |   MCD(↓)   | Periodicity(↓) | V/UV F1(↑) |
+| :--------: | :---------------------: | :---: | :-------: | :--------: | :--------: | :------------: | :--------: |
+|  BigVGAN   |        LibriTTS         |  1M   |   4.027   |   0.7997   |   0.3745   |     0.1018     |   0.9598   |
+|  BigVGAN   |        LibriTTS         |  5M   |   4.256   |   0.7409   |   0.2988   |     0.0809     |   0.9698   |
+| BigVGAN-v2 | Large-scale Compilation |  3M   |   4.359   |   0.7134   |   0.3060   |     0.0621     |   0.9777   |
+| BigVGAN-v2 | Large-scale Compilation |  5M   | **4.362** | **0.7026** | **0.2903** |   **0.0593**   | **0.9793** |

 ## Speed Benchmark

 Below are the speed and VRAM usage benchmark results of BigVGAN from `tests/test_cuda_vs_torch_model.py`, using `bigvgan_v2_24khz_100band_256x` as a reference model.

-| GPU                        | num_mel_frame | use_cuda_kernel | Speed (kHz) | Real-time Factor | VRAM (GB) |
-|:--------------------------:|:-------------:|:---------------:|:-----------:|:----------------:|:---------:|
-| NVIDIA A100                | 256           | False           | 1672.1      | 69.7x            | 1.3       |
-|                            |               | True            | 3916.5      | 163.2x           | 1.3       |
-|                            | 2048          | False           | 1899.6      | 79.2x            | 1.7       |
-|                            |               | True            | 5330.1      | 222.1x           | 1.7       |
-|                            | 16384         | False           | 1973.8      | 82.2x            | 5.0       |
-|                            |               | True            | 5761.7      | 240.1x           | 4.4       |
-| NVIDIA GeForce RTX 3080    | 256           | False           | 841.1       | 35.0x            | 1.3       |
-|                            |               | True            | 1598.1      | 66.6x            | 1.3       |
-|                            | 2048          | False           | 929.9       | 38.7x            | 1.7       |
-|                            |               | True            | 1971.3      | 82.1x            | 1.6       |
-|                            | 16384         | False           | 943.4       | 39.3x            | 5.0       |
-|                            |               | True            | 2026.5      | 84.4x            | 3.9       |
-| NVIDIA GeForce RTX 2080 Ti | 256           | False           | 515.6       | 21.5x            | 1.3       |
-|                            |               | True            | 811.3       | 33.8x            | 1.3       |
-|                            | 2048          | False           | 576.5       | 24.0x            | 1.7       |
-|                            |               | True            | 1023.0      | 42.6x            | 1.5       |
-|                            | 16384         | False           | 589.4       | 24.6x            | 5.0       |
-|                            |               | True            | 1068.1      | 44.5x            | 3.2       |
+|            GPU             | num_mel_frame | use_cuda_kernel | Speed (kHz) | Real-time Factor | VRAM (GB) |
+| :------------------------: | :-----------: | :-------------: | :---------: | :--------------: | :-------: |
+|        NVIDIA A100         |      256      |      False      |   1672.1    |      69.7x       |    1.3    |
+|                            |               |      True       |   3916.5    |      163.2x      |    1.3    |
+|                            |     2048      |      False      |   1899.6    |      79.2x       |    1.7    |
+|                            |               |      True       |   5330.1    |      222.1x      |    1.7    |
+|                            |     16384     |      False      |   1973.8    |      82.2x       |    5.0    |
+|                            |               |      True       |   5761.7    |      240.1x      |    4.4    |
+|  NVIDIA GeForce RTX 3080   |      256      |      False      |    841.1    |      35.0x       |    1.3    |
+|                            |               |      True       |   1598.1    |      66.6x       |    1.3    |
+|                            |     2048      |      False      |    929.9    |      38.7x       |    1.7    |
+|                            |               |      True       |   1971.3    |      82.1x       |    1.6    |
+|                            |     16384     |      False      |    943.4    |      39.3x       |    5.0    |
+|                            |               |      True       |   2026.5    |      84.4x       |    3.9    |
+| NVIDIA GeForce RTX 2080 Ti |      256      |      False      |    515.6    |      21.5x       |    1.3    |
+|                            |               |      True       |    811.3    |      33.8x       |    1.3    |
+|                            |     2048      |      False      |    576.5    |      24.0x       |    1.7    |
+|                            |               |      True       |   1023.0    |      42.6x       |    1.5    |
+|                            |     16384     |      False      |    589.4    |      24.6x       |    5.0    |
+|                            |               |      True       |   1068.1    |      44.5x       |    3.2    |

 ## Acknowledgements

--- a/GPT_SoVITS/TTS_infer_pack/TTS.py
+++ b/GPT_SoVITS/TTS_infer_pack/TTS.py
@ -10,10 +10,10 @@ from copy import deepcopy
 import torchaudio
 from tqdm import tqdm

+
 now_dir = os.getcwd()
 sys.path.append(now_dir)
 import os
-from typing import List, Tuple, Union

 import ffmpeg
 import librosa
@ -25,23 +25,24 @@ from AR.models.t2s_lightning_module import Text2SemanticLightningModule
 from BigVGAN.bigvgan import BigVGAN
 from feature_extractor.cnhubert import CNHubert
 from module.mel_processing import mel_spectrogram_torch, spectrogram_torch
-from module.models import SynthesizerTrn, SynthesizerTrnV3, Generator
+from module.models import Generator, SynthesizerTrn, SynthesizerTrnV3
 from peft import LoraConfig, get_peft_model
 from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
+from sv import SV
 from transformers import AutoModelForMaskedLM, AutoTokenizer

-from tools.audio_sr import AP_BWE
-from tools.i18n.i18n import I18nAuto, scan_language_list
+from gsv_tools.audio_sr import AP_BWE
+from gsv_tools.i18n.i18n import I18nAuto, scan_language_list
 from TTS_infer_pack.text_segmentation_method import splits
 from TTS_infer_pack.TextPreprocessor import TextPreprocessor
-from sv import SV
+

 resample_transform_dict = {}


 def resample(audio_tensor, sr0, sr1, device):
    global resample_transform_dict
-    key = "%s-%s-%s" % (sr0, sr1, str(device))
+    key = f"{sr0}-{sr1}-{str(device)}"
    if key not in resample_transform_dict:
        resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
    return resample_transform_dict[key](audio_tensor)
@ -64,33 +65,36 @@ def denorm_spec(x):
    return (x + 1) / 2 * (spec_max - spec_min) + spec_min


-mel_fn = lambda x: mel_spectrogram_torch(
-    x,
-    **{
-        "n_fft": 1024,
-        "win_size": 1024,
-        "hop_size": 256,
-        "num_mels": 100,
-        "sampling_rate": 24000,
-        "fmin": 0,
-        "fmax": None,
-        "center": False,
-    },
-)
+def mel_fn(x):
+    return mel_spectrogram_torch(
+        x,
+        **{
+            "n_fft": 1024,
+            "win_size": 1024,
+            "hop_size": 256,
+            "num_mels": 100,
+            "sampling_rate": 24000,
+            "fmin": 0,
+            "fmax": None,
+            "center": False,
+        },
+    )

-mel_fn_v4 = lambda x: mel_spectrogram_torch(
-    x,
-    **{
-        "n_fft": 1280,
-        "win_size": 1280,
-        "hop_size": 320,
-        "num_mels": 100,
-        "sampling_rate": 32000,
-        "fmin": 0,
-        "fmax": None,
-        "center": False,
-    },
-)
+
+def mel_fn_v4(x):
+    return mel_spectrogram_torch(
+        x,
+        **{
+            "n_fft": 1280,
+            "win_size": 1280,
+            "hop_size": 320,
+            "num_mels": 100,
+            "sampling_rate": 32000,
+            "fmin": 0,
+            "fmax": None,
+            "center": False,
+        },
+    )


 def speed_change(input_audio: np.ndarray, speed: float, sr: int):
@ -132,7 +136,7 @@ class DictToAttrRecursive(dict):
    def __setattr__(self, key, value):
        if isinstance(value, dict):
            value = DictToAttrRecursive(value)
-        super(DictToAttrRecursive, self).__setitem__(key, value)
+        super().__setitem__(key, value)
        super().__setattr__(key, value)

    def __delattr__(self, item):
@ -276,12 +280,12 @@ class TTS_Config:
    v2_languages: list = ["auto", "auto_yue", "en", "zh", "ja", "yue", "ko", "all_zh", "all_ja", "all_yue", "all_ko"]
    languages: list = v2_languages
    mute_tokens: dict = {
-        "v1" : 486,
-        "v2" : 486,
+        "v1": 486,
+        "v2": 486,
        "v2Pro": 486,
        "v2ProPlus": 486,
-        "v3" : 486,
-        "v4" : 486,
+        "v3": 486,
+        "v4": 486,
    }
    mute_emb_sim_matrix: torch.Tensor = None
    # "all_zh",#全部按中文识别
@ -296,7 +300,7 @@ class TTS_Config:
    # "auto",#多语种启动切分识别语种
    # "auto_yue",#多语种启动切分识别语种

-    def __init__(self, configs: Union[dict, str] = None):
+    def __init__(self, configs: dict | str = None):
        # 设置默认配置文件路径
        configs_base_path: str = "GPT_SoVITS/configs/"
        os.makedirs(configs_base_path, exist_ok=True)
@ -325,7 +329,7 @@ class TTS_Config:

        self.is_half = self.configs.get("is_half", False)
        if str(self.device) == "cpu" and self.is_half:
-            print(f"Warning: Half precision is not supported on CPU, set is_half to False.")
+            print("Warning: Half precision is not supported on CPU, set is_half to False.")
            self.is_half = False

        version = self.configs.get("version", None)
@ -369,7 +373,7 @@ class TTS_Config:
        else:
            print(i18n("路径不存在,使用默认配置"))
            self.save_configs(configs_path)
-        with open(configs_path, "r", encoding="utf-8") as f:
+        with open(configs_path, encoding="utf-8") as f:
            configs = yaml.load(f, Loader=yaml.FullLoader)

        return configs
@ -419,14 +423,14 @@ class TTS_Config:


 class TTS:
-    def __init__(self, configs: Union[dict, str, TTS_Config]):
+    def __init__(self, configs: dict | str | TTS_Config):
        if isinstance(configs, TTS_Config):
            self.configs = configs
        else:
            self.configs: TTS_Config = TTS_Config(configs)

        self.t2s_model: Text2SemanticLightningModule = None
-        self.vits_model: Union[SynthesizerTrn, SynthesizerTrnV3] = None
+        self.vits_model: SynthesizerTrn | SynthesizerTrnV3 = None
        self.bert_tokenizer: AutoTokenizer = None
        self.bert_model: AutoModelForMaskedLM = None
        self.cnhuhbert_model: CNHubert = None
@ -497,8 +501,8 @@ class TTS:
            self.init_sv_model()
        path_sovits = self.configs.default_configs[model_version]["vits_weights_path"]

-        if if_lora_v3 == True and os.path.exists(path_sovits) == False:
-            info = path_sovits + i18n("SoVITS %s 底模缺失，无法加载相应 LoRA 权重" % model_version)
+        if if_lora_v3 and not os.path.exists(path_sovits):
+            info = path_sovits + i18n(f"SoVITS {model_version} 底模缺失，无法加载相应 LoRA 权重")
            raise FileExistsError(info)

        # dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False)
@ -558,7 +562,7 @@ class TTS:

        self.is_v2pro = model_version in {"v2Pro", "v2ProPlus"}

-        if if_lora_v3 == False:
+        if not if_lora_v3:
            print(
                f"Loading VITS weights from {weights_path}. {vits_model.load_state_dict(dict_s2['weight'], strict=False)}"
            )
@ -589,8 +593,6 @@ class TTS:

        self.configs.save_configs()

-
-
    def init_t2s_weights(self, weights_path: str):
        print(f"Loading Text2Semantic weights from {weights_path}")
        self.configs.t2s_weights_path = weights_path
@ -622,7 +624,7 @@ class TTS:
                self.empty_cache()

            self.vocoder = BigVGAN.from_pretrained(
-                "%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,),
+                f"{now_dir}/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x",
                use_cuda_kernel=False,
            )  # if True, RuntimeError: Ninja is required to load C++ extensions
            # remove weight norm in the model and set to eval mode
@ -655,7 +657,7 @@ class TTS:
            )
            self.vocoder.remove_weight_norm()
            state_dict_g = torch.load(
-                "%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,),
+                f"{now_dir}/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth",
                map_location="cpu",
                weights_only=False,
            )
@ -668,7 +670,7 @@ class TTS:
            self.vocoder_configs["overlapped_len"] = 12

        self.vocoder = self.vocoder.eval()
-        if self.configs.is_half == True:
+        if self.configs.is_half:
            self.vocoder = self.vocoder.half().to(self.configs.device)
        else:
            self.vocoder = self.vocoder.to(self.configs.device)
@ -798,7 +800,7 @@ class TTS:
        )
        if self.configs.is_half:
            spec = spec.half()
-        if self.is_v2pro == True:
+        if self.is_v2pro:
            audio = resample(audio, self.configs.sampling_rate, 16000, self.configs.device)
            if self.configs.is_half:
                audio = audio.half()
@ -832,7 +834,7 @@ class TTS:
            prompt_semantic = codes[0, 0].to(self.configs.device)
            self.prompt_cache["prompt_semantic"] = prompt_semantic

-    def batch_sequences(self, sequences: List[torch.Tensor], axis: int = 0, pad_value: int = 0, max_length: int = None):
+    def batch_sequences(self, sequences: list[torch.Tensor], axis: int = 0, pad_value: int = 0, max_length: int = None):
        seq = sequences[0]
        ndim = seq.dim()
        if axis < 0:
@ -846,7 +848,7 @@ class TTS:
            max_length = max(seq_lengths) if max_length < max(seq_lengths) else max_length

        padded_sequences = []
-        for seq, length in zip(sequences, seq_lengths):
+        for seq, length in zip(sequences, seq_lengths, strict=False):
            padding = [0] * axis + [0, max_length - length] + [0] * (ndim - axis - 1)
            padded_seq = torch.nn.functional.pad(seq, padding, value=pad_value)
            padded_sequences.append(padded_seq)
@ -898,7 +900,7 @@ class TTS:
                    batch_index_list.append([])
                batch_index_list[-1].append(i)

-        for batch_idx, index_list in enumerate(batch_index_list):
+        for _batch_idx, index_list in enumerate(batch_index_list):
            item_list = [data[idx] for idx in index_list]
            phones_list = []
            phones_len_list = []
@ -1051,7 +1053,7 @@ class TTS:
        fragment_interval = inputs.get("fragment_interval", 0.3)
        seed = inputs.get("seed", -1)
        seed = -1 if seed in ["", None] else seed
-        actual_seed = set_seed(seed)
+        set_seed(seed)
        parallel_infer = inputs.get("parallel_infer", True)
        repetition_penalty = inputs.get("repetition_penalty", 1.35)
        sample_steps = inputs.get("sample_steps", 32)
@ -1060,7 +1062,7 @@ class TTS:
        overlap_length = inputs.get("overlap_length", 2)
        min_chunk_length = inputs.get("min_chunk_length", 16)
        fixed_length_chunk = inputs.get("fixed_length_chunk", False)
-        chunk_split_thershold = 0.0 # 该值代表语义token与mute token的余弦相似度阈值，若大于该阈值，则视为可切分点。
+        chunk_split_thershold = 0.0  # 该值代表语义token与mute token的余弦相似度阈值，若大于该阈值，则视为可切分点。

        if parallel_infer and not streaming_mode:
            print(i18n("并行推理模式已开启"))
@ -1093,7 +1095,6 @@ class TTS:
            print(i18n("分段返回模式/流式推理模式不支持分桶处理，已自动关闭分桶处理"))
            split_bucket = False

-
        if split_bucket and speed_factor == 1.0 and not (self.configs.use_vocoder and parallel_infer):
            print(i18n("分桶处理模式已开启"))
        elif speed_factor != 1.0:
@ -1171,7 +1172,7 @@ class TTS:
        if not (return_fragment or streaming_mode):
            data = self.text_preprocessor.preprocess(text, text_lang, text_split_method, self.configs.version)
            if len(data) == 0:
-                yield 16000, np.zeros(int(16000), dtype=np.int16)
+                yield 16000, np.zeros(16000, dtype=np.int16)
                return

            batch_index_list: list = None
@ -1237,9 +1238,9 @@ class TTS:
                    if item is None:
                        continue

-                batch_phones: List[torch.LongTensor] = item["phones"]
+                batch_phones: list[torch.LongTensor] = item["phones"]
                # batch_phones:torch.LongTensor = item["phones"]
-                batch_phones_len: torch.LongTensor = item["phones_len"]
+                item["phones_len"]
                all_phoneme_ids: torch.LongTensor = item["all_phones"]
                all_phoneme_lens: torch.LongTensor = item["all_phones_len"]
                all_bert_features: torch.LongTensor = item["all_bert_features"]
@ -1255,7 +1256,7 @@ class TTS:
                    )

                refer_audio_spec = []
-                
+
                sv_emb = [] if self.is_v2pro else None
                for spec, audio_tensor in self.prompt_cache["refer_spec"]:
                    spec = spec.to(dtype=self.precision, device=self.configs.device)
@ -1281,7 +1282,6 @@ class TTS:
                    t4 = time.perf_counter()
                    t_34 += t4 - t3

-
                    batch_audio_fragment = []

                    # ## vits并行推理 method 1
@ -1301,7 +1301,9 @@ class TTS:
                        if speed_factor == 1.0:
                            print(f"{i18n('并行合成中')}...")
                            # ## vits并行推理 method 2
-                            pred_semantic_list = [item[-idx:] for item, idx in zip(pred_semantic_list, idx_list)]
+                            pred_semantic_list = [
+                                item[-idx:] for item, idx in zip(pred_semantic_list, idx_list, strict=False)
+                            ]
                            upsample_rate = math.prod(self.vits_model.upsample_rates)
                            audio_frag_idx = [
                                pred_semantic_list[i].shape[0] * 2 * upsample_rate
@ -1314,8 +1316,8 @@ class TTS:
                            _batch_phones = torch.cat(batch_phones).unsqueeze(0).to(self.configs.device)

                            _batch_audio_fragment = self.vits_model.decode(
-                                    all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb
-                                ).detach()[0, 0, :]
+                                all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb
+                            ).detach()[0, 0, :]

                            audio_frag_end_idx.insert(0, 0)
                            batch_audio_fragment = [
@ -1330,14 +1332,18 @@ class TTS:
                                    pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)
                                )  # .unsqueeze(0)#mq要多unsqueeze一次
                                audio_fragment = self.vits_model.decode(
-                                        _pred_semantic, phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb
-                                    ).detach()[0, 0, :]
+                                    _pred_semantic, phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb
+                                ).detach()[0, 0, :]
                                batch_audio_fragment.append(audio_fragment)  ###试试重建不带上prompt部分
                    else:
                        if parallel_infer:
                            print(f"{i18n('并行合成中')}...")
                            audio_fragments = self.using_vocoder_synthesis_batched_infer(
-                                idx_list, pred_semantic_list, batch_phones, speed=speed_factor, sample_steps=sample_steps
+                                idx_list,
+                                pred_semantic_list,
+                                batch_phones,
+                                speed=speed_factor,
+                                sample_steps=sample_steps,
                            )
                            batch_audio_fragment.extend(audio_fragments)
                        else:
@ -1356,7 +1362,7 @@ class TTS:
                    #     item.to(dtype=self.precision, device=self.configs.device)
                    #     for item in self.prompt_cache["refer_spec"]
                    # ]
-                    semantic_token_generator =self.t2s_model.model.infer_panel(
+                    semantic_token_generator = self.t2s_model.model.infer_panel(
                        all_phoneme_ids[0].unsqueeze(0),
                        all_phoneme_lens,
                        prompt,
@ -1381,30 +1387,34 @@ class TTS:
                        # if speed_factor == 1.0:
                        #     upsample_rate = math.prod(self.vits_model.upsample_rates)*(2 if self.vits_model.semantic_frame_rate == "25hz" else 1)
                        # else:
-                        upsample_rate = math.prod(self.vits_model.upsample_rates)*((2 if self.vits_model.semantic_frame_rate == "25hz" else 1)/speed_factor)
+                        upsample_rate = math.prod(self.vits_model.upsample_rates) * (
+                            (2 if self.vits_model.semantic_frame_rate == "25hz" else 1) / speed_factor
+                        )
                    else:
                        # if speed_factor == 1.0:
                        #     upsample_rate = self.vocoder_configs["upsample_rate"]*(3.875 if self.configs.version == "v3" else 4)
                        # else:
-                        upsample_rate = self.vocoder_configs["upsample_rate"]*((3.875 if self.configs.version == "v3" else 4)/speed_factor)
+                        upsample_rate = self.vocoder_configs["upsample_rate"] * (
+                            (3.875 if self.configs.version == "v3" else 4) / speed_factor
+                        )

                    last_audio_chunk = None
                    # last_tokens = None
                    last_latent = None
                    previous_tokens = []
                    overlap_len = overlap_length
-                    overlap_size = math.ceil(overlap_length*upsample_rate)
+                    overlap_size = math.ceil(overlap_length * upsample_rate)
                    for semantic_tokens, is_final in semantic_token_generator:
                        if semantic_tokens is None and last_audio_chunk is not None:
                            yield self.audio_postprocess(
-                                    [[last_audio_chunk[-overlap_size:]]],
-                                    output_sr,
-                                    None,
-                                    speed_factor,
-                                    False,
-                                    0.0,
-                                    super_sampling if self.configs.use_vocoder and self.configs.version == "v3" else False,
-                                )
+                                [[last_audio_chunk[-overlap_size:]]],
+                                output_sr,
+                                None,
+                                speed_factor,
+                                False,
+                                0.0,
+                                super_sampling if self.configs.use_vocoder and self.configs.version == "v3" else False,
+                            )
                            break

                        _semantic_tokens = semantic_tokens
@ -1415,11 +1425,10 @@ class TTS:
                        _semantic_tokens = torch.cat(previous_tokens, dim=-1)

                        if not is_first_chunk and semantic_tokens.shape[-1] < 10:
-                            overlap_len = overlap_length+(10-semantic_tokens.shape[-1])
+                            overlap_len = overlap_length + (10 - semantic_tokens.shape[-1])
                        else:
                            overlap_len = overlap_length

-
                        if not self.configs.use_vocoder:
                            token_padding_length = 0
                            # token_padding_length = int(phones.shape[-1]*2)-_semantic_tokens.shape[-1]
@ -1429,58 +1438,64 @@ class TTS:
                            #     token_padding_length = 0

                            audio_chunk, latent, latent_mask = self.vits_model.decode_streaming(
-                                                    _semantic_tokens.unsqueeze(0), 
-                                                    phones, refer_audio_spec, 
-                                                    speed=speed_factor,
-                                                    sv_emb=sv_emb,
-                                                    result_length=semantic_tokens.shape[-1]+overlap_len if not is_first_chunk else None,
-                                                    overlap_frames=last_latent[:,:,-overlap_len*(2 if self.vits_model.semantic_frame_rate == "25hz" else 1):] \
-                                                    if last_latent is not None else None,
-                                                    padding_length=token_padding_length
-                                                )
-                            audio_chunk=audio_chunk.detach()[0, 0, :]
+                                _semantic_tokens.unsqueeze(0),
+                                phones,
+                                refer_audio_spec,
+                                speed=speed_factor,
+                                sv_emb=sv_emb,
+                                result_length=semantic_tokens.shape[-1] + overlap_len if not is_first_chunk else None,
+                                overlap_frames=last_latent[
+                                    :, :, -overlap_len * (2 if self.vits_model.semantic_frame_rate == "25hz" else 1) :
+                                ]
+                                if last_latent is not None
+                                else None,
+                                padding_length=token_padding_length,
+                            )
+                            audio_chunk = audio_chunk.detach()[0, 0, :]
                        else:
                            raise RuntimeError(i18n("SoVits V3/4模型不支持流式推理模式"))
-                        
-                        if overlap_len>overlap_length:
-                            audio_chunk=audio_chunk[-int((overlap_length+semantic_tokens.shape[-1])*upsample_rate):]
+
+                        if overlap_len > overlap_length:
+                            audio_chunk = audio_chunk[
+                                -int((overlap_length + semantic_tokens.shape[-1]) * upsample_rate) :
+                            ]

                        audio_chunk_ = audio_chunk
                        if is_first_chunk and not is_final:
                            is_first_chunk = False
                            audio_chunk_ = audio_chunk_[:-overlap_size]
-                        elif is_first_chunk and is_final: 
+                        elif is_first_chunk and is_final:
                            is_first_chunk = False
                        elif not is_first_chunk and not is_final:
                            audio_chunk_ = self.sola_algorithm([last_audio_chunk, audio_chunk_], overlap_size)
                            audio_chunk_ = (
-                                audio_chunk_[last_audio_chunk.shape[0]-overlap_size:-overlap_size] if not is_final \
-                                    else audio_chunk_[last_audio_chunk.shape[0]-overlap_size:]
-                                    )
+                                audio_chunk_[last_audio_chunk.shape[0] - overlap_size : -overlap_size]
+                                if not is_final
+                                else audio_chunk_[last_audio_chunk.shape[0] - overlap_size :]
+                            )

                        last_latent = latent
                        last_audio_chunk = audio_chunk
                        yield self.audio_postprocess(
-                                [[audio_chunk_]],
-                                output_sr,
-                                None,
-                                speed_factor,
-                                False,
-                                0.0,
-                                super_sampling if self.configs.use_vocoder and self.configs.version == "v3" else False,
-                            )
-                        
-                        if is_first_package: 
-                            print(f"first_package_delay: {time.perf_counter()-t0:.3f}")
+                            [[audio_chunk_]],
+                            output_sr,
+                            None,
+                            speed_factor,
+                            False,
+                            0.0,
+                            super_sampling if self.configs.use_vocoder and self.configs.version == "v3" else False,
+                        )
+
+                        if is_first_package:
+                            print(f"first_package_delay: {time.perf_counter() - t0:.3f}")
                            is_first_package = False

-
-                    yield output_sr, np.zeros(int(output_sr*fragment_interval), dtype=np.int16)
+                    yield output_sr, np.zeros(int(output_sr * fragment_interval), dtype=np.int16)

                t5 = time.perf_counter()
                t_45 += t5 - t4
                if return_fragment:
-                    print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t4 - t3, t5 - t4))
+                    print(f"{t1 - t0:.3f}\t{t2 - t1:.3f}\t{t4 - t3:.3f}\t{t5 - t4:.3f}")
                    yield self.audio_postprocess(
                        [batch_audio_fragment],
                        output_sr,
@ -1490,7 +1505,8 @@ class TTS:
                        fragment_interval,
                        super_sampling if self.configs.use_vocoder and self.configs.version == "v3" else False,
                    )
-                elif streaming_mode:...
+                elif streaming_mode:
+                    ...
                else:
                    audio.append(batch_audio_fragment)

@ -1499,7 +1515,7 @@ class TTS:
                    return

            if not (return_fragment or streaming_mode):
-                print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t_34, t_45))
+                print(f"{t1 - t0:.3f}\t{t2 - t1:.3f}\t{t_34:.3f}\t{t_45:.3f}")
                if len(audio) == 0:
                    yield output_sr, np.zeros(int(output_sr), dtype=np.int16)
                    return
@ -1516,7 +1532,7 @@ class TTS:
        except Exception as e:
            traceback.print_exc()
            # 必须返回一个空音频, 否则会导致显存不释放。
-            yield 16000, np.zeros(int(16000), dtype=np.int16)
+            yield 16000, np.zeros(16000, dtype=np.int16)
            # 重置模型, 否则会导致显存释放不完全。
            del self.t2s_model
            del self.vits_model
@ -1540,15 +1556,15 @@ class TTS:

    def audio_postprocess(
        self,
-        audio: List[torch.Tensor],
+        audio: list[torch.Tensor],
        sr: int,
        batch_index_list: list = None,
        speed_factor: float = 1.0,
        split_bucket: bool = True,
        fragment_interval: float = 0.3,
        super_sampling: bool = False,
-    ) -> Tuple[int, np.ndarray]:
-        if fragment_interval>0:
+    ) -> tuple[int, np.ndarray]:
+        if fragment_interval > 0:
            zero_wav = torch.zeros(
                int(self.configs.sampling_rate * fragment_interval), dtype=self.precision, device=self.configs.device
            )
@ -1558,7 +1574,9 @@ class TTS:
                max_audio = torch.abs(audio_fragment).max()  # 简单防止16bit爆音
                if max_audio > 1:
                    audio_fragment /= max_audio
-                audio_fragment: torch.Tensor = torch.cat([audio_fragment, zero_wav], dim=0) if fragment_interval>0 else audio_fragment
+                audio_fragment: torch.Tensor = (
+                    torch.cat([audio_fragment, zero_wav], dim=0) if fragment_interval > 0 else audio_fragment
+                )
                audio[i][j] = audio_fragment

        if split_bucket:
@ -1589,7 +1607,6 @@ class TTS:

        audio = (audio * 32768).astype(np.int16)

-
        # try:
        #     if speed_factor != 1.0:
        #         audio = speed_change(audio, speed=speed_factor, sr=int(sr))
@ -1665,12 +1682,12 @@ class TTS:

    def using_vocoder_synthesis_batched_infer(
        self,
-        idx_list: List[int],
-        semantic_tokens_list: List[torch.Tensor],
-        batch_phones: List[torch.Tensor],
+        idx_list: list[int],
+        semantic_tokens_list: list[torch.Tensor],
+        batch_phones: list[torch.Tensor],
        speed: float = 1.0,
        sample_steps: int = 32,
-    ) -> List[torch.Tensor]:
+    ) -> list[torch.Tensor]:
        prompt_semantic_tokens = self.prompt_cache["prompt_semantic"].unsqueeze(0).unsqueeze(0).to(self.configs.device)
        prompt_phones = torch.LongTensor(self.prompt_cache["phones"]).unsqueeze(0).to(self.configs.device)
        raw_entry = self.prompt_cache["refer_spec"][0]
@ -1778,40 +1795,32 @@ class TTS:

        return audio_fragments

-    def sola_algorithm(
-        self,
-        audio_fragments: List[torch.Tensor],
-        overlap_len: int,
-        search_len:int= 320
-    ):
+    def sola_algorithm(self, audio_fragments: list[torch.Tensor], overlap_len: int, search_len: int = 320):
        # overlap_len-=search_len

        dtype = audio_fragments[0].dtype
-        
+
        for i in range(len(audio_fragments) - 1):
            f1 = audio_fragments[i].float()
            f2 = audio_fragments[i + 1].float()
            w1 = f1[-overlap_len:]
-            w2 = f2[:overlap_len+search_len]
+            w2 = f2[: overlap_len + search_len]
            # w2 = w2[-w2.shape[-1]//2:]
            # assert w1.shape == w2.shape
            corr_norm = F.conv1d(w2.view(1, 1, -1), w1.view(1, 1, -1)).view(-1)

-            corr_den = F.conv1d(w2.view(1, 1, -1)**2, torch.ones_like(w1).view(1, 1, -1)).view(-1)+ 1e-8
-            idx = (corr_norm/corr_den.sqrt()).argmax()
+            corr_den = F.conv1d(w2.view(1, 1, -1) ** 2, torch.ones_like(w1).view(1, 1, -1)).view(-1) + 1e-8
+            idx = (corr_norm / corr_den.sqrt()).argmax()

            print(f"seg_idx: {idx}")

            # idx = corr.argmax()
-            f1_ = f1[: -overlap_len]
+            f1_ = f1[:-overlap_len]
            audio_fragments[i] = f1_

            f2_ = f2[idx:]
            window = torch.hann_window((overlap_len) * 2, device=f1.device, dtype=f1.dtype)
-            f2_[: overlap_len] = (
-                window[: overlap_len] * f2_[: overlap_len]
-                + window[overlap_len :] * f1[-overlap_len :]
-            )
+            f2_[:overlap_len] = window[:overlap_len] * f2_[:overlap_len] + window[overlap_len:] * f1[-overlap_len:]

            # window = torch.sin(torch.arange((overlap_len - idx), device=f1.device) * np.pi / (overlap_len - idx))
            # f2_[: (overlap_len - idx)] = (
--- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
+++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
@ -4,20 +4,21 @@ import threading

 from tqdm import tqdm

+
 now_dir = os.getcwd()
 sys.path.append(now_dir)

 import re
-import torch
-from text.LangSegmenter import LangSegmenter
-from text import chinese
-from typing import Dict, List, Tuple
-from text.cleaner import clean_text
-from text import cleaned_text_to_sequence
-from transformers import AutoModelForMaskedLM, AutoTokenizer
-from TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_method as get_seg_method

-from tools.i18n.i18n import I18nAuto, scan_language_list
+import torch
+from text import cleaned_text_to_sequence
+from text.cleaner import clean_text
+from text.LangSegmenter import LangSegmenter
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+
+from gsv_tools.i18n.i18n import I18nAuto, scan_language_list
+from TTS_infer_pack.text_segmentation_method import get_method as get_seg_method, split_big_text, splits
+

 language = os.environ.get("language", "Auto")
 language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
@ -56,7 +57,7 @@ class TextPreprocessor:
        self.device = device
        self.bert_lock = threading.RLock()

-    def preprocess(self, text: str, lang: str, text_split_method: str, version: str = "v2") -> List[Dict]:
+    def preprocess(self, text: str, lang: str, text_split_method: str, version: str = "v2") -> list[dict]:
        print(f"############ {i18n('切分文本')} ############")
        text = self.replace_consecutive_punctuation(text)
        texts = self.pre_seg_text(text, lang, text_split_method)
@ -98,7 +99,7 @@ class TextPreprocessor:
            # 解决输入目标文本的空行导致报错的问题
            if len(text.strip()) == 0:
                continue
-            if not re.sub("\W+", "", text):
+            if not re.sub(r"\W+", "", text):
                # 检测一下，如果是纯符号，就跳过。
                continue
            if text[-1] not in splits:
@ -116,30 +117,30 @@ class TextPreprocessor:

    def segment_and_extract_feature_for_text(
        self, text: str, language: str, version: str = "v1"
-    ) -> Tuple[list, torch.Tensor, str]:
+    ) -> tuple[list, torch.Tensor, str]:
        return self.get_phones_and_bert(text, language, version)

    def get_phones_and_bert(self, text: str, language: str, version: str, final: bool = False):
        with self.bert_lock:
-            text = re.sub(r' {2,}', ' ', text)
+            text = re.sub(r" {2,}", " ", text)
            textlist = []
            langlist = []
            if language == "all_zh":
-                for tmp in LangSegmenter.getTexts(text,"zh"):
+                for tmp in LangSegmenter.getTexts(text, "zh"):
                    langlist.append(tmp["lang"])
                    textlist.append(tmp["text"])
            elif language == "all_yue":
-                for tmp in LangSegmenter.getTexts(text,"zh"):
+                for tmp in LangSegmenter.getTexts(text, "zh"):
                    if tmp["lang"] == "zh":
                        tmp["lang"] = "yue"
                    langlist.append(tmp["lang"])
                    textlist.append(tmp["text"])
            elif language == "all_ja":
-                for tmp in LangSegmenter.getTexts(text,"ja"):
+                for tmp in LangSegmenter.getTexts(text, "ja"):
                    langlist.append(tmp["lang"])
                    textlist.append(tmp["text"])
            elif language == "all_ko":
-                for tmp in LangSegmenter.getTexts(text,"ko"):
+                for tmp in LangSegmenter.getTexts(text, "ko"):
                    langlist.append(tmp["lang"])
                    textlist.append(tmp["text"])
            elif language == "en":
@ -158,7 +159,9 @@ class TextPreprocessor:
            else:
                for tmp in LangSegmenter.getTexts(text):
                    if langlist:
-                        if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"):
+                        if (tmp["lang"] == "en" and langlist[-1] == "en") or (
+                            tmp["lang"] != "en" and langlist[-1] != "en"
+                        ):
                            textlist[-1] += tmp["text"]
                            continue
                    if tmp["lang"] == "en":
@ -236,4 +239,4 @@ class TextPreprocessor:
        punctuations = "".join(re.escape(p) for p in punctuation)
        pattern = f"([{punctuations}])([{punctuations}])+"
        result = re.sub(pattern, r"\1", text)
-        return result
+        return result
--- a/GPT_SoVITS/inference_cli.py
+++ b/GPT_SoVITS/inference_cli.py
@ -1,9 +1,11 @@
 import argparse
 import os
+
 import soundfile as sf

-from tools.i18n.i18n import I18nAuto
 from GPT_SoVITS.inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav
+from gsv_tools.i18n.i18n import I18nAuto
+

 i18n = I18nAuto()

@ -19,11 +21,11 @@ def synthesize(
    output_path,
 ):
    # Read reference text
-    with open(ref_text_path, "r", encoding="utf-8") as file:
+    with open(ref_text_path, encoding="utf-8") as file:
        ref_text = file.read()

    # Read target text
-    with open(target_text_path, "r", encoding="utf-8") as file:
+    with open(target_text_path, encoding="utf-8") as file:
        target_text = file.read()

    # Change model weights
--- a/GPT_SoVITS/inference_gui.py
+++ b/GPT_SoVITS/inference_gui.py
@ -1,15 +1,29 @@
 import os
 import sys
-from PyQt5.QtCore import QEvent
-from PyQt5.QtWidgets import QApplication, QMainWindow, QLabel, QLineEdit, QPushButton, QTextEdit
-from PyQt5.QtWidgets import QGridLayout, QVBoxLayout, QWidget, QFileDialog, QStatusBar, QComboBox
-import soundfile as sf

-from tools.i18n.i18n import I18nAuto
+import soundfile as sf
+from PyQt5.QtCore import QEvent
+from PyQt5.QtWidgets import (
+    QApplication,
+    QComboBox,
+    QFileDialog,
+    QGridLayout,
+    QLabel,
+    QLineEdit,
+    QMainWindow,
+    QPushButton,
+    QStatusBar,
+    QTextEdit,
+    QVBoxLayout,
+    QWidget,
+)
+
+from gsv_tools.i18n.i18n import I18nAuto
+

 i18n = I18nAuto()

-from inference_webui import gpt_path, sovits_path, change_gpt_weights, change_sovits_weights, get_tts_wav
+from inference_webui import change_gpt_weights, change_sovits_weights, get_tts_wav, gpt_path, sovits_path


 class GPTSoVITSGUI(QMainWindow):
@ -24,11 +38,11 @@ class GPTSoVITSGUI(QMainWindow):

        self.setStyleSheet("""
            QWidget {
-                background-color: #a3d3b1; 
+                background-color: #a3d3b1;
            }

            QTabWidget::pane {
-                background-color: #a3d3b1;  
+                background-color: #a3d3b1;
            }

            QTabWidget::tab-bar {
@ -36,29 +50,29 @@ class GPTSoVITSGUI(QMainWindow):
            }

            QTabBar::tab {
-                background: #8da4bf; 
-                color: #ffffff;  
+                background: #8da4bf;
+                color: #ffffff;
                padding: 8px;
            }

            QTabBar::tab:selected {
-                background: #2a3f54; 
+                background: #2a3f54;
            }

            QLabel {
-                color: #000000;  
+                color: #000000;
            }

            QPushButton {
-                background-color: #4CAF50; 
-                color: white;  
+                background-color: #4CAF50;
+                color: white;
                padding: 8px;
                border: 1px solid #4CAF50;
                border-radius: 4px;
            }

            QPushButton:hover {
-                background-color: #45a049;  
+                background-color: #45a049;
                border: 1px solid #45a049;
                box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.1);
            }
@ -239,14 +253,14 @@ class GPTSoVITSGUI(QMainWindow):
    def upload_ref_text(self):
        file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)")
        if file_path:
-            with open(file_path, "r", encoding="utf-8") as file:
+            with open(file_path, encoding="utf-8") as file:
                content = file.read()
                self.ref_text_input.setText(content)

    def upload_target_text(self):
        file_path, _ = QFileDialog.getOpenFileName(self, "选择文本文件", "", "Text Files (*.txt)")
        if file_path:
-            with open(file_path, "r", encoding="utf-8") as file:
+            with open(file_path, encoding="utf-8") as file:
                content = file.read()
                self.target_text_input.setText(content)

--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@ -6,23 +6,27 @@
 全部按英文识别
 全部按日文识别
 """
-import psutil
+
 import os

+import psutil
+
+
 def set_high_priority():
    """把当前 Python 进程设为 HIGH_PRIORITY_CLASS"""
    if os.name != "nt":
-        return # 仅 Windows 有效
+        return  # 仅 Windows 有效
    p = psutil.Process(os.getpid())
    try:
        p.nice(psutil.HIGH_PRIORITY_CLASS)
        print("已将进程优先级设为 High")
    except psutil.AccessDenied:
        print("权限不足，无法修改优先级（请用管理员运行）")
+
+
 set_high_priority()
 import json
 import logging
-import os
 import re
 import sys
 import traceback
@ -32,6 +36,7 @@ import torch
 import torchaudio
 from text.LangSegmenter import LangSegmenter

+
 logging.getLogger("markdown_it").setLevel(logging.ERROR)
 logging.getLogger("urllib3").setLevel(logging.ERROR)
 logging.getLogger("httpcore").setLevel(logging.ERROR)
@ -46,9 +51,11 @@ version = model_version = os.environ.get("version", "v2")

 from config import change_choices, get_weights_names, name2gpt_path, name2sovits_path

+
 SoVITS_names, GPT_names = get_weights_names()
 from config import pretrained_sovits_name

+
 path_sovits_v3 = pretrained_sovits_name["v3"]
 path_sovits_v4 = pretrained_sovits_name["v4"]
 is_exist_s2gv3 = os.path.exists(path_sovits_v3)
@ -60,7 +67,7 @@ else:
    with open("./weight.json", "w", encoding="utf-8") as file:
        json.dump({"GPT": {}, "SoVITS": {}}, file)

-with open("./weight.json", "r", encoding="utf-8") as file:
+with open("./weight.json", encoding="utf-8") as file:
    weight_data = file.read()
    weight_data = json.loads(weight_data)
    gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, GPT_names[-1]))
@ -96,6 +103,7 @@ import numpy as np
 from feature_extractor import cnhubert
 from transformers import AutoModelForMaskedLM, AutoTokenizer

+
 cnhubert.cnhubert_base_path = cnhubert_base_path

 import random
@ -123,8 +131,9 @@ from peft import LoraConfig, get_peft_model
 from text import cleaned_text_to_sequence
 from text.cleaner import clean_text

-from tools.assets import css, js, top_html
-from tools.i18n.i18n import I18nAuto, scan_language_list
+from gsv_tools.assets import css, js, top_html
+from gsv_tools.i18n.i18n import I18nAuto, scan_language_list
+

 language = os.environ.get("language", "Auto")
 language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
@ -162,7 +171,7 @@ dict_language = dict_language_v1 if version == "v1" else dict_language_v2

 tokenizer = AutoTokenizer.from_pretrained(bert_path)
 bert_model = AutoModelForMaskedLM.from_pretrained(bert_path)
-if is_half == True:
+if is_half:
    bert_model = bert_model.half().to(device)
 else:
    bert_model = bert_model.to(device)
@ -202,7 +211,7 @@ class DictToAttrRecursive(dict):
    def __setattr__(self, key, value):
        if isinstance(value, dict):
            value = DictToAttrRecursive(value)
-        super(DictToAttrRecursive, self).__setitem__(key, value)
+        super().__setitem__(key, value)
        super().__setattr__(key, value)

    def __delattr__(self, item):
@ -213,7 +222,7 @@ class DictToAttrRecursive(dict):


 ssl_model = cnhubert.get_model()
-if is_half == True:
+if is_half:
    ssl_model = ssl_model.half().to(device)
 else:
    ssl_model = ssl_model.to(device)
@ -223,6 +232,7 @@ else:
 # symbol_version-model_version-if_lora_v3
 from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new

+
 v3v4set = {"v3", "v4"}


@ -234,8 +244,8 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
    print(sovits_path, version, model_version, if_lora_v3)
    is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4
    path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
-    if if_lora_v3 == True and is_exist == False:
-        info = path_sovits + "SoVITS %s" % model_version + i18n("底模缺失，无法加载相应 LoRA 权重")
+    if if_lora_v3 and not is_exist:
+        info = path_sovits + f"SoVITS {model_version}" + i18n("底模缺失，无法加载相应 LoRA 权重")
        gr.Warning(info)
        raise FileExistsError(info)
    dict_language = dict_language_v1 if version == "v1" else dict_language_v2
@ -314,17 +324,17 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
            del vq_model.enc_q
        except:
            pass
-    if is_half == True:
+    if is_half:
        vq_model = vq_model.half().to(device)
    else:
        vq_model = vq_model.to(device)
    vq_model.eval()
-    if if_lora_v3 == False:
-        print("loading sovits_%s" % model_version, vq_model.load_state_dict(dict_s2["weight"], strict=False))
+    if not if_lora_v3:
+        print(f"loading sovits_{model_version}", vq_model.load_state_dict(dict_s2["weight"], strict=False))
    else:
        path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
        print(
-            "loading sovits_%spretrained_G" % model_version,
+            f"loading sovits_{model_version}pretrained_G",
            vq_model.load_state_dict(load_sovits_new(path_sovits)["weight"], strict=False),
        )
        lora_rank = dict_s2["lora_rank"]
@ -335,7 +345,7 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
            init_lora_weights=True,
        )
        vq_model.cfm = get_peft_model(vq_model.cfm, lora_config)
-        print("loading sovits_%s_lora%s" % (model_version, lora_rank))
+        print(f"loading sovits_{model_version}_lora{lora_rank}")
        vq_model.load_state_dict(dict_s2["weight"], strict=False)
        vq_model.cfm = vq_model.cfm.merge_and_unload()
        # torch.save(vq_model.state_dict(),"merge_win.pth")
@ -383,7 +393,7 @@ def change_gpt_weights(gpt_path):
    max_sec = config["data"]["max_sec"]
    t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
    t2s_model.load_state_dict(dict_s1["weight"])
-    if is_half == True:
+    if is_half:
        t2s_model = t2s_model.half()
    t2s_model = t2s_model.to(device)
    t2s_model.eval()
@ -401,6 +411,7 @@ change_gpt_weights(gpt_path)
 os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
 import torch

+
 now_dir = os.getcwd()


@ -442,7 +453,7 @@ def init_bigvgan():
    from BigVGAN import bigvgan

    bigvgan_model = bigvgan.BigVGAN.from_pretrained(
-        "%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,),
+        f"{now_dir}/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x",
        use_cuda_kernel=False,
    )  # if True, RuntimeError: Ninja is required to load C++ extensions
    # remove weight norm in the model and set to eval mode
@ -450,7 +461,7 @@ def init_bigvgan():
    bigvgan_model = bigvgan_model.eval()
    clean_hifigan_model()
    clean_sv_cn_model()
-    if is_half == True:
+    if is_half:
        bigvgan_model = bigvgan_model.half().to(device)
    else:
        bigvgan_model = bigvgan_model.to(device)
@ -472,14 +483,14 @@ def init_hifigan():
    hifigan_model.eval()
    hifigan_model.remove_weight_norm()
    state_dict_g = torch.load(
-        "%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,),
+        f"{now_dir}/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth",
        map_location="cpu",
        weights_only=False,
    )
    print("loading vocoder", hifigan_model.load_state_dict(state_dict_g))
    clean_bigvgan_model()
    clean_sv_cn_model()
-    if is_half == True:
+    if is_half:
        hifigan_model = hifigan_model.half().to(device)
    else:
        hifigan_model = hifigan_model.to(device)
@ -508,7 +519,7 @@ resample_transform_dict = {}

 def resample(audio_tensor, sr0, sr1, device):
    global resample_transform_dict
-    key = "%s-%s-%s" % (sr0, sr1, str(device))
+    key = f"{sr0}-{sr1}-{str(device)}"
    if key not in resample_transform_dict:
        resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
    return resample_transform_dict[key](audio_tensor)
@ -544,7 +555,7 @@ def get_spepc(hps, filename, dtype, device, is_v2pro=False):
        center=False,
    )
    spec = spec.to(dtype)
-    if is_v2pro == True:
+    if is_v2pro:
        audio = resample(audio, sr1, 16000, device).to(dtype)
    return spec, audio

@ -556,7 +567,7 @@ def clean_text_inf(text, language, version):
    return phones, word2ph, norm_text


-dtype = torch.float16 if is_half == True else torch.float32
+dtype = torch.float16 if is_half else torch.float32


 def get_bert_inf(phones, word2ph, norm_text, language):
@ -566,7 +577,7 @@ def get_bert_inf(phones, word2ph, norm_text, language):
    else:
        bert = torch.zeros(
            (1024, len(phones)),
-            dtype=torch.float16 if is_half == True else torch.float32,
+            dtype=torch.float16 if is_half else torch.float32,
        ).to(device)

    return bert
@ -595,29 +606,26 @@ def get_first(text):
    return text


-from text import chinese
-
-
 def get_phones_and_bert(text, language, version, final=False):
-    text = re.sub(r' {2,}', ' ', text)
+    text = re.sub(r" {2,}", " ", text)
    textlist = []
    langlist = []
    if language == "all_zh":
-        for tmp in LangSegmenter.getTexts(text,"zh"):
+        for tmp in LangSegmenter.getTexts(text, "zh"):
            langlist.append(tmp["lang"])
            textlist.append(tmp["text"])
    elif language == "all_yue":
-        for tmp in LangSegmenter.getTexts(text,"zh"):
+        for tmp in LangSegmenter.getTexts(text, "zh"):
            if tmp["lang"] == "zh":
                tmp["lang"] = "yue"
            langlist.append(tmp["lang"])
            textlist.append(tmp["text"])
    elif language == "all_ja":
-        for tmp in LangSegmenter.getTexts(text,"ja"):
+        for tmp in LangSegmenter.getTexts(text, "ja"):
            langlist.append(tmp["lang"])
            textlist.append(tmp["text"])
    elif language == "all_ko":
-        for tmp in LangSegmenter.getTexts(text,"ko"):
+        for tmp in LangSegmenter.getTexts(text, "ko"):
            langlist.append(tmp["lang"])
            textlist.append(tmp["text"])
    elif language == "en":
@ -669,6 +677,7 @@ def get_phones_and_bert(text, language, version, final=False):

 from module.mel_processing import mel_spectrogram_torch, spectrogram_torch

+
 spec_min = -12
 spec_max = 2

@ -681,32 +690,36 @@ def denorm_spec(x):
    return (x + 1) / 2 * (spec_max - spec_min) + spec_min


-mel_fn = lambda x: mel_spectrogram_torch(
-    x,
-    **{
-        "n_fft": 1024,
-        "win_size": 1024,
-        "hop_size": 256,
-        "num_mels": 100,
-        "sampling_rate": 24000,
-        "fmin": 0,
-        "fmax": None,
-        "center": False,
-    },
-)
-mel_fn_v4 = lambda x: mel_spectrogram_torch(
-    x,
-    **{
-        "n_fft": 1280,
-        "win_size": 1280,
-        "hop_size": 320,
-        "num_mels": 100,
-        "sampling_rate": 32000,
-        "fmin": 0,
-        "fmax": None,
-        "center": False,
-    },
-)
+def mel_fn(x):
+    return mel_spectrogram_torch(
+        x,
+        **{
+            "n_fft": 1024,
+            "win_size": 1024,
+            "hop_size": 256,
+            "num_mels": 100,
+            "sampling_rate": 24000,
+            "fmin": 0,
+            "fmax": None,
+            "center": False,
+        },
+    )
+
+
+def mel_fn_v4(x):
+    return mel_spectrogram_torch(
+        x,
+        **{
+            "n_fft": 1280,
+            "win_size": 1280,
+            "hop_size": 320,
+            "num_mels": 100,
+            "sampling_rate": 32000,
+            "fmin": 0,
+            "fmax": None,
+            "center": False,
+        },
+    )


 def merge_short_text_in_array(texts, threshold):
@ -732,8 +745,8 @@ sr_model = None

 def audio_sr(audio, sr):
    global sr_model
-    if sr_model == None:
-        from tools.audio_sr import AP_BWE
+    if sr_model is None:
+        from gsv_tools.audio_sr import AP_BWE

        try:
            sr_model = AP_BWE(device, DictToAttrRecursive)
@ -801,10 +814,10 @@ def get_tts_wav(
    print(i18n("实际输入的目标文本:"), text)
    zero_wav = np.zeros(
        int(hps.data.sampling_rate * pause_second),
-        dtype=np.float16 if is_half == True else np.float32,
+        dtype=np.float16 if is_half else np.float32,
    )
    zero_wav_torch = torch.from_numpy(zero_wav)
-    if is_half == True:
+    if is_half:
        zero_wav_torch = zero_wav_torch.half().to(device)
    else:
        zero_wav_torch = zero_wav_torch.to(device)
@ -815,7 +828,7 @@ def get_tts_wav(
                gr.Warning(i18n("参考音频在3~10秒范围外，请更换！"))
                raise OSError(i18n("参考音频在3~10秒范围外，请更换！"))
            wav16k = torch.from_numpy(wav16k)
-            if is_half == True:
+            if is_half:
                wav16k = wav16k.half().to(device)
            else:
                wav16k = wav16k.to(device)
@ -871,7 +884,7 @@ def get_tts_wav(
        t2 = ttime()
        # cache_key="%s-%s-%s-%s-%s-%s-%s-%s"%(ref_wav_path,prompt_text,prompt_language,text,text_language,top_k,top_p,temperature)
        # print(cache.keys(),if_freeze)
-        if i_text in cache and if_freeze == True:
+        if i_text in cache and if_freeze:
            pred_semantic = cache[i_text]
        else:
            with torch.no_grad():
@ -896,7 +909,7 @@ def get_tts_wav(
            refers = []
            if is_v2pro:
                sv_emb = []
-                if sv_cn_model == None:
+                if sv_cn_model is None:
                    init_sv_cn()
            if inp_refs:
                for path in inp_refs:
@ -965,10 +978,10 @@ def get_tts_wav(
            cfm_res = torch.cat(cfm_resss, 2)
            cfm_res = denorm_spec(cfm_res)
            if model_version == "v3":
-                if bigvgan_model == None:
+                if bigvgan_model is None:
                    init_bigvgan()
            else:  # v4
-                if hifigan_model == None:
+                if hifigan_model is None:
                    init_hifigan()
            vocoder_model = bigvgan_model if model_version == "v3" else hifigan_model
            with torch.inference_mode():
@ -982,7 +995,7 @@ def get_tts_wav(
        t4 = ttime()
        t.extend([t2 - t1, t3 - t2, t4 - t3])
        t1 = ttime()
-    print("%.3f\t%.3f\t%.3f\t%.3f" % (t[0], sum(t[1::3]), sum(t[2::3]), sum(t[3::3])))
+    print(f"{t[0]:.3f}\t{sum(t[1::3]):.3f}\t{sum(t[2::3]):.3f}\t{sum(t[3::3]):.3f}")
    audio_opt = torch.cat(audio_opt, 0)  # np.concatenate
    if model_version in {"v1", "v2", "v2Pro", "v2ProPlus"}:
        opt_sr = 32000
@ -990,7 +1003,7 @@ def get_tts_wav(
        opt_sr = 24000
    else:
        opt_sr = 48000  # v4
-    if if_sr == True and opt_sr == 24000:
+    if if_sr and opt_sr == 24000:
        print(i18n("音频超分中"))
        audio_opt, opt_sr = audio_sr(audio_opt.unsqueeze(0), opt_sr)
        max_audio = np.abs(audio_opt).max()
@ -1062,7 +1075,7 @@ def cut2(inp):

 def cut3(inp):
    inp = inp.strip("\n")
-    opts = ["%s" % item for item in inp.strip("。").split("。")]
+    opts = [f"{item}" for item in inp.strip("。").split("。")]
    opts = [item for item in opts if not set(item).issubset(punctuation)]
    return "\n".join(opts)

@ -1077,7 +1090,7 @@ def cut4(inp):
 # contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py
 def cut5(inp):
    inp = inp.strip("\n")
-    punds = {",", ".", ";", "?", "!", "、", "，", "。", "？", "！", ";", "：", "…"}
+    punds = {",", ".", ";", "?", "!", "、", "，", "。", "？", "！", "：", "…"}
    mergeitems = []
    items = []

@ -1101,7 +1114,7 @@ def cut5(inp):

 def custom_sort_key(s):
    # 使用正则表达式提取字符串中的数字部分和非数字部分
-    parts = re.split("(\d+)", s)
+    parts = re.split(r"(\d+)", s)
    # 将数字部分转换为整数，非数字部分保持不变
    parts = [int(part) if part.isdigit() else part for part in parts]
    return parts
--- a/GPT_SoVITS/inference_webui_fast.py
+++ b/GPT_SoVITS/inference_webui_fast.py
@ -6,32 +6,37 @@
 全部按英文识别
 全部按日文识别
 """
-import psutil
+
 import os

+import psutil
+
+
 def set_high_priority():
    """把当前 Python 进程设为 HIGH_PRIORITY_CLASS"""
    if os.name != "nt":
-        return # 仅 Windows 有效
+        return  # 仅 Windows 有效
    p = psutil.Process(os.getpid())
    try:
        p.nice(psutil.HIGH_PRIORITY_CLASS)
        print("已将进程优先级设为 High")
    except psutil.AccessDenied:
        print("权限不足，无法修改优先级（请用管理员运行）")
+
+
 set_high_priority()
 import json
 import logging
-import os
 import random
 import re
 import sys

 import torch

+
 now_dir = os.getcwd()
 sys.path.append(now_dir)
-sys.path.append("%s/GPT_SoVITS" % (now_dir))
+sys.path.append(f"{now_dir}/GPT_SoVITS")

 logging.getLogger("markdown_it").setLevel(logging.ERROR)
 logging.getLogger("urllib3").setLevel(logging.ERROR)
@ -60,8 +65,9 @@ import gradio as gr
 from TTS_infer_pack.text_segmentation_method import get_method
 from TTS_infer_pack.TTS import NO_PROMPT_ERROR, TTS, TTS_Config

-from tools.assets import css, js, top_html
-from tools.i18n.i18n import I18nAuto, scan_language_list
+from gsv_tools.assets import css, js, top_html
+from gsv_tools.i18n.i18n import I18nAuto, scan_language_list
+

 language = os.environ.get("language", "Auto")
 language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
@ -114,9 +120,11 @@ cut_method = {

 from config import change_choices, get_weights_names, name2gpt_path, name2sovits_path

+
 SoVITS_names, GPT_names = get_weights_names()
 from config import pretrained_sovits_name

+
 path_sovits_v3 = pretrained_sovits_name["v3"]
 path_sovits_v4 = pretrained_sovits_name["v4"]
 is_exist_s2gv3 = os.path.exists(path_sovits_v3)
@ -203,7 +211,7 @@ def inference(

 def custom_sort_key(s):
    # 使用正则表达式提取字符串中的数字部分和非数字部分
-    parts = re.split("(\d+)", s)
+    parts = re.split(r"(\d+)", s)
    # 将数字部分转换为整数，非数字部分保持不变
    parts = [int(part) if part.isdigit() else part for part in parts]
    return parts
@ -215,7 +223,7 @@ else:
    with open("./weight.json", "w", encoding="utf-8") as file:
        json.dump({"GPT": {}, "SoVITS": {}}, file)

-with open("./weight.json", "r", encoding="utf-8") as file:
+with open("./weight.json", encoding="utf-8") as file:
    weight_data = file.read()
    weight_data = json.loads(weight_data)
    gpt_path = os.environ.get("gpt_path", weight_data.get("GPT", {}).get(version, GPT_names[-1]))
@ -227,6 +235,7 @@ with open("./weight.json", "r", encoding="utf-8") as file:

 from process_ckpt import get_sovits_version_from_path_fast

+
 v3v4set = {"v3", "v4"}


@ -238,8 +247,8 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
    # print(sovits_path,version, model_version, if_lora_v3)
    is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4
    path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
-    if if_lora_v3 == True and is_exist == False:
-        info = path_sovits + "SoVITS %s" % model_version + i18n("底模缺失，无法加载相应 LoRA 权重")
+    if if_lora_v3 and not is_exist:
+        info = path_sovits + f"SoVITS {model_version}" + i18n("底模缺失，无法加载相应 LoRA 权重")
        gr.Warning(info)
        raise FileExistsError(info)
    dict_language = dict_language_v1 if version == "v1" else dict_language_v2
--- a/GPT_SoVITS/module/data_utils.py
+++ b/GPT_SoVITS/module/data_utils.py
@ -1,14 +1,16 @@
 import os
 import random
 import traceback
+
 import torch
+import torch.nn.functional as F
 import torch.utils.data
+from text import cleaned_text_to_sequence
 from tqdm import tqdm

-from module.mel_processing import spectrogram_torch, spec_to_mel_torch
-from text import cleaned_text_to_sequence
-import torch.nn.functional as F
-from tools.my_utils import load_audio
+from gsv_tools.my_utils import load_audio
+from module.mel_processing import spec_to_mel_torch, spectrogram_torch
+

 version = os.environ.get("version", None)

@ -23,22 +25,22 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):

    def __init__(self, hparams, version=None, val=False):
        exp_dir = hparams.exp_dir
-        self.path2 = "%s/2-name2text.txt" % exp_dir
-        self.path4 = "%s/4-cnhubert" % exp_dir
-        self.path5 = "%s/5-wav32k" % exp_dir
+        self.path2 = f"{exp_dir}/2-name2text.txt"
+        self.path4 = f"{exp_dir}/4-cnhubert"
+        self.path5 = f"{exp_dir}/5-wav32k"
        assert os.path.exists(self.path2)
        assert os.path.exists(self.path4)
        assert os.path.exists(self.path5)
        self.is_v2Pro = version in {"v2Pro", "v2ProPlus"}
        if self.is_v2Pro:
-            self.path7 = "%s/7-sv_cn" % exp_dir
+            self.path7 = f"{exp_dir}/7-sv_cn"
            assert os.path.exists(self.path7)
        names4 = set([name[:-3] for name in list(os.listdir(self.path4))])  # 去除.pt后缀
        names5 = set(os.listdir(self.path5))
        if self.is_v2Pro:
            names6 = set([name[:-3] for name in list(os.listdir(self.path7))])  # 去除.pt后缀
        self.phoneme_data = {}
-        with open(self.path2, "r", encoding="utf8") as f:
+        with open(self.path2, encoding="utf8") as f:
            lines = f.read().strip("\n").split("\n")

        for line in lines:
@ -85,7 +87,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
                skipped_phone += 1
                continue

-            size = os.path.getsize("%s/%s" % (self.path5, audiopath))
+            size = os.path.getsize(f"{self.path5}/{audiopath}")
            duration = size / self.sampling_rate / 2

            if duration == 0:
@ -110,15 +112,15 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
        audiopath, phoneme_ids = audiopath_sid_text
        text = torch.FloatTensor(phoneme_ids)
        try:
-            spec, wav = self.get_audio("%s/%s" % (self.path5, audiopath))
+            spec, wav = self.get_audio(f"{self.path5}/{audiopath}")
            with torch.no_grad():
-                ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
+                ssl = torch.load(f"{self.path4}/{audiopath}.pt", map_location="cpu")
                if ssl.shape[-1] != spec.shape[-1]:
                    typee = ssl.dtype
                    ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
                ssl.requires_grad = False
                if self.is_v2Pro:
-                    sv_emb = torch.load("%s/%s.pt" % (self.path7, audiopath), map_location="cpu")
+                    sv_emb = torch.load(f"{self.path7}/{audiopath}.pt", map_location="cpu")
        except:
            traceback.print_exc()
            spec = torch.zeros(1025, 100)
@ -285,16 +287,16 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):

    def __init__(self, hparams, val=False):
        exp_dir = hparams.exp_dir
-        self.path2 = "%s/2-name2text.txt" % exp_dir
-        self.path4 = "%s/4-cnhubert" % exp_dir
-        self.path5 = "%s/5-wav32k" % exp_dir
+        self.path2 = f"{exp_dir}/2-name2text.txt"
+        self.path4 = f"{exp_dir}/4-cnhubert"
+        self.path5 = f"{exp_dir}/5-wav32k"
        assert os.path.exists(self.path2)
        assert os.path.exists(self.path4)
        assert os.path.exists(self.path5)
        names4 = set([name[:-3] for name in list(os.listdir(self.path4))])  # 去除.pt后缀
        names5 = set(os.listdir(self.path5))
        self.phoneme_data = {}
-        with open(self.path2, "r", encoding="utf8") as f:
+        with open(self.path2, encoding="utf8") as f:
            lines = f.read().strip("\n").split("\n")

        for line in lines:
@ -339,7 +341,7 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
                skipped_phone += 1
                continue

-            size = os.path.getsize("%s/%s" % (self.path5, audiopath))
+            size = os.path.getsize(f"{self.path5}/{audiopath}")
            duration = size / self.sampling_rate / 2

            if duration == 0:
@ -376,9 +378,9 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
        audiopath, phoneme_ids = audiopath_sid_text
        text = torch.FloatTensor(phoneme_ids)
        try:
-            spec, mel = self.get_audio("%s/%s" % (self.path5, audiopath))
+            spec, mel = self.get_audio(f"{self.path5}/{audiopath}")
            with torch.no_grad():
-                ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
+                ssl = torch.load(f"{self.path4}/{audiopath}.pt", map_location="cpu")
                if ssl.shape[-1] != spec.shape[-1]:
                    typee = ssl.dtype
                    ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
@ -523,16 +525,16 @@ class TextAudioSpeakerLoaderV4(torch.utils.data.Dataset):

    def __init__(self, hparams, val=False):
        exp_dir = hparams.exp_dir
-        self.path2 = "%s/2-name2text.txt" % exp_dir
-        self.path4 = "%s/4-cnhubert" % exp_dir
-        self.path5 = "%s/5-wav32k" % exp_dir
+        self.path2 = f"{exp_dir}/2-name2text.txt"
+        self.path4 = f"{exp_dir}/4-cnhubert"
+        self.path5 = f"{exp_dir}/5-wav32k"
        assert os.path.exists(self.path2)
        assert os.path.exists(self.path4)
        assert os.path.exists(self.path5)
        names4 = set([name[:-3] for name in list(os.listdir(self.path4))])  # 去除.pt后缀
        names5 = set(os.listdir(self.path5))
        self.phoneme_data = {}
-        with open(self.path2, "r", encoding="utf8") as f:
+        with open(self.path2, encoding="utf8") as f:
            lines = f.read().strip("\n").split("\n")

        for line in lines:
@ -577,7 +579,7 @@ class TextAudioSpeakerLoaderV4(torch.utils.data.Dataset):
                skipped_phone += 1
                continue

-            size = os.path.getsize("%s/%s" % (self.path5, audiopath))
+            size = os.path.getsize(f"{self.path5}/{audiopath}")
            duration = size / self.sampling_rate / 2

            if duration == 0:
@ -614,9 +616,9 @@ class TextAudioSpeakerLoaderV4(torch.utils.data.Dataset):
        audiopath, phoneme_ids = audiopath_sid_text
        text = torch.FloatTensor(phoneme_ids)
        try:
-            spec, mel = self.get_audio("%s/%s" % (self.path5, audiopath))
+            spec, mel = self.get_audio(f"{self.path5}/{audiopath}")
            with torch.no_grad():
-                ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
+                ssl = torch.load(f"{self.path4}/{audiopath}.pt", map_location="cpu")
                if ssl.shape[-1] != spec.shape[-1]:
                    typee = ssl.dtype
                    ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
@ -734,16 +736,16 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):

    def __init__(self, hparams, val=False):
        exp_dir = hparams.exp_dir
-        self.path2 = "%s/2-name2text.txt" % exp_dir
-        self.path4 = "%s/4-cnhubert" % exp_dir
-        self.path5 = "%s/5-wav32k" % exp_dir
+        self.path2 = f"{exp_dir}/2-name2text.txt"
+        self.path4 = f"{exp_dir}/4-cnhubert"
+        self.path5 = f"{exp_dir}/5-wav32k"
        assert os.path.exists(self.path2)
        assert os.path.exists(self.path4)
        assert os.path.exists(self.path5)
        names4 = set([name[:-3] for name in list(os.listdir(self.path4))])  # 去除.pt后缀
        names5 = set(os.listdir(self.path5))
        self.phoneme_data = {}
-        with open(self.path2, "r", encoding="utf8") as f:
+        with open(self.path2, encoding="utf8") as f:
            lines = f.read().strip("\n").split("\n")

        for line in lines:
@ -788,7 +790,7 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
                skipped_phone += 1
                continue

-            size = os.path.getsize("%s/%s" % (self.path5, audiopath))
+            size = os.path.getsize(f"{self.path5}/{audiopath}")
            duration = size / self.sampling_rate / 2

            if duration == 0:
@ -825,9 +827,9 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
        audiopath, phoneme_ids = audiopath_sid_text
        text = torch.FloatTensor(phoneme_ids)
        try:
-            spec, mel, wav = self.get_audio("%s/%s" % (self.path5, audiopath))
+            spec, mel, wav = self.get_audio(f"{self.path5}/{audiopath}")
            with torch.no_grad():
-                ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
+                ssl = torch.load(f"{self.path4}/{audiopath}.pt", map_location="cpu")
                if ssl.shape[-1] != spec.shape[-1]:
                    typee = ssl.dtype
                    ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
--- a/GPT_SoVITS/prepare_datasets/1-get-text.py
+++ b/GPT_SoVITS/prepare_datasets/1-get-text.py
@ -1,7 +1,6 @@
-# -*- coding: utf-8 -*-
-
 import os

+
 inp_text = os.environ.get("inp_text")
 inp_wav_dir = os.environ.get("inp_wav_dir")
 exp_name = os.environ.get("exp_name")
@ -13,13 +12,12 @@ opt_dir = os.environ.get("opt_dir")
 bert_pretrained_dir = os.environ.get("bert_pretrained_dir")
 import torch

+
 is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
 version = os.environ.get("version", None)
-import traceback
 import os.path
-from text.cleaner import clean_text
-from transformers import AutoModelForMaskedLM, AutoTokenizer
-from tools.my_utils import clean_path
+import shutil
+import traceback

 # inp_text=sys.argv[1]
 # inp_wav_dir=sys.argv[2]
@ -29,23 +27,26 @@ from tools.my_utils import clean_path
 # os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[6]#i_gpu
 # opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name
 # bert_pretrained_dir="/data/docker/liujing04/bert-vits2/Bert-VITS2-master20231106/bert/chinese-roberta-wwm-ext-large"
-
 from time import time as ttime
-import shutil
+
+from text.cleaner import clean_text
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+
+from gsv_tools.my_utils import clean_path


 def my_save(fea, path):  #####fix issue: torch.save doesn't support chinese path
    dir = os.path.dirname(path)
    name = os.path.basename(path)
    # tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part)
-    tmp_path = "%s%s.pth" % (ttime(), i_part)
+    tmp_path = f"{ttime()}{i_part}.pth"
    torch.save(fea, tmp_path)
-    shutil.move(tmp_path, "%s/%s" % (dir, name))
+    shutil.move(tmp_path, f"{dir}/{name}")


-txt_path = "%s/2-name2text-%s.txt" % (opt_dir, i_part)
-if os.path.exists(txt_path) == False:
-    bert_dir = "%s/3-bert" % (opt_dir)
+txt_path = f"{opt_dir}/2-name2text-{i_part}.txt"
+if not os.path.exists(txt_path):
+    bert_dir = f"{opt_dir}/3-bert"
    os.makedirs(opt_dir, exist_ok=True)
    os.makedirs(bert_dir, exist_ok=True)
    if torch.cuda.is_available():
@ -60,7 +61,7 @@ if os.path.exists(txt_path) == False:
        raise FileNotFoundError(bert_pretrained_dir)
    tokenizer = AutoTokenizer.from_pretrained(bert_pretrained_dir)
    bert_model = AutoModelForMaskedLM.from_pretrained(bert_pretrained_dir)
-    if is_half == True:
+    if is_half:
        bert_model = bert_model.half().to(device)
    else:
        bert_model = bert_model.to(device)
@ -90,8 +91,8 @@ if os.path.exists(txt_path) == False:
                name = os.path.basename(name)
                print(name)
                phones, word2ph, norm_text = clean_text(text.replace("%", "-").replace("￥", ","), lan, version)
-                path_bert = "%s/%s.pt" % (bert_dir, name)
-                if os.path.exists(path_bert) == False and lan == "zh":
+                path_bert = f"{bert_dir}/{name}.pt"
+                if not os.path.exists(path_bert) and lan == "zh":
                    bert_feature = get_bert_feature(norm_text, word2ph)
                    assert bert_feature.shape[-1] == len(phones)
                    # torch.save(bert_feature, path_bert)
@ -104,7 +105,7 @@ if os.path.exists(txt_path) == False:

    todo = []
    res = []
-    with open(inp_text, "r", encoding="utf8") as f:
+    with open(inp_text, encoding="utf8") as f:
        lines = f.read().strip("\n").split("\n")

    language_v1_to_language_v2 = {
@ -138,6 +139,6 @@ if os.path.exists(txt_path) == False:
    process(todo, res)
    opt = []
    for name, phones, word2ph, norm_text in res:
-        opt.append("%s\t%s\t%s\t%s" % (name, phones, word2ph, norm_text))
+        opt.append(f"{name}\t{phones}\t{word2ph}\t{norm_text}")
    with open(txt_path, "w", encoding="utf8") as f:
        f.write("\n".join(opt) + "\n")
--- a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py
+++ b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py
@ -1,7 +1,6 @@
-# -*- coding: utf-8 -*-
-
-import sys
 import os
+import sys
+

 inp_text = os.environ.get("inp_text")
 inp_wav_dir = os.environ.get("inp_wav_dir")
@ -12,20 +11,24 @@ if "_CUDA_VISIBLE_DEVICES" in os.environ:
    os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
 from feature_extractor import cnhubert

+
 opt_dir = os.environ.get("opt_dir")
 cnhubert.cnhubert_base_path = os.environ.get("cnhubert_base_dir")
 import torch

+
 is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()

 import traceback
+
+import librosa
 import numpy as np
 from scipy.io import wavfile
-import librosa
+

 now_dir = os.getcwd()
 sys.path.append(now_dir)
-from tools.my_utils import load_audio, clean_path
+import shutil

 # from config import cnhubert_base_path
 # cnhubert.cnhubert_base_path=cnhubert_base_path
@ -37,22 +40,22 @@ from tools.my_utils import load_audio, clean_path
 # os.environ["CUDA_VISIBLE_DEVICES"]=sys.argv[6]
 # cnhubert.cnhubert_base_path=sys.argv[7]
 # opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name
-
 from time import time as ttime
-import shutil
+
+from gsv_tools.my_utils import clean_path, load_audio


 def my_save(fea, path):  #####fix issue: torch.save doesn't support chinese path
    dir = os.path.dirname(path)
    name = os.path.basename(path)
    # tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part)
-    tmp_path = "%s%s.pth" % (ttime(), i_part)
+    tmp_path = f"{ttime()}{i_part}.pth"
    torch.save(fea, tmp_path)
-    shutil.move(tmp_path, "%s/%s" % (dir, name))
+    shutil.move(tmp_path, f"{dir}/{name}")


-hubert_dir = "%s/4-cnhubert" % (opt_dir)
-wav32dir = "%s/5-wav32k" % (opt_dir)
+hubert_dir = f"{opt_dir}/4-cnhubert"
+wav32dir = f"{opt_dir}/5-wav32k"
 os.makedirs(opt_dir, exist_ok=True)
 os.makedirs(hubert_dir, exist_ok=True)
 os.makedirs(wav32dir, exist_ok=True)
@ -67,7 +70,7 @@ else:
    device = "cpu"
 model = cnhubert.get_model()
 # is_half=False
-if is_half == True:
+if is_half:
    model = model.half().to(device)
 else:
    model = model.to(device)
@ -76,36 +79,36 @@ nan_fails = []


 def name2go(wav_name, wav_path):
-    hubert_path = "%s/%s.pt" % (hubert_dir, wav_name)
+    hubert_path = f"{hubert_dir}/{wav_name}.pt"
    if os.path.exists(hubert_path):
        return
    tmp_audio = load_audio(wav_path, 32000)
    tmp_max = np.abs(tmp_audio).max()
    if tmp_max > 2.2:
-        print("%s-filtered,%s" % (wav_name, tmp_max))
+        print(f"{wav_name}-filtered,{tmp_max}")
        return
    tmp_audio32 = (tmp_audio / tmp_max * (maxx * alpha * 32768)) + ((1 - alpha) * 32768) * tmp_audio
    tmp_audio32b = (tmp_audio / tmp_max * (maxx * alpha * 1145.14)) + ((1 - alpha) * 1145.14) * tmp_audio
    tmp_audio = librosa.resample(tmp_audio32b, orig_sr=32000, target_sr=16000)  # 不是重采样问题
    tensor_wav16 = torch.from_numpy(tmp_audio)
-    if is_half == True:
+    if is_half:
        tensor_wav16 = tensor_wav16.half().to(device)
    else:
        tensor_wav16 = tensor_wav16.to(device)
    ssl = model.model(tensor_wav16.unsqueeze(0))["last_hidden_state"].transpose(1, 2).cpu()  # torch.Size([1, 768, 215])
    if np.isnan(ssl.detach().numpy()).sum() != 0:
        nan_fails.append((wav_name, wav_path))
-        print("nan filtered:%s" % wav_name)
+        print(f"nan filtered:{wav_name}")
        return
    wavfile.write(
-        "%s/%s" % (wav32dir, wav_name),
+        f"{wav32dir}/{wav_name}",
        32000,
        tmp_audio32.astype("int16"),
    )
    my_save(ssl, hubert_path)


-with open(inp_text, "r", encoding="utf8") as f:
+with open(inp_text, encoding="utf8") as f:
    lines = f.read().strip("\n").split("\n")

 for line in lines[int(i_part) :: int(all_parts)]:
@ -113,9 +116,9 @@ for line in lines[int(i_part) :: int(all_parts)]:
        # wav_name,text=line.split("\t")
        wav_name, spk_name, language, text = line.split("|")
        wav_name = clean_path(wav_name)
-        if inp_wav_dir != "" and inp_wav_dir != None:
+        if inp_wav_dir != "" and inp_wav_dir is not None:
            wav_name = os.path.basename(wav_name)
-            wav_path = "%s/%s" % (inp_wav_dir, wav_name)
+            wav_path = f"{inp_wav_dir}/{wav_name}"

        else:
            wav_path = wav_name
@ -124,7 +127,7 @@ for line in lines[int(i_part) :: int(all_parts)]:
    except:
        print(line, traceback.format_exc())

-if len(nan_fails) > 0 and is_half == True:
+if len(nan_fails) > 0 and is_half:
    is_half = False
    model = model.float()
    for wav in nan_fails:
--- a/GPT_SoVITS/prepare_datasets/2-get-sv.py
+++ b/GPT_SoVITS/prepare_datasets/2-get-sv.py
@ -1,7 +1,6 @@
-# -*- coding: utf-8 -*-
-
-import sys
 import os
+import sys
+

 inp_text = os.environ.get("inp_text")
 inp_wav_dir = os.environ.get("inp_wav_dir")
@ -15,32 +14,37 @@ opt_dir = os.environ.get("opt_dir")
 sv_path = os.environ.get("sv_path")
 import torch

+
 is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()

 import traceback
+
 import torchaudio

+
 now_dir = os.getcwd()
 sys.path.append(now_dir)
 sys.path.append(f"{now_dir}/GPT_SoVITS/eres2net")
-from tools.my_utils import clean_path
-from time import time as ttime
 import shutil
-from ERes2NetV2 import ERes2NetV2
+from time import time as ttime
+
 import kaldi as Kaldi
+from ERes2NetV2 import ERes2NetV2
+
+from gsv_tools.my_utils import clean_path


 def my_save(fea, path):  #####fix issue: torch.save doesn't support chinese path
    dir = os.path.dirname(path)
    name = os.path.basename(path)
    # tmp_path="%s/%s%s.pth"%(dir,ttime(),i_part)
-    tmp_path = "%s%s.pth" % (ttime(), i_part)
+    tmp_path = f"{ttime()}{i_part}.pth"
    torch.save(fea, tmp_path)
-    shutil.move(tmp_path, "%s/%s" % (dir, name))
+    shutil.move(tmp_path, f"{dir}/{name}")


-sv_cn_dir = "%s/7-sv_cn" % (opt_dir)
-wav32dir = "%s/5-wav32k" % (opt_dir)
+sv_cn_dir = f"{opt_dir}/7-sv_cn"
+wav32dir = f"{opt_dir}/5-wav32k"
 os.makedirs(opt_dir, exist_ok=True)
 os.makedirs(sv_cn_dir, exist_ok=True)
 os.makedirs(wav32dir, exist_ok=True)
@ -63,7 +67,7 @@ class SV:
        embedding_model.eval()
        self.embedding_model = embedding_model
        self.res = torchaudio.transforms.Resample(32000, 16000).to(device)
-        if is_half == False:
+        if not is_half:
            self.embedding_model = self.embedding_model.to(device)
        else:
            self.embedding_model = self.embedding_model.half().to(device)
@ -72,7 +76,7 @@ class SV:
    def compute_embedding3(self, wav):  # (1,x)#-1~1
        with torch.no_grad():
            wav = self.res(wav)
-            if self.is_half == True:
+            if self.is_half:
                wav = wav.half()
            feat = torch.stack(
                [Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav]
@ -85,10 +89,10 @@ sv = SV(device, is_half)


 def name2go(wav_name, wav_path):
-    sv_cn_path = "%s/%s.pt" % (sv_cn_dir, wav_name)
+    sv_cn_path = f"{sv_cn_dir}/{wav_name}.pt"
    if os.path.exists(sv_cn_path):
        return
-    wav_path = "%s/%s" % (wav32dir, wav_name)
+    wav_path = f"{wav32dir}/{wav_name}"
    wav32k, sr0 = torchaudio.load(wav_path)
    assert sr0 == 32000
    wav32k = wav32k.to(device)
@ -96,16 +100,16 @@ def name2go(wav_name, wav_path):
    my_save(emb, sv_cn_path)


-with open(inp_text, "r", encoding="utf8") as f:
+with open(inp_text, encoding="utf8") as f:
    lines = f.read().strip("\n").split("\n")

 for line in lines[int(i_part) :: int(all_parts)]:
    try:
        wav_name, spk_name, language, text = line.split("|")
        wav_name = clean_path(wav_name)
-        if inp_wav_dir != "" and inp_wav_dir != None:
+        if inp_wav_dir != "" and inp_wav_dir is not None:
            wav_name = os.path.basename(wav_name)
-            wav_path = "%s/%s" % (inp_wav_dir, wav_name)
+            wav_path = f"{inp_wav_dir}/{wav_name}"

        else:
            wav_path = wav_name
--- a/GPT_SoVITS/prepare_datasets/3-get-semantic.py
+++ b/GPT_SoVITS/prepare_datasets/3-get-semantic.py
@ -1,5 +1,6 @@
 import os

+
 inp_text = os.environ.get("inp_text")
 exp_name = os.environ.get("exp_name")
 i_part = os.environ.get("i_part")
@ -28,20 +29,25 @@ else:
    version = "v3"
 import torch

+
 is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
-import traceback
 import sys
+import traceback
+

 now_dir = os.getcwd()
 sys.path.append(now_dir)
 import logging
+
 import utils

+
 if version != "v3":
    from module.models import SynthesizerTrn
 else:
    from module.models import SynthesizerTrnV3 as SynthesizerTrn
-from tools.my_utils import clean_path
+from gsv_tools.my_utils import clean_path
+

 logging.getLogger("numba").setLevel(logging.WARNING)
 # from config import pretrained_s2G
@ -54,9 +60,9 @@ logging.getLogger("numba").setLevel(logging.WARNING)
 # opt_dir="/data/docker/liujing04/gpt-vits/fine_tune_dataset/%s"%exp_name


-hubert_dir = "%s/4-cnhubert" % (opt_dir)
-semantic_path = "%s/6-name2semantic-%s.tsv" % (opt_dir, i_part)
-if os.path.exists(semantic_path) == False:
+hubert_dir = f"{opt_dir}/4-cnhubert"
+semantic_path = f"{opt_dir}/6-name2semantic-{i_part}.tsv"
+if not os.path.exists(semantic_path):
    os.makedirs(opt_dir, exist_ok=True)

    if torch.cuda.is_available():
@ -73,7 +79,7 @@ if os.path.exists(semantic_path) == False:
        version=version,
        **hps.model,
    )
-    if is_half == True:
+    if is_half:
        vq_model = vq_model.half().to(device)
    else:
        vq_model = vq_model.to(device)
@ -87,19 +93,19 @@ if os.path.exists(semantic_path) == False:
    )

    def name2go(wav_name, lines):
-        hubert_path = "%s/%s.pt" % (hubert_dir, wav_name)
-        if os.path.exists(hubert_path) == False:
+        hubert_path = f"{hubert_dir}/{wav_name}.pt"
+        if not os.path.exists(hubert_path):
            return
        ssl_content = torch.load(hubert_path, map_location="cpu")
-        if is_half == True:
+        if is_half:
            ssl_content = ssl_content.half().to(device)
        else:
            ssl_content = ssl_content.to(device)
        codes = vq_model.extract_latent(ssl_content)
        semantic = " ".join([str(i) for i in codes[0, 0, :].tolist()])
-        lines.append("%s\t%s" % (wav_name, semantic))
+        lines.append(f"{wav_name}\t{semantic}")

-    with open(inp_text, "r", encoding="utf8") as f:
+    with open(inp_text, encoding="utf8") as f:
        lines = f.read().strip("\n").split("\n")

    lines1 = []
--- a/GPT_SoVITS/process_ckpt.py
+++ b/GPT_SoVITS/process_ckpt.py
@ -1,10 +1,13 @@
+import os
+import shutil
 import traceback
 from collections import OrderedDict
 from time import time as ttime
-import shutil
-import os
+
 import torch
-from tools.i18n.i18n import I18nAuto
+
+from gsv_tools.i18n.i18n import I18nAuto
+

 i18n = I18nAuto()

@ -12,13 +15,14 @@ i18n = I18nAuto()
 def my_save(fea, path):  #####fix issue: torch.save doesn't support chinese path
    dir = os.path.dirname(path)
    name = os.path.basename(path)
-    tmp_path = "%s.pth" % (ttime())
+    tmp_path = f"{ttime()}.pth"
    torch.save(fea, tmp_path)
-    shutil.move(tmp_path, "%s/%s" % (dir, name))
+    shutil.move(tmp_path, f"{dir}/{name}")


 from io import BytesIO

+
 model_version2byte = {
    "v3": b"03",
    "v4": b"04",
@ -47,14 +51,14 @@ def savee(ckpt, name, epoch, steps, hps, model_version=None, lora_rank=None):
                continue
            opt["weight"][key] = ckpt[key].half()
        opt["config"] = hps
-        opt["info"] = "%sepoch_%siteration" % (epoch, steps)
+        opt["info"] = f"{epoch}epoch_{steps}iteration"
        if lora_rank:
            opt["lora_rank"] = lora_rank
-            my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), model_version)
-        elif model_version != None and "Pro" in model_version:
-            my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), model_version)
+            my_save2(opt, f"{hps.save_weight_dir}/{name}.pth", model_version)
+        elif model_version is not None and "Pro" in model_version:
+            my_save2(opt, f"{hps.save_weight_dir}/{name}.pth", model_version)
        else:
-            my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name))
+            my_save(opt, f"{hps.save_weight_dir}/{name}.pth")
        return "Success."
    except:
        return traceback.format_exc()
--- a/README.md
+++ b/README.md
@ -1,3 +1,5 @@
+#
+
 <div align="center">

 <h1>GPT-SoVITS-WebUI</h1>
@ -7,8 +9,6 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>

 <a href="https://trendshift.io/repositories/7033" target="_blank"><img src="https://trendshift.io/api/badge/repositories/7033" alt="RVC-Boss%2FGPT-SoVITS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>

-<!-- img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br> -->
-
 [![Python](https://img.shields.io/badge/python-3.10--3.12-blue?style=for-the-badge&logo=python)](https://www.python.org)
 [![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases)

@ -27,7 +27,11 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>

 ---

-## Features:
+<div align="center">
+
+## Features
+
+</div>

 1. **Zero-shot TTS:** Input a 5-second vocal sample and experience instant text-to-speech conversion.

@ -41,51 +45,56 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>

 Unseen speakers few-shot fine-tuning demo:

-https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
+<https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb>

 **RTF(inference speed) of GPT-SoVITS v2 ProPlus**:
 0.028 tested in 4060Ti, 0.014 tested in 4090 (1400words~=4min, inference time is 3.36s), 0.526 in M4 CPU. You can test our [huggingface demo](https://lj1995-gpt-sovits-proplus.hf.space/) (half H200) to experience high-speed inference .

-请不要尬黑GPT-SoVITS推理速度慢，谢谢！
-
 **User guide: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**

+<div align="center">
+
 ## Installation

-For users in China, you can [click here](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official) to use AutoDL Cloud Docker to experience the full functionality online.
+For users in China, you can [Click Here to use](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official) AutoDL Cloud Docker to experience the full functionality online.

 ### Tested Environments

-| Python Version | PyTorch Version  | Device        |
-| -------------- | ---------------- | ------------- |
-| Python 3.10    | PyTorch 2.5.1    | CUDA 12.4     |
-| Python 3.11    | PyTorch 2.5.1    | CUDA 12.4     |
-| Python 3.11    | PyTorch 2.7.0    | CUDA 12.8     |
-| Python 3.9     | PyTorch 2.8.0dev | CUDA 12.8     |
-| Python 3.9     | PyTorch 2.5.1    | Apple silicon |
-| Python 3.11    | PyTorch 2.7.0    | Apple silicon |
-| Python 3.9     | PyTorch 2.2.2    | CPU           |
+| Python Version | PyTorch Version | Device        |
+| -------------- | --------------- | ------------- |
+| Python 3.10    | PyTorch 2.8.0   | CUDA 12.6     |
+| Python 3.11    | PyTorch 2.9.0   | CUDA 12.6     |
+| Python 3.11    | PyTorch 2.7.0   | CUDA 12.8     |
+| Python 3.11    | PyTorch 2.10.0  | CUDA 12.8     |
+| Python 3.10    | PyTorch 2.8.0   | Apple silicon |
+| Python 3.11    | PyTorch 2.9.0   | Apple silicon |
+| Python 3.12    | PyTorch 2.10.0  | Apple silicon |
+| Python 3.10    | PyTorch 2.9.0   | CPU           |
+
+</div>

 ### Windows

-If you are a Windows user (tested with win>=10), you can [download the integrated package](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true) and double-click on _go-webui.bat_ to start GPT-SoVITS-WebUI.
+If you are a Windows user (tested with win>=10), you can [download the integrated package](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/tree/main) and double-click on `go-webui.bat` to start GPT-SoVITS-WebUI.

 **Users in China can [download the package here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).**

 Install the program by running the following commands:

 ```pwsh
-conda create -n GPTSoVits python=3.10
-conda activate GPTSoVits
-pwsh -F install.ps1 --Device <CU126|CU128|CPU> --Source <HF|HF-Mirror|ModelScope> [--DownloadUVR5]
+conda create -n GPTSoVITS python=3.11
+conda activate GPTSoVITS
+pwsh -F install.ps1 --help
 ```

 ### Linux

+Install the program by running the following commands:
+
 ```bash
-conda create -n GPTSoVits python=3.10
-conda activate GPTSoVits
-bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
+conda create -n GPTSoVITS python=3.11
+conda activate GPTSoVITS
+bash install.sh --help
 ```

 ### macOS
@ -95,54 +104,51 @@ bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScop
 Install the program by running the following commands:

 ```bash
-conda create -n GPTSoVits python=3.10
-conda activate GPTSoVits
-bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
+conda create -n GPTSoVITS python=3.11
+conda activate GPTSoVITS
+bash install.sh --help
 ```

 ### Install Manually

-#### Install Dependences
+Install the program by running the following commands:

 ```bash
-conda create -n GPTSoVits python=3.10
-conda activate GPTSoVits
+conda create -n GPTSoVITS python=3.11
+conda activate GPTSoVITS
+conda install uv ffmpeg -c conda-forge

-pip install -r extra-req.txt --no-deps
-pip install -r requirements.txt
+uv export --extra main -o pylock.toml -q --extra [mlx|cu126|cu128|rocm|cpu]
+uv pip sync pylock.toml --no-break-system-packages --preview-features pylock
+uv pip install ".[flash-attn]"
 ```

-#### Install FFmpeg
+### Pretrained Models

-##### Conda Users
+**If `install.sh` runs successfully, you may skip No.1,2,3**

-```bash
-conda activate GPTSoVits
-conda install ffmpeg
-```
+**Users in China can [download all these models here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#nVNhX).**

-##### Ubuntu/Debian Users
+1. Download pretrained models from [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) and place them in `GPT_SoVITS/pretrained_models`.

-```bash
-sudo apt install ffmpeg
-sudo apt install libsox-dev
-```
+2. Download G2PW models from [G2PWModel.zip (HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip (ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip), unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`. (Chinese TTS Only)

-##### Windows Users
+3. For UVR5 (Vocals/Accompaniment Separation & Reverberation Removal, additionally), download models from [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) and place them in `gsv_tools/uvr5/uvr5_weights`.
+   - If you want to use `bs_roformer` or `mel_band_roformer` models for UVR5, you can manually download the model and corresponding configuration file, and put them in `gsv_tools/uvr5/uvr5_weights`. **Rename the model file and configuration file, ensure that the model and configuration files have the same and corresponding names except for the suffix**. In addition, the model and configuration file names **must include `roformer`** in order to be recognized as models of the roformer class.

-Download and place [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) and [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) in the GPT-SoVITS root
+   - The suggestion is to **directly specify the model type** in the model name and configuration file name, such as `mel_mand_roformer`, `bs_roformer`. If not specified, the features will be compared from the configuration file to determine which type of model it is. For example, the model `bs_roformer_ep_368_sdr_12.9628.ckpt` and its corresponding configuration file `bs_roformer_ep_368_sdr_12.9628.yaml` are a pair, `kim_mel_band_roformer.ckpt` and `kim_mel_band_roformer.yaml` are also a pair.

-Install [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe)
+4. For Chinese ASR (additionally), download models from [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), and [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) and place them in `gsv_tools/asr/models`.

-##### MacOS Users
+5. For English or Japanese ASR (additionally), download models from [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) and place them in `gsv_tools/asr/models`. Also, [other models](https://huggingface.co/Systran) may have the similar effect with smaller disk footprint.

-```bash
-brew install ffmpeg
-```
+<div align="center">

-### Running GPT-SoVITS with Docker
+## Running GPT-SoVITS with Docker

-#### Docker Image Selection
+</div>
+
+### Docker Image Selection

 Due to rapid development in the codebase and a slower Docker image release cycle, please:

@ -153,15 +159,15 @@ Due to rapid development in the codebase and a slower Docker image release cycle
 - Docker Compose will mount **all files** in the current directory. Please switch to the project root directory and **pull the latest code** before using the Docker image
 - Optionally, build the image locally using the provided Dockerfile for the most up-to-date changes

-#### Environment Variables
+### Environment Variables

 - `is_half`: Controls whether half-precision (fp16) is enabled. Set to `true` if your GPU supports it to reduce memory usage.

-#### Shared Memory Configuration
+### Shared Memory Configuration

 On Windows (Docker Desktop), the default shared memory size is small and may cause unexpected behavior. Increase `shm_size` (e.g., to `16g`) in your Docker Compose file based on your available system memory.

-#### Choosing a Service
+### Choosing a Service

 The `docker-compose.yaml` defines two services:

@ -174,7 +180,7 @@ To run a specific service with Docker Compose, use:
 docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
 ```

-#### Building the Docker Image Locally
+### Building the Docker Image Locally

 If you want to build the image yourself, use:

@ -182,7 +188,7 @@ If you want to build the image yourself, use:
 bash docker_build.sh --cuda <12.6|12.8> [--lite]
 ```

-#### Accessing the Running Container (Bash Shell)
+### Accessing the Running Container (Bash Shell)

 Once the container is running in the background, you can access it using:

@ -190,31 +196,15 @@ Once the container is running in the background, you can access it using:
 docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
 ```

-## Pretrained Models
-
-**If `install.sh` runs successfully, you may skip No.1,2,3**
-
-**Users in China can [download all these models here](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#nVNhX).**
-
-1. Download pretrained models from [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) and place them in `GPT_SoVITS/pretrained_models`.
-
-2. Download G2PW models from [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip), unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.(Chinese TTS Only)
-
-3. For UVR5 (Vocals/Accompaniment Separation & Reverberation Removal, additionally), download models from [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) and place them in `tools/uvr5/uvr5_weights`.
-
-   - If you want to use `bs_roformer` or `mel_band_roformer` models for UVR5, you can manually download the model and corresponding configuration file, and put them in `tools/uvr5/uvr5_weights`. **Rename the model file and configuration file, ensure that the model and configuration files have the same and corresponding names except for the suffix**. In addition, the model and configuration file names **must include `roformer`** in order to be recognized as models of the roformer class.
-
-   - The suggestion is to **directly specify the model type** in the model name and configuration file name, such as `mel_mand_roformer`, `bs_roformer`. If not specified, the features will be compared from the configuration file to determine which type of model it is. For example, the model `bs_roformer_ep_368_sdr_12.9628.ckpt` and its corresponding configuration file `bs_roformer_ep_368_sdr_12.9628.yaml` are a pair, `kim_mel_band_roformer.ckpt` and `kim_mel_band_roformer.yaml` are also a pair.
-
-4. For Chinese ASR (additionally), download models from [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files), and [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) and place them in `tools/asr/models`.
-
-5. For English or Japanese ASR (additionally), download models from [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) and place them in `tools/asr/models`. Also, [other models](https://huggingface.co/Systran) may have the similar effect with smaller disk footprint.
+<div align="center">

 ## Dataset Format

-The TTS annotation .list file format:
+</div>

-```
+The TTS annotation `.list` file format:
+
+```text

 vocal_path|speaker_name|language|text

@ -230,20 +220,23 @@ Language dictionary:

 Example:

-```
+```text

 D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.

 ```

+<div align="center">
+
 ## Finetune and inference

+</div>
+
 ### Open WebUI

 #### Integrated Package Users

 Double-click `go-webui.bat`or use `go-webui.ps1`
-if you want to switch to V1,then double-click`go-webui-v1.bat` or use `go-webui-v1.ps1`

 #### Others

@ -251,21 +244,13 @@ if you want to switch to V1,then double-click`go-webui-v1.bat` or use `go-webui-
 python webui.py <language(optional)>
 ```

-if you want to switch to V1,then
-
-```bash
-python webui.py v1 <language(optional)>
-```
-
-Or maunally switch version in WebUI
-
 ### Finetune

 #### Path Auto-filling is now supported

 1. Fill in the audio path
 2. Slice the audio into small chunks
-3. Denoise(optinal)
+3. Denoise (optinal)
 4. ASR
 5. Proofreading ASR transcriptions
 6. Go to the next Tab, then finetune the model
@ -274,7 +259,7 @@ Or maunally switch version in WebUI

 #### Integrated Package Users

-Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
+Double-click `go-webui.bat` or use `go-webui.ps1` ,then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`

 #### Others

@ -290,8 +275,12 @@ python webui.py

 then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`

+<div align="center">
+
 ## V2 Release Notes

+</div>
+
 New Features:

 1. Support Korean and Cantonese
@ -304,18 +293,12 @@ New Features:

   [more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

-Use v2 from v1 environment:
-
-1. `pip install -r requirements.txt` to update some packages
-
-2. Clone the latest codes from github.
-
-3. Download v2 pretrained models from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) and put them into `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained`.
-
-   Chinese v2 additional: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip)(Download G2PW models, unzip and rename to `G2PWModel`, and then place them in `GPT_SoVITS/text`.)
+<div align="center">

 ## V3 Release Notes

+</div>
+
 New Features:

 1. The timbre similarity is higher, requiring less training data to approximate the target speaker (the timbre similarity is significantly improved using the base model directly without fine-tuning).
@ -324,111 +307,59 @@ New Features:

   [more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

-Use v3 from v2 environment:
-
-1. `pip install -r requirements.txt` to update some packages
-
-2. Clone the latest codes from github.
-
-3. Download v3 pretrained models (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`.
-
-   additional: for Audio Super Resolution model, you can read [how to download](./tools/AP_BWE_main/24kto48k/readme.txt)
+<div align="center">

 ## V4 Release Notes

+</div>
+
 New Features:

 1. Version 4 fixes the issue of metallic artifacts in Version 3 caused by non-integer multiple upsampling, and natively outputs 48k audio to prevent muffled sound (whereas Version 3 only natively outputs 24k audio). The author considers Version 4 a direct replacement for Version 3, though further testing is still needed.
   [more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

-Use v4 from v1/v2/v3 environment:
-
-1. `pip install -r requirements.txt` to update some packages
-
-2. Clone the latest codes from github.
-
-3. Download v4 pretrained models (gsv-v4-pretrained/s2v4.pth, and gsv-v4-pretrained/vocoder.pth) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`.
+<div align="center">

 ## V2Pro Release Notes

+</div>
+
 New Features:

 1. Slightly higher VRAM usage than v2, surpassing v4's performance, with v2's hardware cost and speed.
   [more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7)>)

-2.v1/v2 and the v2Pro series share the same characteristics, while v3/v4 have similar features. For training sets with average audio quality, v1/v2/v2Pro can deliver decent results, but v3/v4 cannot. Additionally, the synthesized tone and timebre of v3/v4 lean more toward the reference audio rather than the overall training set.
+2. v1/v2 and the v2Pro series share the same characteristics, while v3/v4 have similar features. For training sets with average audio quality, v1/v2/v2Pro can deliver decent results, but v3/v4 cannot. Additionally, the synthesized tone and timebre of v3/v4 lean more toward the reference audio rather than the overall training set.

-Use v2Pro from v1/v2/v3/v4 environment:
-
-1. `pip install -r requirements.txt` to update some packages
-
-2. Clone the latest codes from github.
-
-3. Download v2Pro pretrained models (v2Pro/s2Dv2Pro.pth, v2Pro/s2Gv2Pro.pth, v2Pro/s2Dv2ProPlus.pth, v2Pro/s2Gv2ProPlus.pth, and sv/pretrained_eres2netv2w24s4ep4.ckpt) from [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) and put them into `GPT_SoVITS/pretrained_models`.
+<div align="center">

 ## Todo List

- [x] **High Priority:**
+</div>

-  - [x] Localization in Japanese and English.
-  - [x] User guide.
-  - [x] Japanese and English dataset fine tune training.
+- [x] **High Priority:**
+  - [x] Localization in Japanese and English
+  - [x] User guide
+  - [x] Japanese and English dataset fine tune training

 - [ ] **Features:**
-  - [x] Zero-shot voice conversion (5s) / few-shot voice conversion (1min).
-  - [x] TTS speaking speed control.
-  - [ ] ~~Enhanced TTS emotion control.~~ Maybe use pretrained finetuned preset GPT models for better emotion.
-  - [ ] Experiment with changing SoVITS token inputs to probability distribution of GPT vocabs (transformer latent).
-  - [x] Improve English and Japanese text frontend.
-  - [ ] Develop tiny and larger-sized TTS models.
-  - [x] Colab scripts.
-  - [x] Try expand training dataset (2k hours -> 10k hours).
+  - [x] Zero-shot voice conversion (5s) / few-shot voice conversion (1min)
+  - [x] TTS speaking speed control
+  - [ ] ~~Enhanced TTS emotion control~~ Maybe use pretrained finetuned preset GPT models for better emotion
+  - [ ] Experiment with changing SoVITS token inputs to probability distribution of GPT vocabs (transformer latent)
+  - [x] Improve English and Japanese text frontend
+  - [ ] Develop tiny and larger-sized TTS models
+  - [x] Colab scripts
+  - [x] Try expand training dataset (2k hours -> 10k hours)
  - [x] better sovits base model (enhanced audio quality)
  - [ ] model mix

-## (Additional) Method for running from the command line
-
-Use the command line to open the WebUI for UVR5
-
-```bash
-python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
-```
-
-<!-- If you can't open a browser, follow the format below for UVR processing,This is using mdxnet for audio processing
-```
-python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
-``` -->
-
-This is how the audio segmentation of the dataset is done using the command line
-
-```bash
-python audio_slicer.py \
-    --input_path "<path_to_original_audio_file_or_directory>" \
-    --output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
-    --threshold <volume_threshold> \
-    --min_length <minimum_duration_of_each_subclip> \
-    --min_interval <shortest_time_gap_between_adjacent_subclips>
-    --hop_size <step_size_for_computing_volume_curve>
-```
-
-This is how dataset ASR processing is done using the command line(Only Chinese)
-
-```bash
-python tools/asr/funasr_asr.py -i <input> -o <output>
-```
-
-ASR processing is performed through Faster_Whisper(ASR marking except Chinese)
-
-(No progress bars, GPU performance may cause time delays)
-
-```bash
-python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
-```
-
-A custom list save path is enabled
+<div align="center">

 ## Credits

+</div>
+
 Special thanks to the following projects and contributors:

 ### Theoretical Research
@ -439,7 +370,7 @@ Special thanks to the following projects and contributors:
 - [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
 - [contentvec](https://github.com/auspicious3000/contentvec/)
 - [hifi-gan](https://github.com/jik876/hifi-gan)
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
+- [fish-speech](https://github.com/fishaudio/fish-speech/blob/d3df50503b36314a964f66cac1af1e19e95bcfa3/fish_speech/models/text2semantic/inference.py#L81)
 - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
 - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)

@ -471,8 +402,12 @@ Special thanks to the following projects and contributors:

 Thankful to @Naozumi520 for providing the Cantonese training set and for the guidance on Cantonese-related knowledge.

+<div align="center">
+
 ## Thanks to all contributors for their efforts

+</div>
+
 <a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
-  <img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
+  <img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" alt=""/>
 </a>
--- a/api.py
+++ b/api.py
@ -145,33 +145,36 @@ import os
 import re
 import sys

+
 now_dir = os.getcwd()
 sys.path.append(now_dir)
-sys.path.append("%s/GPT_SoVITS" % (now_dir))
+sys.path.append(f"{now_dir}/GPT_SoVITS")

+import logging
 import signal
-from text.LangSegmenter import LangSegmenter
+import subprocess
+from io import BytesIO
 from time import time as ttime
+
+import librosa
+import numpy as np
+import soundfile as sf
 import torch
 import torchaudio
-import librosa
-import soundfile as sf
-from fastapi import FastAPI, Request, Query
-from fastapi.responses import StreamingResponse, JSONResponse
 import uvicorn
-from transformers import AutoModelForMaskedLM, AutoTokenizer
-import numpy as np
+from AR.models.t2s_lightning_module import Text2SemanticLightningModule
+from fastapi import FastAPI, Query, Request
+from fastapi.responses import JSONResponse, StreamingResponse
 from feature_extractor import cnhubert
-from io import BytesIO
+from module.mel_processing import spectrogram_torch
 from module.models import Generator, SynthesizerTrn, SynthesizerTrnV3
 from peft import LoraConfig, get_peft_model
-from AR.models.t2s_lightning_module import Text2SemanticLightningModule
 from text import cleaned_text_to_sequence
 from text.cleaner import clean_text
-from module.mel_processing import spectrogram_torch
+from text.LangSegmenter import LangSegmenter
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+
 import config as global_config
-import logging
-import subprocess


 class DefaultRefer:
@ -239,14 +242,14 @@ def init_bigvgan():
    from BigVGAN import bigvgan

    bigvgan_model = bigvgan.BigVGAN.from_pretrained(
-        "%s/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x" % (now_dir,),
+        f"{now_dir}/GPT_SoVITS/pretrained_models/models--nvidia--bigvgan_v2_24khz_100band_256x",
        use_cuda_kernel=False,
    )  # if True, RuntimeError: Ninja is required to load C++ extensions
    # remove weight norm in the model and set to eval mode
    bigvgan_model.remove_weight_norm()
    bigvgan_model = bigvgan_model.eval()

-    if is_half == True:
+    if is_half:
        bigvgan_model = bigvgan_model.half().to(device)
    else:
        bigvgan_model = bigvgan_model.to(device)
@ -268,12 +271,12 @@ def init_hifigan():
    hifigan_model.eval()
    hifigan_model.remove_weight_norm()
    state_dict_g = torch.load(
-        "%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,),
+        f"{now_dir}/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth",
        map_location="cpu",
        weights_only=False,
    )
    print("loading vocoder", hifigan_model.load_state_dict(state_dict_g))
-    if is_half == True:
+    if is_half:
        hifigan_model = hifigan_model.half().to(device)
    else:
        hifigan_model = hifigan_model.to(device)
@ -292,7 +295,7 @@ resample_transform_dict = {}

 def resample(audio_tensor, sr0, sr1, device):
    global resample_transform_dict
-    key = "%s-%s-%s" % (sr0, sr1, str(device))
+    key = f"{sr0}-{sr1}-{str(device)}"
    if key not in resample_transform_dict:
        resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
    return resample_transform_dict[key](audio_tensor)
@ -300,6 +303,7 @@ def resample(audio_tensor, sr0, sr1, device):

 from module.mel_processing import mel_spectrogram_torch

+
 spec_min = -12
 spec_max = 2

@ -312,32 +316,36 @@ def denorm_spec(x):
    return (x + 1) / 2 * (spec_max - spec_min) + spec_min


-mel_fn = lambda x: mel_spectrogram_torch(
-    x,
-    **{
-        "n_fft": 1024,
-        "win_size": 1024,
-        "hop_size": 256,
-        "num_mels": 100,
-        "sampling_rate": 24000,
-        "fmin": 0,
-        "fmax": None,
-        "center": False,
-    },
-)
-mel_fn_v4 = lambda x: mel_spectrogram_torch(
-    x,
-    **{
-        "n_fft": 1280,
-        "win_size": 1280,
-        "hop_size": 320,
-        "num_mels": 100,
-        "sampling_rate": 32000,
-        "fmin": 0,
-        "fmax": None,
-        "center": False,
-    },
-)
+def mel_fn(x):
+    return mel_spectrogram_torch(
+        x,
+        **{
+            "n_fft": 1024,
+            "win_size": 1024,
+            "hop_size": 256,
+            "num_mels": 100,
+            "sampling_rate": 24000,
+            "fmin": 0,
+            "fmax": None,
+            "center": False,
+        },
+    )
+
+
+def mel_fn_v4(x):
+    return mel_spectrogram_torch(
+        x,
+        **{
+            "n_fft": 1280,
+            "win_size": 1280,
+            "hop_size": 320,
+            "num_mels": 100,
+            "sampling_rate": 32000,
+            "fmin": 0,
+            "fmax": None,
+            "center": False,
+        },
+    )


 sr_model = None
@ -345,8 +353,8 @@ sr_model = None

 def audio_sr(audio, sr):
    global sr_model
-    if sr_model == None:
-        from tools.audio_sr import AP_BWE
+    if sr_model is None:
+        from gsv_tools.audio_sr import AP_BWE

        try:
            sr_model = AP_BWE(device, DictToAttrRecursive)
@ -390,8 +398,8 @@ def get_sovits_weights(sovits_path):
    is_exist = is_exist_s2gv3 if model_version == "v3" else is_exist_s2gv4
    path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4

-    if if_lora_v3 == True and is_exist == False:
-        logger.info("SoVITS %s 底模缺失，无法加载相应 LoRA 权重" % model_version)
+    if if_lora_v3 and not is_exist:
+        logger.info(f"SoVITS {model_version} 底模缺失，无法加载相应 LoRA 权重")

    dict_s2 = load_sovits_new(sovits_path)
    hps = dict_s2["config"]
@ -408,7 +416,7 @@ def get_sovits_weights(sovits_path):
    if model_version not in {"v3", "v4"}:
        if "Pro" in model_version:
            hps.model.version = model_version
-            if sv_cn_model == None:
+            if sv_cn_model is None:
                init_sv_cn()

        vq_model = SynthesizerTrn(
@ -437,12 +445,12 @@ def get_sovits_weights(sovits_path):
            del vq_model.enc_q
        except:
            pass
-    if is_half == True:
+    if is_half:
        vq_model = vq_model.half().to(device)
    else:
        vq_model = vq_model.to(device)
    vq_model.eval()
-    if if_lora_v3 == False:
+    if not if_lora_v3:
        vq_model.load_state_dict(dict_s2["weight"], strict=False)
    else:
        path_sovits = path_sovits_v3 if model_version == "v3" else path_sovits_v4
@ -480,7 +488,7 @@ def get_gpt_weights(gpt_path):
    max_sec = config["data"]["max_sec"]
    t2s_model = Text2SemanticLightningModule(config, "****", is_train=False)
    t2s_model.load_state_dict(dict_s1["weight"])
-    if is_half == True:
+    if is_half:
        t2s_model = t2s_model.half()
    t2s_model = t2s_model.to(device)
    t2s_model.eval()
@ -533,35 +541,32 @@ def get_bert_inf(phones, word2ph, norm_text, language):
    else:
        bert = torch.zeros(
            (1024, len(phones)),
-            dtype=torch.float16 if is_half == True else torch.float32,
+            dtype=torch.float16 if is_half else torch.float32,
        ).to(device)

    return bert


-from text import chinese
-
-
 def get_phones_and_bert(text, language, version, final=False):
-    text = re.sub(r' {2,}', ' ', text)
+    text = re.sub(r" {2,}", " ", text)
    textlist = []
    langlist = []
    if language == "all_zh":
-        for tmp in LangSegmenter.getTexts(text,"zh"):
+        for tmp in LangSegmenter.getTexts(text, "zh"):
            langlist.append(tmp["lang"])
            textlist.append(tmp["text"])
    elif language == "all_yue":
-        for tmp in LangSegmenter.getTexts(text,"zh"):
+        for tmp in LangSegmenter.getTexts(text, "zh"):
            if tmp["lang"] == "zh":
                tmp["lang"] = "yue"
            langlist.append(tmp["lang"])
            textlist.append(tmp["text"])
    elif language == "all_ja":
-        for tmp in LangSegmenter.getTexts(text,"ja"):
+        for tmp in LangSegmenter.getTexts(text, "ja"):
            langlist.append(tmp["lang"])
            textlist.append(tmp["text"])
    elif language == "all_ko":
-        for tmp in LangSegmenter.getTexts(text,"ko"):
+        for tmp in LangSegmenter.getTexts(text, "ko"):
            langlist.append(tmp["lang"])
            textlist.append(tmp["text"])
    elif language == "en":
@ -606,7 +611,7 @@ def get_phones_and_bert(text, language, version, final=False):
    if not final and len(phones) < 6:
        return get_phones_and_bert("." + text, language, version, final=True)

-    return phones, bert.to(torch.float16 if is_half == True else torch.float32), norm_text
+    return phones, bert.to(torch.float16 if is_half else torch.float32), norm_text


 class DictToAttrRecursive(dict):
@ -627,7 +632,7 @@ class DictToAttrRecursive(dict):
    def __setattr__(self, key, value):
        if isinstance(value, dict):
            value = DictToAttrRecursive(value)
-        super(DictToAttrRecursive, self).__setitem__(key, value)
+        super().__setitem__(key, value)
        super().__setattr__(key, value)

    def __delattr__(self, item):
@ -662,7 +667,7 @@ def get_spepc(hps, filename, dtype, device, is_v2pro=False):
        center=False,
    )
    spec = spec.to(dtype)
-    if is_v2pro == True:
+    if is_v2pro:
        audio = resample(audio, sr1, 16000, device).to(dtype)
    return spec, audio

@ -715,11 +720,11 @@ def pack_ogg(audio_bytes, data, rate):
        pack_ogg_thread.join()
    except RuntimeError as e:
        # If changing the thread stack size is unsupported, a RuntimeError is raised.
-        print("RuntimeError: {}".format(e))
+        print(f"RuntimeError: {e}")
        print("Changing the thread stack size is unsupported.")
    except ValueError as e:
        # If the specified stack size is invalid, a ValueError is raised and the stack size is unmodified.
-        print("ValueError: {}".format(e))
+        print(f"ValueError: {e}")
        print("The specified stack size is invalid.")

    return audio_bytes
@ -794,7 +799,7 @@ def cut_text(text, punc):
        punds = r"[" + "".join(punc_list) + r"]"
        text = text.strip("\n")
        items = re.split(f"({punds})", text)
-        mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])]
+        mergeitems = ["".join(group) for group in zip(items[::2], items[1::2], strict=False)]
        # 在句子不存在符号或句尾无符号的时候保证文本完整
        if len(items) % 2 == 1:
            mergeitems.append(items[-1])
@ -861,18 +866,18 @@ def get_tts_wav(
    if if_sr and version != "v3":
        if_sr = False

-    t0 = ttime()
+    ttime()
    prompt_text = prompt_text.strip("\n")
    if prompt_text[-1] not in splits:
        prompt_text += "。" if prompt_language != "en" else "."
    prompt_language, text = prompt_language, text.strip("\n")
-    dtype = torch.float16 if is_half == True else torch.float32
-    zero_wav = np.zeros(int(hps.data.sampling_rate * 0.3), dtype=np.float16 if is_half == True else np.float32)
+    dtype = torch.float16 if is_half else torch.float32
+    zero_wav = np.zeros(int(hps.data.sampling_rate * 0.3), dtype=np.float16 if is_half else np.float32)
    with torch.no_grad():
        wav16k, sr = librosa.load(ref_wav_path, sr=16000)
        wav16k = torch.from_numpy(wav16k)
        zero_wav_torch = torch.from_numpy(zero_wav)
-        if is_half == True:
+        if is_half:
            wav16k = wav16k.half().to(device)
            zero_wav_torch = zero_wav_torch.half().to(device)
        else:
@ -889,7 +894,7 @@ def get_tts_wav(
            refers = []
            if is_v2pro:
                sv_emb = []
-                if sv_cn_model == None:
+                if sv_cn_model is None:
                    init_sv_cn()
            if inp_refs:
                for path in inp_refs:
@ -908,7 +913,7 @@ def get_tts_wav(
        else:
            refer, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device)

-    t1 = ttime()
+    ttime()
    # os.environ['version'] = version
    prompt_language = dict_language[prompt_language.lower()]
    text_language = dict_language[text_language.lower()]
@ -930,7 +935,7 @@ def get_tts_wav(
        all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
        bert = bert.to(device).unsqueeze(0)
        all_phoneme_len = torch.tensor([all_phoneme_ids.shape[-1]]).to(device)
-        t2 = ttime()
+        ttime()
        with torch.no_grad():
            pred_semantic, idx = t2s_model.model.infer_panel(
                all_phoneme_ids,
@ -944,7 +949,7 @@ def get_tts_wav(
                early_stop_num=hz * max_sec,
            )
            pred_semantic = pred_semantic[:, -idx:].unsqueeze(0)
-        t3 = ttime()
+        ttime()

        if version not in {"v3", "v4"}:
            if is_v2pro:
@ -1014,10 +1019,10 @@ def get_tts_wav(
            cfm_res = torch.cat(cfm_resss, 2)
            cfm_res = denorm_spec(cfm_res)
            if version == "v3":
-                if bigvgan_model == None:
+                if bigvgan_model is None:
                    init_bigvgan()
            else:  # v4
-                if hifigan_model == None:
+                if hifigan_model is None:
                    init_hifigan()
            vocoder_model = bigvgan_model if version == "v3" else hifigan_model
            with torch.inference_mode():
@ -1030,7 +1035,7 @@ def get_tts_wav(
        audio_opt.append(audio)
        audio_opt.append(zero_wav)
        audio_opt = np.concatenate(audio_opt, 0)
-        t4 = ttime()
+        ttime()

        if version in {"v1", "v2", "v2Pro", "v2ProPlus"}:
            sr = 32000
@ -1128,7 +1133,7 @@ def handle(
        if not default_refer.is_ready():
            return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400)

-    if cut_punc == None:
+    if cut_punc is None:
        text = cut_text(text, default_cut_punc)
    else:
        text = cut_text(text, cut_punc)
--- a/api_v2.py
+++ b/api_v2.py
@ -104,27 +104,31 @@ RESP:
 import os
 import sys
 import traceback
-from typing import Generator, Union
+from collections.abc import Generator
+

 now_dir = os.getcwd()
 sys.path.append(now_dir)
-sys.path.append("%s/GPT_SoVITS" % (now_dir))
+sys.path.append(f"{now_dir}/GPT_SoVITS")

 import argparse
-import subprocess
-import wave
 import signal
+import subprocess
+import threading
+import wave
+from io import BytesIO
+
 import numpy as np
 import soundfile as sf
-from fastapi import FastAPI, Response
-from fastapi.responses import StreamingResponse, JSONResponse
 import uvicorn
-from io import BytesIO
-from tools.i18n.i18n import I18nAuto
-from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config
-from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import get_method_names as get_cut_method_names
+from fastapi import FastAPI, Response
+from fastapi.responses import JSONResponse, StreamingResponse
 from pydantic import BaseModel
-import threading
+
+from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import get_method_names as get_cut_method_names
+from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config
+from gsv_tools.i18n.i18n import I18nAuto
+

 # print(sys.path)
 i18n = I18nAuto()
@ -169,7 +173,7 @@ class TTS_Request(BaseModel):
    fragment_interval: float = 0.3
    seed: int = -1
    media_type: str = "wav"
-    streaming_mode: Union[bool, int] = False
+    streaming_mode: bool | int = False
    parallel_infer: bool = True
    repetition_penalty: float = 1.35
    sample_steps: int = 32
@ -199,8 +203,6 @@ def pack_ogg(io_buffer: BytesIO, data: np.ndarray, rate: int):
        with sf.SoundFile(io_buffer, mode="w", samplerate=rate, channels=1, format="ogg") as audio_file:
            audio_file.write(data)

-
-
    # See: https://docs.python.org/3/library/threading.html
    # The stack size of this thread is at least 32768
    # If stack overflow error still occurs, just modify the `stack_size`.
@ -214,11 +216,11 @@ def pack_ogg(io_buffer: BytesIO, data: np.ndarray, rate: int):
        pack_ogg_thread.join()
    except RuntimeError as e:
        # If changing the thread stack size is unsupported, a RuntimeError is raised.
-        print("RuntimeError: {}".format(e))
+        print(f"RuntimeError: {e}")
        print("Changing the thread stack size is unsupported.")
    except ValueError as e:
        # If the specified stack size is invalid, a ValueError is raised and the stack size is unmodified.
-        print("ValueError: {}".format(e))
+        print(f"ValueError: {e}")
        print("The specified stack size is invalid.")

    return io_buffer
@ -306,7 +308,7 @@ def check_params(req: dict):
    text: str = req.get("text", "")
    text_lang: str = req.get("text_lang", "")
    ref_audio_path: str = req.get("ref_audio_path", "")
-    streaming_mode: bool = req.get("streaming_mode", False)
+    req.get("streaming_mode", False)
    media_type: str = req.get("media_type", "wav")
    prompt_lang: str = req.get("prompt_lang", "")
    text_split_method: str = req.get("text_split_method", "cut5")
@ -384,7 +386,7 @@ async def tts_handle(req: dict):
    check_res = check_params(req)
    if check_res is not None:
        return check_res
-    
+
    if streaming_mode == 0:
        streaming_mode = False
        return_fragment = False
@ -403,7 +405,10 @@ async def tts_handle(req: dict):
        fixed_length_chunk = True

    else:
-        return JSONResponse(status_code=400, content={"message": f"the value of streaming_mode must be 0, 1, 2, 3(int) or true/false(bool)"})
+        return JSONResponse(
+            status_code=400,
+            content={"message": "the value of streaming_mode must be 0, 1, 2, 3(int) or true/false(bool)"},
+        )

    req["streaming_mode"] = streaming_mode
    req["return_fragment"] = return_fragment
@ -413,7 +418,6 @@ async def tts_handle(req: dict):

    streaming_mode = streaming_mode or return_fragment

-
    try:
        tts_generator = tts_pipeline.run(req)

@ -475,7 +479,7 @@ async def tts_get_endpoint(
    repetition_penalty: float = 1.35,
    sample_steps: int = 32,
    super_sampling: bool = False,
-    streaming_mode: Union[bool, int] = False,
+    streaming_mode: bool | int = False,
    overlap_length: int = 2,
    min_chunk_length: int = 16,
 ):
--- a/config.py
+++ b/config.py
@ -4,7 +4,8 @@ import sys

 import torch

-from tools.i18n.i18n import I18nAuto
+from gsv_tools.i18n.i18n import I18nAuto
+

 i18n = I18nAuto(language=os.environ.get("language", "Auto"))

@ -77,7 +78,7 @@ GPT_weight_version2root = {

 def custom_sort_key(s):
    # 使用正则表达式提取字符串中的数字部分和非数字部分
-    parts = re.split("(\d+)", s)
+    parts = re.split(r"(\d+)", s)
    # 将数字部分转换为整数，非数字部分保持不变
    parts = [int(part) if part.isdigit() else part for part in parts]
    return parts
@ -93,7 +94,7 @@ def get_weights_names():
            continue
        for name in os.listdir(path):
            if name.endswith(".pth"):
-                SoVITS_names.append("%s/%s" % (path, name))
+                SoVITS_names.append(f"{path}/{name}")
    if not SoVITS_names:
        SoVITS_names = [""]
    GPT_names = []
@ -105,7 +106,7 @@ def get_weights_names():
            continue
        for name in os.listdir(path):
            if name.endswith(".ckpt"):
-                GPT_names.append("%s/%s" % (path, name))
+                GPT_names.append(f"{path}/{name}")
    SoVITS_names = sorted(SoVITS_names, key=custom_sort_key)
    GPT_names = sorted(GPT_names, key=custom_sort_key)
    if not GPT_names:
@ -161,7 +162,7 @@ def get_device_dtype_sm(idx: int) -> tuple[torch.device, torch.dtype, float, flo
    is_16_series = bool(re.search(r"16\d{2}", name)) and sm_version == 7.5
    if mem_gb < 4 or sm_version < 5.3:
        return cpu, torch.float32, 0.0, 0.0
-    if sm_version == 6.1 or is_16_series == True:
+    if sm_version == 6.1 or is_16_series:
        return cuda, torch.float32, sm_version, mem_gb
    if sm_version > 6.1:
        return cuda, torch.float16, sm_version, mem_gb
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -30,8 +30,8 @@ services:
      - "9880:9880"
    volumes:
      - .:/workspace/GPT-SoVITS
-      - tools/asr/models:/workspace/models/asr_models
-      - tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
+      - gsv_tools/asr/models:/workspace/models/asr_models
+      - gsv_tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
    environment:
      - is_half=true
    tty: true
@ -68,8 +68,8 @@ services:
      - "9880:9880"
    volumes:
      - .:/workspace/GPT-SoVITS
-      - tools/asr/models:/workspace/models/asr_models
-      - tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
+      - gsv_tools/asr/models:/workspace/models/asr_models
+      - gsv_tools/uvr5/uvr5_weights:/workspace/models/uvr5_weights
    environment:
      - is_half=true
    tty: true
--- a/docker_build.sh
+++ b/docker_build.sh
@ -26,7 +26,7 @@ print_help() {
    echo "  -h, --help          Show this help message and exit"
    echo ""
    echo "Examples:"
-    echo "  bash docker_build.sh --cuda 12.6 --funasr --faster-whisper"
+    echo "  bash docker_build.sh --cuda 12.6 --lite"
 }

 # Show help if no arguments provided
--- a/docs/cn/Changelog_CN.md
+++ b/docs/cn/Changelog_CN.md
@ -409,7 +409,7 @@

 - 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4)
  - 内容: **新增 GPT-SoVITS V3 模型, 需要 14G 显存进行微调.**
-  - 类型: 新功能 (特性参阅 [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)))
+  - 类型: 新功能 (特性参阅 [Wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>))
  - 提交: RVC-Boss
 - 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032)
  - 内容: 更新项目多语言文档.
@ -622,5 +622,3 @@
  - 内容: 提升推理进程优先级（修复win11下可能GPU利用率受限的问题）
  - 类型: 修复
  - 提交: XianYue0125
-
-
--- a/docs/cn/README.md
+++ b/docs/cn/README.md
@ -1,3 +1,5 @@
+#
+
 <div align="center">

 <h1>GPT-SoVITS-WebUI</h1>
@ -19,15 +21,18 @@
 [![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
 [![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)

-
 [**English**](../../README.md) | **中文简体** | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)

 </div>

 ---

+<div align="center">
+
 ## 功能

+</div>
+
 1. **零样本文本到语音 (TTS):** 输入 5 秒的声音样本, 即刻体验文本到语音转换.

 2. **少样本 TTS:** 仅需 1 分钟的训练数据即可微调模型, 提升声音相似度和真实感.
@ -44,40 +49,49 @@

 **用户手册: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**

+<div align="center">
+
 ## 安装

 中国地区的用户可[点击此处](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official)使用 AutoDL 云端镜像进行体验.

 ### 测试通过的环境

-| Python Version | PyTorch Version  | Device        |
-| -------------- | ---------------- | ------------- |
-| Python 3.10    | PyTorch 2.5.1    | CUDA 12.4     |
-| Python 3.11    | PyTorch 2.5.1    | CUDA 12.4     |
-| Python 3.11    | PyTorch 2.7.0    | CUDA 12.8     |
-| Python 3.9     | PyTorch 2.8.0dev | CUDA 12.8     |
-| Python 3.9     | PyTorch 2.5.1    | Apple silicon |
-| Python 3.11    | PyTorch 2.7.0    | Apple silicon |
-| Python 3.9     | PyTorch 2.2.2    | CPU           |
+| Python Version | PyTorch Version | Device        |
+| -------------- | --------------- | ------------- |
+| Python 3.10    | PyTorch 2.8.0   | CUDA 12.6     |
+| Python 3.11    | PyTorch 2.9.0   | CUDA 12.6     |
+| Python 3.11    | PyTorch 2.7.0   | CUDA 12.8     |
+| Python 3.11    | PyTorch 2.10.0  | CUDA 12.8     |
+| Python 3.10    | PyTorch 2.8.0   | Apple silicon |
+| Python 3.11    | PyTorch 2.9.0   | Apple silicon |
+| Python 3.12    | PyTorch 2.10.0  | Apple silicon |
+| Python 3.10    | PyTorch 2.9.0   | CPU           |
+
+</div>

 ### Windows

-如果你是 Windows 用户 (已在 win>=10 上测试), 可以下载[整合包](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true), 解压后双击 go-webui.bat 即可启动 GPT-SoVITS-WebUI.
+如果你是 Windows 用户 (已在 win>=10 上测试), 可以下载[整合包](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/tree/main), 解压后双击 go-webui.bat 即可启动 GPT-SoVITS-WebUI.

 **中国地区的用户可以[在此处下载整合包](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#KTvnO).**

+运行以下的命令来安装本项目:
+
 ```pwsh
-conda create -n GPTSoVits python=3.10
-conda activate GPTSoVits
-pwsh -F install.ps1 --Device <CU126|CU128|CPU> --Source <HF|HF-Mirror|ModelScope> [--DownloadUVR5]
+conda create -n GPTSoVITS python=3.11
+conda activate GPTSoVITS
+pwsh -F install.ps1 --help
 ```

 ### Linux

+运行以下的命令来安装本项目:
+
 ```bash
-conda create -n GPTSoVits python=3.10
-conda activate GPTSoVits
-bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
+conda create -n GPTSoVITS python=3.11
+conda activate GPTSoVITS
+bash install.sh --help
 ```

 ### macOS
@ -87,54 +101,51 @@ bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScop
 运行以下的命令来安装本项目:

 ```bash
-conda create -n GPTSoVits python=3.10
-conda activate GPTSoVits
-bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
+conda create -n GPTSoVITS python=3.11
+conda activate GPTSoVITS
+bash install.sh --help
 ```

 ### 手动安装

-#### 安装依赖
+运行以下的命令来安装本项目:

 ```bash
-conda create -n GPTSoVits python=3.10
-conda activate GPTSoVits
+conda create -n GPTSoVITS python=3.11
+conda activate GPTSoVITS
+conda install uv ffmpeg -c conda-forge

-pip install -r extra-req.txt --no-deps
-pip install -r requirements.txt
+uv export --extra main -o pylock.toml -q --extra [mlx|cu126|cu128|rocm|cpu]
+uv pip sync pylock.toml --no-break-system-packages --preview-features pylock
+uv pip install ".[flash-attn]"
 ```

-#### 安装 FFmpeg
+### 预训练模型

-##### Conda 用户
+**若成功运行`install.sh`可跳过 No.1,2,3**

-```bash
-conda activate GPTSoVits
-conda install ffmpeg
-```
+**中国地区的用户可以[在此处下载这些模型](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#nVNhX).**

-##### Ubuntu/Debian 用户
+1. 从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型, 并将其放置在 `GPT_SoVITS/pretrained_models` 目录中.

-```bash
-sudo apt install ffmpeg
-sudo apt install libsox-dev
-```
+2. 从 [G2PWModel.zip (HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip (ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) 下载模型, 解压并重命名为 `G2PWModel`, 然后将其放置在 `GPT_SoVITS/text` 目录中. (仅限中文 TTS)

-##### Windows 用户
+3. 对于 UVR5 (人声/伴奏分离和混响移除, 额外功能), 从 [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 下载模型, 并将其放置在 `gsv_tools/uvr5/uvr5_weights` 目录中.
+   - 如果你在 UVR5 中使用 `bs_roformer` 或 `mel_band_roformer`模型, 你可以手动下载模型和相应的配置文件, 并将它们放在 `gsv_tools/UVR5/UVR5_weights` 中.**重命名模型文件和配置文件, 确保除后缀外**, 模型和配置文件具有相同且对应的名称.此外, 模型和配置文件名**必须包含"roformer"**, 才能被识别为 roformer 类的模型.

-下载并将 [ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) 和 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) 放置在 GPT-SoVITS 根目录下
+   - 建议在模型名称和配置文件名中**直接指定模型类型**, 例如`mel_mand_roformer`、`bs_roformer`.如果未指定, 将从配置文中比对特征, 以确定它是哪种类型的模型.例如, 模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对.`kim_mel_band_roformer.ckpt` 和 `kim_mel_band_roformer.yaml` 也是一对.

-安装 [Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 环境
+4. 对于中文 ASR (额外功能), 从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型, 并将它们放置在 `gsv_tools/asr/models` 目录中.

-##### MacOS 用户
+5. 对于英语或日语 ASR (额外功能), 从 [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 下载模型, 并将其放置在 `gsv_tools/asr/models` 目录中.此外, [其他模型](https://huggingface.co/Systran) 可能具有类似效果且占用更少的磁盘空间.

-```bash
-brew install ffmpeg
-```
+<div align="center">

-### 运行 GPT-SoVITS (使用 Docker)
+## 运行 GPT-SoVITS (使用 Docker)

-#### Docker 镜像选择
+</div>
+
+### Docker 镜像选择

 由于代码库更新频繁, 而 Docker 镜像的发布周期相对较慢, 请注意：

@ -145,15 +156,15 @@ brew install ffmpeg
 - Docker Compose 将会挂载当前目录的**所有文件**, 请在使用 Docker 镜像前先切换到项目根目录并**拉取代码更新**
 - 可选：为了获得最新的更改, 你可以使用提供的 Dockerfile 在本地构建镜像

-#### 环境变量
+### 环境变量

 - `is_half`：控制是否启用半精度(fp16). 如果你的 GPU 支持, 设置为 `true` 可以减少显存占用

-#### 共享内存配置
+### 共享内存配置

 在 Windows (Docker Desktop) 中, 默认共享内存大小较小, 可能导致运行异常. 请在 Docker Compose 文件中根据系统内存情况, 增大 `shm_size` (例如设置为 `16g`)

-#### 选择服务
+### 选择服务

 `docker-compose.yaml` 文件定义了两个主要服务类型：

@ -166,7 +177,7 @@ brew install ffmpeg
 docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
 ```

-#### 本地构建 Docker 镜像
+### 本地构建 Docker 镜像

 如果你希望自行构建镜像, 请使用以下命令：

@ -174,7 +185,7 @@ docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|
 bash docker_build.sh --cuda <12.6|12.8> [--lite]
 ```

-#### 访问运行中的容器 (Bash Shell)
+### 访问运行中的容器 (Bash Shell)

 当容器在后台运行时, 你可以通过以下命令进入容器：

@ -182,32 +193,18 @@ bash docker_build.sh --cuda <12.6|12.8> [--lite]
 docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
 ```

-## 预训练模型
-
-**若成功运行`install.sh`可跳过 No.1,2,3**
-
-**中国地区的用户可以[在此处下载这些模型](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e/dkxgpiy9zb96hob4#nVNhX).**
-
-1. 从 [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 下载预训练模型, 并将其放置在 `GPT_SoVITS/pretrained_models` 目录中.
-
-2. 从 [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) 下载模型, 解压并重命名为 `G2PWModel`, 然后将其放置在 `GPT_SoVITS/text` 目录中. (仅限中文 TTS)
-
-3. 对于 UVR5 (人声/伴奏分离和混响移除, 额外功能), 从 [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 下载模型, 并将其放置在 `tools/uvr5/uvr5_weights` 目录中.
-
-   - 如果你在 UVR5 中使用 `bs_roformer` 或 `mel_band_roformer`模型, 你可以手动下载模型和相应的配置文件, 并将它们放在 `tools/UVR5/UVR5_weights` 中.**重命名模型文件和配置文件, 确保除后缀外**, 模型和配置文件具有相同且对应的名称.此外, 模型和配置文件名**必须包含"roformer"**, 才能被识别为 roformer 类的模型.
-
-   - 建议在模型名称和配置文件名中**直接指定模型类型**, 例如`mel_mand_roformer`、`bs_roformer`.如果未指定, 将从配置文中比对特征, 以确定它是哪种类型的模型.例如, 模型`bs_roformer_ep_368_sdr_12.9628.ckpt` 和对应的配置文件`bs_roformer_ep_368_sdr_12.9628.yaml` 是一对.`kim_mel_band_roformer.ckpt` 和 `kim_mel_band_roformer.yaml` 也是一对.
-
-4. 对于中文 ASR (额外功能), 从 [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 和 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 下载模型, 并将它们放置在 `tools/asr/models` 目录中.
-
-5. 对于英语或日语 ASR (额外功能), 从 [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 下载模型, 并将其放置在 `tools/asr/models` 目录中.此外, [其他模型](https://huggingface.co/Systran) 可能具有类似效果且占用更少的磁盘空间.
+<div align="center">

 ## 数据集格式

-文本到语音 (TTS) 注释 .list 文件格式:
+</div>
+
+文本到语音 (TTS) 注释 `.list` 文件格式:
+
+```text

-```
 vocal_path|speaker_name|language|text
+
 ```

 语言字典:
@ -220,18 +217,23 @@ vocal_path|speaker_name|language|text

 示例:

-```
+```text
+
 D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神.
+
 ```

+<div align="center">
+
 ## 微调与推理

+</div>
+
 ### 打开 WebUI

 #### 整合包用户

 双击`go-webui.bat`或者使用`go-webui.ps1`
-若想使用 V1,则双击`go-webui-v1.bat`或者使用`go-webui-v1.ps1`

 #### 其他

@ -239,14 +241,6 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神.
 python webui.py <language(optional)>
 ```

-若想使用 V1,则
-
-```bash
-python webui.py v1 <language(optional)>
-```
-
-或者在 webUI 内动态切换
-
 ### 微调

 #### 现已支持自动填充路径
@ -278,8 +272,12 @@ python webui.py

 然后在 `1-GPT-SoVITS-TTS/1C-推理` 中打开推理 webUI

+<div align="center">
+
 ## V2 发布说明

+</div>
+
 新特性:

 1. 支持韩语及粤语
@ -292,18 +290,12 @@ python webui.py

   详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

-从 v1 环境迁移至 v2
-
-1. 需要 pip 安装 requirements.txt 更新环境
-
-2. 需要克隆 github 上的最新代码
-
-3. 需要从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) 下载预训练模型文件放到 GPT_SoVITS/pretrained_models/gsv-v2final-pretrained 下
-
-   中文额外需要下载[G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (下载 G2PW 模型,解压并重命名为`G2PWModel`,将其放到`GPT_SoVITS/text`目录下)
+<div align="center">

 ## V3 更新说明

+</div>
+
 新模型特点:

 1. 音色相似度更像, 需要更少训练集来逼近本人 (不训练直接使用底模模式下音色相似性提升更大)
@ -312,33 +304,23 @@ python webui.py

   详见[wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

-从 v2 环境迁移至 v3
-
-1. 需要 pip 安装 requirements.txt 更新环境
-
-2. 需要克隆 github 上的最新代码
-
-3. 从[huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)下载这些 v3 新增预训练模型 (s1v3.ckpt, s2Gv3.pth and models--nvidia--bigvgan_v2_24khz_100band_256x folder)将他们放到`GPT_SoVITS/pretrained_models`目录下
-
-   如果想用音频超分功能缓解 v3 模型生成 24k 音频觉得闷的问题, 需要下载额外的模型参数, 参考[how to download](../../tools/AP_BWE_main/24kto48k/readme.txt)
+<div align="center">

 ## V4 更新说明

+</div>
+
 新特性：

 1. **V4 版本修复了 V3 版本中由于非整数倍上采样导致的金属音问题, 并原生输出 48kHz 音频以避免声音闷糊 (而 V3 版本仅原生输出 24kHz 音频)**. 作者认为 V4 是对 V3 的直接替代, 但仍需进一步测试.
   [更多详情](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

-从 V1/V2/V3 环境迁移至 V4：
-
-1. 执行 `pip install -r requirements.txt` 更新部分依赖包.
-
-2. 从 GitHub 克隆最新代码.
-
-3. 从 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) 下载 V4 预训练模型 (`gsv-v4-pretrained/s2v4.ckpt` 和 `gsv-v4-pretrained/vocoder.pth`), 并放入 `GPT_SoVITS/pretrained_models` 目录.
+<div align="center">

 ## V2Pro 更新说明

+</div>
+
 新特性：

 1. **相比 V2 占用稍高显存, 性能超过 V4, 在保留 V2 硬件成本和推理速度优势的同时实现更高音质.**
@ -346,77 +328,35 @@ python webui.py

 2. V1/V2 与 V2Pro 系列具有相同特性, V3/V4 则具备相近功能. 对于平均音频质量较低的训练集, V1/V2/V2Pro 可以取得较好的效果, 但 V3/V4 无法做到. 此外, V3/V4 合成的声音更偏向参考音频, 而不是整体训练集的风格.

-从 V1/V2/V3/V4 环境迁移至 V2Pro：
-
-1. 执行 `pip install -r requirements.txt` 更新部分依赖包.
-
-2. 从 GitHub 克隆最新代码.
-
-3. 从 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) 下载 V2Pro 预训练模型 (`v2Pro/s2Dv2Pro.pth`, `v2Pro/s2Gv2Pro.pth`, `v2Pro/s2Dv2ProPlus.pth`, `v2Pro/s2Gv2ProPlus.pth`, 和 `sv/pretrained_eres2netv2w24s4ep4.ckpt`), 并放入 `GPT_SoVITS/pretrained_models` 目录.
+<div align="center">

 ## 待办事项清单

- [x] **高优先级:**
+</div>

-  - [x] 日语和英语的本地化.
-  - [x] 用户指南.
-  - [x] 日语和英语数据集微调训练.
+- [x] **高优先级:**
+  - [x] 日语和英语的本地化
+  - [x] 用户指南
+  - [x] 日语和英语数据集微调训练

 - [ ] **功能:**
  - [x] 零样本声音转换 (5 秒) / 少样本声音转换 (1 分钟).
-  - [x] TTS 语速控制.
-  - [ ] ~~增强的 TTS 情感控制.~~
-  - [ ] 尝试将 SoVITS 令牌输入更改为词汇的概率分布.
-  - [x] 改进英语和日语文本前端.
-  - [ ] 开发体积小和更大的 TTS 模型.
-  - [x] Colab 脚本.
-  - [x] 扩展训练数据集 (从 2k 小时到 10k 小时).
-  - [x] 更好的 sovits 基础模型 (增强的音频质量).
-  - [ ] 模型混合.
+  - [x] TTS 语速控制
+  - [ ] ~~增强的 TTS 情感控制~~
+  - [ ] 尝试将 SoVITS 令牌输入更改为词汇的概率分布
+  - [x] 改进英语和日语文本前端
+  - [ ] 开发体积小和更大的 TTS 模型
+  - [x] Colab 脚本
+  - [x] 扩展训练数据集 (从 2k 小时到 10k 小时)
+  - [x] 更好的 sovits 基础模型 (增强的音频质量)
+  - [ ] 模型混合

-## (附加) 命令行运行方式
-
-使用命令行打开 UVR5 的 WebUI
-
-```bash
-python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
-```
-
-<!-- 如果打不开浏览器, 请按照下面的格式进行UVR处理, 这是使用mdxnet进行音频处理的方式
-````
-python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
-```` -->
-
-这是使用命令行完成数据集的音频切分的方式
-
-```bash
-python audio_slicer.py \
-    --input_path "<path_to_original_audio_file_or_directory>" \
-    --output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
-    --threshold <volume_threshold> \
-    --min_length <minimum_duration_of_each_subclip> \
-    --min_interval <shortest_time_gap_between_adjacent_subclips>
-    --hop_size <step_size_for_computing_volume_curve>
-```
-
-这是使用命令行完成数据集 ASR 处理的方式 (仅限中文)
-
-```bash
-python tools/asr/funasr_asr.py -i <input> -o <output>
-```
-
-通过 Faster_Whisper 进行 ASR 处理 (除中文之外的 ASR 标记)
-
-(没有进度条, GPU 性能可能会导致时间延迟)
-
-```bash
-python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
-```
-
-启用自定义列表保存路径
+<div align="center">

 ## 致谢

+</div>
+
 特别感谢以下项目和贡献者:

 ### 理论研究
@ -427,7 +367,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
 - [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
 - [contentvec](https://github.com/auspicious3000/contentvec/)
 - [hifi-gan](https://github.com/jik876/hifi-gan)
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
+- [fish-speech](https://github.com/fishaudio/fish-speech/blob/d3df50503b36314a964f66cac1af1e19e95bcfa3/fish_speech/models/text2semantic/inference.py#L81)
 - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
 - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)

@ -459,8 +399,12 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p

 感谢 @Naozumi520 提供粤语训练集, 并在粤语相关知识方面给予指导.

+<div align="center">
+
 ## 感谢所有贡献者的努力

+</div>
+
 <a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
-  <img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
+  <img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" alt=""/>
 </a>
--- a/docs/ja/README.md
+++ b/docs/ja/README.md
@ -1,3 +1,5 @@
+#
+
 <div align="center">

 <h1>GPT-SoVITS-WebUI</h1>
@ -19,14 +21,17 @@
 [![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
 [![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)

-
 [**English**](../../README.md) | [**中文简体**](../cn/README.md) | **日本語** | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)

 </div>

 ---

-## 機能:
+<div align="center">
+
+## 機能
+
+</div>

 1. **Zero-Shot TTS:** たった 5 秒間の音声サンプルで、即座にテキストからその音声に変換できます.

@ -40,34 +45,49 @@

 声の事前学習無しかつ Few-Shot でトレーニングされたモデルのデモ:

-https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
+<https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb>

 **ユーザーマニュアル: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**

+<div align="center">
+
 ## インストール

 ### テスト済みの環境

-| Python Version | PyTorch Version  | Device        |
-| -------------- | ---------------- | ------------- |
-| Python 3.10    | PyTorch 2.5.1    | CUDA 12.4     |
-| Python 3.11    | PyTorch 2.5.1    | CUDA 12.4     |
-| Python 3.11    | PyTorch 2.7.0    | CUDA 12.8     |
-| Python 3.9     | PyTorch 2.8.0dev | CUDA 12.8     |
-| Python 3.9     | PyTorch 2.5.1    | Apple silicon |
-| Python 3.11    | PyTorch 2.7.0    | Apple silicon |
-| Python 3.9     | PyTorch 2.2.2    | CPU           |
+| Python Version | PyTorch Version | Device        |
+| -------------- | --------------- | ------------- |
+| Python 3.10    | PyTorch 2.8.0   | CUDA 12.6     |
+| Python 3.11    | PyTorch 2.9.0   | CUDA 12.6     |
+| Python 3.11    | PyTorch 2.7.0   | CUDA 12.8     |
+| Python 3.11    | PyTorch 2.10.0  | CUDA 12.8     |
+| Python 3.10    | PyTorch 2.8.0   | Apple silicon |
+| Python 3.11    | PyTorch 2.9.0   | Apple silicon |
+| Python 3.12    | PyTorch 2.10.0  | Apple silicon |
+| Python 3.10    | PyTorch 2.9.0   | CPU           |
+
+</div>

 ### Windows

-Windows ユーザー: (Windows 10 以降でテスト済み)、[統合パッケージをダウンロード](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true)し、解凍後に _go-webui.bat_ をダブルクリックすると、GPT-SoVITS-WebUI が起動します.
+Windows ユーザー: (Windows 10 以降でテスト済み)、[統合パッケージをダウンロード](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/tree/main)し、解凍後に `go-webui.bat` をダブルクリックすると、GPT-SoVITS-WebUI が起動します.
+
+以下のコマンドを実行してこのプロジェクトをインストールします:
+
+```pwsh
+conda create -n GPTSoVITS python=3.11
+conda activate GPTSoVITS
+pwsh -F install.ps1 --help
+```

 ### Linux

+以下のコマンドを実行してこのプロジェクトをインストールします:
+
 ```bash
-conda create -n GPTSoVits python=3.10
-conda activate GPTSoVits
-bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
+conda create -n GPTSoVITS python=3.11
+conda activate GPTSoVITS
+bash install.sh --help
 ```

 ### macOS
@ -77,54 +97,49 @@ bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScop
 以下のコマンドを実行してこのプロジェクトをインストールします:

 ```bash
-conda create -n GPTSoVits python=3.10
-conda activate GPTSoVits
-bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
+conda create -n GPTSoVITS python=3.11
+conda activate GPTSoVITS
+bash install.sh --help
 ```

 ### 手動インストール

-#### 依存関係をインストールします
+以下のコマンドを実行してこのプロジェクトをインストールします:

 ```bash
-conda create -n GPTSoVits python=3.10
-conda activate GPTSoVits
+conda create -n GPTSoVITS python=3.11
+conda activate GPTSoVITS
+conda install uv ffmpeg -c conda-forge

-pip install -r extra-req.txt --no-deps
-pip install -r requirements.txt
+uv export --extra main -o pylock.toml -q --extra [mlx|cu126|cu128|rocm|cpu]
+uv pip sync pylock.toml --no-break-system-packages --preview-features pylock
+uv pip install ".[flash-attn]"
 ```

-#### FFmpeg をインストールします
+### 事前訓練済みモデル

-##### Conda ユーザー
+**`install.sh`が正常に実行された場合、No.1,2,3 はスキップしてかまいません.**

-```bash
-conda activate GPTSoVits
-conda install ffmpeg
-```
+1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) から事前訓練済みモデルをダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリに配置してください.

-##### Ubuntu/Debian ユーザー
+2. [G2PWModel.zip (HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip (ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください. (中国語 TTS のみ)

-```bash
-sudo apt install ffmpeg
-sudo apt install libsox-dev
-```
+3. UVR5 (ボーカル/伴奏 (BGM 等) 分離 & リバーブ除去の追加機能) の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`gsv_tools/uvr5/uvr5_weights` ディレクトリに配置してください.
+   - UVR5 で bs_roformer または mel_band_roformer モデルを使用する場合、モデルと対応する設定ファイルを手動でダウンロードし、`gsv_tools/UVR5/UVR5_weights`フォルダに配置することができます.**モデルファイルと設定ファイルの名前は、拡張子を除いて同じであることを確認してください**.さらに、モデルと設定ファイルの名前には**「roformer」が含まれている必要があります**.これにより、roformer クラスのモデルとして認識されます.

-##### Windows ユーザー
+   - モデル名と設定ファイル名には、**直接モデルタイプを指定することをお勧めします**.例: mel_mand_roformer、bs_roformer.指定しない場合、設定文から特徴を照合して、モデルの種類を特定します.例えば、モデル`bs_roformer_ep_368_sdr_12.9628.ckpt`と対応する設定ファイル`bs_roformer_ep_368_sdr_12.9628.yaml`はペアです.同様に、`kim_mel_band_roformer.ckpt`と`kim_mel_band_roformer.yaml`もペアです.

-[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) と [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) をダウンロードし、GPT-SoVITS のルートフォルダに置きます
+4. 中国語 ASR (追加機能) の場合は、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、および [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`gsv_tools/asr/models` ディレクトリに配置してください.

-[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 環境をインストールしてください
+5. 英語または日本語の ASR (追加機能) を使用する場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`gsv_tools/asr/models` ディレクトリに配置してください.また、[他のモデル](https://huggingface.co/Systran) は、より小さいサイズで高クオリティな可能性があります.

-##### MacOS ユーザー
+<div align="center">

-```bash
-brew install ffmpeg
-```
+## GPT-SoVITS の実行 (Docker 使用)

-### GPT-SoVITS の実行 (Docker 使用)
+</div>

-#### Docker イメージの選択
+### Docker イメージの選択

 コードベースの更新が頻繁である一方、Docker イメージのリリースは比較的遅いため、以下を確認してください：

@ -135,15 +150,15 @@ brew install ffmpeg
 - Docker Compose は現在のディレクトリ内の**すべてのファイル**をマウントします. Docker イメージを使用する前に、プロジェクトのルートディレクトリに移動し、**コードを最新の状態に更新**してください
 - オプション：最新の変更を反映させるため、提供されている Dockerfile を使ってローカルでイメージをビルドすることも可能です

-#### 環境変数
+### 環境変数

 - `is_half`：半精度 (fp16) を使用するかどうかを制御します. GPU が対応している場合、`true` に設定することでメモリ使用量を削減できます

-#### 共有メモリの設定
+### 共有メモリの設定

 Windows (Docker Desktop) では、デフォルトの共有メモリサイズが小さいため、予期しない動作が発生する可能性があります. Docker Compose ファイル内の `shm_size` を (例：`16g`) に増やすことをおすすめします

-#### サービスの選択
+### サービスの選択

 `docker-compose.yaml` ファイルには次の 2 種類のサービスが定義されています：

@ -156,7 +171,7 @@ Windows (Docker Desktop) では、デフォルトの共有メモリサイズが
 docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
 ```

-#### Docker イメージのローカルビルド
+### Docker イメージのローカルビルド

 自分でイメージをビルドするには、以下のコマンドを使ってください：

@ -164,7 +179,7 @@ docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|
 bash docker_build.sh --cuda <12.6|12.8> [--lite]
 ```

-#### 実行中のコンテナへアクセス (Bash Shell)
+### 実行中のコンテナへアクセス (Bash Shell)

 コンテナがバックグラウンドで実行されている場合、以下のコマンドでシェルにアクセスできます：

@ -172,30 +187,18 @@ bash docker_build.sh --cuda <12.6|12.8> [--lite]
 docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
 ```

-## 事前訓練済みモデル
-
-**`install.sh`が正常に実行された場合、No.1,2,3 はスキップしてかまいません.**
-
-1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) から事前訓練済みモデルをダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリに配置してください.
-
-2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) からモデルをダウンロードし、解凍して `G2PWModel` にリネームし、`GPT_SoVITS/text` ディレクトリに配置してください. (中国語 TTS のみ)
-
-3. UVR5 (ボーカル/伴奏 (BGM 等) 分離 & リバーブ除去の追加機能) の場合は、[UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) からモデルをダウンロードし、`tools/uvr5/uvr5_weights` ディレクトリに配置してください.
-
-   - UVR5 で bs_roformer または mel_band_roformer モデルを使用する場合、モデルと対応する設定ファイルを手動でダウンロードし、`tools/UVR5/UVR5_weights`フォルダに配置することができます.**モデルファイルと設定ファイルの名前は、拡張子を除いて同じであることを確認してください**.さらに、モデルと設定ファイルの名前には**「roformer」が含まれている必要があります**.これにより、roformer クラスのモデルとして認識されます.
-
-   - モデル名と設定ファイル名には、**直接モデルタイプを指定することをお勧めします**.例: mel_mand_roformer、bs_roformer.指定しない場合、設定文から特徴を照合して、モデルの種類を特定します.例えば、モデル`bs_roformer_ep_368_sdr_12.9628.ckpt`と対応する設定ファイル`bs_roformer_ep_368_sdr_12.9628.yaml`はペアです.同様に、`kim_mel_band_roformer.ckpt`と`kim_mel_band_roformer.yaml`もペアです.
-
-4. 中国語 ASR (追加機能) の場合は、[Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)、[Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)、および [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください.
-
-5. 英語または日本語の ASR (追加機能) を使用する場合は、[Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) からモデルをダウンロードし、`tools/asr/models` ディレクトリに配置してください.また、[他のモデル](https://huggingface.co/Systran) は、より小さいサイズで高クオリティな可能性があります.
+<div align="center">

 ## データセット形式

-TTS アノテーション .list ファイル形式:
+</div>
+
+TTS アノテーション `.list` ファイル形式:
+
+```text

-```
 vocal_path|speaker_name|language|text
+
 ```

 言語辞書:
@ -206,18 +209,23 @@ vocal_path|speaker_name|language|text

 例:

-```
+```text
+
 D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
+
 ```

+<div align="center">
+
 ## 微調整と推論

+</div>
+
 ### WebUI を開く

 #### 統合パッケージ利用者

 `go-webui.bat`をダブルクリックするか、`go-webui.ps1`を使用します.
-V1 に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックするか、`go-webui-v1.ps1`を使用してください.

 #### その他

@ -225,14 +233,6 @@ V1 に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックす
 python webui.py <言語(オプション)>
 ```

-V1 に切り替えたい場合は
-
-```bash
-python webui.py v1 <言語(オプション)>
-```
-
-または WebUI で手動でバージョンを切り替えてください.
-
 ### 微調整

 #### パス自動補完のサポート
@ -248,7 +248,7 @@ python webui.py v1 <言語(オプション)>

 #### 統合パッケージ利用者

-`go-webui-v2.bat`をダブルクリックするか、`go-webui-v2.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます.
+`go-webui.bat`をダブルクリックするか、`go-webui.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます.

 #### その他

@ -264,8 +264,12 @@ python webui.py

 その後、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます.

+<div align="center">
+
 ## V2 リリースノート

+</div>
+
 新機能:

 1. 韓国語と広東語をサポート
@ -278,18 +282,12 @@ python webui.py

   [詳細はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

-V1 環境から V2 を使用するには:
-
-1. `pip install -r requirements.txt`を使用していくつかのパッケージを更新
-
-2. 最新のコードを github からクローン
-
-3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)から V2 の事前学習モデルをダウンロードし、それらを`GPT_SoVITS/pretrained_models/gsv-v2final-pretrained`に配置
-
-   中国語 V2 追加: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW モデルをダウンロードし、解凍して`G2PWModel`にリネームし、`GPT_SoVITS/text`に配置します)
+<div align="center">

 ## V3 リリースノート

+</div>
+
 新機能:

 1. 音色の類似性が向上し、ターゲットスピーカーを近似するために必要な学習データが少なくなりました (音色の類似性は、ファインチューニングなしでベースモデルを直接使用することで顕著に改善されます).
@ -298,33 +296,23 @@ V1 環境から V2 を使用するには:

   [詳細情報はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

-v2 環境から v3 を使用する方法:
-
-1. `pip install -r requirements.txt` を実行して、いくつかのパッケージを更新します.
-
-2. GitHub から最新のコードをクローンします.
-
-3. v3 の事前学習済みモデル (s1v3.ckpt、s2Gv3.pth、models--nvidia--bigvgan_v2_24khz_100band_256x フォルダ) を[Huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) からダウンロードし、GPT_SoVITS/pretrained_models フォルダに配置します.
-
-   追加: 音声超解像モデルについては、[ダウンロード方法](../../tools/AP_BWE_main/24kto48k/readme.txt)を参照してください.
+<div align="center">

 ## V4 リリースノート

+</div>
+
 新機能:

 1. **V4 は、V3 で発生していた非整数倍アップサンプリングによる金属音の問題を修正し、音声がこもる問題を防ぐためにネイティブに 48kHz 音声を出力します（V3 はネイティブに 24kHz 音声のみ出力）**. 作者は V4 を V3 の直接的な置き換えとして推奨していますが、さらなるテストが必要です.
   [詳細はこちら](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

-V1/V2/V3 環境から V4 への移行方法:
-
-1. `pip install -r requirements.txt` を実行して一部の依存パッケージを更新してください.
-
-2. GitHub から最新のコードをクローンします.
-
-3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) から V4 の事前学習済みモデル (`gsv-v4-pretrained/s2v4.ckpt` および `gsv-v4-pretrained/vocoder.pth`) をダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリへ配置してください.
+<div align="center">

 ## V2Pro リリースノート

+</div>
+
 新機能:

 1. **V2 と比較してやや高いメモリ使用量ですが、ハードウェアコストと推論速度は維持しつつ、V4 よりも高い性能と音質を実現します. **
@ -332,18 +320,13 @@ V1/V2/V3 環境から V4 への移行方法:

 2. V1/V2 と V2Pro シリーズは類似した特徴を持ち、V3/V4 も同様の機能を持っています. 平均音質が低いトレーニングセットの場合、V1/V2/V2Pro は良好な結果を出すことができますが、V3/V4 では対応できません. また、V3/V4 の合成音声はトレーニング全体ではなく、より参考音声に寄った音質になります.

-V1/V2/V3/V4 環境から V2Pro への移行方法:
-
-1. `pip install -r requirements.txt` を実行して一部の依存パッケージを更新してください.
-
-2. GitHub から最新のコードをクローンします.
-
-3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) から V2Pro の事前学習済みモデル (`v2Pro/s2Dv2Pro.pth`, `v2Pro/s2Gv2Pro.pth`, `v2Pro/s2Dv2ProPlus.pth`, `v2Pro/s2Gv2ProPlus.pth`, および `sv/pretrained_eres2netv2w24s4ep4.ckpt`) をダウンロードし、`GPT_SoVITS/pretrained_models` ディレクトリへ配置してください.
+<div align="center">

 ## Todo リスト

- [x] **優先度 高:**
+</div>

+- [x] **優先度 高:**
  - [x] 日本語と英語でのローカライズ.
  - [x] ユーザーガイド.
  - [x] 日本語データセットと英語データセットのファインチューニングトレーニング.
@ -356,53 +339,16 @@ V1/V2/V3/V4 環境から V2Pro への移行方法:
  - [x] 英語と日本語のテキストフロントエンドを改善.
  - [ ] 小型と大型の TTS モデルを開発する.
  - [x] Colab のスクリプト.
-  - [ ] トレーニングデータセットを拡張する (2k→10k).
+  - [x] トレーニングデータセットを拡張する (2k→10k).
  - [x] より良い sovits ベースモデル (音質向上)
  - [ ] モデルミックス

-## (追加の) コマンドラインから実行する方法
-
-コマンド ラインを使用して UVR5 の WebUI を開きます
-
-```bash
-python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
-```
-
-<!-- ブラウザを開けない場合は、以下の形式に従って UVR 処理を行ってください.これはオーディオ処理に mdxnet を使用しています.
-```
-python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
-``` -->
-
-コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです.
-
-```bash
-python audio_slicer.py \
-    --input_path "<path_to_original_audio_file_or_directory>" \
-    --output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
-    --threshold <volume_threshold> \
-    --min_length <minimum_duration_of_each_subclip> \
-    --min_interval <shortest_time_gap_between_adjacent_subclips>
-    --hop_size <step_size_for_computing_volume_curve>
-```
-
-コマンドラインを使用してデータセット ASR 処理を行う方法です (中国語のみ)
-
-```bash
-python tools/asr/funasr_asr.py -i <input> -o <output>
-```
-
-ASR 処理は Faster_Whisper を通じて実行されます(中国語を除く ASR マーキング)
-
-(進行状況バーは表示されません.GPU のパフォーマンスにより時間遅延が発生する可能性があります)
-
-```bash
-python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
-```
-
-カスタムリストの保存パスが有効になっています
+<div align="center">

 ## クレジット

+</div>
+
 特に以下のプロジェクトと貢献者に感謝します:

 ### 理論研究
@ -413,7 +359,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
 - [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
 - [contentvec](https://github.com/auspicious3000/contentvec/)
 - [hifi-gan](https://github.com/jik876/hifi-gan)
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
+- [fish-speech](https://github.com/fishaudio/fish-speech/blob/d3df50503b36314a964f66cac1af1e19e95bcfa3/fish_speech/models/text2semantic/inference.py#L81)
 - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
 - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)

@ -445,8 +391,12 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p

@Naozumi520 さん、広東語のトレーニングセットの提供と、広東語に関する知識のご指導をいただき、感謝申し上げます.

+<div align="center">
+
 ## すべてのコントリビューターに感謝します

+</div>
+
 <a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
-  <img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
+  <img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" alt=""/>
 </a>
--- a/docs/ko/README.md
+++ b/docs/ko/README.md
@ -1,3 +1,5 @@
+#
+
 <div align="center">

 <h1>GPT-SoVITS-WebUI</h1>
@ -19,14 +21,17 @@
 [![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
 [![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)

-
 [**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | **한국어** | [**Türkçe**](../tr/README.md)

 </div>

 ---

-## 기능:
+<div align="center">
+
+## 기능
+
+</div>

 1. **제로샷 텍스트 음성 변환 (TTS):** 5초의 음성 샘플을 입력하면 즉시 텍스트를 음성으로 변환할 수 있습니다.

@ -40,97 +45,101 @@

 보지 못한 발화자의 퓨샷(few-shot) 파인튜닝 데모:

-https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
+<https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb>

 **사용자 설명서: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**

+<div align="center">
+
 ## 설치

 ### 테스트 통과 환경

-| Python Version | PyTorch Version  | Device        |
-| -------------- | ---------------- | ------------- |
-| Python 3.10    | PyTorch 2.5.1    | CUDA 12.4     |
-| Python 3.11    | PyTorch 2.5.1    | CUDA 12.4     |
-| Python 3.11    | PyTorch 2.7.0    | CUDA 12.8     |
-| Python 3.9     | PyTorch 2.8.0dev | CUDA 12.8     |
-| Python 3.9     | PyTorch 2.5.1    | Apple silicon |
-| Python 3.11    | PyTorch 2.7.0    | Apple silicon |
-| Python 3.9     | PyTorch 2.2.2    | CPU           |
+| Python Version | PyTorch Version | Device        |
+| -------------- | --------------- | ------------- |
+| Python 3.10    | PyTorch 2.8.0   | CUDA 12.6     |
+| Python 3.11    | PyTorch 2.9.0   | CUDA 12.6     |
+| Python 3.11    | PyTorch 2.7.0   | CUDA 12.8     |
+| Python 3.11    | PyTorch 2.10.0  | CUDA 12.8     |
+| Python 3.10    | PyTorch 2.8.0   | Apple silicon |
+| Python 3.11    | PyTorch 2.9.0   | Apple silicon |
+| Python 3.12    | PyTorch 2.10.0  | Apple silicon |
+| Python 3.10    | PyTorch 2.9.0   | CPU           |
+
+</div>

 ### Windows

-Windows 사용자라면 (win>=10에서 테스트됨), [통합 패키지를 다운로드](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true)한 후 압축을 풀고 _go-webui.bat_ 파일을 더블 클릭하면 GPT-SoVITS-WebUI를 시작할 수 있습니다.
+Windows 사용자라면 (win>=10에서 테스트됨), [통합 패키지를 다운로드](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/tree/main)한 후 압축을 풀고 `go-webui.bat` 파일을 더블 클릭하면 GPT-SoVITS-WebUI를 시작할 수 있습니다.
+
+다음 명령어를 실행하여 이 프로젝트를 설치하세요:

 ```pwsh
-conda create -n GPTSoVits python=3.10
-conda activate GPTSoVits
-pwsh -F install.ps1 --Device <CU126|CU128|CPU> --Source <HF|HF-Mirror|ModelScope> [--DownloadUVR5]
+conda create -n GPTSoVITS python=3.11
+conda activate GPTSoVITS
+pwsh -F install.ps1 --help
 ```

 ### Linux

+다음 명령어를 실행하여 이 프로젝트를 설치하세요:
+
 ```bash
-conda create -n GPTSoVits python=3.10
-conda activate GPTSoVits
-bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
+conda create -n GPTSoVITS python=3.11
+conda activate GPTSoVITS
+bash install.sh --help
 ```

 ### macOS

 **주의: Mac에서 GPU로 훈련된 모델은 다른 OS에서 훈련된 모델에 비해 품질이 낮습니다. 해당 문제를 해결하기 전까지 MacOS에선 CPU를 사용하여 훈련을 진행합니다.**

-다음 명령어를 실행하여 이 프로젝트를 설치하세요
+다음 명령어를 실행하여 이 프로젝트를 설치하세요:

 ```bash
-conda create -n GPTSoVits python=3.10
-conda activate GPTSoVits
-bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
+conda create -n GPTSoVITS python=3.11
+conda activate GPTSoVITS
+bash install.sh --help
 ```

 ### 수동 설치

-#### 의존성 설치
+다음 명령어를 실행하여 이 프로젝트를 설치하세요:

 ```bash
-conda create -n GPTSoVits python=3.10
-conda activate GPTSoVits
+conda create -n GPTSoVITS python=3.11
+conda activate GPTSoVITS
+conda install uv ffmpeg -c conda-forge

-pip install -r extra-req.txt --no-deps
-pip install -r requirements.txt
+uv export --extra main -o pylock.toml -q --extra [mlx|cu126|cu128|rocm|cpu]
+uv pip sync pylock.toml --no-break-system-packages --preview-features pylock
+uv pip install ".[flash-attn]"
 ```

-#### FFmpeg 설치
+### 사전 학습된 모델

-##### Conda 사용자
+**`install.sh`가 성공적으로 실행되면 No.1,2,3 은 건너뛰어도 됩니다.**

-```bash
-conda activate GPTSoVits
-conda install ffmpeg
-```
+1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 에서 사전 학습된 모델을 다운로드하고, `GPT_SoVITS/pretrained_models` 디렉토리에 배치하세요.

-##### Ubuntu/Debian 사용자
+2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) 에서 모델을 다운로드하고 압축을 풀어 `G2PWModel`로 이름을 변경한 후, `GPT_SoVITS/text` 디렉토리에 배치하세요. (중국어 TTS 전용)

-```bash
-sudo apt install ffmpeg
-sudo apt install libsox-dev
-```
+3. UVR5 (보컬/반주 분리 & 잔향 제거 추가 기능)의 경우, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 에서 모델을 다운로드하고 `gsv_tools/uvr5/uvr5_weights` 디렉토리에 배치하세요.
+   - UVR5에서 bs_roformer 또는 mel_band_roformer 모델을 사용할 경우, 모델과 해당 설정 파일을 수동으로 다운로드하여 `gsv_tools/UVR5/UVR5_weights` 폴더에 저장할 수 있습니다. **모델 파일과 설정 파일의 이름은 확장자를 제외하고 동일한 이름을 가지도록 해야 합니다**. 또한, 모델과 설정 파일 이름에는 **"roformer"**가 포함되어야 roformer 클래스의 모델로 인식됩니다.

-##### Windows 사용자
+   - 모델 이름과 설정 파일 이름에 **모델 유형을 직접 지정하는 것이 좋습니다**. 예: mel_mand_roformer, bs_roformer. 지정하지 않으면 설정 파일을 기준으로 특성을 비교하여 어떤 유형의 모델인지를 판단합니다. 예를 들어, 모델 `bs_roformer_ep_368_sdr_12.9628.ckpt`와 해당 설정 파일 `bs_roformer_ep_368_sdr_12.9628.yaml`은 한 쌍입니다. `kim_mel_band_roformer.ckpt`와 `kim_mel_band_roformer.yaml`도 한 쌍입니다.

-[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe)와 [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe)를 GPT-SoVITS root 디렉토리에 넣습니다
+4. 중국어 ASR (추가 기능)의 경우, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 및 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 에서 모델을 다운로드하고, `gsv_tools/asr/models` 디렉토리에 배치하세요.

-[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) 설치
+5. 영어 또는 일본어 ASR (추가 기능)의 경우, [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 에서 모델을 다운로드하고, `gsv_tools/asr/models` 디렉토리에 배치하세요. 또한, [다른 모델](https://huggingface.co/Systran) 은 더 적은 디스크 용량으로 비슷한 효과를 가질 수 있습니다.

-##### MacOS 사용자
+<div align="center">

-```bash
-brew install ffmpeg
-```
+## GPT-SoVITS 실행하기 (Docker 사용)

-### GPT-SoVITS 실행하기 (Docker 사용)
+</div>

-#### Docker 이미지 선택
+### Docker 이미지 선택

 코드베이스가 빠르게 업데이트되는 반면 Docker 이미지 릴리스 주기는 느리기 때문에 다음을 참고하세요:

@ -141,15 +150,15 @@ brew install ffmpeg
 - Docker Compose는 현재 디렉터리의 **모든 파일**을 마운트합니다. Docker 이미지를 사용하기 전에 프로젝트 루트 디렉터리로 이동하여 코드를 **최신 상태로 업데이트**하세요
 - 선택 사항: 최신 변경사항을 반영하려면 제공된 Dockerfile을 사용하여 로컬에서 직접 이미지를 빌드할 수 있습니다

-#### 환경 변수
+### 환경 변수

 - `is_half`: 반정밀도(fp16) 사용 여부를 제어합니다. GPU가 지원하는 경우 `true`로 설정하면 메모리 사용량을 줄일 수 있습니다

-#### 공유 메모리 설정
+### 공유 메모리 설정

 Windows(Docker Desktop)에서는 기본 공유 메모리 크기가 작아 예기치 않은 동작이 발생할 수 있습니다. 시스템 메모리 상황에 따라 Docker Compose 파일에서 `shm_size`를 (예: `16g`)로 증가시키는 것이 좋습니다

-#### 서비스 선택
+### 서비스 선택

 `docker-compose.yaml` 파일에는 두 가지 서비스 유형이 정의되어 있습니다:

@ -162,7 +171,7 @@ Windows(Docker Desktop)에서는 기본 공유 메모리 크기가 작아 예기
 docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
 ```

-#### Docker 이미지 직접 빌드하기
+### Docker 이미지 직접 빌드하기

 직접 이미지를 빌드하려면 다음 명령어를 사용하세요:

@ -170,7 +179,7 @@ docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|
 bash docker_build.sh --cuda <12.6|12.8> [--lite]
 ```

-#### 실행 중인 컨테이너 접속하기 (Bash Shell)
+### 실행 중인 컨테이너 접속하기 (Bash Shell)

 컨테이너가 백그라운드에서 실행 중일 때 다음 명령어로 셸에 접속할 수 있습니다:

@ -178,30 +187,14 @@ bash docker_build.sh --cuda <12.6|12.8> [--lite]
 docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
 ```

-## 사전 학습된 모델
-
-**`install.sh`가 성공적으로 실행되면 No.1,2,3 은 건너뛰어도 됩니다.**
-
-1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) 에서 사전 학습된 모델을 다운로드하고, `GPT_SoVITS/pretrained_models` 디렉토리에 배치하세요.
-
-2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) 에서 모델을 다운로드하고 압축을 풀어 `G2PWModel`로 이름을 변경한 후, `GPT_SoVITS/text` 디렉토리에 배치하세요. (중국어 TTS 전용)
-
-3. UVR5 (보컬/반주 분리 & 잔향 제거 추가 기능)의 경우, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) 에서 모델을 다운로드하고 `tools/uvr5/uvr5_weights` 디렉토리에 배치하세요.
-
-   - UVR5에서 bs_roformer 또는 mel_band_roformer 모델을 사용할 경우, 모델과 해당 설정 파일을 수동으로 다운로드하여 `tools/UVR5/UVR5_weights` 폴더에 저장할 수 있습니다. **모델 파일과 설정 파일의 이름은 확장자를 제외하고 동일한 이름을 가지도록 해야 합니다**. 또한, 모델과 설정 파일 이름에는 **"roformer"**가 포함되어야 roformer 클래스의 모델로 인식됩니다.
-
-   - 모델 이름과 설정 파일 이름에 **모델 유형을 직접 지정하는 것이 좋습니다**. 예: mel_mand_roformer, bs_roformer. 지정하지 않으면 설정 파일을 기준으로 특성을 비교하여 어떤 유형의 모델인지를 판단합니다. 예를 들어, 모델 `bs_roformer_ep_368_sdr_12.9628.ckpt`와 해당 설정 파일 `bs_roformer_ep_368_sdr_12.9628.yaml`은 한 쌍입니다. `kim_mel_band_roformer.ckpt`와 `kim_mel_band_roformer.yaml`도 한 쌍입니다.
-
-4. 중국어 ASR (추가 기능)의 경우, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) 및 [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) 에서 모델을 다운로드하고, `tools/asr/models` 디렉토리에 배치하세요.
-
-5. 영어 또는 일본어 ASR (추가 기능)의 경우, [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) 에서 모델을 다운로드하고, `tools/asr/models` 디렉토리에 배치하세요. 또한, [다른 모델](https://huggingface.co/Systran) 은 더 적은 디스크 용량으로 비슷한 효과를 가질 수 있습니다.
-
 ## 데이터셋 형식

 텍스트 음성 합성(TTS) 주석 .list 파일 형식:

-```
+```text
+
 vocal_path|speaker_name|language|text
+
 ```

 언어 사전:
@ -209,21 +202,28 @@ vocal_path|speaker_name|language|text
 - 'zh': 중국어
 - 'ja': 일본어
 - 'en': 영어
+- 'ko': 한국인
+- 'yue': 광둥어

 예시:

-```
+```text
+
 D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
+
 ```

+<div align="center">
+
 ## 미세 조정 및 추론

+</div>
+
 ### WebUI 열기

 #### 통합 패키지 사용자

 `go-webui.bat`을 더블 클릭하거나 `go-webui.ps1`를 사용하십시오.
-V1으로 전환하려면, `go-webui-v1.bat`을 더블 클릭하거나 `go-webui-v1.ps1`를 사용하십시오.

 #### 기타

@ -231,21 +231,13 @@ V1으로 전환하려면, `go-webui-v1.bat`을 더블 클릭하거나 `go-webui-
 python webui.py <언어(옵션)>
 ```

-V1으로 전환하려면,
-
-```bash
-python webui.py v1 <언어(옵션)>
-```
-
-또는 WebUI에서 수동으로 버전을 전환하십시오.
-
 ### 미세 조정

 #### 경로 자동 채우기가 지원됩니다

 1. 오디오 경로를 입력하십시오.
 2. 오디오를 작은 청크로 분할하십시오.
-3. 노이즈 제거(옵션)
+3. 노이즈 제거 (옵션)
 4. ASR 수행
 5. ASR 전사를 교정하십시오.
 6. 다음 탭으로 이동하여 모델을 미세 조정하십시오.
@ -254,7 +246,7 @@ python webui.py v1 <언어(옵션)>

 #### 통합 패키지 사용자

-`go-webui-v2.bat`을 더블 클릭하거나 `go-webui-v2.ps1`를 사용한 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.
+`go-webui.bat`을 더블 클릭하거나 `go-webui.ps1`를 사용한 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.

 #### 기타

@ -270,8 +262,12 @@ python webui.py

 그런 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.

+<div align="center">
+
 ## V2 릴리스 노트

+</div>
+
 새로운 기능:

 1. 한국어 및 광둥어 지원
@ -284,18 +280,12 @@ python webui.py

   [자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

-V1 환경에서 V2를 사용하려면:
-
-1. `pip install -r requirements.txt`를 사용하여 일부 패키지 업데이트
-
-2. github에서 최신 코드를 클론하십시오.
-
-3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)에서 V2 사전 학습 모델을 다운로드하여 `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained`에 넣으십시오.
-
-   중국어 V2 추가: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.)
+<div align="center">

 ## V3 릴리스 노트

+</div>
+
 새로운 기능:

 1. 음색 유사성이 더 높아져 목표 음성에 대한 학습 데이터가 적게 필요합니다. (기본 모델을 직접 사용하여 미세 조정 없이 음색 유사성이 크게 향상됩니다.)
@ -304,33 +294,23 @@ V1 환경에서 V2를 사용하려면:

   [자세한 내용](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

-v2 환경에서 v3 사용하기:
-
-1. `pip install -r requirements.txt`로 일부 패키지를 업데이트합니다.
-
-2. 최신 코드를 github 에서 클론합니다.
-
-3. v3 사전 훈련된 모델(s1v3.ckpt, s2Gv3.pth, 그리고 models--nvidia--bigvgan_v2_24khz_100band_256x 폴더)을 [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 다운로드하여 `GPT_SoVITS/pretrained_models` 폴더에 넣습니다.
-
-   추가: 오디오 슈퍼 해상도 모델에 대해서는 [다운로드 방법](../../tools/AP_BWE_main/24kto48k/readme.txt)을 참고하세요.
+<div align="center">

 ## V4 릴리스 노트

+</div>
+
 신규 기능:

 1. **V4는 V3에서 발생하는 비정수 배율 업샘플링으로 인한 금속성 잡음 문제를 수정했으며, 소리가 먹먹해지는 것을 방지하기 위해 기본적으로 48kHz 오디오를 출력합니다 (V3는 기본적으로 24kHz만 지원)**. 개발자는 V4를 V3의 직접적인 대체 버전으로 보고 있지만 추가 테스트가 필요합니다.
   [자세히 보기](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

-V1/V2/V3 환경에서 V4로 전환 방법:
-
-1. 일부 의존 패키지를 업데이트하기 위해 `pip install -r requirements.txt` 명령어를 실행하세요.
-
-2. GitHub에서 최신 코드를 클론하세요.
-
-3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 V4 사전 학습 모델(`gsv-v4-pretrained/s2v4.ckpt` 및 `gsv-v4-pretrained/vocoder.pth`)을 다운로드하고 `GPT_SoVITS/pretrained_models` 디렉토리에 넣으세요.
+<div align="center">

 ## V2Pro 릴리스 노트

+</div>
+
 신규 기능:

 1. **V2보다 약간 높은 VRAM 사용량이지만 성능은 V4보다 우수하며, V2 수준의 하드웨어 비용과 속도를 유지합니다**.
@ -338,24 +318,18 @@ V1/V2/V3 환경에서 V4로 전환 방법:

 2. V1/V2와 V2Pro 시리즈는 유사한 특징을 가지며, V3/V4도 비슷한 기능을 가지고 있습니다. 평균 음질이 낮은 학습 데이터셋에서는 V1/V2/V2Pro가 좋은 결과를 내지만 V3/V4는 그렇지 못합니다. 또한 V3/V4의 합성 음색은 전체 학습 데이터셋보다는 참고 음성에 더 가깝습니다.

-V1/V2/V3/V4 환경에서 V2Pro로 전환 방법:
-
-1. 일부 의존 패키지를 업데이트하기 위해 `pip install -r requirements.txt` 명령어를 실행하세요.
-
-2. GitHub에서 최신 코드를 클론하세요.
-
-3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main)에서 V2Pro 사전 학습 모델(`v2Pro/s2Dv2Pro.pth`, `v2Pro/s2Gv2Pro.pth`, `v2Pro/s2Dv2ProPlus.pth`, `v2Pro/s2Gv2ProPlus.pth`, 및 `sv/pretrained_eres2netv2w24s4ep4.ckpt`)을 다운로드하고 `GPT_SoVITS/pretrained_models` 디렉토리에 넣으세요.
+<div align="center">

 ## 할 일 목록

- [x] **최우선순위:**
+</div>

+- [x] **최우선순위:**
  - [x] 일본어 및 영어 지역화.
  - [x] 사용자 가이드.
  - [x] 일본어 및 영어 데이터셋 미세 조정 훈련.

 - [ ] **기능:**
-
  - [x] 제로샷 음성 변환 (5초) / 소량의 음성 변환 (1분).
  - [x] TTS 속도 제어.
  - [ ] ~~향상된 TTS 감정 제어.~~
@ -363,53 +337,16 @@ V1/V2/V3/V4 환경에서 V2Pro로 전환 방법:
  - [x] 영어 및 일본어 텍스트 프론트 엔드 개선.
  - [ ] 작은 크기와 큰 크기의 TTS 모델 개발.
  - [x] Colab 스크립트.
-  - [ ] 훈련 데이터셋 확장 (2k 시간에서 10k 시간).
+  - [x] 훈련 데이터셋 확장 (2k 시간에서 10k 시간).
  - [x] 더 나은 sovits 기본 모델 (향상된 오디오 품질).
  - [ ] 모델 블렌딩.

-## (추가적인) 명령줄에서 실행하는 방법
-
-명령줄을 사용하여 UVR5용 WebUI 열기
-
-```bash
-python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
-```
-
-<!-- 브라우저를 열 수 없는 경우 UVR 처리를 위해 아래 형식을 따르십시오. 이는 오디오 처리를 위해 mdxnet을 사용하는 것입니다.
-```
-python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
-``` -->
-
-명령줄을 사용하여 데이터세트의 오디오 분할을 수행하는 방법은 다음과 같습니다.
-
-```bash
-python audio_slicer.py \
-    --input_path "<path_to_original_audio_file_or_directory>" \
-    --output_root "<directory_where_subdivided_audio_clips_will_be_saved>" \
-    --threshold <volume_threshold> \
-    --min_length <minimum_duration_of_each_subclip> \
-    --min_interval <shortest_time_gap_between_adjacent_subclips>
-    --hop_size <step_size_for_computing_volume_curve>
-```
-
-명령줄을 사용하여 데이터 세트 ASR 처리를 수행하는 방법입니다(중국어만 해당).
-
-```bash
-python tools/asr/funasr_asr.py -i <input> -o <output>
-```
-
-ASR 처리는 Faster_Whisper(중국어를 제외한 ASR 마킹)를 통해 수행됩니다.
-
-(진행률 표시줄 없음, GPU 성능으로 인해 시간 지연이 발생할 수 있음)
-
-```bash
-python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p <precision>
-```
-
-사용자 정의 목록 저장 경로가 활성화되었습니다.
+<div align="center">

 ## 감사의 말

+</div>
+
 다음 프로젝트와 기여자들에게 특별히 감사드립니다:

 ### 이론 연구
@ -420,7 +357,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
 - [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
 - [contentvec](https://github.com/auspicious3000/contentvec/)
 - [hifi-gan](https://github.com/jik876/hifi-gan)
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
+- [fish-speech](https://github.com/fishaudio/fish-speech/blob/d3df50503b36314a964f66cac1af1e19e95bcfa3/fish_speech/models/text2semantic/inference.py#L81)
 - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
 - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)

@ -452,8 +389,12 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p

@Naozumi520 님께 감사드립니다. 광둥어 학습 자료를 제공해 주시고, 광둥어 관련 지식을 지도해 주셔서 감사합니다.

-## 모든 기여자들에게 감사드립니다 ;)
+<div align="center">
+
+## 참여해주신 모든 분들께 감사드립니다
+
+</div>

 <a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
-  <img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
+  <img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" alt=""/>
 </a>
--- a/docs/tr/README.md
+++ b/docs/tr/README.md
@ -1,3 +1,5 @@
+#
+
 <div align="center">

 <h1>GPT-SoVITS-WebUI</h1>
@ -25,7 +27,7 @@ Güçlü Birkaç Örnekli Ses Dönüştürme ve Metinden Konuşmaya Web Arayüz

 ---

-## Özellikler:
+## Özellikler

 1. **Sıfır Örnekli Metinden Konuşmaya:** 5 saniyelik bir vokal örneği girin ve anında metinden konuşmaya dönüşümünü deneyimleyin.

@ -39,40 +41,49 @@ Güçlü Birkaç Örnekli Ses Dönüştürme ve Metinden Konuşmaya Web Arayüz

 Görünmeyen konuşmacılar birkaç örnekli ince ayar demosu:

-https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
+<https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb>

 **Kullanıcı Kılavuzu: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**

+<div align="center">
+
 ## Kurulum

 ### Test Edilmiş Ortamlar

-| Python Version | PyTorch Version  | Device        |
-| -------------- | ---------------- | ------------- |
-| Python 3.10    | PyTorch 2.5.1    | CUDA 12.4     |
-| Python 3.11    | PyTorch 2.5.1    | CUDA 12.4     |
-| Python 3.11    | PyTorch 2.7.0    | CUDA 12.8     |
-| Python 3.9     | PyTorch 2.8.0dev | CUDA 12.8     |
-| Python 3.9     | PyTorch 2.5.1    | Apple silicon |
-| Python 3.11    | PyTorch 2.7.0    | Apple silicon |
-| Python 3.9     | PyTorch 2.2.2    | CPU           |
+| Python Version | PyTorch Version | Device        |
+| -------------- | --------------- | ------------- |
+| Python 3.10    | PyTorch 2.8.0   | CUDA 12.6     |
+| Python 3.11    | PyTorch 2.9.0   | CUDA 12.6     |
+| Python 3.11    | PyTorch 2.7.0   | CUDA 12.8     |
+| Python 3.11    | PyTorch 2.10.0  | CUDA 12.8     |
+| Python 3.10    | PyTorch 2.8.0   | Apple silicon |
+| Python 3.11    | PyTorch 2.9.0   | Apple silicon |
+| Python 3.12    | PyTorch 2.10.0  | Apple silicon |
+| Python 3.10    | PyTorch 2.9.0   | CPU           |
+
+</div>

 ### Windows

-Eğer bir Windows kullanıcısıysanız (win>=10 ile test edilmiştir), [entegre paketi indirin](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/resolve/main/GPT-SoVITS-v3lora-20250228.7z?download=true) ve _go-webui.bat_ dosyasına çift tıklayarak GPT-SoVITS-WebUI'yi başlatın.
+Eğer bir Windows kullanıcısıysanız (win>=10 ile test edilmiştir), [entegre paketi indirin](https://huggingface.co/lj1995/GPT-SoVITS-windows-package/tree/main) ve `go-webui.bat` dosyasına çift tıklayarak GPT-SoVITS-WebUI'yi başlatın.
+
+Aşağıdaki komutları çalıştırarak programı yükleyin:

 ```pwsh
-conda create -n GPTSoVits python=3.10
-conda activate GPTSoVits
-pwsh -F install.ps1 --Device <CU126|CU128|CPU> --Source <HF|HF-Mirror|ModelScope> [--DownloadUVR5]
+conda create -n GPTSoVITS python=3.11
+conda activate GPTSoVITS
+pwsh -F install.ps1 --help
 ```

 ### Linux

+Aşağıdaki komutları çalıştırarak programı yükleyin:
+
 ```bash
-conda create -n GPTSoVits python=3.10
-conda activate GPTSoVits
-bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
+conda create -n GPTSoVITS python=3.11
+conda activate GPTSoVITS
+bash install.sh --help
 ```

 ### macOS
@ -82,54 +93,49 @@ bash install.sh --device <CU126|CU128|ROCM|CPU> --source <HF|HF-Mirror|ModelScop
 Aşağıdaki komutları çalıştırarak programı yükleyin:

 ```bash
-conda create -n GPTSoVits python=3.10
-conda activate GPTSoVits
-bash install.sh --device <MPS|CPU> --source <HF|HF-Mirror|ModelScope> [--download-uvr5]
+conda create -n GPTSoVITS python=3.11
+conda activate GPTSoVITS
+bash install.sh --help
 ```

 ### El ile Yükleme

-#### Bağımlılıkları Yükleme
+Aşağıdaki komutları çalıştırarak programı yükleyin:

 ```bash
-conda create -n GPTSoVits python=3.10
-conda activate GPTSoVits
+conda create -n GPTSoVITS python=3.11
+conda activate GPTSoVITS
+conda install uv ffmpeg -c conda-forge

-pip install -r extra-req.txt --no-deps
-pip install -r requirements.txt
+uv export --extra main -o pylock.toml -q --extra [mlx|cu126|cu128|rocm|cpu]
+uv pip sync pylock.toml --no-break-system-packages --preview-features pylock
+uv pip install ".[flash-attn]"
 ```

-#### FFmpeg'i Yükleme
+### Önceden Eğitilmiş Modeller

-##### Conda Kullanıcıları
+**Eğer `install.sh` başarıyla çalıştırılırsa, No.1,2,3 adımını atlayabilirsiniz.**

-```bash
-conda activate GPTSoVits
-conda install ffmpeg
-```
+1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) üzerinden önceden eğitilmiş modelleri indirip `GPT_SoVITS/pretrained_models` dizinine yerleştirin.

-##### Ubuntu/Debian Kullanıcıları
+2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) üzerinden modeli indirip sıkıştırmayı açın ve `G2PWModel` olarak yeniden adlandırın, ardından `GPT_SoVITS/text` dizinine yerleştirin. (Sadece Çince TTS için)

-```bash
-sudo apt install ffmpeg
-sudo apt install libsox-dev
-```
+3. UVR5 (Vokal/Enstrümantal Ayrımı & Yankı Giderme) için, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) üzerinden modelleri indirip `gsv_tools/uvr5/uvr5_weights` dizinine yerleştirin.
+   - UVR5'te bs_roformer veya mel_band_roformer modellerini kullanıyorsanız, modeli ve ilgili yapılandırma dosyasını manuel olarak indirip `gsv_tools/UVR5/UVR5_weights` klasörüne yerleştirebilirsiniz. **Model dosyası ve yapılandırma dosyasının adı, uzantı dışında aynı olmalıdır**. Ayrıca, model ve yapılandırma dosyasının adlarında **"roformer"** kelimesi yer almalıdır, böylece roformer sınıfındaki bir model olarak tanınır.

-##### Windows Kullanıcıları
+   - Model adı ve yapılandırma dosyası adı içinde **doğrudan model tipini belirtmek önerilir**. Örneğin: mel_mand_roformer, bs_roformer. Belirtilmezse, yapılandırma dosyasından özellikler karşılaştırılarak model tipi belirlenir. Örneğin, `bs_roformer_ep_368_sdr_12.9628.ckpt` modeli ve karşılık gelen yapılandırma dosyası `bs_roformer_ep_368_sdr_12.9628.yaml` bir çifttir. Aynı şekilde, `kim_mel_band_roformer.ckpt` ve `kim_mel_band_roformer.yaml` da bir çifttir.

-[ffmpeg.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffmpeg.exe) ve [ffprobe.exe](https://huggingface.co/lj1995/VoiceConversionWebUI/blob/main/ffprobe.exe) dosyalarını indirin ve GPT-SoVITS kök dizinine yerleştirin
+4. Çince ASR için, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) ve [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) üzerinden modelleri indirip `gsv_tools/asr/models` dizinine yerleştirin.

-[Visual Studio 2017](https://aka.ms/vs/17/release/vc_redist.x86.exe) ortamını yükleyin
+5. İngilizce veya Japonca ASR için, [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) üzerinden modeli indirip `gsv_tools/asr/models` dizinine yerleştirin. Ayrıca, [diğer modeller](https://huggingface.co/Systran) benzer bir etki yaratabilir ve daha az disk alanı kaplayabilir.

-##### MacOS Kullanıcıları
+<div align="center">

-```bash
-brew install ffmpeg
-```
+## GPT-SoVITS Çalıştırma (Docker Kullanarak)

-### GPT-SoVITS Çalıştırma (Docker Kullanarak)
+</div>

-#### Docker İmajı Seçimi
+### Docker İmajı Seçimi

 Kod tabanı hızla geliştiği halde Docker imajları daha yavaş yayınlandığı için lütfen şu adımları izleyin:

@ -140,15 +146,15 @@ Kod tabanı hızla geliştiği halde Docker imajları daha yavaş yayınlandığ
 - Docker Compose, mevcut dizindeki **tüm dosyaları** bağlayacaktır. Docker imajını kullanmadan önce lütfen proje kök dizinine geçin ve **en son kodu çekin**
 - Opsiyonel: En güncel değişiklikleri almak için, sağlanan Dockerfile ile yerel olarak imajı kendiniz oluşturabilirsiniz

-#### Ortam Değişkenleri
+### Ortam Değişkenleri

 - `is_half`: Yarı hassasiyet (fp16) kullanımını kontrol eder. GPU’nuz destekliyorsa, belleği azaltmak için `true` olarak ayarlayın.

-#### Paylaşılan Bellek Yapılandırması
+### Paylaşılan Bellek Yapılandırması

 Windows (Docker Desktop) ortamında, varsayılan paylaşılan bellek boyutu düşüktür ve bu beklenmedik hatalara neden olabilir. Sistem belleğinize göre Docker Compose dosyasındaki `shm_size` değerini (örneğin `16g`) artırmanız önerilir.

-#### Servis Seçimi
+### Servis Seçimi

 `docker-compose.yaml` dosyasında iki tür servis tanımlanmıştır:

@ -161,7 +167,7 @@ Belirli bir servisi Docker Compose ile çalıştırmak için şu komutu kullanı
 docker compose run --service-ports <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128>
 ```

-#### Docker İmajını Yerel Olarak Oluşturma
+### Docker İmajını Yerel Olarak Oluşturma

 Docker imajını kendiniz oluşturmak isterseniz şu komutu kullanın:

@ -169,7 +175,7 @@ Docker imajını kendiniz oluşturmak isterseniz şu komutu kullanın:
 bash docker_build.sh --cuda <12.6|12.8> [--lite]
 ```

-#### Çalışan Konteynere Erişim (Bash Shell)
+### Çalışan Konteynere Erişim (Bash Shell)

 Konteyner arka planda çalışırken, aşağıdaki komutla içine girebilirsiniz:

@ -177,30 +183,18 @@ Konteyner arka planda çalışırken, aşağıdaki komutla içine girebilirsiniz
 docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GPT-SoVITS-CU128> bash
 ```

-## Önceden Eğitilmiş Modeller
-
-**Eğer `install.sh` başarıyla çalıştırılırsa, No.1,2,3 adımını atlayabilirsiniz.**
-
-1. [GPT-SoVITS Models](https://huggingface.co/lj1995/GPT-SoVITS) üzerinden önceden eğitilmiş modelleri indirip `GPT_SoVITS/pretrained_models` dizinine yerleştirin.
-
-2. [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) üzerinden modeli indirip sıkıştırmayı açın ve `G2PWModel` olarak yeniden adlandırın, ardından `GPT_SoVITS/text` dizinine yerleştirin. (Sadece Çince TTS için)
-
-3. UVR5 (Vokal/Enstrümantal Ayrımı & Yankı Giderme) için, [UVR5 Weights](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main/uvr5_weights) üzerinden modelleri indirip `tools/uvr5/uvr5_weights` dizinine yerleştirin.
-
-   - UVR5'te bs_roformer veya mel_band_roformer modellerini kullanıyorsanız, modeli ve ilgili yapılandırma dosyasını manuel olarak indirip `tools/UVR5/UVR5_weights` klasörüne yerleştirebilirsiniz. **Model dosyası ve yapılandırma dosyasının adı, uzantı dışında aynı olmalıdır**. Ayrıca, model ve yapılandırma dosyasının adlarında **"roformer"** kelimesi yer almalıdır, böylece roformer sınıfındaki bir model olarak tanınır.
-
-   - Model adı ve yapılandırma dosyası adı içinde **doğrudan model tipini belirtmek önerilir**. Örneğin: mel_mand_roformer, bs_roformer. Belirtilmezse, yapılandırma dosyasından özellikler karşılaştırılarak model tipi belirlenir. Örneğin, `bs_roformer_ep_368_sdr_12.9628.ckpt` modeli ve karşılık gelen yapılandırma dosyası `bs_roformer_ep_368_sdr_12.9628.yaml` bir çifttir. Aynı şekilde, `kim_mel_band_roformer.ckpt` ve `kim_mel_band_roformer.yaml` da bir çifttir.
-
-4. Çince ASR için, [Damo ASR Model](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files), [Damo VAD Model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/files) ve [Damo Punc Model](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/files) üzerinden modelleri indirip `tools/asr/models` dizinine yerleştirin.
-
-5. İngilizce veya Japonca ASR için, [Faster Whisper Large V3](https://huggingface.co/Systran/faster-whisper-large-v3) üzerinden modeli indirip `tools/asr/models` dizinine yerleştirin. Ayrıca, [diğer modeller](https://huggingface.co/Systran) benzer bir etki yaratabilir ve daha az disk alanı kaplayabilir.
+<div align="center">

 ## Veri Seti Formatı

+</div>
+
 TTS açıklama .list dosya formatı:

-```
+```text
+
 vocal_path|speaker_name|language|text
+
 ```

 Dil sözlüğü:
@ -213,18 +207,23 @@ Dil sözlüğü:

 Örnek:

-```
+```text
+
 D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
+
 ```

+<div align="center">
+
 ## İnce Ayar ve Çıkarım

+</div>
+
 ### WebUI'yi Açın

 #### Entegre Paket Kullanıcıları

 `go-webui.bat` dosyasına çift tıklayın veya `go-webui.ps1` kullanın.
-V1'e geçmek istiyorsanız, `go-webui-v1.bat` dosyasına çift tıklayın veya `go-webui-v1.ps1` kullanın.

 #### Diğerleri

@ -232,14 +231,6 @@ V1'e geçmek istiyorsanız, `go-webui-v1.bat` dosyasına çift tıklayın veya `
 python webui.py <dil(isteğe bağlı)>
 ```

-V1'e geçmek istiyorsanız,
-
-```bash
-python webui.py v1 <dil(isteğe bağlı)>
-```
-
-veya WebUI'de manuel olarak sürüm değiştirin.
-
 ### İnce Ayar

 #### Yol Otomatik Doldurma artık destekleniyor
@ -255,7 +246,7 @@ veya WebUI'de manuel olarak sürüm değiştirin.

 #### Entegre Paket Kullanıcıları

-`go-webui-v2.bat` dosyasına çift tıklayın veya `go-webui-v2.ps1` kullanın, ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.
+`go-webui.bat` dosyasına çift tıklayın veya `go-webui.ps1` kullanın, ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.

 #### Diğerleri

@ -271,8 +262,12 @@ python webui.py

 ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.

+<div align="center">
+
 ## V2 Sürüm Notları

+</div>
+
 Yeni Özellikler:

 1. Korece ve Kantonca destekler
@ -285,18 +280,12 @@ Yeni Özellikler:

   [detaylar burada](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v2%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

-V1 ortamından V2'yi kullanmak için:
-
-1. `pip install -r requirements.txt` ile bazı paketleri güncelleyin
-
-2. github'dan en son kodları klonlayın.
-
-3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) adresinden v2 önceden eğitilmiş modelleri indirin ve bunları `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained` dizinine yerleştirin.
-
-   Ek olarak Çince V2: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.)
+<div align="center">

 ## V3 Sürüm Notları

+</div>
+
 Yeni Özellikler:

 1. **Tını benzerliği** daha yüksek olup, hedef konuşmacıyı yakınsamak için daha az eğitim verisi gerekmektedir (tını benzerliği, base model doğrudan kullanılacak şekilde fine-tuning yapılmadan önemli ölçüde iyileştirilmiştir).
@ -305,33 +294,23 @@ Yeni Özellikler:

   [daha fazla detay](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

-V2 ortamında V3 kullanımı:
-
-1. `pip install -r requirements.txt` ile bazı paketleri güncelleyin.
-
-2. GitHub'dan en son kodları klonlayın.
-
-3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden v3 önceden eğitilmiş modellerini (s1v3.ckpt, s2Gv3.pth ve models--nvidia--bigvgan_v2_24khz_100band_256x klasörünü) indirin ve `GPT_SoVITS/pretrained_models` dizinine yerleştirin.
-
-   ek: Ses Süper Çözünürlük modeli için [nasıl indirileceği](../../tools/AP_BWE_main/24kto48k/readme.txt) hakkında bilgi alabilirsiniz.
+<div align="center">

 ## V4 Sürüm Notları

+</div>
+
 Yeni Özellikler:

 1. **V4, V3'te görülen non-integer upsample işleminden kaynaklanan metalik ses sorununu düzeltti ve sesin boğuklaşmasını önlemek için doğrudan 48kHz ses çıktısı sunar (V3 sadece 24kHz destekler)**. Yazar, V4'ün V3'ün yerine geçebileceğini belirtmiştir ancak daha fazla test yapılması gerekmektedir.
   [Daha fazla bilgi](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3v4%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)

-V1/V2/V3 ortamından V4'e geçiş:
-
-1. Bazı bağımlılıkları güncellemek için `pip install -r requirements.txt` komutunu çalıştırın.
-
-2. GitHub'dan en son kodları klonlayın.
-
-3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden V4 ön eğitilmiş modelleri indirin (`gsv-v4-pretrained/s2v4.ckpt` ve `gsv-v4-pretrained/vocoder.pth`) ve bunları `GPT_SoVITS/pretrained_models` dizinine koyun.
+<div align="center">

 ## V2Pro Sürüm Notları

+</div>
+
 Yeni Özellikler:

 1. **V2 ile karşılaştırıldığında biraz daha yüksek VRAM kullanımı sağlar ancak V4'ten daha iyi performans gösterir; aynı donanım maliyeti ve hız avantajını korur**.
@ -339,18 +318,13 @@ Yeni Özellikler:

 2. V1/V2 ve V2Pro serisi benzer özelliklere sahipken, V3/V4 de yakın işlevleri paylaşır. Ortalama kalite düşük olan eğitim setleriyle V1/V2/V2Pro iyi sonuçlar verebilir ama V3/V4 veremez. Ayrıca, V3/V4’ün ürettiği ses tonu genel eğitim setine değil, referans ses örneğine daha çok benzemektedir.

-V1/V2/V3/V4 ortamından V2Pro'ya geçiş:
-
-1. Bazı bağımlılıkları güncellemek için `pip install -r requirements.txt` komutunu çalıştırın.
-
-2. GitHub'dan en son kodları klonlayın.
-
-3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main) üzerinden V2Pro ön eğitilmiş modelleri indirin (`v2Pro/s2Dv2Pro.pth`, `v2Pro/s2Gv2Pro.pth`, `v2Pro/s2Dv2ProPlus.pth`, `v2Pro/s2Gv2ProPlus.pth`, ve `sv/pretrained_eres2netv2w24s4ep4.ckpt`) ve bunları `GPT_SoVITS/pretrained_models` dizinine koyun.
+<div align="center">

 ## Yapılacaklar Listesi

- [x] **Yüksek Öncelikli:**
+</div>

+- [x] **Yüksek Öncelikli:**
  - [x] Japonca ve İngilizceye yerelleştirme.
  - [x] Kullanıcı kılavuzu.
  - [x] Japonca ve İngilizce veri seti ince ayar eğitimi.
@ -363,53 +337,16 @@ V1/V2/V3/V4 ortamından V2Pro'ya geçiş:
  - [x] İngilizce ve Japonca metin ön ucunu iyileştirme.
  - [ ] Küçük ve büyük boyutlu metinden konuşmaya modelleri geliştirme.
  - [x] Colab betikleri.
-  - [ ] Eğitim veri setini genişletmeyi dene (2k saat -> 10k saat).
+  - [x] Eğitim veri setini genişletmeyi dene (2k saat -> 10k saat).
  - [x] daha iyi sovits temel modeli (geliştirilmiş ses kalitesi)
  - [ ] model karışımı

-## (Ekstra) Komut satırından çalıştırma yöntemi
-
-UVR5 için Web Arayüzünü açmak için komut satırını kullanın
-
-```bash
-python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
-```
-
-<!-- Bir tarayıcı açamıyorsanız, UVR işleme için aşağıdaki formatı izleyin,Bu ses işleme için mdxnet kullanıyor
-```
-python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
-``` -->
-
-Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır
-
-```bash
-python audio_slicer.py \
-    --input_path "<orijinal_ses_dosyası_veya_dizininin_yolu>" \
-    --output_root "<alt_bölümlere_ayrılmış_ses_kliplerinin_kaydedileceği_dizin>" \
-    --threshold <ses_eşiği> \
-    --min_length <her_bir_alt_klibin_minimum_süresi> \
-    --min_interval <bitişik_alt_klipler_arasındaki_en_kısa_zaman_aralığı>
-    --hop_size <ses_eğrisini_hesaplamak_için_adım_boyutu>
-```
-
-Veri seti ASR işleme komut satırı kullanılarak bu şekilde yapılır (Yalnızca Çince)
-
-```bash
-python tools/asr/funasr_asr.py -i <girdi> -o <çıktı>
-```
-
-ASR işleme Faster_Whisper aracılığıyla gerçekleştirilir (Çince dışındaki ASR işaretleme)
-
-(İlerleme çubukları yok, GPU performansı zaman gecikmelerine neden olabilir)
-
-```bash
-python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
-```
-
-Özel bir liste kaydetme yolu etkinleştirildi
+<div align="center">

 ## Katkı Verenler

+</div>
+
 Özellikle aşağıdaki projelere ve katkıda bulunanlara teşekkür ederiz:

 ### Teorik Araştırma
@ -420,7 +357,7 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
 - [TransferTTS](https://github.com/hcy71o/TransferTTS/blob/master/models.py#L556)
 - [contentvec](https://github.com/auspicious3000/contentvec/)
 - [hifi-gan](https://github.com/jik876/hifi-gan)
- [fish-speech](https://github.com/fishaudio/fish-speech/blob/main/tools/llama/generate.py#L41)
+- [fish-speech](https://github.com/fishaudio/fish-speech/blob/d3df50503b36314a964f66cac1af1e19e95bcfa3/fish_speech/models/text2semantic/inference.py#L81)
 - [f5-TTS](https://github.com/SWivid/F5-TTS/blob/main/src/f5_tts/model/backbones/dit.py)
 - [shortcut flow matching](https://github.com/kvfrans/shortcut-models/blob/main/targets_shortcut.py)

@ -452,8 +389,12 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>

@Naozumi520'ye Kantonca eğitim setini sağladığı ve Kantonca ile ilgili bilgiler konusunda rehberlik ettiği için minnettarım.

+<div align="center">
+
 ## Tüm katkıda bulunanlara çabaları için teşekkürler

+</div>
+
 <a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
-  <img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
+  <img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" alt=""/>
 </a>
--- a/extra-req.txt
+++ b/extra-req.txt
@ -1 +0,0 @@
-faster-whisper
--- a/gsv_tools/AP_BWE_main/24kto48k/readme.txt
+++ b/gsv_tools/AP_BWE_main/24kto48k/readme.txt
--- a/gsv_tools/AP_BWE_main/LICENSE
+++ b/gsv_tools/AP_BWE_main/LICENSE
--- a/gsv_tools/AP_BWE_main/README.md
+++ b/gsv_tools/AP_BWE_main/README.md
--- a/gsv_tools/AP_BWE_main/datasets1/init.py
+++ b/gsv_tools/AP_BWE_main/datasets1/init.py
--- a/gsv_tools/AP_BWE_main/datasets1/dataset.py
+++ b/gsv_tools/AP_BWE_main/datasets1/dataset.py
@ -1,8 +1,9 @@
 import os
 import random
+
 import torch
-import torchaudio
 import torch.utils.data
+import torchaudio
 import torchaudio.functional as aF


@ -37,10 +38,10 @@ def amp_pha_istft(log_amp, pha, n_fft, hop_size, win_size, center=True):


 def get_dataset_filelist(a):
-    with open(a.input_training_file, "r", encoding="utf-8") as fi:
+    with open(a.input_training_file, encoding="utf-8") as fi:
        training_indexes = [x.split("|")[0] for x in fi.read().split("\n") if len(x) > 0]

-    with open(a.input_validation_file, "r", encoding="utf-8") as fi:
+    with open(a.input_validation_file, encoding="utf-8") as fi:
        validation_indexes = [x.split("|")[0] for x in fi.read().split("\n") if len(x) > 0]

    return training_indexes, validation_indexes
--- a/gsv_tools/AP_BWE_main/models/init.py
+++ b/gsv_tools/AP_BWE_main/models/init.py
--- a/gsv_tools/AP_BWE_main/models/model.py
+++ b/gsv_tools/AP_BWE_main/models/model.py
@ -1,7 +1,7 @@
 import torch
-import torch.nn.functional as F
 import torch.nn as nn
-from torch.nn.utils import weight_norm, spectral_norm
+import torch.nn.functional as F
+from torch.nn.utils import spectral_norm, weight_norm


 # from utils import init_weights, get_padding
@ -16,7 +16,7 @@ def init_weights(m, mean=0.0, std=0.01):


 import numpy as np
-from typing import Tuple, List
+

 LRELU_SLOPE = 0.1

@ -75,7 +75,7 @@ class ConvNeXtBlock(nn.Module):

 class APNet_BWE_Model(torch.nn.Module):
    def __init__(self, h):
-        super(APNet_BWE_Model, self).__init__()
+        super().__init__()
        self.h = h
        self.adanorm_num_embeddings = None
        layer_scale_init_value = 1 / h.ConvNeXt_layers
@ -125,7 +125,7 @@ class APNet_BWE_Model(torch.nn.Module):
        x_mag = self.norm_pre_mag(x_mag.transpose(1, 2)).transpose(1, 2)
        x_pha = self.norm_pre_pha(x_pha.transpose(1, 2)).transpose(1, 2)

-        for conv_block_mag, conv_block_pha in zip(self.convnext_mag, self.convnext_pha):
+        for conv_block_mag, conv_block_pha in zip(self.convnext_mag, self.convnext_pha, strict=False):
            x_mag = x_mag + x_pha
            x_pha = x_pha + x_mag
            x_mag = conv_block_mag(x_mag, cond_embedding_id=None)
@ -146,9 +146,9 @@ class APNet_BWE_Model(torch.nn.Module):

 class DiscriminatorP(torch.nn.Module):
    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
-        super(DiscriminatorP, self).__init__()
+        super().__init__()
        self.period = period
-        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        norm_f = weight_norm if not use_spectral_norm else spectral_norm
        self.convs = nn.ModuleList(
            [
                norm_f(nn.Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
@ -185,7 +185,7 @@ class DiscriminatorP(torch.nn.Module):

 class MultiPeriodDiscriminator(torch.nn.Module):
    def __init__(self):
-        super(MultiPeriodDiscriminator, self).__init__()
+        super().__init__()
        self.discriminators = nn.ModuleList(
            [
                DiscriminatorP(2),
@ -201,7 +201,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
        y_d_gs = []
        fmap_rs = []
        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
+        for _i, d in enumerate(self.discriminators):
            y_d_r, fmap_r = d(y)
            y_d_g, fmap_g = d(y_hat)
            y_d_rs.append(y_d_r)
@ -215,7 +215,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
 class MultiResolutionAmplitudeDiscriminator(nn.Module):
    def __init__(
        self,
-        resolutions: Tuple[Tuple[int, int, int]] = ((512, 128, 512), (1024, 256, 1024), (2048, 512, 2048)),
+        resolutions: tuple[tuple[int, int, int]] = ((512, 128, 512), (1024, 256, 1024), (2048, 512, 2048)),
        num_embeddings: int = None,
    ):
        super().__init__()
@ -225,7 +225,7 @@ class MultiResolutionAmplitudeDiscriminator(nn.Module):

    def forward(
        self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
-    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
+    ) -> tuple[list[torch.Tensor], list[torch.Tensor], list[list[torch.Tensor]], list[list[torch.Tensor]]]:
        y_d_rs = []
        y_d_gs = []
        fmap_rs = []
@ -245,7 +245,7 @@ class MultiResolutionAmplitudeDiscriminator(nn.Module):
 class DiscriminatorAR(nn.Module):
    def __init__(
        self,
-        resolution: Tuple[int, int, int],
+        resolution: tuple[int, int, int],
        channels: int = 64,
        in_channels: int = 1,
        num_embeddings: int = None,
@ -269,7 +269,7 @@ class DiscriminatorAR(nn.Module):

    def forward(
        self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None
-    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, list[torch.Tensor]]:
        fmap = []
        x = x.squeeze(1)

@ -309,7 +309,7 @@ class DiscriminatorAR(nn.Module):
 class MultiResolutionPhaseDiscriminator(nn.Module):
    def __init__(
        self,
-        resolutions: Tuple[Tuple[int, int, int]] = ((512, 128, 512), (1024, 256, 1024), (2048, 512, 2048)),
+        resolutions: tuple[tuple[int, int, int]] = ((512, 128, 512), (1024, 256, 1024), (2048, 512, 2048)),
        num_embeddings: int = None,
    ):
        super().__init__()
@ -319,7 +319,7 @@ class MultiResolutionPhaseDiscriminator(nn.Module):

    def forward(
        self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
-    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
+    ) -> tuple[list[torch.Tensor], list[torch.Tensor], list[list[torch.Tensor]], list[list[torch.Tensor]]]:
        y_d_rs = []
        y_d_gs = []
        fmap_rs = []
@ -339,7 +339,7 @@ class MultiResolutionPhaseDiscriminator(nn.Module):
 class DiscriminatorPR(nn.Module):
    def __init__(
        self,
-        resolution: Tuple[int, int, int],
+        resolution: tuple[int, int, int],
        channels: int = 64,
        in_channels: int = 1,
        num_embeddings: int = None,
@ -363,7 +363,7 @@ class DiscriminatorPR(nn.Module):

    def forward(
        self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None
-    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, list[torch.Tensor]]:
        fmap = []
        x = x.squeeze(1)

@ -402,8 +402,8 @@ class DiscriminatorPR(nn.Module):

 def feature_loss(fmap_r, fmap_g):
    loss = 0
-    for dr, dg in zip(fmap_r, fmap_g):
-        for rl, gl in zip(dr, dg):
+    for dr, dg in zip(fmap_r, fmap_g, strict=False):
+        for rl, gl in zip(dr, dg, strict=False):
            loss += torch.mean(torch.abs(rl - gl))

    return loss
@ -413,7 +413,7 @@ def discriminator_loss(disc_real_outputs, disc_generated_outputs):
    loss = 0
    r_losses = []
    g_losses = []
-    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs, strict=False):
        r_loss = torch.mean(torch.clamp(1 - dr, min=0))
        g_loss = torch.mean(torch.clamp(1 + dg, min=0))
        loss += r_loss + g_loss
--- a/gsv_tools/init.py
+++ b/gsv_tools/init.py
--- a/gsv_tools/asr/config.py
+++ b/gsv_tools/asr/config.py
@ -5,9 +5,9 @@ def get_models():
        "large-v2",
        "large-v3",
        "large-v3-turbo",
-        #"distil-large-v2",
-        #"distil-large-v3",
-        #"distil-large-v3.5",
+        # "distil-large-v2",
+        # "distil-large-v3",
+        # "distil-large-v3.5",
    ]
    return model_size_list

--- a/gsv_tools/asr/fasterwhisper_asr.py
+++ b/gsv_tools/asr/fasterwhisper_asr.py
@ -9,33 +9,34 @@ from huggingface_hub import snapshot_download as snapshot_download_hf
 from modelscope import snapshot_download as snapshot_download_ms
 from tqdm import tqdm

-from tools.asr.config import get_models
-from tools.asr.funasr_asr import only_asr
-from tools.my_utils import load_cudnn
+from gsv_tools.asr.config import get_models
+from gsv_tools.asr.funasr_asr import only_asr
+from gsv_tools.my_utils import load_cudnn
+

 # fmt: off
 language_code_list = [
-    "af", "am", "ar", "as", "az", 
-    "ba", "be", "bg", "bn", "bo", 
-    "br", "bs", "ca", "cs", "cy", 
-    "da", "de", "el", "en", "es", 
-    "et", "eu", "fa", "fi", "fo", 
-    "fr", "gl", "gu", "ha", "haw", 
-    "he", "hi", "hr", "ht", "hu", 
-    "hy", "id", "is", "it", "ja", 
-    "jw", "ka", "kk", "km", "kn", 
-    "ko", "la", "lb", "ln", "lo", 
-    "lt", "lv", "mg", "mi", "mk", 
-    "ml", "mn", "mr", "ms", "mt", 
-    "my", "ne", "nl", "nn", "no", 
-    "oc", "pa", "pl", "ps", "pt", 
-    "ro", "ru", "sa", "sd", "si", 
-    "sk", "sl", "sn", "so", "sq", 
-    "sr", "su", "sv", "sw", "ta", 
-    "te", "tg", "th", "tk", "tl", 
-    "tr", "tt", "uk", "ur", "uz", 
+    "af", "am", "ar", "as", "az",
+    "ba", "be", "bg", "bn", "bo",
+    "br", "bs", "ca", "cs", "cy",
+    "da", "de", "el", "en", "es",
+    "et", "eu", "fa", "fi", "fo",
+    "fr", "gl", "gu", "ha", "haw",
+    "he", "hi", "hr", "ht", "hu",
+    "hy", "id", "is", "it", "ja",
+    "jw", "ka", "kk", "km", "kn",
+    "ko", "la", "lb", "ln", "lo",
+    "lt", "lv", "mg", "mi", "mk",
+    "ml", "mn", "mr", "ms", "mt",
+    "my", "ne", "nl", "nn", "no",
+    "oc", "pa", "pl", "ps", "pt",
+    "ro", "ru", "sa", "sd", "si",
+    "sk", "sl", "sn", "so", "sq",
+    "sr", "su", "sv", "sw", "ta",
+    "te", "tg", "th", "tk", "tl",
+    "tr", "tt", "uk", "ur", "uz",
    "vi", "yi", "yo", "zh", "yue",
-    "auto"] 
+    "auto"]
 # fmt: on


@ -52,20 +53,20 @@ def download_model(model_size: str):
        if "distil" in model_size:
            if "3.5" in model_size:
                repo_id = "distil-whisper/distil-large-v3.5-ct2"
-                model_path = "tools/asr/models/faster-distil-whisper-large-v3.5"
+                model_path = "gsv_tools/asr/models/faster-distil-whisper-large-v3.5"
            else:
                repo_id = "Systran/faster-{}-whisper-{}".format(*model_size.split("-", maxsplit=1))
        elif model_size == "large-v3-turbo":
            repo_id = "mobiuslabsgmbh/faster-whisper-large-v3-turbo"
-            model_path = "tools/asr/models/faster-whisper-large-v3-turbo"
+            model_path = "gsv_tools/asr/models/faster-whisper-large-v3-turbo"
        else:
            repo_id = f"Systran/faster-whisper-{model_size}"
        model_path = (
-            model_path or f"tools/asr/models/{repo_id.replace('Systran/', '').replace('distil-whisper/', '', 1)}"
+            model_path or f"gsv_tools/asr/models/{repo_id.replace('Systran/', '').replace('distil-whisper/', '', 1)}"
        )
    else:
        repo_id = "XXXXRT/faster-whisper"
-        model_path = "tools/asr/models"
+        model_path = "gsv_tools/asr/models"

    files: list[str] = [
        "config.json",
--- a/gsv_tools/asr/funasr_asr.py
+++ b/gsv_tools/asr/funasr_asr.py
@ -1,5 +1,3 @@
-# -*- coding:utf-8 -*-
-
 import argparse
 import os
 import traceback
@ -8,6 +6,7 @@ from funasr import AutoModel
 from modelscope import snapshot_download
 from tqdm import tqdm

+
 funasr_models = {}  # 存储模型避免重复加载


@ -23,27 +22,27 @@ def only_asr(input_file, language):

 def create_model(language="zh"):
    if language == "zh":
-        path_vad = "tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch"
-        path_punc = "tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
-        path_asr = "tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+        path_vad = "gsv_tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch"
+        path_punc = "gsv_tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+        path_asr = "gsv_tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
        snapshot_download(
            "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
-            local_dir="tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+            local_dir="gsv_tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch",
        )
        snapshot_download(
            "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
-            local_dir="tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
+            local_dir="gsv_tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
        )
        snapshot_download(
            "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
-            local_dir="tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+            local_dir="gsv_tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
        )
        model_revision = "v2.0.4"
    elif language == "yue":
-        path_asr = "tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online"
+        path_asr = "gsv_tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online"
        snapshot_download(
            "iic/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online",
-            local_dir="tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online",
+            local_dir="gsv_tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online",
        )
        path_vad = path_punc = None
        vad_model_revision = punc_model_revision = ""
--- a/gsv_tools/asr/models/.gitignore
+++ b/gsv_tools/asr/models/.gitignore
--- a/gsv_tools/assets.py
+++ b/gsv_tools/assets.py
--- a/gsv_tools/audio_sr.py
+++ b/gsv_tools/audio_sr.py
@ -1,23 +1,24 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
-import sys
 import os
+import sys
+

 AP_BWE_main_dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "AP_BWE_main")
 sys.path.append(AP_BWE_main_dir_path)
 import json
+
 import torch
 import torchaudio.functional as aF
-# from attrdict import AttrDict####will be bug in py3.10

-from datasets1.dataset import amp_pha_stft, amp_pha_istft
+# from attrdict import AttrDict####will be bug in py3.10
+from datasets1.dataset import amp_pha_istft, amp_pha_stft
 from models.model import APNet_BWE_Model


 class AP_BWE:
    def __init__(self, device, DictToAttrRecursive, checkpoint_file=None):
-        if checkpoint_file == None:
-            checkpoint_file = "%s/24kto48k/g_24kto48k.zip" % (AP_BWE_main_dir_path)
-            if os.path.exists(checkpoint_file) == False:
+        if checkpoint_file is None:
+            checkpoint_file = f"{AP_BWE_main_dir_path}/24kto48k/g_24kto48k.zip"
+            if not os.path.exists(checkpoint_file):
                raise FileNotFoundError
        config_file = os.path.join(os.path.split(checkpoint_file)[0], "config.json")
        with open(config_file) as f:
--- a/gsv_tools/cmd-denoise.py
+++ b/gsv_tools/cmd-denoise.py
@ -1,12 +1,13 @@
-import os
 import argparse
+import os
 import traceback

 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from tqdm import tqdm

-path_denoise = "tools/denoise-model/speech_frcrn_ans_cirm_16k"
+
+path_denoise = "gsv_tools/denoise-model/speech_frcrn_ans_cirm_16k"
 path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k"
 ans = pipeline(Tasks.acoustic_noise_suppression, model=path_denoise)

@ -17,7 +18,7 @@ def execute_denoise(input_folder, output_folder):
    # print(list(os.listdir(input_folder).sort()))
    for name in tqdm(os.listdir(input_folder)):
        try:
-            ans("%s/%s" % (input_folder, name), output_path="%s/%s" % (output_folder, name))
+            ans(f"{input_folder}/{name}", output_path=f"{output_folder}/{name}")
        except:
            traceback.print_exc()

--- a/gsv_tools/denoise-model/.gitignore
+++ b/gsv_tools/denoise-model/.gitignore
--- a/gsv_tools/i18n/i18n.py
+++ b/gsv_tools/i18n/i18n.py
@ -2,11 +2,12 @@ import json
 import locale
 import os

+
 I18N_JSON_DIR: os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), "locale")


 def load_language_list(language):
-    with open(os.path.join(I18N_JSON_DIR, f"{language}.json"), "r", encoding="utf-8") as f:
+    with open(os.path.join(I18N_JSON_DIR, f"{language}.json"), encoding="utf-8") as f:
        language_list = json.load(f)
    return language_list

--- a/gsv_tools/i18n/locale/en_US.json
+++ b/gsv_tools/i18n/locale/en_US.json
--- a/gsv_tools/i18n/locale/es_ES.json
+++ b/gsv_tools/i18n/locale/es_ES.json
--- a/gsv_tools/i18n/locale/fr_FR.json
+++ b/gsv_tools/i18n/locale/fr_FR.json
--- a/gsv_tools/i18n/locale/it_IT.json
+++ b/gsv_tools/i18n/locale/it_IT.json
--- a/gsv_tools/i18n/locale/ja_JP.json
+++ b/gsv_tools/i18n/locale/ja_JP.json
--- a/gsv_tools/i18n/locale/ko_KR.json
+++ b/gsv_tools/i18n/locale/ko_KR.json
--- a/gsv_tools/i18n/locale/pt_BR.json
+++ b/gsv_tools/i18n/locale/pt_BR.json
--- a/gsv_tools/i18n/locale/ru_RU.json
+++ b/gsv_tools/i18n/locale/ru_RU.json
--- a/gsv_tools/i18n/locale/tr_TR.json
+++ b/gsv_tools/i18n/locale/tr_TR.json
--- a/gsv_tools/i18n/locale/zh_CN.json
+++ b/gsv_tools/i18n/locale/zh_CN.json
--- a/gsv_tools/i18n/locale/zh_HK.json
+++ b/gsv_tools/i18n/locale/zh_HK.json
--- a/gsv_tools/i18n/locale/zh_SG.json
+++ b/gsv_tools/i18n/locale/zh_SG.json
--- a/gsv_tools/i18n/locale/zh_TW.json
+++ b/gsv_tools/i18n/locale/zh_TW.json
--- a/gsv_tools/i18n/scan_i18n.py
+++ b/gsv_tools/i18n/scan_i18n.py
@ -4,6 +4,7 @@ import json
 import os
 from collections import OrderedDict

+
 I18N_JSON_DIR: os.PathLike = os.path.join(os.path.dirname(os.path.relpath(__file__)), "locale")
 DEFAULT_LANGUAGE: str = "zh_CN"  # 默认语言
 TITLE_LEN: int = 60  # 标题显示长度
@ -36,7 +37,7 @@ def scan_i18n_strings():
    print(" Scanning Files and Extracting i18n Strings ".center(TITLE_LEN, "="))
    for filename in glob.iglob("**/*.py", recursive=True):
        try:
-            with open(filename, "r", encoding="utf-8") as f:
+            with open(filename, encoding="utf-8") as f:
                code = f.read()
                if "I18nAuto" in code:
                    tree = ast.parse(code)
@ -57,7 +58,7 @@ def update_i18n_json(json_file, standard_keys):
    standard_keys = sorted(standard_keys)
    print(f" Process {json_file} ".center(TITLE_LEN, "="))
    # 读取 JSON 文件
-    with open(json_file, "r", encoding="utf-8") as f:
+    with open(json_file, encoding="utf-8") as f:
        json_data = json.load(f, object_pairs_hook=OrderedDict)
    # 打印处理前的 JSON 条目数
    len_before = len(json_data)
--- a/gsv_tools/my_utils.py
+++ b/gsv_tools/my_utils.py
@ -8,7 +8,8 @@ import gradio as gr
 import numpy as np
 import pandas as pd

-from tools.i18n.i18n import I18nAuto
+from gsv_tools.i18n.i18n import I18nAuto
+

 i18n = I18nAuto(language=os.environ.get("language", "Auto"))

@ -48,7 +49,7 @@ def clean_path(path_str: str):

 def check_for_existance(file_list: list = None, is_train=False, is_dataset_processing=False):
    files_status = []
-    if is_train == True and file_list:
+    if is_train and file_list:
        file_list.append(os.path.join(file_list[0], "2-name2text.txt"))
        file_list.append(os.path.join(file_list[0], "3-bert"))
        file_list.append(os.path.join(file_list[0], "4-cnhubert"))
@ -61,7 +62,7 @@ def check_for_existance(file_list: list = None, is_train=False, is_dataset_proce
            files_status.append(False)
    if sum(files_status) != len(files_status):
        if is_train:
-            for file, status in zip(file_list, files_status):
+            for file, status in zip(file_list, files_status, strict=False):
                if status:
                    pass
                else:
@ -97,13 +98,13 @@ def check_details(path_list=None, is_train=False, is_dataset_processing=False):
            if not os.path.isdir(audio_path):
                gr.Warning(i18n("请填入正确的音频文件夹路径"))
                return
-        with open(list_path, "r", encoding="utf8") as f:
+        with open(list_path, encoding="utf8") as f:
            line = f.readline().strip("\n").split("\n")
        wav_name, _, __, ___ = line[0].split("|")
        wav_name = clean_path(wav_name)
-        if audio_path != "" and audio_path != None:
+        if audio_path != "" and audio_path is not None:
            wav_name = os.path.basename(wav_name)
-            wav_path = "%s/%s" % (audio_path, wav_name)
+            wav_path = f"{audio_path}/{wav_name}"
        else:
            wav_path = wav_name
        if os.path.exists(wav_path):
@ -117,7 +118,7 @@ def check_details(path_list=None, is_train=False, is_dataset_processing=False):
        path_list.append(os.path.join(path_list[0], "5-wav32k"))
        path_list.append(os.path.join(path_list[0], "6-name2semantic.tsv"))
        phone_path, hubert_path, wav_path, semantic_path = path_list[1:]
-        with open(phone_path, "r", encoding="utf-8") as f:
+        with open(phone_path, encoding="utf-8") as f:
            if f.read(1):
                ...
            else:
--- a/gsv_tools/slice_audio.py
+++ b/gsv_tools/slice_audio.py
@ -1,13 +1,14 @@
 import os
 import sys
-import numpy as np
 import traceback
+
+import numpy as np
 from scipy.io import wavfile
+from slicer2 import Slicer

 # parent_directory = os.path.dirname(os.path.abspath(__file__))
 # sys.path.append(parent_directory)
-from tools.my_utils import load_audio
-from slicer2 import Slicer
+from gsv_tools.my_utils import load_audio


 def slice(inp, opt_root, threshold, min_length, min_interval, hop_size, max_sil_kept, _max, alpha, i_part, all_part):
--- a/gsv_tools/slicer2.py
+++ b/gsv_tools/slicer2.py
--- a/gsv_tools/subfix_webui.py
+++ b/gsv_tools/subfix_webui.py
@ -1,5 +1,7 @@
 import sys
-from tools.i18n.i18n import I18nAuto, scan_language_list
+
+from gsv_tools.i18n.i18n import I18nAuto, scan_language_list
+

 language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto"
 i18n = I18nAuto(language=language)
@ -9,6 +11,7 @@ import json
 import os
 import uuid

+
 try:
    import gradio.analytics as analytics

@ -21,6 +24,7 @@ import librosa
 import numpy as np
 import soundfile

+
 g_json_key_text = ""
 g_json_key_path = ""
 g_load_file = ""
@ -113,7 +117,7 @@ def b_delete_audio(*checkbox_list):
    change = False
    for i, checkbox in reversed(list(enumerate(checkbox_list))):
        if g_index + i < len(g_data_json):
-            if checkbox == True:
+            if checkbox:
                g_data_json.pop(g_index + i)
                change = True

@ -150,7 +154,7 @@ def b_audio_split(audio_breakpoint, *checkbox_list):
    global g_data_json, g_max_json_index
    checked_index = []
    for i, checkbox in enumerate(checkbox_list):
-        if checkbox == True and g_index + i < len(g_data_json):
+        if checkbox and g_index + i < len(g_data_json):
            checked_index.append(g_index + i)
    if len(checked_index) == 1:
        index = checked_index[0]
@ -182,7 +186,7 @@ def b_merge_audio(interval_r, *checkbox_list):
    audios_path = []
    audios_text = []
    for i, checkbox in enumerate(checkbox_list):
-        if checkbox == True and g_index + i < len(g_data_json):
+        if checkbox and g_index + i < len(g_data_json):
            checked_index.append(g_index + i)

    if len(checked_index) > 1:
@ -237,7 +241,7 @@ def b_save_list():

 def b_load_json():
    global g_data_json, g_max_json_index
-    with open(g_load_file, "r", encoding="utf-8") as file:
+    with open(g_load_file, encoding="utf-8") as file:
        g_data_json = file.readlines()
        g_data_json = [json.loads(line) for line in g_data_json]
        g_max_json_index = len(g_data_json) - 1
@ -245,7 +249,7 @@ def b_load_json():

 def b_load_list():
    global g_data_json, g_max_json_index
-    with open(g_load_file, "r", encoding="utf-8") as source:
+    with open(g_load_file, encoding="utf-8") as source:
        data_list = source.readlines()
        for _ in data_list:
            data = _.split("|")
--- a/gsv_tools/uvr5/bs_roformer/init.py
+++ b/gsv_tools/uvr5/bs_roformer/init.py
--- a/gsv_tools/uvr5/bs_roformer/attend.py
+++ b/gsv_tools/uvr5/bs_roformer/attend.py
@ -1,7 +1,7 @@
-from packaging import version
 import torch
-from torch import nn, einsum
 import torch.nn.functional as F
+from packaging import version
+from torch import einsum, nn


 def exists(val):
--- a/gsv_tools/uvr5/bs_roformer/bs_roformer.py
+++ b/gsv_tools/uvr5/bs_roformer/bs_roformer.py
@ -1,21 +1,20 @@
+from collections.abc import Callable
 from functools import partial

 import torch
-from torch import nn
-from torch.nn import Module, ModuleList
 import torch.nn.functional as F
+from einops import pack, rearrange, unpack
+from einops.layers.torch import Rearrange

-from bs_roformer.attend import Attend
-from torch.utils.checkpoint import checkpoint
-
-from typing import Tuple, Optional, Callable
 # from beartype.typing import Tuple, Optional, List, Callable
 # from beartype import beartype
-
 from rotary_embedding_torch import RotaryEmbedding
+from torch import nn
+from torch.nn import Module, ModuleList
+from torch.utils.checkpoint import checkpoint
+
+from bs_roformer.attend import Attend

-from einops import rearrange, pack, unpack
-from einops.layers.torch import Rearrange

 # helper functions

@ -192,7 +191,7 @@ class Transformer(Module):

 class BandSplit(Module):
    # @beartype
-    def __init__(self, dim, dim_inputs: Tuple[int, ...]):
+    def __init__(self, dim, dim_inputs: tuple[int, ...]):
        super().__init__()
        self.dim_inputs = dim_inputs
        self.to_features = ModuleList([])
@ -206,7 +205,7 @@ class BandSplit(Module):
        x = x.split(self.dim_inputs, dim=-1)

        outs = []
-        for split_input, to_feature in zip(x, self.to_features):
+        for split_input, to_feature in zip(x, self.to_features, strict=False):
            split_output = to_feature(split_input)
            outs.append(split_output)

@ -219,7 +218,7 @@ def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh):
    net = []
    dims = (dim_in, *((dim_hidden,) * (depth - 1)), dim_out)

-    for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])):
+    for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:], strict=False)):
        is_last = ind == (len(dims) - 2)

        net.append(nn.Linear(layer_dim_in, layer_dim_out))
@ -234,15 +233,13 @@ def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh):

 class MaskEstimator(Module):
    # @beartype
-    def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expansion_factor=4):
+    def __init__(self, dim, dim_inputs: tuple[int, ...], depth, mlp_expansion_factor=4):
        super().__init__()
        self.dim_inputs = dim_inputs
        self.to_freqs = ModuleList([])
        dim_hidden = dim * mlp_expansion_factor

        for dim_in in dim_inputs:
-            net = []
-
            mlp = nn.Sequential(MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1))

            self.to_freqs.append(mlp)
@ -252,7 +249,7 @@ class MaskEstimator(Module):

        outs = []

-        for band_features, mlp in zip(x, self.to_freqs):
+        for band_features, mlp in zip(x, self.to_freqs, strict=False):
            freq_out = mlp(band_features)
            outs.append(freq_out)

@ -339,7 +336,7 @@ class BSRoformer(Module):
        time_transformer_depth=2,
        freq_transformer_depth=2,
        linear_transformer_depth=0,
-        freqs_per_bands: Tuple[int, ...] = DEFAULT_FREQS_PER_BANDS,
+        freqs_per_bands: tuple[int, ...] = DEFAULT_FREQS_PER_BANDS,
        # in the paper, they divide into ~60 bands, test with 1 for starters
        dim_head=64,
        heads=8,
@ -352,10 +349,10 @@ class BSRoformer(Module):
        # 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction
        stft_win_length=2048,
        stft_normalized=False,
-        stft_window_fn: Optional[Callable] = None,
+        stft_window_fn: Callable | None = None,
        mask_estimator_depth=2,
        multi_stft_resolution_loss_weight=1.0,
-        multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256),
+        multi_stft_resolutions_window_sizes: tuple[int, ...] = (4096, 2048, 1024, 512, 256),
        multi_stft_hop_size=147,
        multi_stft_normalized=False,
        multi_stft_window_fn: Callable = torch.hann_window,
--- a/gsv_tools/uvr5/bs_roformer/mel_band_roformer.py
+++ b/gsv_tools/uvr5/bs_roformer/mel_band_roformer.py
@ -1,23 +1,20 @@
+from collections.abc import Callable
 from functools import partial

 import torch
-from torch import nn
-from torch.nn import Module, ModuleList
 import torch.nn.functional as F
+from einops import pack, rearrange, reduce, repeat, unpack
+from einops.layers.torch import Rearrange
+from librosa import filters

-from bs_roformer.attend import Attend
-from torch.utils.checkpoint import checkpoint
-
-from typing import Tuple, Optional, Callable
 # from beartype.typing import Tuple, Optional, List, Callable
 # from beartype import beartype
-
 from rotary_embedding_torch import RotaryEmbedding
+from torch import nn
+from torch.nn import Module, ModuleList
+from torch.utils.checkpoint import checkpoint

-from einops import rearrange, pack, unpack, reduce, repeat
-from einops.layers.torch import Rearrange
-
-from librosa import filters
+from bs_roformer.attend import Attend


 # helper functions
@ -201,7 +198,7 @@ class Transformer(Module):

 class BandSplit(Module):
    # @beartype
-    def __init__(self, dim, dim_inputs: Tuple[int, ...]):
+    def __init__(self, dim, dim_inputs: tuple[int, ...]):
        super().__init__()
        self.dim_inputs = dim_inputs
        self.to_features = ModuleList([])
@ -215,7 +212,7 @@ class BandSplit(Module):
        x = x.split(self.dim_inputs, dim=-1)

        outs = []
-        for split_input, to_feature in zip(x, self.to_features):
+        for split_input, to_feature in zip(x, self.to_features, strict=False):
            split_output = to_feature(split_input)
            outs.append(split_output)

@ -228,7 +225,7 @@ def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh):
    net = []
    dims = (dim_in, *((dim_hidden,) * depth), dim_out)

-    for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])):
+    for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:], strict=False)):
        is_last = ind == (len(dims) - 2)

        net.append(nn.Linear(layer_dim_in, layer_dim_out))
@ -243,15 +240,13 @@ def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh):

 class MaskEstimator(Module):
    # @beartype
-    def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expansion_factor=4):
+    def __init__(self, dim, dim_inputs: tuple[int, ...], depth, mlp_expansion_factor=4):
        super().__init__()
        self.dim_inputs = dim_inputs
        self.to_freqs = ModuleList([])
        dim_hidden = dim * mlp_expansion_factor

        for dim_in in dim_inputs:
-            net = []
-
            mlp = nn.Sequential(MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1))

            self.to_freqs.append(mlp)
@ -261,7 +256,7 @@ class MaskEstimator(Module):

        outs = []

-        for band_features, mlp in zip(x, self.to_freqs):
+        for band_features, mlp in zip(x, self.to_freqs, strict=False):
            freq_out = mlp(band_features)
            outs.append(freq_out)

@ -296,10 +291,10 @@ class MelBandRoformer(Module):
        # 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction
        stft_win_length=2048,
        stft_normalized=False,
-        stft_window_fn: Optional[Callable] = None,
+        stft_window_fn: Callable | None = None,
        mask_estimator_depth=1,
        multi_stft_resolution_loss_weight=1.0,
-        multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256),
+        multi_stft_resolutions_window_sizes: tuple[int, ...] = (4096, 2048, 1024, 512, 256),
        multi_stft_hop_size=147,
        multi_stft_normalized=False,
        multi_stft_window_fn: Callable = torch.hann_window,
--- a/gsv_tools/uvr5/bsroformer.py
+++ b/gsv_tools/uvr5/bsroformer.py
@ -10,12 +10,13 @@ import torch.nn as nn
 import yaml
 from tqdm import tqdm

+
 warnings.filterwarnings("ignore")


 class Roformer_Loader:
    def get_config(self, config_path):
-        with open(config_path, "r", encoding="utf-8") as f:
+        with open(config_path, encoding="utf-8") as f:
            # use fullloader to load tag !!python/tuple, code can be improved
            config = yaml.load(f, Loader=yaml.FullLoader)
        return config
@ -104,7 +105,7 @@ class Roformer_Loader:

            model = MelBandRoformer(**dict(self.config["model"]))
        else:
-            print("Error: Unknown model: {}".format(self.model_type))
+            print(f"Error: Unknown model: {self.model_type}")
            model = None
        return model

@ -192,9 +193,11 @@ class Roformer_Loader:
        progress_bar.close()

        if self.config["training"]["target_instrument"] is None:
-            return {k: v for k, v in zip(self.config["training"]["instruments"], estimated_sources)}
+            return {k: v for k, v in zip(self.config["training"]["instruments"], estimated_sources, strict=False)}
        else:
-            return {k: v for k, v in zip([self.config["training"]["target_instrument"]], estimated_sources)}
+            return {
+                k: v for k, v in zip([self.config["training"]["target_instrument"]], estimated_sources, strict=False)
+            }

    def run_folder(self, input, vocal_root, others_root, format):
        self.model.eval()
@ -210,8 +213,8 @@ class Roformer_Loader:
        try:
            mix, sr = librosa.load(path, sr=sample_rate, mono=False)
        except Exception as e:
-            print("Can read track: {}".format(path))
-            print("Error message: {}".format(str(e)))
+            print(f"Can read track: {path}")
+            print(f"Error message: {str(e)}")
            return

        # in case if model only supports mono tracks
@ -232,17 +235,17 @@ class Roformer_Loader:
            other_instruments = [i for i in self.config["training"]["instruments"] if i != target_instrument]
            other = mix_orig - res[target_instrument]  # caculate other instruments

-            path_vocal = "{}/{}_{}.wav".format(vocal_root, file_base_name, target_instrument)
-            path_other = "{}/{}_{}.wav".format(others_root, file_base_name, other_instruments[0])
+            path_vocal = f"{vocal_root}/{file_base_name}_{target_instrument}.wav"
+            path_other = f"{others_root}/{file_base_name}_{other_instruments[0]}.wav"
            self.save_audio(path_vocal, res[target_instrument].T, sr, format)
            self.save_audio(path_other, other.T, sr, format)
        else:
            # if target instrument is not specified, save the first instrument as vocal and the rest as others
            vocal_inst = self.config["training"]["instruments"][0]
-            path_vocal = "{}/{}_{}.wav".format(vocal_root, file_base_name, vocal_inst)
+            path_vocal = f"{vocal_root}/{file_base_name}_{vocal_inst}.wav"
            self.save_audio(path_vocal, res[vocal_inst].T, sr, format)
            for other in self.config["training"]["instruments"][1:]:  # save other instruments
-                path_other = "{}/{}_{}.wav".format(others_root, file_base_name, other)
+                path_other = f"{others_root}/{file_base_name}_{other}.wav"
                self.save_audio(path_other, res[other].T, sr, format)

    def save_audio(self, path, data, sr, format):
@ -253,7 +256,7 @@ class Roformer_Loader:
            sf.write(path, data, sr)
        else:
            sf.write(path, data, sr)
-            os.system('ffmpeg -i "{}" -vn "{}" -q:a 2 -y'.format(path, path[:-3] + format))
+            os.system(f'ffmpeg -i "{path}" -vn "{path[:-3] + format}" -q:a 2 -y')
            try:
                os.remove(path)
            except:
@ -275,7 +278,7 @@ class Roformer_Loader:
            if self.model_type is None:
                # if model_type is still None, raise an error
                raise ValueError(
-                    "Error: Unknown model type. If you are using a model without a configuration file, Ensure that your model name includes 'bs_roformer', 'bsroformer', 'mel_band_roformer', or 'melbandroformer'. Otherwise, you can manually place the model configuration file into 'tools/uvr5/uvr5w_weights' and ensure that the configuration file is named as '<model_name>.yaml' then try it again."
+                    "Error: Unknown model type. If you are using a model without a configuration file, Ensure that your model name includes 'bs_roformer', 'bsroformer', 'mel_band_roformer', or 'melbandroformer'. Otherwise, you can manually place the model configuration file into 'gsv_tools/uvr5/uvr5w_weights' and ensure that the configuration file is named as '<model_name>.yaml' then try it again."
                )
            self.config = self.get_default_config()
        else:
@ -290,12 +293,12 @@ class Roformer_Loader:
                    # else it's a mel_band_roformer model
                    self.model_type = "mel_band_roformer"

-        print("Detected model type: {}".format(self.model_type))
+        print(f"Detected model type: {self.model_type}")
        model = self.get_model_from_config()
        state_dict = torch.load(model_path, map_location="cpu")
        model.load_state_dict(state_dict)

-        if is_half == False:
+        if not is_half:
            self.model = model.to(device)
        else:
            self.model = model.half().to(device)
--- a/gsv_tools/uvr5/lib/lib_v5/dataset.py
+++ b/gsv_tools/uvr5/lib/lib_v5/dataset.py
@ -38,7 +38,7 @@ def make_pair(mix_dir, inst_dir):
        [os.path.join(inst_dir, fname) for fname in os.listdir(inst_dir) if os.path.splitext(fname)[1] in input_exts]
    )

-    filelist = list(zip(X_list, y_list))
+    filelist = list(zip(X_list, y_list, strict=False))

    return filelist

@ -138,10 +138,10 @@ def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset

 def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
    patch_list = []
-    patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(cropsize, sr, hop_length, n_fft, offset)
+    patch_dir = f"cs{cropsize}_sr{sr}_hl{hop_length}_nf{n_fft}_of{offset}"
    os.makedirs(patch_dir, exist_ok=True)

-    for i, (X_path, y_path) in enumerate(tqdm(filelist)):
+    for _i, (X_path, y_path) in enumerate(tqdm(filelist)):
        basename = os.path.splitext(os.path.basename(X_path))[0]

        X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
@ -154,7 +154,7 @@ def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):

        len_dataset = int(np.ceil(X.shape[2] / roi_size))
        for j in range(len_dataset):
-            outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j))
+            outpath = os.path.join(patch_dir, f"{basename}_p{j}.npz")
            start = j * roi_size
            if not os.path.exists(outpath):
                np.savez(
--- a/gsv_tools/uvr5/lib/lib_v5/layers.py
+++ b/gsv_tools/uvr5/lib/lib_v5/layers.py
@ -7,7 +7,7 @@ from . import spec_utils

 class Conv2DBNActiv(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(Conv2DBNActiv, self).__init__()
+        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
                nin,
@ -28,7 +28,7 @@ class Conv2DBNActiv(nn.Module):

 class SeperableConv2DBNActiv(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(SeperableConv2DBNActiv, self).__init__()
+        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
                nin,
@ -51,7 +51,7 @@ class SeperableConv2DBNActiv(nn.Module):

 class Encoder(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
-        super(Encoder, self).__init__()
+        super().__init__()
        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)

@ -64,7 +64,7 @@ class Encoder(nn.Module):

 class Decoder(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
-        super(Decoder, self).__init__()
+        super().__init__()
        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
        self.dropout = nn.Dropout2d(0.1) if dropout else None

@ -83,7 +83,7 @@ class Decoder(nn.Module):

 class ASPPModule(nn.Module):
    def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
-        super(ASPPModule, self).__init__()
+        super().__init__()
        self.conv1 = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, None)),
            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
--- a/gsv_tools/uvr5/lib/lib_v5/layers_123812KB.py
+++ b/gsv_tools/uvr5/lib/lib_v5/layers_123812KB.py
@ -7,7 +7,7 @@ from . import spec_utils

 class Conv2DBNActiv(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(Conv2DBNActiv, self).__init__()
+        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
                nin,
@ -28,7 +28,7 @@ class Conv2DBNActiv(nn.Module):

 class SeperableConv2DBNActiv(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(SeperableConv2DBNActiv, self).__init__()
+        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
                nin,
@ -51,7 +51,7 @@ class SeperableConv2DBNActiv(nn.Module):

 class Encoder(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
-        super(Encoder, self).__init__()
+        super().__init__()
        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)

@ -64,7 +64,7 @@ class Encoder(nn.Module):

 class Decoder(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
-        super(Decoder, self).__init__()
+        super().__init__()
        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
        self.dropout = nn.Dropout2d(0.1) if dropout else None

@ -83,7 +83,7 @@ class Decoder(nn.Module):

 class ASPPModule(nn.Module):
    def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
-        super(ASPPModule, self).__init__()
+        super().__init__()
        self.conv1 = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, None)),
            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
--- a/gsv_tools/uvr5/lib/lib_v5/layers_123821KB.py
+++ b/gsv_tools/uvr5/lib/lib_v5/layers_123821KB.py
@ -7,7 +7,7 @@ from . import spec_utils

 class Conv2DBNActiv(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(Conv2DBNActiv, self).__init__()
+        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
                nin,
@ -28,7 +28,7 @@ class Conv2DBNActiv(nn.Module):

 class SeperableConv2DBNActiv(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(SeperableConv2DBNActiv, self).__init__()
+        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
                nin,
@ -51,7 +51,7 @@ class SeperableConv2DBNActiv(nn.Module):

 class Encoder(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
-        super(Encoder, self).__init__()
+        super().__init__()
        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)

@ -64,7 +64,7 @@ class Encoder(nn.Module):

 class Decoder(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
-        super(Decoder, self).__init__()
+        super().__init__()
        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
        self.dropout = nn.Dropout2d(0.1) if dropout else None

@ -83,7 +83,7 @@ class Decoder(nn.Module):

 class ASPPModule(nn.Module):
    def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
-        super(ASPPModule, self).__init__()
+        super().__init__()
        self.conv1 = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, None)),
            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
--- a/gsv_tools/uvr5/lib/lib_v5/layers_33966KB.py
+++ b/gsv_tools/uvr5/lib/lib_v5/layers_33966KB.py
@ -7,7 +7,7 @@ from . import spec_utils

 class Conv2DBNActiv(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(Conv2DBNActiv, self).__init__()
+        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
                nin,
@ -28,7 +28,7 @@ class Conv2DBNActiv(nn.Module):

 class SeperableConv2DBNActiv(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(SeperableConv2DBNActiv, self).__init__()
+        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
                nin,
@ -51,7 +51,7 @@ class SeperableConv2DBNActiv(nn.Module):

 class Encoder(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
-        super(Encoder, self).__init__()
+        super().__init__()
        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)

@ -64,7 +64,7 @@ class Encoder(nn.Module):

 class Decoder(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
-        super(Decoder, self).__init__()
+        super().__init__()
        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
        self.dropout = nn.Dropout2d(0.1) if dropout else None

@ -83,7 +83,7 @@ class Decoder(nn.Module):

 class ASPPModule(nn.Module):
    def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
-        super(ASPPModule, self).__init__()
+        super().__init__()
        self.conv1 = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, None)),
            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
--- a/gsv_tools/uvr5/lib/lib_v5/layers_537227KB.py
+++ b/gsv_tools/uvr5/lib/lib_v5/layers_537227KB.py
@ -7,7 +7,7 @@ from . import spec_utils

 class Conv2DBNActiv(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(Conv2DBNActiv, self).__init__()
+        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
                nin,
@ -28,7 +28,7 @@ class Conv2DBNActiv(nn.Module):

 class SeperableConv2DBNActiv(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(SeperableConv2DBNActiv, self).__init__()
+        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
                nin,
@ -51,7 +51,7 @@ class SeperableConv2DBNActiv(nn.Module):

 class Encoder(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
-        super(Encoder, self).__init__()
+        super().__init__()
        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)

@ -64,7 +64,7 @@ class Encoder(nn.Module):

 class Decoder(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
-        super(Decoder, self).__init__()
+        super().__init__()
        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
        self.dropout = nn.Dropout2d(0.1) if dropout else None

@ -83,7 +83,7 @@ class Decoder(nn.Module):

 class ASPPModule(nn.Module):
    def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
-        super(ASPPModule, self).__init__()
+        super().__init__()
        self.conv1 = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, None)),
            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
--- a/gsv_tools/uvr5/lib/lib_v5/layers_537238KB.py
+++ b/gsv_tools/uvr5/lib/lib_v5/layers_537238KB.py
@ -7,7 +7,7 @@ from . import spec_utils

 class Conv2DBNActiv(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(Conv2DBNActiv, self).__init__()
+        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
                nin,
@ -28,7 +28,7 @@ class Conv2DBNActiv(nn.Module):

 class SeperableConv2DBNActiv(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(SeperableConv2DBNActiv, self).__init__()
+        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
                nin,
@ -51,7 +51,7 @@ class SeperableConv2DBNActiv(nn.Module):

 class Encoder(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
-        super(Encoder, self).__init__()
+        super().__init__()
        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
        self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)

@ -64,7 +64,7 @@ class Encoder(nn.Module):

 class Decoder(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
-        super(Decoder, self).__init__()
+        super().__init__()
        self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
        self.dropout = nn.Dropout2d(0.1) if dropout else None

@ -83,7 +83,7 @@ class Decoder(nn.Module):

 class ASPPModule(nn.Module):
    def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
-        super(ASPPModule, self).__init__()
+        super().__init__()
        self.conv1 = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, None)),
            Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
--- a/gsv_tools/uvr5/lib/lib_v5/layers_new.py
+++ b/gsv_tools/uvr5/lib/lib_v5/layers_new.py
@ -7,7 +7,7 @@ from . import spec_utils

 class Conv2DBNActiv(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
-        super(Conv2DBNActiv, self).__init__()
+        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(
                nin,
@ -28,7 +28,7 @@ class Conv2DBNActiv(nn.Module):

 class Encoder(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
-        super(Encoder, self).__init__()
+        super().__init__()
        self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
        self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)

@ -41,7 +41,7 @@ class Encoder(nn.Module):

 class Decoder(nn.Module):
    def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
-        super(Decoder, self).__init__()
+        super().__init__()
        self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
        # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
        self.dropout = nn.Dropout2d(0.1) if dropout else None
@ -64,7 +64,7 @@ class Decoder(nn.Module):

 class ASPPModule(nn.Module):
    def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
-        super(ASPPModule, self).__init__()
+        super().__init__()
        self.conv1 = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, None)),
            Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
@ -94,7 +94,7 @@ class ASPPModule(nn.Module):

 class LSTMModule(nn.Module):
    def __init__(self, nin_conv, nin_lstm, nout_lstm):
-        super(LSTMModule, self).__init__()
+        super().__init__()
        self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
        self.lstm = nn.LSTM(input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True)
        self.dense = nn.Sequential(nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU())
--- a/gsv_tools/uvr5/lib/lib_v5/model_param_init.py
+++ b/gsv_tools/uvr5/lib/lib_v5/model_param_init.py
@ -1,6 +1,7 @@
 import json
 import pathlib

+
 default_param = {}
 default_param["bins"] = 768
 default_param["unstable_bins"] = 9  # training only
@ -41,7 +42,7 @@ def int_keys(d):
    return r


-class ModelParameters(object):
+class ModelParameters:
    def __init__(self, config_path=""):
        if ".pth" == pathlib.Path(config_path).suffix:
            import zipfile
@ -49,7 +50,7 @@ class ModelParameters(object):
            with zipfile.ZipFile(config_path, "r") as zip:
                self.param = json.loads(zip.read("param.json"), object_pairs_hook=int_keys)
        elif ".json" == pathlib.Path(config_path).suffix:
-            with open(config_path, "r") as f:
+            with open(config_path) as f:
                self.param = json.loads(f.read(), object_pairs_hook=int_keys)
        else:
            self.param = default_param
--- a/gsv_tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json
+++ b/gsv_tools/uvr5/lib/lib_v5/modelparams/1band_sr16000_hl512.json
--- a/gsv_tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json
+++ b/gsv_tools/uvr5/lib/lib_v5/modelparams/1band_sr32000_hl512.json
--- a/gsv_tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json
+++ b/gsv_tools/uvr5/lib/lib_v5/modelparams/1band_sr33075_hl384.json
--- a/gsv_tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json
+++ b/gsv_tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl1024.json
--- a/gsv_tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json
+++ b/gsv_tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl256.json
--- a/gsv_tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json
+++ b/gsv_tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512.json
--- a/gsv_tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json
+++ b/gsv_tools/uvr5/lib/lib_v5/modelparams/1band_sr44100_hl512_cut.json
--- a/gsv_tools/uvr5/lib/lib_v5/modelparams/2band_32000.json
+++ b/gsv_tools/uvr5/lib/lib_v5/modelparams/2band_32000.json
--- a/gsv_tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json
+++ b/gsv_tools/uvr5/lib/lib_v5/modelparams/2band_44100_lofi.json
--- a/gsv_tools/uvr5/lib/lib_v5/modelparams/2band_48000.json
+++ b/gsv_tools/uvr5/lib/lib_v5/modelparams/2band_48000.json
--- a/gsv_tools/uvr5/lib/lib_v5/modelparams/3band_44100.json
+++ b/gsv_tools/uvr5/lib/lib_v5/modelparams/3band_44100.json
--- a/gsv_tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json
+++ b/gsv_tools/uvr5/lib/lib_v5/modelparams/3band_44100_mid.json
--- a/gsv_tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json
+++ b/gsv_tools/uvr5/lib/lib_v5/modelparams/3band_44100_msb2.json
--- a/gsv_tools/uvr5/lib/lib_v5/modelparams/4band_44100.json
+++ b/gsv_tools/uvr5/lib/lib_v5/modelparams/4band_44100.json
--- a/gsv_tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json
+++ b/gsv_tools/uvr5/lib/lib_v5/modelparams/4band_44100_mid.json
--- a/gsv_tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json
+++ b/gsv_tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb.json
--- a/gsv_tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json
+++ b/gsv_tools/uvr5/lib/lib_v5/modelparams/4band_44100_msb2.json
--- a/gsv_tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json
+++ b/gsv_tools/uvr5/lib/lib_v5/modelparams/4band_44100_reverse.json
--- a/Show More
+++ b/Show More