Merge 1e59f757a29e7e9eff73fb65b5740af098fe064e into ed89a023378dabba9d4b6580235bb9742245816d

2025-10-11 02:12:11 +08:00 · 2025-06-11 15:19:08 +00:00 · 2025-06-11 15:19:08 +00:00 · bd8022916e
commit bd8022916e
parent ed89a02337 1e59f757a2
47 changed files with 1591 additions and 1254 deletions
--- a/.github/build_windows_packages.ps1
+++ b/.github/build_windows_packages.ps1
@ -162,7 +162,7 @@ Copy-Item -Path $curr -Destination $pkgName -Recurse
 $7zPath = "$pkgName.7z"
 $start = Get-Date
 Write-Host "Compress Starting at $start"
-& "C:\Program Files\7-Zip\7z.exe" a -t7z "$7zPath" "$pkgName" -m0=lzma2 -mx=9 -md=1g -ms=1g -mmc=500 -mfb=273 -mlc=0 -mlp=4 -mpb=4 -mc=8g -mmt=on -bsp1
+& "C:\Program Files\7-Zip\7z.exe" a -t7z "$7zPath" "$pkgName" -m0=lzma2 -mx=9 -mmt=on -bsp1
 $end = Get-Date
 Write-Host "Elapsed time: $($end - $start)"
 Get-ChildItem .
--- a/GPT_SoVITS/AR/models/t2s_lightning_module.py
+++ b/GPT_SoVITS/AR/models/t2s_lightning_module.py
@ -28,7 +28,8 @@ class Text2SemanticLightningModule(LightningModule):
                self.load_state_dict(
                    torch.load(
                        pretrained_s1,
-                        map_location="cpu", weights_only=False,
+                        map_location="cpu",
                        weights_only=False,
                    )["weight"],
                )
            )
--- a/GPT_SoVITS/TTS_infer_pack/TTS.py
+++ b/GPT_SoVITS/TTS_infer_pack/TTS.py
@ -32,19 +32,21 @@ from transformers import AutoModelForMaskedLM, AutoTokenizer
 from tools.audio_sr import AP_BWE
 from tools.i18n.i18n import I18nAuto, scan_language_list
 from tools.my_utils import load_audio
 from TTS_infer_pack.text_segmentation_method import splits
 from TTS_infer_pack.TextPreprocessor import TextPreprocessor
 from sv import SV
 resample_transform_dict = {}
 def resample(audio_tensor, sr0, sr1, device):
    global resample_transform_dict
    key = "%s-%s-%s" % (sr0, sr1, str(device))
    if key not in resample_transform_dict:
-        resample_transform_dict[key] = torchaudio.transforms.Resample(
+        resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
            sr0, sr1
        ).to(device)
    return resample_transform_dict[key](audio_tensor)
 language = os.environ.get("language", "Auto")
 language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
 i18n = I18nAuto(language=language)
@ -111,6 +113,7 @@ def speed_change(input_audio: np.ndarray, speed: float, sr: int):
    return processed_audio
 class DictToAttrRecursive(dict):
    def __init__(self, input_dict):
        super().__init__(input_dict)
@ -632,7 +635,9 @@ class TTS:
            )
            self.vocoder.remove_weight_norm()
            state_dict_g = torch.load(
-                "%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu", weights_only=False
+                "%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,),
                map_location="cpu",
                weights_only=False,
            )
            print("loading vocoder", self.vocoder.load_state_dict(state_dict_g))
@ -752,11 +757,13 @@ class TTS:
        if raw_sr != self.configs.sampling_rate:
            audio = raw_audio.to(self.configs.device)
-            if (audio.shape[0] == 2): audio = audio.mean(0).unsqueeze(0)
+            if audio.shape[0] == 2:
                audio = audio.mean(0).unsqueeze(0)
            audio = resample(audio, raw_sr, self.configs.sampling_rate, self.configs.device)
        else:
            audio = raw_audio.to(self.configs.device)
-            if (audio.shape[0] == 2): audio = audio.mean(0).unsqueeze(0)
+            if audio.shape[0] == 2:
                audio = audio.mean(0).unsqueeze(0)
        maxx = audio.abs().max()
        if maxx > 1:
@ -775,7 +782,8 @@ class TTS:
            audio = resample(audio, self.configs.sampling_rate, 16000, self.configs.device)
            if self.configs.is_half:
                audio = audio.half()
-        else:audio=None
+        else:
            audio = None
        return spec, audio
    def _set_prompt_semantic(self, ref_wav_path: str):
@ -1073,7 +1081,10 @@ class TTS:
        ###### setting reference audio and prompt text preprocessing ########
        t0 = time.perf_counter()
-        if (ref_audio_path is not None) and (ref_audio_path != self.prompt_cache["ref_audio_path"] or (self.is_v2pro and self.prompt_cache["refer_spec"][0][1] is None)):
+        if (ref_audio_path is not None) and (
            ref_audio_path != self.prompt_cache["ref_audio_path"]
            or (self.is_v2pro and self.prompt_cache["refer_spec"][0][1] is None)
        ):
            if not os.path.exists(ref_audio_path):
                raise ValueError(f"{ref_audio_path} not exists")
            self.set_ref_audio(ref_audio_path)
@ -1212,7 +1223,8 @@ class TTS:
                t_34 += t4 - t3
                refer_audio_spec = []
-                if self.is_v2pro:sv_emb=[]
+                if self.is_v2pro:
                    sv_emb = []
                for spec, audio_tensor in self.prompt_cache["refer_spec"]:
                    spec = spec.to(dtype=self.precision, device=self.configs.device)
                    refer_audio_spec.append(spec)
@ -1250,9 +1262,13 @@ class TTS:
                        )
                        _batch_phones = torch.cat(batch_phones).unsqueeze(0).to(self.configs.device)
                        if self.is_v2pro != True:
-                            _batch_audio_fragment = self.vits_model.decode(all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor).detach()[0, 0, :]
+                            _batch_audio_fragment = self.vits_model.decode(
                                all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor
                            ).detach()[0, 0, :]
                        else:
-                            _batch_audio_fragment = self.vits_model.decode(all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor,sv_emb=sv_emb).detach()[0, 0, :]
+                            _batch_audio_fragment = self.vits_model.decode(
                                all_pred_semantic, _batch_phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb
                            ).detach()[0, 0, :]
                        audio_frag_end_idx.insert(0, 0)
                        batch_audio_fragment = [
                            _batch_audio_fragment[audio_frag_end_idx[i - 1] : audio_frag_end_idx[i]]
@ -1266,9 +1282,13 @@ class TTS:
                                pred_semantic_list[i][-idx:].unsqueeze(0).unsqueeze(0)
                            )  # .unsqueeze(0)#mq要多unsqueeze一次
                            if self.is_v2pro != True:
-                                audio_fragment = self.vits_model.decode(_pred_semantic, phones, refer_audio_spec, speed=speed_factor).detach()[0, 0, :]
+                                audio_fragment = self.vits_model.decode(
                                    _pred_semantic, phones, refer_audio_spec, speed=speed_factor
                                ).detach()[0, 0, :]
                            else:
-                                audio_fragment = self.vits_model.decode(_pred_semantic, phones, refer_audio_spec, speed=speed_factor,sv_emb=sv_emb).detach()[0, 0, :]
+                                audio_fragment = self.vits_model.decode(
                                    _pred_semantic, phones, refer_audio_spec, speed=speed_factor, sv_emb=sv_emb
                                ).detach()[0, 0, :]
                            batch_audio_fragment.append(audio_fragment)  ###试试重建不带上prompt部分
                else:
                    if parallel_infer:
--- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
+++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
@ -160,7 +160,9 @@ class TextPreprocessor:
                else:
                    for tmp in LangSegmenter.getTexts(text):
                        if langlist:
-                            if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"):
+                            if (tmp["lang"] == "en" and langlist[-1] == "en") or (
                                tmp["lang"] != "en" and langlist[-1] != "en"
                            ):
                                textlist[-1] += tmp["text"]
                                continue
                        if tmp["lang"] == "en":
--- a/GPT_SoVITS/eres2net/ERes2Net.py
+++ b/GPT_SoVITS/eres2net/ERes2Net.py
@ -8,7 +8,6 @@
 The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
 """
 import torch
 import math
 import torch.nn as nn
@ -16,15 +15,14 @@ import torch.nn.functional as F
 import pooling_layers as pooling_layers
 from fusion import AFF
 class ReLU(nn.Hardtanh):
 class ReLU(nn.Hardtanh):
    def __init__(self, inplace=False):
        super(ReLU, self).__init__(0, 20, inplace)
    def __repr__(self):
-        inplace_str = 'inplace' if self.inplace else ''
+        inplace_str = "inplace" if self.inplace else ""
-        return self.__class__.__name__ + ' (' \
+        return self.__class__.__name__ + " (" + inplace_str + ")"
            + inplace_str + ')'
 class BasicBlockERes2Net(nn.Module):
@ -51,9 +49,9 @@ class BasicBlockERes2Net(nn.Module):
        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(
-                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1,
+                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
-                          stride=stride, bias=False),
+                nn.BatchNorm2d(self.expansion * planes),
-                nn.BatchNorm2d(self.expansion * planes))
+            )
        self.stride = stride
        self.width = width
        self.scale = scale
@ -86,6 +84,7 @@ class BasicBlockERes2Net(nn.Module):
        return out
 class BasicBlockERes2Net_diff_AFF(nn.Module):
    expansion = 2
@ -115,9 +114,9 @@ class BasicBlockERes2Net_diff_AFF(nn.Module):
        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(
-                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1,
+                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
-                          stride=stride, bias=False),
+                nn.BatchNorm2d(self.expansion * planes),
-                nn.BatchNorm2d(self.expansion * planes))
+            )
        self.stride = stride
        self.width = width
        self.scale = scale
@ -151,16 +150,19 @@ class BasicBlockERes2Net_diff_AFF(nn.Module):
        return out
 class ERes2Net(nn.Module):
-    def __init__(self,
+    def __init__(
        self,
        block=BasicBlockERes2Net,
        block_fuse=BasicBlockERes2Net_diff_AFF,
        num_blocks=[3, 4, 6, 3],
        m_channels=32,
        feat_dim=80,
        embedding_size=192,
-                 pooling_func='TSTP',
+        pooling_func="TSTP",
-                 two_emb_layer=False):
+        two_emb_layer=False,
    ):
        super(ERes2Net, self).__init__()
        self.in_planes = m_channels
        self.feat_dim = feat_dim
@ -176,20 +178,24 @@ class ERes2Net(nn.Module):
        self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2)
        # Downsampling module for each layer
-        self.layer1_downsample = nn.Conv2d(m_channels * 2, m_channels * 4, kernel_size=3, stride=2, padding=1, bias=False)
+        self.layer1_downsample = nn.Conv2d(
-        self.layer2_downsample = nn.Conv2d(m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, bias=False)
+            m_channels * 2, m_channels * 4, kernel_size=3, stride=2, padding=1, bias=False
-        self.layer3_downsample = nn.Conv2d(m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, bias=False)
+        )
        self.layer2_downsample = nn.Conv2d(
            m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, bias=False
        )
        self.layer3_downsample = nn.Conv2d(
            m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, bias=False
        )
        # Bottom-up fusion module
        self.fuse_mode12 = AFF(channels=m_channels * 4)
        self.fuse_mode123 = AFF(channels=m_channels * 8)
        self.fuse_mode1234 = AFF(channels=m_channels * 16)
-        self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == "TSDP" else 2
+        self.n_stats = 1 if pooling_func == "TAP" or pooling_func == "TSDP" else 2
-        self.pool = getattr(pooling_layers, pooling_func)(
+        self.pool = getattr(pooling_layers, pooling_func)(in_dim=self.stats_dim * block.expansion)
-            in_dim=self.stats_dim * block.expansion)
+        self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats, embedding_size)
        self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats,
                               embedding_size)
        if self.two_emb_layer:
            self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
            self.seg_2 = nn.Linear(embedding_size, embedding_size)
@ -247,14 +253,12 @@ class ERes2Net(nn.Module):
        return fuse_out1234
-if __name__ == '__main__':
+if __name__ == "__main__":
    x = torch.zeros(10, 300, 80)
-    model = ERes2Net(feat_dim=80, embedding_size=192, pooling_func='TSTP')
+    model = ERes2Net(feat_dim=80, embedding_size=192, pooling_func="TSTP")
    model.eval()
    out = model(x)
    print(out.shape)  # torch.Size([10, 192])
    num_params = sum(param.numel() for param in model.parameters())
    print("{} M".format(num_params / 1e6))  # 6.61M
--- a/GPT_SoVITS/eres2net/ERes2NetV2.py
+++ b/GPT_SoVITS/eres2net/ERes2NetV2.py
@ -8,8 +8,6 @@
 both the model parameters and its computational cost.
 """
 import torch
 import math
 import torch.nn as nn
@ -17,19 +15,17 @@ import torch.nn.functional as F
 import pooling_layers as pooling_layers
 from fusion import AFF
 class ReLU(nn.Hardtanh):
 class ReLU(nn.Hardtanh):
    def __init__(self, inplace=False):
        super(ReLU, self).__init__(0, 20, inplace)
    def __repr__(self):
-        inplace_str = 'inplace' if self.inplace else ''
+        inplace_str = "inplace" if self.inplace else ""
-        return self.__class__.__name__ + ' (' \
+        return self.__class__.__name__ + " (" + inplace_str + ")"
            + inplace_str + ')'
 class BasicBlockERes2NetV2(nn.Module):
    def __init__(self, in_planes, planes, stride=1, baseWidth=26, scale=2, expansion=2):
        super(BasicBlockERes2NetV2, self).__init__()
        width = int(math.floor(planes * (baseWidth / 64.0)))
@ -52,12 +48,9 @@ class BasicBlockERes2NetV2(nn.Module):
        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(
-                nn.Conv2d(in_planes,
+                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
-                          self.expansion * planes,
+                nn.BatchNorm2d(self.expansion * planes),
-                          kernel_size=1,
+            )
                          stride=stride,
                          bias=False),
                nn.BatchNorm2d(self.expansion * planes))
        self.stride = stride
        self.width = width
        self.scale = scale
@ -90,8 +83,8 @@ class BasicBlockERes2NetV2(nn.Module):
        return out
 class BasicBlockERes2NetV2AFF(nn.Module):
 class BasicBlockERes2NetV2AFF(nn.Module):
    def __init__(self, in_planes, planes, stride=1, baseWidth=26, scale=2, expansion=2):
        super(BasicBlockERes2NetV2AFF, self).__init__()
        width = int(math.floor(planes * (baseWidth / 64.0)))
@ -119,12 +112,9 @@ class BasicBlockERes2NetV2AFF(nn.Module):
        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(
-                nn.Conv2d(in_planes,
+                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
-                          self.expansion * planes,
+                nn.BatchNorm2d(self.expansion * planes),
-                          kernel_size=1,
+            )
                          stride=stride,
                          bias=False),
                nn.BatchNorm2d(self.expansion * planes))
        self.stride = stride
        self.width = width
        self.scale = scale
@ -158,8 +148,10 @@ class BasicBlockERes2NetV2AFF(nn.Module):
        return out
 class ERes2NetV2(nn.Module):
-    def __init__(self,
+    def __init__(
        self,
        block=BasicBlockERes2NetV2,
        block_fuse=BasicBlockERes2NetV2AFF,
        num_blocks=[3, 4, 6, 3],
@ -169,8 +161,9 @@ class ERes2NetV2(nn.Module):
        baseWidth=26,
        scale=2,
        expansion=2,
-                 pooling_func='TSTP',
+        pooling_func="TSTP",
-                 two_emb_layer=False):
+        two_emb_layer=False,
    ):
        super(ERes2NetV2, self).__init__()
        self.in_planes = m_channels
        self.feat_dim = feat_dim
@ -181,42 +174,29 @@ class ERes2NetV2(nn.Module):
        self.scale = scale
        self.expansion = expansion
-        self.conv1 = nn.Conv2d(1,
+        self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
                               m_channels,
                               kernel_size=3,
                               stride=1,
                               padding=1,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(m_channels)
-        self.layer1 = self._make_layer(block,
+        self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=1)
-                                       m_channels,
+        self.layer2 = self._make_layer(block, m_channels * 2, num_blocks[1], stride=2)
-                                       num_blocks[0],
+        self.layer3 = self._make_layer(block_fuse, m_channels * 4, num_blocks[2], stride=2)
-                                       stride=1)
+        self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2)
        self.layer2 = self._make_layer(block,
                                       m_channels * 2,
                                       num_blocks[1],
                                       stride=2)
        self.layer3 = self._make_layer(block_fuse,
                                       m_channels * 4,
                                       num_blocks[2],
                                       stride=2)
        self.layer4 = self._make_layer(block_fuse,
                                       m_channels * 8,
                                       num_blocks[3],
                                       stride=2)
        # Downsampling module
-        self.layer3_ds = nn.Conv2d(m_channels * 4 * self.expansion, m_channels * 8 * self.expansion, kernel_size=3, \
+        self.layer3_ds = nn.Conv2d(
-                                   padding=1, stride=2, bias=False)
+            m_channels * 4 * self.expansion,
            m_channels * 8 * self.expansion,
            kernel_size=3,
            padding=1,
            stride=2,
            bias=False,
        )
        # Bottom-up fusion module
        self.fuse34 = AFF(channels=m_channels * 8 * self.expansion, r=4)
-        self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == "TSDP" else 2
+        self.n_stats = 1 if pooling_func == "TAP" or pooling_func == "TSDP" else 2
-        self.pool = getattr(pooling_layers, pooling_func)(
+        self.pool = getattr(pooling_layers, pooling_func)(in_dim=self.stats_dim * self.expansion)
-            in_dim=self.stats_dim * self.expansion)
+        self.seg_1 = nn.Linear(self.stats_dim * self.expansion * self.n_stats, embedding_size)
        self.seg_1 = nn.Linear(self.stats_dim * self.expansion * self.n_stats,
                               embedding_size)
        if self.two_emb_layer:
            self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
            self.seg_2 = nn.Linear(embedding_size, embedding_size)
@ -228,7 +208,11 @@ class ERes2NetV2(nn.Module):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
-            layers.append(block(self.in_planes, planes, stride, baseWidth=self.baseWidth, scale=self.scale, expansion=self.expansion))
+            layers.append(
                block(
                    self.in_planes, planes, stride, baseWidth=self.baseWidth, scale=self.scale, expansion=self.expansion
                )
            )
            self.in_planes = planes * self.expansion
        return nn.Sequential(*layers)
@ -276,8 +260,8 @@ class ERes2NetV2(nn.Module):
        # else:
        #     return embed_a
 if __name__ == '__main__':
 if __name__ == "__main__":
    x = torch.randn(1, 300, 80)
    model = ERes2NetV2(feat_dim=80, embedding_size=192, m_channels=64, baseWidth=26, scale=2, expansion=2)
    model.eval()
@ -286,7 +270,3 @@ if __name__ == '__main__':
    macs, num_params = profile(model, inputs=(x,))
    print("Params: {} M".format(num_params / 1e6))  # 17.86 M
    print("MACs: {} G".format(macs / 1e9))  # 12.69 G
--- a/GPT_SoVITS/eres2net/ERes2Net_huge.py
+++ b/GPT_SoVITS/eres2net/ERes2Net_huge.py
@ -8,7 +8,6 @@
 ERes2Net-huge is an upgraded version of ERes2Net that uses a larger number of parameters to achieve better
 recognition performance. Parameters expansion, baseWidth, and scale can be modified to obtain optimal performance.
 """
 import pdb
 import torch
 import math
@ -17,15 +16,14 @@ import torch.nn.functional as F
 import pooling_layers as pooling_layers
 from fusion import AFF
 class ReLU(nn.Hardtanh):
 class ReLU(nn.Hardtanh):
    def __init__(self, inplace=False):
        super(ReLU, self).__init__(0, 20, inplace)
    def __repr__(self):
-        inplace_str = 'inplace' if self.inplace else ''
+        inplace_str = "inplace" if self.inplace else ""
-        return self.__class__.__name__ + ' (' \
+        return self.__class__.__name__ + " (" + inplace_str + ")"
            + inplace_str + ')'
 class BasicBlockERes2Net(nn.Module):
@ -53,7 +51,8 @@ class BasicBlockERes2Net(nn.Module):
        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(self.expansion * planes))
+                nn.BatchNorm2d(self.expansion * planes),
            )
        self.stride = stride
        self.width = width
        self.scale = scale
@ -86,6 +85,7 @@ class BasicBlockERes2Net(nn.Module):
        return out
 class BasicBlockERes2Net_diff_AFF(nn.Module):
    expansion = 4
@ -116,7 +116,8 @@ class BasicBlockERes2Net_diff_AFF(nn.Module):
        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(self.expansion * planes))
+                nn.BatchNorm2d(self.expansion * planes),
            )
        self.stride = stride
        self.width = width
        self.scale = scale
@ -141,7 +142,6 @@ class BasicBlockERes2Net_diff_AFF(nn.Module):
            else:
                out = torch.cat((out, sp), 1)
        out = self.conv3(out)
        out = self.bn3(out)
@ -151,16 +151,19 @@ class BasicBlockERes2Net_diff_AFF(nn.Module):
        return out
 class ERes2Net(nn.Module):
-    def __init__(self,
+    def __init__(
        self,
        block=BasicBlockERes2Net,
        block_fuse=BasicBlockERes2Net_diff_AFF,
        num_blocks=[3, 4, 6, 3],
        m_channels=64,
        feat_dim=80,
        embedding_size=192,
-                 pooling_func='TSTP',
+        pooling_func="TSTP",
-                 two_emb_layer=False):
+        two_emb_layer=False,
    ):
        super(ERes2Net, self).__init__()
        self.in_planes = m_channels
        self.feat_dim = feat_dim
@ -176,17 +179,22 @@ class ERes2Net(nn.Module):
        self.layer3 = self._make_layer(block_fuse, m_channels * 4, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block_fuse, m_channels * 8, num_blocks[3], stride=2)
-        self.layer1_downsample = nn.Conv2d(m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, bias=False)
+        self.layer1_downsample = nn.Conv2d(
-        self.layer2_downsample = nn.Conv2d(m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, bias=False)
+            m_channels * 4, m_channels * 8, kernel_size=3, padding=1, stride=2, bias=False
-        self.layer3_downsample = nn.Conv2d(m_channels * 16, m_channels * 32, kernel_size=3, padding=1, stride=2, bias=False)
+        )
        self.layer2_downsample = nn.Conv2d(
            m_channels * 8, m_channels * 16, kernel_size=3, padding=1, stride=2, bias=False
        )
        self.layer3_downsample = nn.Conv2d(
            m_channels * 16, m_channels * 32, kernel_size=3, padding=1, stride=2, bias=False
        )
        self.fuse_mode12 = AFF(channels=m_channels * 8)
        self.fuse_mode123 = AFF(channels=m_channels * 16)
        self.fuse_mode1234 = AFF(channels=m_channels * 32)
-        self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == "TSDP" else 2
+        self.n_stats = 1 if pooling_func == "TAP" or pooling_func == "TSDP" else 2
-        self.pool = getattr(pooling_layers, pooling_func)(
+        self.pool = getattr(pooling_layers, pooling_func)(in_dim=self.stats_dim * block.expansion)
            in_dim=self.stats_dim * block.expansion)
        self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats, embedding_size)
        if self.two_emb_layer:
            self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
@ -244,14 +252,13 @@ class ERes2Net(nn.Module):
        out4 = self.layer4(out3)
        fuse_out123_downsample = self.layer3_downsample(fuse_out123)
        fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample).flatten(start_dim=1, end_dim=2)  # bs,20480,T
-        if(if_mean==False):
+        if if_mean == False:
            mean = fuse_out1234[0].transpose(1, 0)  # (T,20480),bs=T
        else:
            mean = fuse_out1234.mean(2)  # bs,20480
        mean_std = torch.cat([mean, torch.zeros_like(mean)], 1)
        return self.seg_1(mean_std)  # (T,192)
        # stats = self.pool(fuse_out1234)
        # if self.two_emb_layer:
        #     out = F.relu(embed_a)
@ -280,7 +287,3 @@ class ERes2Net(nn.Module):
        # print(fuse_out1234.shape)
        # print(fuse_out1234.flatten(start_dim=1,end_dim=2).shape)
        # pdb.set_trace()
--- a/GPT_SoVITS/eres2net/fusion.py
+++ b/GPT_SoVITS/eres2net/fusion.py
@ -6,7 +6,6 @@ import torch.nn as nn
 class AFF(nn.Module):
    def __init__(self, channels=64, r=4):
        super(AFF, self).__init__()
        inter_channels = int(channels // r)
@ -26,4 +25,3 @@ class AFF(nn.Module):
        xo = torch.mul(x, x_att) + torch.mul(ds_y, 2.0 - x_att)
        return xo
--- a/GPT_SoVITS/eres2net/kaldi.py
+++ b/GPT_SoVITS/eres2net/kaldi.py
@ -144,7 +144,7 @@ def _get_waveform_and_window_properties(
    )
    assert 0 < window_shift, "`window_shift` must be greater than 0"
    assert padded_window_size % 2 == 0, (
-        "the padded `window_size` must be divisible by two." " use `round_to_power_of_two` or change `frame_length`"
+        "the padded `window_size` must be divisible by two. use `round_to_power_of_two` or change `frame_length`"
    )
    assert 0.0 <= preemphasis_coefficient <= 1.0, "`preemphasis_coefficient` must be between [0,1]"
    assert sample_frequency > 0, "`sample_frequency` must be greater than zero"
@ -441,7 +441,9 @@ def get_mel_banks(
    high_freq: float,
    vtln_low: float,
    vtln_high: float,
-    vtln_warp_factor: float,device=None,dtype=None
+    vtln_warp_factor: float,
    device=None,
    dtype=None,
 ) -> Tuple[Tensor, Tensor]:
    """
    Returns:
@ -457,9 +459,9 @@ def get_mel_banks(
    if high_freq <= 0.0:
        high_freq += nyquist
-    assert (
+    assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), (
-        (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq)
+        "Bad values in options: low-freq {} and high-freq {} vs. nyquist {}".format(low_freq, high_freq, nyquist)
-    ), "Bad values in options: low-freq {} and high-freq {} vs. nyquist {}".format(low_freq, high_freq, nyquist)
+    )
    # fft-bin width [think of it as Nyquist-freq / half-window-length]
    fft_bin_width = sample_freq / window_length_padded
@ -475,7 +477,7 @@ def get_mel_banks(
    assert vtln_warp_factor == 1.0 or (
        (low_freq < vtln_low < high_freq) and (0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)
-    ), "Bad values in options: vtln-low {} and vtln-high {}, versus " "low-freq {} and high-freq {}".format(
+    ), "Bad values in options: vtln-low {} and vtln-high {}, versus low-freq {} and high-freq {}".format(
        vtln_low, vtln_high, low_freq, high_freq
    )
@ -510,7 +512,10 @@ def get_mel_banks(
    return bins.to(device=device, dtype=dtype)  # , center_freqs
 cache = {}
 def fbank(
    waveform: Tensor,
    blackman_coeff: float = 0.42,
@ -620,10 +625,30 @@ def fbank(
    # size (num_mel_bins, padded_window_size // 2)
    # print(num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp)
-    cache_key="%s-%s-%s-%s-%s-%s-%s-%s-%s-%s"%(num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp,device,dtype)
+    cache_key = "%s-%s-%s-%s-%s-%s-%s-%s-%s-%s" % (
        num_mel_bins,
        padded_window_size,
        sample_frequency,
        low_freq,
        high_freq,
        vtln_low,
        vtln_high,
        vtln_warp,
        device,
        dtype,
    )
    if cache_key not in cache:
        mel_energies = get_mel_banks(
-            num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp,device,dtype
+            num_mel_bins,
            padded_window_size,
            sample_frequency,
            low_freq,
            high_freq,
            vtln_low,
            vtln_high,
            vtln_warp,
            device,
            dtype,
        )
        cache[cache_key] = mel_energies
    else:
--- a/GPT_SoVITS/eres2net/pooling_layers.py
+++ b/GPT_SoVITS/eres2net/pooling_layers.py
@ -11,6 +11,7 @@ class TAP(nn.Module):
    """
    Temporal average pooling, only first-order mean is considered
    """
    def __init__(self, **kwargs):
        super(TAP, self).__init__()
@ -25,6 +26,7 @@ class TSDP(nn.Module):
    """
    Temporal standard deviation pooling, only second-order std is considered
    """
    def __init__(self, **kwargs):
        super(TSDP, self).__init__()
@ -41,6 +43,7 @@ class TSTP(nn.Module):
    x-vector
    Comment: simple concatenation can not make full use of both statistics
    """
    def __init__(self, **kwargs):
        super(TSTP, self).__init__()
@ -59,6 +62,7 @@ class ASTP(nn.Module):
    """Attentive statistics pooling: Channel- and context-dependent
    statistics pooling, first used in ECAPA_TDNN.
    """
    def __init__(self, in_dim, bottleneck_dim=128, global_context_att=False):
        super(ASTP, self).__init__()
        self.global_context_att = global_context_att
@ -66,15 +70,10 @@ class ASTP(nn.Module):
        # Use Conv1d with stride == 1 rather than Linear, then we don't
        # need to transpose inputs.
        if global_context_att:
-            self.linear1 = nn.Conv1d(
+            self.linear1 = nn.Conv1d(in_dim * 3, bottleneck_dim, kernel_size=1)  # equals W and b in the paper
                in_dim * 3, bottleneck_dim,
                kernel_size=1)  # equals W and b in the paper
        else:
-            self.linear1 = nn.Conv1d(
+            self.linear1 = nn.Conv1d(in_dim, bottleneck_dim, kernel_size=1)  # equals W and b in the paper
-                in_dim, bottleneck_dim,
+        self.linear2 = nn.Conv1d(bottleneck_dim, in_dim, kernel_size=1)  # equals V and k in the paper
                kernel_size=1)  # equals W and b in the paper
        self.linear2 = nn.Conv1d(bottleneck_dim, in_dim,
                                 kernel_size=1)  # equals V and k in the paper
    def forward(self, x):
        """
@ -88,15 +87,13 @@ class ASTP(nn.Module):
        if self.global_context_att:
            context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
-            context_std = torch.sqrt(
+            context_std = torch.sqrt(torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
                torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
            x_in = torch.cat((x, context_mean, context_std), dim=1)
        else:
            x_in = x
        # DON'T use ReLU here! ReLU may be hard to converge.
-        alpha = torch.tanh(
+        alpha = torch.tanh(self.linear1(x_in))  # alpha = F.relu(self.linear1(x_in))
            self.linear1(x_in))  # alpha = F.relu(self.linear1(x_in))
        alpha = torch.softmax(self.linear2(alpha), dim=2)
        mean = torch.sum(alpha * x, dim=2)
        var = torch.sum(alpha * (x**2), dim=2) - mean**2
--- a/GPT_SoVITS/export_torch_script_v3v4.py
+++ b/GPT_SoVITS/export_torch_script_v3v4.py
@ -435,6 +435,7 @@ class GPTSoVITSV3(torch.nn.Module):
        wav_gen = torch.cat(wav_gen_list, 2)
        return wav_gen[0][0][:wav_gen_length]
 class GPTSoVITSV4(torch.nn.Module):
    def __init__(self, gpt_sovits_half, cfm, hifigan):
        super().__init__()
@ -577,6 +578,7 @@ from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
 v3v4set = {"v3", "v4"}
 def get_sovits_weights(sovits_path):
    path_sovits_v3 = "GPT_SoVITS/pretrained_models/s2Gv3.pth"
    is_exist_s2gv3 = os.path.exists(path_sovits_v3)
@ -707,7 +709,6 @@ def export_1(ref_wav_path,ref_wav_text,version="v3"):
        sovits = get_sovits_weights("GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth")
        init_hifigan()
    dict_s1 = torch.load("GPT_SoVITS/pretrained_models/s1v3.ckpt")
    raw_t2s = get_raw_t2s_model(dict_s1).to(device)
    print("#### get_raw_t2s_model ####")
@ -751,9 +752,7 @@ def export_1(ref_wav_path,ref_wav_text,version="v3"):
    # phones1, bert1, norm_text1 = get_phones_and_bert(
    #     "你这老坏蛋，我找了你这么久，真没想到在这里找到你。他说。", "all_zh", "v3"
    # )
-    phones1, bert1, norm_text1 = get_phones_and_bert(
+    phones1, bert1, norm_text1 = get_phones_and_bert(ref_wav_text, "auto", "v3")
        ref_wav_text, "auto", "v3"
    )
    phones2, bert2, norm_text2 = get_phones_and_bert(
        "这是一个简单的示例，真没想到这么简单就完成了。The King and His Stories.Once there was a king. He likes to write stories, but his stories were not good. As people were afraid of him, they all said his stories were good.After reading them, the writer at once turned to the soldiers and said: Take me back to prison, please.",
        "auto",
@ -1201,7 +1200,6 @@ def export_2(version="v3"):
    gpt_sovits_v3v4 = gpt_sovits_v3 if version == "v3" else gpt_sovits_v4
    sr = 24000 if version == "v3" else 48000
    time.sleep(5)
    # print("thread:", torch.get_num_threads())
    # print("thread:", torch.get_num_interop_threads())
@ -1212,14 +1210,14 @@ def export_2(version="v3"):
        "汗流浃背了呀!老弟~ My uncle has two dogs. One is big and the other is small. He likes them very much. He often plays with them. He takes them for a walk every day. He says they are his good friends. He is very happy with them. 最后还是我得了 MVP....",
        gpt_sovits_v3v4,
        "out.wav",
-        sr
+        sr,
    )
    test_export(
        "你小子是什么来路.汗流浃背了呀!老弟~ My uncle has two dogs. He is very happy with them. 最后还是我得了 MVP!",
        gpt_sovits_v3v4,
        "out2.wav",
-        sr
+        sr,
    )
    # test_export(
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@ -1337,5 +1337,6 @@ if __name__ == "__main__":
        inbrowser=True,
        share=is_share,
        server_port=infer_ttswebui,
        show_api=False,
        # quiet=True,
    )
--- a/GPT_SoVITS/inference_webui_fast.py
+++ b/GPT_SoVITS/inference_webui_fast.py
@ -505,5 +505,6 @@ if __name__ == "__main__":
        inbrowser=True,
        share=is_share,
        server_port=infer_ttswebui,
        show_api=False,
        # quiet=True,
    )
--- a/GPT_SoVITS/module/data_utils.py
+++ b/GPT_SoVITS/module/data_utils.py
@ -252,9 +252,28 @@ class TextAudioSpeakerCollate:
            if self.is_v2Pro:
                sv_embs[i] = row[4]
        if self.is_v2Pro:
-            return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths,sv_embs
+            return (
                ssl_padded,
                ssl_lengths,
                spec_padded,
                spec_lengths,
                wav_padded,
                wav_lengths,
                text_padded,
                text_lengths,
                sv_embs,
            )
        else:
-            return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths
+            return (
                ssl_padded,
                ssl_lengths,
                spec_padded,
                spec_lengths,
                wav_padded,
                wav_lengths,
                text_padded,
                text_lengths,
            )
 class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
--- a/GPT_SoVITS/module/models.py
+++ b/GPT_SoVITS/module/models.py
@ -586,12 +586,17 @@ class DiscriminatorS(torch.nn.Module):
        return x, fmap
 v2pro_set = {"v2Pro", "v2ProPlus"}
 class MultiPeriodDiscriminator(torch.nn.Module):
    def __init__(self, use_spectral_norm=False, version=None):
        super(MultiPeriodDiscriminator, self).__init__()
-        if version in v2pro_set:periods = [2, 3, 5, 7, 11,17,23]
+        if version in v2pro_set:
-        else:periods = [2, 3, 5, 7, 11]
+            periods = [2, 3, 5, 7, 11, 17, 23]
        else:
            periods = [2, 3, 5, 7, 11]
        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
        discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
@ -787,6 +792,7 @@ class CodePredictor(nn.Module):
            return pred_codes.transpose(0, 1)
 class SynthesizerTrn(nn.Module):
    """
    Synthesizer for Training
@ -983,7 +989,14 @@ class SynthesizerTrn(nn.Module):
        quantized = self.quantizer.decode(codes)
        if self.semantic_frame_rate == "25hz":
            quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest")
-        x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, self.ge_to512(ge.transpose(2,1)).transpose(2,1)if self.is_v2pro else ge, speed)
+        x, m_p, logs_p, y_mask = self.enc_p(
            quantized,
            y_lengths,
            text,
            text_lengths,
            self.ge_to512(ge.transpose(2, 1)).transpose(2, 1) if self.is_v2pro else ge,
            speed,
        )
        z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
        z = self.flow(z_p, y_mask, g=ge, reverse=True)
@ -996,6 +1009,7 @@ class SynthesizerTrn(nn.Module):
        quantized, codes, commit_loss, quantized_list = self.quantizer(ssl)
        return codes.transpose(0, 1)
 class CFM(torch.nn.Module):
    def __init__(self, in_channels, dit):
        super().__init__()
@ -1029,7 +1043,18 @@ class CFM(torch.nn.Module):
            t_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * t
            # v_pred = model(x, t_tensor, d_tensor, **extra_args)
            v_pred, text_emb, dt = self.estimator(
-                x, prompt_x, x_lens, t_tensor, d_tensor, mu, use_grad_ckpt=False, drop_audio_cond=False, drop_text=False, infer=True, text_cache=text_cache, dt_cache=dt_cache
+                x,
                prompt_x,
                x_lens,
                t_tensor,
                d_tensor,
                mu,
                use_grad_ckpt=False,
                drop_audio_cond=False,
                drop_text=False,
                infer=True,
                text_cache=text_cache,
                dt_cache=dt_cache,
            )
            v_pred = v_pred.transpose(2, 1)
            if self.use_conditioner_cache:
@ -1048,7 +1073,7 @@ class CFM(torch.nn.Module):
                    drop_text=True,
                    infer=True,
                    text_cache=text_cfg_cache,
-                                    dt_cache=dt_cache
+                    dt_cache=dt_cache,
                )
                neg = neg.transpose(2, 1)
                if self.use_conditioner_cache:
--- a/GPT_SoVITS/module/modules.py
+++ b/GPT_SoVITS/module/modules.py
@ -1,5 +1,4 @@
 import math
 import pdb
 import numpy as np
 import torch
--- a/GPT_SoVITS/prepare_datasets/2-get-sv.py
+++ b/GPT_SoVITS/prepare_datasets/2-get-sv.py
@ -10,7 +10,6 @@ i_part = os.environ.get("i_part")
 all_parts = os.environ.get("all_parts")
 if "_CUDA_VISIBLE_DEVICES" in os.environ:
    os.environ["CUDA_VISIBLE_DEVICES"] = os.environ["_CUDA_VISIBLE_DEVICES"]
 from feature_extractor import cnhubert
 opt_dir = os.environ.get("opt_dir")
 sv_path = os.environ.get("sv_path")
@ -19,19 +18,18 @@ import torch
 is_half = eval(os.environ.get("is_half", "True")) and torch.cuda.is_available()
 import traceback
 import numpy as np
 from scipy.io import wavfile
 import torchaudio
 now_dir = os.getcwd()
 sys.path.append(now_dir)
 sys.path.append(f"{now_dir}/GPT_SoVITS/eres2net")
-from tools.my_utils import load_audio, clean_path
+from tools.my_utils import clean_path
 from time import time as ttime
 import shutil
 from ERes2NetV2 import ERes2NetV2
 import kaldi as Kaldi
 def my_save(fea, path):  #####fix issue: torch.save doesn't support chinese path
    dir = os.path.dirname(path)
    name = os.path.basename(path)
@ -56,9 +54,10 @@ if torch.cuda.is_available():
 else:
    device = "cpu"
 class SV:
    def __init__(self, device, is_half):
-        pretrained_state = torch.load(sv_path, map_location='cpu')
+        pretrained_state = torch.load(sv_path, map_location="cpu")
        embedding_model = ERes2NetV2(baseWidth=24, scale=4, expansion=4)
        embedding_model.load_state_dict(pretrained_state)
        embedding_model.eval()
@ -73,15 +72,22 @@ class SV:
    def compute_embedding3(self, wav):  # (1,x)#-1~1
        with torch.no_grad():
            wav = self.res(wav)
-            if self.is_half==True:wav=wav.half()
+            if self.is_half == True:
-            feat = torch.stack([Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav])
+                wav = wav.half()
            feat = torch.stack(
                [Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav]
            )
            sv_emb = self.embedding_model.forward3(feat)
        return sv_emb
 sv = SV(device, is_half)
 def name2go(wav_name, wav_path):
    sv_cn_path = "%s/%s.pt" % (sv_cn_dir, wav_name)
-    if os.path.exists(sv_cn_path):return
+    if os.path.exists(sv_cn_path):
        return
    wav_path = "%s/%s" % (wav32dir, wav_name)
    wav32k, sr0 = torchaudio.load(wav_path)
    assert sr0 == 32000
--- a/GPT_SoVITS/process_ckpt.py
+++ b/GPT_SoVITS/process_ckpt.py
@ -17,7 +17,6 @@ def my_save(fea, path):  #####fix issue: torch.save doesn't support chinese path
    shutil.move(tmp_path, "%s/%s" % (dir, name))
 from io import BytesIO
 model_version2byte = {
@ -26,6 +25,8 @@ model_version2byte={
    "v2Pro": b"05",
    "v2ProPlus": b"06",
 }
 def my_save2(fea, path, model_version):
    bio = BytesIO()
    torch.save(fea, bio)
@ -50,7 +51,7 @@ def savee(ckpt, name, epoch, steps, hps, model_version=None, lora_rank=None):
        if lora_rank:
            opt["lora_rank"] = lora_rank
            my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), model_version)
-        elif (model_version!=None and "Pro"in model_version):
+        elif model_version != None and "Pro" in model_version:
            my_save2(opt, "%s/%s.pth" % (hps.save_weight_dir, name), model_version)
        else:
            my_save(opt, "%s/%s.pth" % (hps.save_weight_dir, name))
@ -58,6 +59,7 @@ def savee(ckpt, name, epoch, steps, hps, model_version=None, lora_rank=None):
    except:
        return traceback.format_exc()
 """
 00:v1
 01:v2
--- a/GPT_SoVITS/s2_train.py
+++ b/GPT_SoVITS/s2_train.py
@ -36,7 +36,7 @@ from module.models import (
    MultiPeriodDiscriminator,
    SynthesizerTrn,
 )
-from process_ckpt import savee,my_save2
+from process_ckpt import savee
 torch.backends.cudnn.benchmark = False
 torch.backends.cudnn.deterministic = False
@ -91,7 +91,26 @@ def run(rank, n_gpus, hps):
    train_sampler = DistributedBucketSampler(
        train_dataset,
        hps.train.batch_size,
-        [32,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,],
+        [
            32,
            300,
            400,
            500,
            600,
            700,
            800,
            900,
            1000,
            1100,
            1200,
            1300,
            1400,
            1500,
            1600,
            1700,
            1800,
            1900,
        ],
        num_replicas=n_gpus,
        rank=rank,
        shuffle=True,
@ -315,12 +334,39 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
        else:
            ssl, ssl_lengths, spec, spec_lengths, y, y_lengths, text, text_lengths = data
        if torch.cuda.is_available():
-            spec, spec_lengths = (spec.cuda(rank,non_blocking=True,),spec_lengths.cuda(rank,non_blocking=True,),)
+            spec, spec_lengths = (
-            y, y_lengths = (y.cuda(rank,non_blocking=True,),y_lengths.cuda(rank,non_blocking=True,),)
+                spec.cuda(
                    rank,
                    non_blocking=True,
                ),
                spec_lengths.cuda(
                    rank,
                    non_blocking=True,
                ),
            )
            y, y_lengths = (
                y.cuda(
                    rank,
                    non_blocking=True,
                ),
                y_lengths.cuda(
                    rank,
                    non_blocking=True,
                ),
            )
            ssl = ssl.cuda(rank, non_blocking=True)
            ssl.requires_grad = False
            # ssl_lengths = ssl_lengths.cuda(rank, non_blocking=True)
-            text, text_lengths = (text.cuda(rank,non_blocking=True,),text_lengths.cuda(rank,non_blocking=True,),)
+            text, text_lengths = (
                text.cuda(
                    rank,
                    non_blocking=True,
                ),
                text_lengths.cuda(
                    rank,
                    non_blocking=True,
                ),
            )
            if hps.model.version in {"v2Pro", "v2ProPlus"}:
                sv_emb = sv_emb.cuda(rank, non_blocking=True)
        else:
@ -334,9 +380,19 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
                sv_emb = sv_emb.to(device)
        with autocast(enabled=hps.train.fp16_run):
            if hps.model.version in {"v2Pro", "v2ProPlus"}:
-                (y_hat,kl_ssl,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q),stats_ssl) = net_g(ssl, spec, spec_lengths, text, text_lengths,sv_emb)
+                (y_hat, kl_ssl, ids_slice, x_mask, z_mask, (z, z_p, m_p, logs_p, m_q, logs_q), stats_ssl) = net_g(
                    ssl, spec, spec_lengths, text, text_lengths, sv_emb
                )
            else:
-                (y_hat,kl_ssl,ids_slice,x_mask,z_mask,(z, z_p, m_p, logs_p, m_q, logs_q),stats_ssl,) = net_g(ssl, spec, spec_lengths, text, text_lengths)
+                (
                    y_hat,
                    kl_ssl,
                    ids_slice,
                    x_mask,
                    z_mask,
                    (z, z_p, m_p, logs_p, m_q, logs_q),
                    stats_ssl,
                ) = net_g(ssl, spec, spec_lengths, text, text_lengths)
            mel = spec_to_mel_torch(
                spec,
@ -508,7 +564,14 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
                % (
                    hps.name,
                    epoch,
-                    savee(ckpt,hps.name + "_e%s_s%s" % (epoch, global_step),epoch,global_step,hps,model_version=None if hps.model.version not in {"v2Pro","v2ProPlus"}else hps.model.version),
+                    savee(
                        ckpt,
                        hps.name + "_e%s_s%s" % (epoch, global_step),
                        epoch,
                        global_step,
                        hps,
                        model_version=None if hps.model.version not in {"v2Pro", "v2ProPlus"} else hps.model.version,
                    ),
                )
            )
--- a/GPT_SoVITS/sv.py
+++ b/GPT_SoVITS/sv.py
@ -1,11 +1,16 @@
-import sys,os,torch
+import sys
 import os
 import torch
 sys.path.append(f"{os.getcwd()}/GPT_SoVITS/eres2net")
 sv_path = "GPT_SoVITS/pretrained_models/sv/pretrained_eres2netv2w24s4ep4.ckpt"
 from ERes2NetV2 import ERes2NetV2
 import kaldi as Kaldi
 class SV:
    def __init__(self, device, is_half):
-        pretrained_state = torch.load(sv_path, map_location='cpu', weights_only=False)
+        pretrained_state = torch.load(sv_path, map_location="cpu", weights_only=False)
        embedding_model = ERes2NetV2(baseWidth=24, scale=4, expansion=4)
        embedding_model.load_state_dict(pretrained_state)
        embedding_model.eval()
@ -18,7 +23,10 @@ class SV:
    def compute_embedding3(self, wav):
        with torch.no_grad():
-            if self.is_half==True:wav=wav.half()
+            if self.is_half == True:
-            feat = torch.stack([Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav])
+                wav = wav.half()
            feat = torch.stack(
                [Kaldi.fbank(wav0.unsqueeze(0), num_mel_bins=80, sample_frequency=16000, dither=0) for wav0 in wav]
            )
            sv_emb = self.embedding_model.forward3(feat)
        return sv_emb
--- a/GPT_SoVITS/text/LangSegmenter/langsegmenter.py
+++ b/GPT_SoVITS/text/LangSegmenter/langsegmenter.py
@ -3,19 +3,25 @@ import re
 # jieba静音
 import jieba
 jieba.setLogLevel(logging.CRITICAL)
 # 更改fast_langdetect大模型位置
 from pathlib import Path
 import fast_langdetect
-fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector(fast_langdetect.infer.LangDetectConfig(cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect"))
+
 fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector(
    fast_langdetect.infer.LangDetectConfig(
        cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect"
    )
 )
 from split_lang import LangSplitter
 def full_en(text):
-    pattern = r'^(?=.*[A-Za-z])[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$'
+    pattern = r"^(?=.*[A-Za-z])[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$"
    return bool(re.match(pattern, text))
@ -34,7 +40,7 @@ def full_cjk(text):
        (0x2EBF0, 0x2EE5D),  # CJK Extension H
    ]
-    pattern = r'[0-9、-〜。！？.!?… /]+$'
+    pattern = r"[0-9、-〜。！？.!?… /]+$"
    cjk_text = ""
    for char in text:
@ -53,28 +59,28 @@ def split_jako(tag_lang,item):
    lang_list: list[dict] = []
    tag = 0
-    for match in re.finditer(pattern, item['text']):
+    for match in re.finditer(pattern, item["text"]):
        if match.start() > tag:
-            lang_list.append({'lang':item['lang'],'text':item['text'][tag:match.start()]})
+            lang_list.append({"lang": item["lang"], "text": item["text"][tag : match.start()]})
        tag = match.end()
-        lang_list.append({'lang':tag_lang,'text':item['text'][match.start():match.end()]})
+        lang_list.append({"lang": tag_lang, "text": item["text"][match.start() : match.end()]})
-    if tag < len(item['text']):
+    if tag < len(item["text"]):
-        lang_list.append({'lang':item['lang'],'text':item['text'][tag:len(item['text'])]})
+        lang_list.append({"lang": item["lang"], "text": item["text"][tag : len(item["text"])]})
    return lang_list
 def merge_lang(lang_list, item):
-    if lang_list and item['lang'] == lang_list[-1]['lang']:
+    if lang_list and item["lang"] == lang_list[-1]["lang"]:
-        lang_list[-1]['text'] += item['text']
+        lang_list[-1]["text"] += item["text"]
    else:
        lang_list.append(item)
    return lang_list
-class LangSegmenter():
+class LangSegmenter:
    # 默认过滤器, 基于gsv目前四种语言
    DEFAULT_LANG_MAP = {
        "zh": "zh",
@ -87,7 +93,6 @@ class LangSegmenter():
        "en": "en",
    }
    def getTexts(text):
        lang_splitter = LangSplitter(lang_map=LangSegmenter.DEFAULT_LANG_MAP)
        substr = lang_splitter.split_by_lang(text=text)
@ -95,18 +100,18 @@ class LangSegmenter():
        lang_list: list[dict] = []
        for _, item in enumerate(substr):
-            dict_item = {'lang':item.lang,'text':item.text}
+            dict_item = {"lang": item.lang, "text": item.text}
            # 处理短英文被识别为其他语言的问题
-            if full_en(dict_item['text']):  
+            if full_en(dict_item["text"]):
-                dict_item['lang'] = 'en'
+                dict_item["lang"] = "en"
                lang_list = merge_lang(lang_list, dict_item)
                continue
            # 处理非日语夹日文的问题(不包含CJK)
            ja_list: list[dict] = []
-            if dict_item['lang'] != 'ja':
+            if dict_item["lang"] != "ja":
-                ja_list = split_jako('ja',dict_item)
+                ja_list = split_jako("ja", dict_item)
            if not ja_list:
                ja_list.append(dict_item)
@ -115,8 +120,8 @@ class LangSegmenter():
            ko_list: list[dict] = []
            temp_list: list[dict] = []
            for _, ko_item in enumerate(ja_list):
-                if ko_item["lang"] != 'ko':
+                if ko_item["lang"] != "ko":
-                    ko_list = split_jako('ko',ko_item)
+                    ko_list = split_jako("ko", ko_item)
                if ko_list:
                    temp_list.extend(ko_list)
@ -126,10 +131,10 @@ class LangSegmenter():
            # 未存在非日韩文夹日韩文
            if len(temp_list) == 1:
                # 未知语言检查是否为CJK
-                if dict_item['lang'] == 'x':
+                if dict_item["lang"] == "x":
-                    cjk_text = full_cjk(dict_item['text'])
+                    cjk_text = full_cjk(dict_item["text"])
                    if cjk_text:
-                        dict_item = {'lang':'zh','text':cjk_text}
+                        dict_item = {"lang": "zh", "text": cjk_text}
                        lang_list = merge_lang(lang_list, dict_item)
                    else:
                        lang_list = merge_lang(lang_list, dict_item)
@ -141,10 +146,10 @@ class LangSegmenter():
            # 存在非日韩文夹日韩文
            for _, temp_item in enumerate(temp_list):
                # 未知语言检查是否为CJK
-                if temp_item['lang'] == 'x':
+                if temp_item["lang"] == "x":
-                    cjk_text = full_cjk(dict_item['text'])
+                    cjk_text = full_cjk(dict_item["text"])
                    if cjk_text:
-                        dict_item = {'lang':'zh','text':cjk_text}
+                        dict_item = {"lang": "zh", "text": cjk_text}
                        lang_list = merge_lang(lang_list, dict_item)
                    else:
                        lang_list = merge_lang(lang_list, dict_item)
@ -154,13 +159,13 @@ class LangSegmenter():
        temp_list = lang_list
        lang_list = []
        for _, temp_item in enumerate(temp_list):
-            if temp_item['lang'] == 'x':
+            if temp_item["lang"] == "x":
                if lang_list:
-                    temp_item['lang'] = lang_list[-1]['lang']
+                    temp_item["lang"] = lang_list[-1]["lang"]
                elif len(temp_list) > 1:
-                    temp_item['lang'] = temp_list[1]['lang']
+                    temp_item["lang"] = temp_list[1]["lang"]
                else:
-                    temp_item['lang'] = 'zh'
+                    temp_item["lang"] = "zh"
            lang_list = merge_lang(lang_list, temp_item)
--- a/GPT_SoVITS/text/g2pw/onnx_api.py
+++ b/GPT_SoVITS/text/g2pw/onnx_api.py
@ -3,7 +3,6 @@
 import json
 import os
 import traceback
 import warnings
 import zipfile
 from typing import Any, Dict, List, Tuple
@ -23,7 +22,8 @@ from .utils import load_config
 onnxruntime.set_default_logger_severity(3)
 try:
    onnxruntime.preload_dlls()
-except:pass
+except:
    pass
    # traceback.print_exc()
 warnings.filterwarnings("ignore")
--- a/GPT_SoVITS/text/tone_sandhi.py
+++ b/GPT_SoVITS/text/tone_sandhi.py
@ -655,11 +655,7 @@ class ToneSandhi:
        while i < len(seg):
            word, pos = seg[i]
            merged = False
-            if (
+            if i - 1 >= 0 and word == "一" and i + 1 < len(seg):
                i - 1 >= 0
                and word == "一"
                and i + 1 < len(seg)
            ):
                last = new_seg[-1] if new_seg else seg[i - 1]
                if last[0] == seg[i + 1][0] and last[1] == "v" and seg[i + 1][1] == "v":
                    combined = last[0] + "一" + seg[i + 1][0]
--- a/README.md
+++ b/README.md
@ -1,3 +1,5 @@
 #
 <div align="center">
 <h1>GPT-SoVITS-WebUI</h1>
@ -7,12 +9,17 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>
 <a href="https://trendshift.io/repositories/7033" target="_blank"><img src="https://trendshift.io/api/badge/repositories/7033" alt="RVC-Boss%2FGPT-SoVITS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
-<!-- img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br> -->
+[![Train In Colab](https://img.shields.io/badge/Colab-Training-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb)
 [![Infer In Colab](https://img.shields.io/badge/Colab-Inference-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-Inference.ipynb)
 [![Huggingface](https://img.shields.io/badge/HuggingFace-online%20demo-blue.svg?style=for-the-badge&logo=huggingface)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
-[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
+[![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases)
-[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
+[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
-[![Huggingface](https://img.shields.io/badge/🤗%20-online%20demo-yellow.svg?style=for-the-badge)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
+[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
-[![Discord](https://img.shields.io/discord/1198701940511617164?color=%23738ADB&label=Discord&style=for-the-badge)](https://discord.gg/dnrgs5GHfG)
+
 [![简体中文](https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
 [![English](https://img.shields.io/badge/English-Read%20Docs-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://rentry.co/GPT-SoVITS-guide#/)
 [![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
 **English** | [**中文简体**](./docs/cn/README.md) | [**日本語**](./docs/ja/README.md) | [**한국어**](./docs/ko/README.md) | [**Türkçe**](./docs/tr/README.md)
@ -20,7 +27,7 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>
 ---
-## Features:
+## Features
 1. **Zero-shot TTS:** Input a 5-second vocal sample and experience instant text-to-speech conversion.
@ -34,13 +41,13 @@ A Powerful Few-shot Voice Conversion and Text-to-Speech WebUI.<br><br>
 Unseen speakers few-shot fine-tuning demo:
-https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
+<https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb>
-**User guide: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
+<!-- **User guide: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)** -->
 ## Installation
-For users in China, you can [click here](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official) to use AutoDL Cloud Docker to experience the full functionality online.
+For users in China, you can use [AutoDL Cloud Docker](https://www.codewithgpu.com/i/RVC-Boss/GPT-SoVITS/GPT-SoVITS-Official) to experience the full functionality online.
 ### Tested Environments
@ -193,10 +200,8 @@ docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GP
 The TTS annotation .list file format:
-```
+```text
 vocal_path|speaker_name|language|text
 ```
 Language dictionary:
@ -209,10 +214,8 @@ Language dictionary:
 Example:
-```
+```text
 D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
 ```
 ## Finetune and inference
@ -222,7 +225,6 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
 #### Integrated Package Users
 Double-click `go-webui.bat`or use `go-webui.ps1`
 if you want to switch to V1,then double-click`go-webui-v1.bat` or use `go-webui-v1.ps1`
 #### Others
@ -230,14 +232,6 @@ if you want to switch to V1,then double-click`go-webui-v1.bat` or use `go-webui-
 python webui.py <language(optional)>
 ```
 if you want to switch to V1,then
 ```bash
 python webui.py v1 <language(optional)>
 ```
 Or maunally switch version in WebUI
 ### Finetune
 #### Path Auto-filling is now supported
@ -253,7 +247,7 @@ Or maunally switch version in WebUI
 #### Integrated Package Users
-Double-click `go-webui-v2.bat` or use `go-webui-v2.ps1` ,then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
+Double-click `go-webui.bat` or use `go-webui.ps1` , then open the inference webui at `1-GPT-SoVITS-TTS/1C-inference`
 #### Others
@ -333,7 +327,7 @@ Use v4 from v1/v2/v3 environment:
 New Features:
 1. Slightly higher VRAM usage than v2, surpassing v4's performance, with v2's hardware cost and speed.
-   [more details](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7))
+   [more details](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7)>)
 2.v1/v2 and the v2Pro series share the same characteristics, while v3/v4 have similar features. For training sets with average audio quality, v1/v2/v2Pro can deliver decent results, but v3/v4 cannot. Additionally, the synthesized tone and timebre of v3/v4 lean more toward the reference audio rather than the overall training set.
@ -373,11 +367,6 @@ Use the command line to open the WebUI for UVR5
 python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
 ```
 <!-- If you can't open a browser, follow the format below for UVR processing,This is using mdxnet for audio processing
 ```
 python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
 ``` -->
 This is how the audio segmentation of the dataset is done using the command line
 ```bash
@ -453,5 +442,5 @@ Thankful to @Naozumi520 for providing the Cantonese training set and for the gui
 ## Thanks to all contributors for their efforts
 <a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
-  <img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
+  <img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" alt="Contributors"/>
 </a>
--- a/api.py
+++ b/api.py
@ -199,6 +199,8 @@ def is_full(*items):  # 任意一项为空返回False
 bigvgan_model = hifigan_model = sv_cn_model = None
 def clean_hifigan_model():
    global hifigan_model
    if hifigan_model:
@ -208,6 +210,8 @@ def clean_hifigan_model():
            torch.cuda.empty_cache()
        except:
            pass
 def clean_bigvgan_model():
    global bigvgan_model
    if bigvgan_model:
@ -217,6 +221,8 @@ def clean_bigvgan_model():
            torch.cuda.empty_cache()
        except:
            pass
 def clean_sv_cn_model():
    global sv_cn_model
    if sv_cn_model:
@ -262,7 +268,9 @@ def init_hifigan():
    hifigan_model.eval()
    hifigan_model.remove_weight_norm()
    state_dict_g = torch.load(
-        "%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu", weights_only=False
+        "%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,),
        map_location="cpu",
        weights_only=False,
    )
    print("loading vocoder", hifigan_model.load_state_dict(state_dict_g))
    if is_half == True:
@ -272,19 +280,21 @@ def init_hifigan():
 from sv import SV
 def init_sv_cn():
    global hifigan_model, bigvgan_model, sv_cn_model
    sv_cn_model = SV(device, is_half)
 resample_transform_dict = {}
 def resample(audio_tensor, sr0, sr1, device):
    global resample_transform_dict
    key = "%s-%s-%s" % (sr0, sr1, str(device))
    if key not in resample_transform_dict:
-        resample_transform_dict[key] = torchaudio.transforms.Resample(
+        resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
            sr0, sr1
        ).to(device)
    return resample_transform_dict[key](audio_tensor)
@ -370,6 +380,7 @@ from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
 def get_sovits_weights(sovits_path):
    from config import pretrained_sovits_name
    path_sovits_v3 = pretrained_sovits_name["v3"]
    path_sovits_v4 = pretrained_sovits_name["v4"]
    is_exist_s2gv3 = os.path.exists(path_sovits_v3)
@ -632,11 +643,13 @@ def get_spepc(hps, filename, dtype, device, is_v2pro=False):
    audio, sr0 = torchaudio.load(filename)
    if sr0 != sr1:
        audio = audio.to(device)
-        if(audio.shape[0]==2):audio=audio.mean(0).unsqueeze(0)
+        if audio.shape[0] == 2:
            audio = audio.mean(0).unsqueeze(0)
        audio = resample(audio, sr0, sr1, device)
    else:
        audio = audio.to(device)
-        if(audio.shape[0]==2):audio=audio.mean(0).unsqueeze(0)
+        if audio.shape[0] == 2:
            audio = audio.mean(0).unsqueeze(0)
    maxx = audio.abs().max()
    if maxx > 1:
@ -937,14 +950,22 @@ def get_tts_wav(
        if version not in {"v3", "v4"}:
            if is_v2pro:
                audio = (
-                    vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed,sv_emb=sv_emb)
+                    vq_model.decode(
                        pred_semantic,
                        torch.LongTensor(phones2).to(device).unsqueeze(0),
                        refers,
                        speed=speed,
                        sv_emb=sv_emb,
                    )
                    .detach()
                    .cpu()
                    .numpy()[0, 0]
                )
            else:
                audio = (
-                    vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed)
+                    vq_model.decode(
                        pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed
                    )
                    .detach()
                    .cpu()
                    .numpy()[0, 0]
@ -1108,7 +1129,6 @@ def handle(
        if not default_refer.is_ready():
            return JSONResponse({"code": 400, "message": "未指定参考音频且接口无预设"}, status_code=400)
    if cut_punc == None:
        text = cut_text(text, default_cut_punc)
    else:
--- a/config.py
+++ b/config.py
@ -144,6 +144,7 @@ webui_port_subfix = 9871
 api_port = 9880
 # Thanks to the contribution of @Karasukaigan and @XXXXRT666
 def get_device_dtype_sm(idx: int) -> tuple[torch.device, torch.dtype, float, float]:
    cpu = torch.device("cpu")
@ -158,9 +159,12 @@ def get_device_dtype_sm(idx: int) -> tuple[torch.device, torch.dtype, float, flo
    major, minor = capability
    sm_version = major + minor / 10.0
    is_16_series = bool(re.search(r"16\d{2}", name)) and sm_version == 7.5
-    if mem_gb < 4 or sm_version < 5.3:return cpu, torch.float32, 0.0, 0.0
+    if mem_gb < 4 or sm_version < 5.3:
-    if sm_version == 6.1 or is_16_series==True:return cuda, torch.float32, sm_version, mem_gb
+        return cpu, torch.float32, 0.0, 0.0
-    if sm_version > 6.1:return cuda, torch.float16, sm_version, mem_gb
+    if sm_version == 6.1 or is_16_series == True:
        return cuda, torch.float32, sm_version, mem_gb
    if sm_version > 6.1:
        return cuda, torch.float16, sm_version, mem_gb
    return cpu, torch.float32, 0.0, 0.0
--- a/docker_build.ps1
+++ b/docker_build.ps1
@ -0,0 +1,73 @@
 $ErrorActionPreference = "Stop"
 $ScriptDir = Split-Path -Parent $MyInvocation.MyCommand.Definition
 Set-Location $ScriptDir
 if (-not (Get-Command "docker" -ErrorAction SilentlyContinue)) {
    Write-Host "Docker Not Found"
    exit 1
 }
 $Lite = $false
 $CudaVersion = "12.6"
 function Write-Help {
    Write-Host @"
 Usage: powershell -File docker_build.ps1 [OPTIONS]
 Options:
  --cuda 12.6|12.8    Specify the CUDA VERSION (REQUIRED)
  --lite              Build a Lite Image
  -h, --help          Show this help message and exit
 Examples:
  powershell -File docker_build.ps1 --cuda 12.6 --lite
 "@
 }
 if ($args.Count -eq 0) {
    Write-Help
    exit 0
 }
 for ($i = 0; $i -lt $args.Count; $i++) {
    switch ($args[$i]) {
        '--cuda' {
            $i++
            $val = $args[$i]
            if ($val -ne "12.6" -and $val -ne "12.8") {
                Write-Host "Error: Invalid CUDA_VERSION: $val"
                Write-Host "Choose From: [12.6, 12.8]"
                exit 1
            }
            $CudaVersion = $val
        }
        '--lite' {
            $Lite = $true
        }
        '-h' { Write-Help; exit 0 }
        '--help' { Write-Help; exit 0 }
        default {
            Write-Host "Unknown Argument: $($args[$i])"
            Write-Host "Use -h or --help to see available options."
            exit 1
        }
    }
 }
 $arch = (Get-CimInstance Win32_Processor).Architecture
 $TargetPlatform = if ($arch -eq 9) { "linux/amd64" } else { "linux/arm64" }
 if ($Lite) {
    $TorchBase = "lite"
 } else {
    $TorchBase = "full"
 }
 docker build `
    --build-arg CUDA_VERSION=$CudaVersion `
    --build-arg LITE=$Lite `
    --build-arg TARGETPLATFORM=$TargetPlatform `
    --build-arg TORCH_BASE=$TorchBase `
    -t "$env:USERNAME/gpt-sovits:local" `
    .
--- a/docker_build.sh
+++ b/docker_build.sh
@ -25,7 +25,7 @@ print_help() {
    echo "  -h, --help          Show this help message and exit"
    echo ""
    echo "Examples:"
-    echo "  bash docker_build.sh --cuda 12.6 --funasr --faster-whisper"
+    echo "  bash docker_build.sh --cuda 12.6"
 }
 # Show help if no arguments provided
--- a/docs/cn/Changelog_CN.md
+++ b/docs/cn/Changelog_CN.md
@ -230,6 +230,7 @@
 ## 202403
 - 2024.03.06 [PR#675](https://github.com/RVC-Boss/GPT-SoVITS/pull/675)
  - 内容: Faster Whisper 在没有 CUDA 可用时自动使用 CPU 推理.
  - 类型: 优化
  - 提交: ShiroDoMain
@ -409,7 +410,7 @@
 - 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4)
  - 内容: **新增 GPT-SoVITS V3 模型, 需要 14G 显存进行微调.**
-  - 类型: 新功能 (特性参阅 [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)))
+  - 类型: 新功能 (特性参阅 [Wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>))
  - 提交: RVC-Boss
 - 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032)
  - 内容: 更新项目多语言文档.
@ -475,6 +476,7 @@
    - Pydantic: [Issue#2230](https://github.com/RVC-Boss/GPT-SoVITS/issues/2230), [Issue#2239](https://github.com/RVC-Boss/GPT-SoVITS/issues/2239).
    - PyTorch-Lightning: [Issue#2174](https://github.com/RVC-Boss/GPT-SoVITS/issues/2174).
 - 2025.03.31 [PR#2241](https://github.com/RVC-Boss/GPT-SoVITS/pull/2241)
  - 内容: **为 SoVITS v3 适配并行推理**.
  - 类型: 新功能
  - 提交: ChasonJiang
--- a/docs/cn/README.md
+++ b/docs/cn/README.md
@ -1,3 +1,5 @@
 #
 <div align="center">
 <h1>GPT-SoVITS-WebUI</h1>
@ -7,12 +9,17 @@
 <a href="https://trendshift.io/repositories/7033" target="_blank"><img src="https://trendshift.io/api/badge/repositories/7033" alt="RVC-Boss%2FGPT-SoVITS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
-<!-- img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br> -->
+[![Train In Colab](https://img.shields.io/badge/Colab-训练-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb)
 [![Infer In Colab](https://img.shields.io/badge/Colab-推理-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-Inference.ipynb)
 [![Huggingface](https://img.shields.io/badge/HuggingFace-online%20demo-blue.svg?style=for-the-badge&logo=huggingface)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
-[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
+[![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases)
-[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
+[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
-[![Huggingface](https://img.shields.io/badge/🤗%20-online%20demo-yellow.svg?style=for-the-badge)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
+[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
-[![Discord](https://img.shields.io/discord/1198701940511617164?color=%23738ADB&label=Discord&style=for-the-badge)](https://discord.gg/dnrgs5GHfG)
+
 [![简体中文](https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
 [![English](https://img.shields.io/badge/English-Read%20Docs-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://rentry.co/GPT-SoVITS-guide#/)
 [![Change Log](https://img.shields.io/badge/更新日志-查看更新-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/cn/Changelog_CN.md)
 [**English**](../../README.md) | **中文简体** | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)
@ -36,7 +43,7 @@
 <https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb>
-**用户手册: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
+<!-- **用户手册: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)** -->
 ## 安装
@ -193,7 +200,7 @@ docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GP
 文本到语音 (TTS) 注释 .list 文件格式:
-```
+```text
 vocal_path|speaker_name|language|text
 ```
@ -207,7 +214,7 @@ vocal_path|speaker_name|language|text
 示例:
-```
+```text
 D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神.
 ```
@ -218,7 +225,6 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神.
 #### 整合包用户
 双击`go-webui.bat`或者使用`go-webui.ps1`
 若想使用 V1,则双击`go-webui-v1.bat`或者使用`go-webui-v1.ps1`
 #### 其他
@ -226,14 +232,6 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|zh|我爱玩原神.
 python webui.py <language(optional)>
 ```
 若想使用 V1,则
 ```bash
 python webui.py v1 <language(optional)>
 ```
 或者在 webUI 内动态切换
 ### 微调
 #### 现已支持自动填充路径
@ -449,5 +447,5 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
 ## 感谢所有贡献者的努力
 <a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
-  <img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
+  <img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" alt="Contributors"/>
 </a>
--- a/docs/en/Changelog_EN.md
+++ b/docs/en/Changelog_EN.md
@ -409,7 +409,7 @@
 - 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4)
  - Content: **Added GPT-SoVITS V3 model, which requires 14GB VRAM for fine-tuning.**
-  - Type: Feature (Refer to [Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)))
+  - Type: Feature (Refer to [Wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>))
  - Contributor: RVC-Boss
 - 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032)
  - Content: Updated multilingual project documentation.
@ -478,9 +478,6 @@
  - Content: **Enabled parallel inference for SoVITS v3.**
  - Type: Feature
  - Contributor: ChasonJiang
 - Fixed other minor bugs.
 - Integrated package fixes for ONNX runtime GPU inference support:
  - Type: Fix
  - Details:
--- a/docs/ja/Changelog_JA.md
+++ b/docs/ja/Changelog_JA.md
@ -409,7 +409,7 @@
 - 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4)
  - 内容: **GPT-SoVITS V3 モデルを追加。ファインチューニングには 14GB の VRAM が必要。**
-  - タイプ: 新機能（[Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7))参照）
+  - タイプ: 新機能（[Wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)参照）
  - 貢献者: RVC-Boss
 - 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032)
  - 内容: 多言語プロジェクトドキュメントを更新。
@ -475,6 +475,7 @@
    - Pydantic: [Issue#2230](https://github.com/RVC-Boss/GPT-SoVITS/issues/2230), [Issue#2239](https://github.com/RVC-Boss/GPT-SoVITS/issues/2239).
    - PyTorch-Lightning: [Issue#2174](https://github.com/RVC-Boss/GPT-SoVITS/issues/2174).
 - 2025.03.31 [PR#2241](https://github.com/RVC-Boss/GPT-SoVITS/pull/2241)
  - 内容: **SoVITS v3 の並列推論を有効化。**
  - タイプ: 新機能
  - 貢献者: ChasonJiang
--- a/docs/ja/README.md
+++ b/docs/ja/README.md
@ -1,3 +1,5 @@
 #
 <div align="center">
 <h1>GPT-SoVITS-WebUI</h1>
@ -5,12 +7,17 @@
 [![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/RVC-Boss/GPT-SoVITS)
-<img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br>
+[![Train In Colab](https://img.shields.io/badge/Colab-Training-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb)
 [![Infer In Colab](https://img.shields.io/badge/Colab-Inference-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-Inference.ipynb)
 [![Huggingface](https://img.shields.io/badge/HuggingFace-online%20demo-blue.svg?style=for-the-badge&logo=huggingface)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
-[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
+[![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases)
-[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
+[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
-[![Huggingface](https://img.shields.io/badge/🤗%20-online%20demo-yellow.svg?style=for-the-badge)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
+[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
-[![Discord](https://img.shields.io/discord/1198701940511617164?color=%23738ADB&label=Discord&style=for-the-badge)](https://discord.gg/dnrgs5GHfG)
+
 [![简体中文](https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
 [![English](https://img.shields.io/badge/English-Read%20Docs-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://rentry.co/GPT-SoVITS-guide#/)
 [![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
 [**English**](../../README.md) | [**中文简体**](../cn/README.md) | **日本語** | [**한국어**](../ko/README.md) | [**Türkçe**](../tr/README.md)
@ -18,7 +25,7 @@
 ---
-## 機能:
+## 機能
 1. **Zero-Shot TTS:** たった 5 秒間の音声サンプルで、即座にテキストからその音声に変換できます.
@ -32,9 +39,9 @@
 声の事前学習無しかつ Few-Shot でトレーニングされたモデルのデモ:
-https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
+<https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb>
-**ユーザーマニュアル: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
+<!-- **ユーザーマニュアル: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)** -->
 ## インストール
@ -185,7 +192,7 @@ docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GP
 TTS アノテーション .list ファイル形式:
-```
+```text
 vocal_path|speaker_name|language|text
 ```
@ -197,7 +204,7 @@ vocal_path|speaker_name|language|text
 例:
-```
+```text
 D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
 ```
@ -208,7 +215,6 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
 #### 統合パッケージ利用者
 `go-webui.bat`をダブルクリックするか、`go-webui.ps1`を使用します.
 V1 に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックするか、`go-webui-v1.ps1`を使用してください.
 #### その他
@ -216,14 +222,6 @@ V1 に切り替えたい場合は、`go-webui-v1.bat`をダブルクリックす
 python webui.py <言語(オプション)>
 ```
 V1 に切り替えたい場合は
 ```bash
 python webui.py v1 <言語(オプション)>
 ```
 または WebUI で手動でバージョンを切り替えてください.
 ### 微調整
 #### パス自動補完のサポート
@ -239,7 +237,7 @@ python webui.py v1 <言語(オプション)>
 #### 統合パッケージ利用者
-`go-webui-v2.bat`をダブルクリックするか、`go-webui-v2.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます.
+`go-webui.bat`をダブルクリックするか、`go-webui.ps1`を使用して、`1-GPT-SoVITS-TTS/1C-inference`で推論 webui を開きます.
 #### その他
@ -359,11 +357,6 @@ V1/V2/V3/V4 環境から V2Pro への移行方法:
 python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
 ```
 <!-- ブラウザを開けない場合は、以下の形式に従って UVR 処理を行ってください.これはオーディオ処理に mdxnet を使用しています.
 ```
 python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
 ``` -->
 コマンド ラインを使用してデータセットのオーディオ セグメンテーションを行う方法は次のとおりです.
 ```bash
@ -439,5 +432,5 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
 ## すべてのコントリビューターに感謝します
 <a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
-  <img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
+  <img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" alt="Contributors"/>
 </a>
--- a/docs/ko/Changelog_KO.md
+++ b/docs/ko/Changelog_KO.md
@ -242,7 +242,7 @@
  - 유형: 최적화
  - 기여자: GoHomeToMacDonal
 - 2024.03.10 [PR#721](https://github.com/RVC-Boss/GPT-SoVITS/pull/721)
-  - 내용: 빠른 추론 브랜치 'fast_inference_' 추가
+  - 내용: 빠른 추론 브랜치 'fast*inference*' 추가
  - 유형: 기능
  - 기여자: ChasonJiang
 - 2024.03.13 [PR#761](https://github.com/RVC-Boss/GPT-SoVITS/pull/761)
@ -409,7 +409,7 @@
 - 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4)
  - 내용: **GPT-SoVITS V3 모델 추가, 파인튜닝 시 14GB VRAM 필요.**
-  - 유형: 신규 기능 ([위키 참조](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)))
+  - 유형: 신규 기능 ([위키 참조](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>))
  - 기여자: RVC-Boss
 - 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032)
  - 내용: 다국어 프로젝트 문서 업데이트.
@ -475,6 +475,7 @@
    - Pydantic: [Issue#2230](https://github.com/RVC-Boss/GPT-SoVITS/issues/2230), [Issue#2239](https://github.com/RVC-Boss/GPT-SoVITS/issues/2239).
    - PyTorch-Lightning: [Issue#2174](https://github.com/RVC-Boss/GPT-SoVITS/issues/2174).
 - 2025.03.31 [PR#2241](https://github.com/RVC-Boss/GPT-SoVITS/pull/2241)
  - 내용: **SoVITS v3 병렬 추론 지원 활성화.**
  - 유형: 신규 기능
  - 기여자: ChasonJiang
--- a/docs/ko/README.md
+++ b/docs/ko/README.md
@ -1,3 +1,5 @@
 #
 <div align="center">
 <h1>GPT-SoVITS-WebUI</h1>
@ -5,12 +7,17 @@
 [![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange)](https://github.com/RVC-Boss/GPT-SoVITS)
-<img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br>
+[![Train In Colab](https://img.shields.io/badge/Colab-Training-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb)
 [![Infer In Colab](https://img.shields.io/badge/Colab-Inference-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-Inference.ipynb)
 [![Huggingface](https://img.shields.io/badge/HuggingFace-online%20demo-blue.svg?style=for-the-badge&logo=huggingface)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
-[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
+[![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases)
-[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
+[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
-[![Huggingface](https://img.shields.io/badge/🤗%20-online%20demo-yellow.svg?style=for-the-badge)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
+[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
-[![Discord](https://img.shields.io/discord/1198701940511617164?color=%23738ADB&label=Discord&style=for-the-badge)](https://discord.gg/dnrgs5GHfG)
+
 [![简体中文](https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
 [![English](https://img.shields.io/badge/English-Read%20Docs-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://rentry.co/GPT-SoVITS-guide#/)
 [![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
 [**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | **한국어** | [**Türkçe**](../tr/README.md)
@ -18,7 +25,7 @@
 ---
-## 기능:
+## 기능
 1. **제로샷 텍스트 음성 변환 (TTS):** 5초의 음성 샘플을 입력하면 즉시 텍스트를 음성으로 변환할 수 있습니다.
@ -32,9 +39,9 @@
 보지 못한 발화자의 퓨샷(few-shot) 파인튜닝 데모:
-https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
+<https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb>
-**사용자 설명서: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
+<!-- **사용자 설명서: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)** -->
 ## 설치
@ -185,7 +192,7 @@ docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GP
 텍스트 음성 합성(TTS) 주석 .list 파일 형식:
-```
+```text
 vocal_path|speaker_name|language|text
 ```
@ -197,7 +204,7 @@ vocal_path|speaker_name|language|text
 예시:
-```
+```text
 D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
 ```
@ -208,7 +215,6 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
 #### 통합 패키지 사용자
 `go-webui.bat`을 더블 클릭하거나 `go-webui.ps1`를 사용하십시오.
 V1으로 전환하려면, `go-webui-v1.bat`을 더블 클릭하거나 `go-webui-v1.ps1`를 사용하십시오.
 #### 기타
@ -216,14 +222,6 @@ V1으로 전환하려면, `go-webui-v1.bat`을 더블 클릭하거나 `go-webui-
 python webui.py <언어(옵션)>
 ```
 V1으로 전환하려면,
 ```bash
 python webui.py v1 <언어(옵션)>
 ```
 또는 WebUI에서 수동으로 버전을 전환하십시오.
 ### 미세 조정
 #### 경로 자동 채우기가 지원됩니다
@ -239,7 +237,7 @@ python webui.py v1 <언어(옵션)>
 #### 통합 패키지 사용자
-`go-webui-v2.bat`을 더블 클릭하거나 `go-webui-v2.ps1`를 사용한 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.
+`go-webui.bat`을 더블 클릭하거나 `go-webui.ps1`를 사용한 다음 `1-GPT-SoVITS-TTS/1C-inference`에서 추론 webui를 엽니다.
 #### 기타
@ -277,13 +275,13 @@ V1 환경에서 V2를 사용하려면:
 3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained)에서 V2 사전 학습 모델을 다운로드하여 `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained`에 넣으십시오.
-   중국어 V2 추가: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다.)
+   중국어 V2 추가: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW 모델을 다운로드하여 압축을 풀고 `G2PWModel`로 이름을 변경한 다음 `GPT_SoVITS/text`에 배치합니다)
 ## V3 릴리스 노트
 새로운 기능:
-1. 음색 유사성이 더 높아져 목표 음성에 대한 학습 데이터가 적게 필요합니다. (기본 모델을 직접 사용하여 미세 조정 없이 음색 유사성이 크게 향상됩니다.)
+1. 음색 유사성이 더 높아져 목표 음성에 대한 학습 데이터가 적게 필요합니다. (기본 모델을 직접 사용하여 미세 조정 없이 음색 유사성이 크게 향상됩니다)
 2. GPT 모델이 더 안정적이며 반복 및 생략이 적고, 더 풍부한 감정 표현을 가진 음성을 생성하기가 더 쉽습니다.
@ -437,8 +435,8 @@ python ./tools/asr/fasterwhisper_asr.py -i <input> -o <output> -l <language> -p
@Naozumi520 님께 감사드립니다. 광둥어 학습 자료를 제공해 주시고, 광둥어 관련 지식을 지도해 주셔서 감사합니다.
-## 모든 기여자들에게 감사드립니다 ;)
+## 모든 기여자들에게 감사드립니다
 <a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
-  <img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
+  <img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" alt="Contributors"/>
 </a>
--- a/docs/tr/Changelog_TR.md
+++ b/docs/tr/Changelog_TR.md
@ -244,7 +244,7 @@
  - Tür: Optimizasyon
  - Katkıda Bulunan: GoHomeToMacDonal
 - 2024.03.10 [PR#721](https://github.com/RVC-Boss/GPT-SoVITS/pull/721)
-  - İçerik: Hızlı çıkarım dalı 'fast_inference_' eklendi
+  - İçerik: Hızlı çıkarım dalı 'fast*inference*' eklendi
  - Tür: Özellik
  - Katkıda Bulunan: ChasonJiang
 - 2024.03.13 [PR#761](https://github.com/RVC-Boss/GPT-SoVITS/pull/761)
@ -409,7 +409,7 @@
 - 2025.02.11 [Commit#ed207c4b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed207c4b879d5296e9be3ae5f7b876729a2c43b8)~[Commit#6e2b4918](https://github.com/RVC-Boss/GPT-SoVITS/commit/6e2b49186c5b961f0de41ea485d398dffa9787b4)
  - İçerik: **İnce ayar için 14GB VRAM gerektiren GPT-SoVITS V3 modeli eklendi.**
-  - Tür: Yeni Özellik ([Wiki](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)) referans)
+  - Tür: Yeni Özellik ([Wiki](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>) referans)
  - Katkıda Bulunan: RVC-Boss
 - 2025.02.12 [PR#2032](https://github.com/RVC-Boss/GPT-SoVITS/pull/2032)
  - İçerik: Çok dilli proje dokümantasyonu güncellendi.
@ -475,6 +475,7 @@
    - Pydantic: [Issue#2230](https://github.com/RVC-Boss/GPT-SoVITS/issues/2230), [Issue#2239](https://github.com/RVC-Boss/GPT-SoVITS/issues/2239).
    - PyTorch-Lightning: [Issue#2174](https://github.com/RVC-Boss/GPT-SoVITS/issues/2174).
 - 2025.03.31 [PR#2241](https://github.com/RVC-Boss/GPT-SoVITS/pull/2241)
  - İçerik: **SoVITS v3 için paralel çıkarım etkinleştirildi.**
  - Tür: Yeni Özellik
  - Katkıda Bulunan: ChasonJiang
--- a/docs/tr/README.md
+++ b/docs/tr/README.md
@ -1,3 +1,5 @@
 #
 <div align="center">
 <h1>GPT-SoVITS-WebUI</h1>
@ -7,12 +9,17 @@ Güçlü Birkaç Örnekli Ses Dönüştürme ve Metinden Konuşmaya Web Arayüz
 <a href="https://trendshift.io/repositories/7033" target="_blank"><img src="https://trendshift.io/api/badge/repositories/7033" alt="RVC-Boss%2FGPT-SoVITS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
-<!-- img src="https://counter.seku.su/cmoe?name=gptsovits&theme=r34" /><br> -->
+[![Train In Colab](https://img.shields.io/badge/Colab-Training-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-WebUI.ipynb)
 [![Infer In Colab](https://img.shields.io/badge/Colab-Inference-F9AB00?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/Colab-Inference.ipynb)
 [![Huggingface](https://img.shields.io/badge/HuggingFace-online%20demo-blue.svg?style=for-the-badge&logo=huggingface)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
-[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/RVC-Boss/GPT-SoVITS/blob/main/colab_webui.ipynb)
+[![GitHub release](https://img.shields.io/github/v/release/RVC-Boss/gpt-sovits?style=for-the-badge&logo=github)](https://github.com/RVC-Boss/gpt-sovits/releases)
-[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
+[![Image Size](https://img.shields.io/docker/image-size/xxxxrt666/gpt-sovits/latest?style=for-the-badge&logo=docker)](https://hub.docker.com/r/xxxxrt666/gpt-sovits)
-[![Huggingface](https://img.shields.io/badge/🤗%20-online%20demo-yellow.svg?style=for-the-badge)](https://huggingface.co/spaces/lj1995/GPT-SoVITS-v2)
+[![License](https://img.shields.io/badge/LICENSE-MIT-green.svg?style=for-the-badge&logo=opensourceinitiative)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/LICENSE)
-[![Discord](https://img.shields.io/discord/1198701940511617164?color=%23738ADB&label=Discord&style=for-the-badge)](https://discord.gg/dnrgs5GHfG)
+
 [![简体中文](https://img.shields.io/badge/简体中文-阅读文档-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e)
 [![English](https://img.shields.io/badge/English-Read%20Docs-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://rentry.co/GPT-SoVITS-guide#/)
 [![Change Log](https://img.shields.io/badge/Change%20Log-View%20Updates-blue?style=for-the-badge&logo=googledocs&logoColor=white)](https://github.com/RVC-Boss/GPT-SoVITS/blob/main/docs/en/Changelog_EN.md)
 [**English**](../../README.md) | [**中文简体**](../cn/README.md) | [**日本語**](../ja/README.md) | [**한국어**](../ko/README.md) | **Türkçe**
@ -20,7 +27,7 @@ Güçlü Birkaç Örnekli Ses Dönüştürme ve Metinden Konuşmaya Web Arayüz
 ---
-## Özellikler:
+## Özellikler
 1. **Sıfır Örnekli Metinden Konuşmaya:** 5 saniyelik bir vokal örneği girin ve anında metinden konuşmaya dönüşümünü deneyimleyin.
@ -34,9 +41,9 @@ Güçlü Birkaç Örnekli Ses Dönüştürme ve Metinden Konuşmaya Web Arayüz
 Görünmeyen konuşmacılar birkaç örnekli ince ayar demosu:
-https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb
+<https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-80c060ab47fb>
-**Kullanıcı Kılavuzu: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
+<!-- **Kullanıcı Kılavuzu: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)** -->
 ## Kurulum
@ -187,7 +194,7 @@ docker exec -it <GPT-SoVITS-CU126-Lite|GPT-SoVITS-CU128-Lite|GPT-SoVITS-CU126|GP
 TTS açıklama .list dosya formatı:
-```
+```text
 vocal_path|speaker_name|language|text
 ```
@ -201,7 +208,7 @@ Dil sözlüğü:
 Örnek:
-```
+```text
 D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
 ```
@ -212,7 +219,6 @@ D:\GPT-SoVITS\xxx/xxx.wav|xxx|en|I like playing Genshin.
 #### Entegre Paket Kullanıcıları
 `go-webui.bat` dosyasına çift tıklayın veya `go-webui.ps1` kullanın.
 V1'e geçmek istiyorsanız, `go-webui-v1.bat` dosyasına çift tıklayın veya `go-webui-v1.ps1` kullanın.
 #### Diğerleri
@ -220,14 +226,6 @@ V1'e geçmek istiyorsanız, `go-webui-v1.bat` dosyasına çift tıklayın veya `
 python webui.py <dil(isteğe bağlı)>
 ```
 V1'e geçmek istiyorsanız,
 ```bash
 python webui.py v1 <dil(isteğe bağlı)>
 ```
 veya WebUI'de manuel olarak sürüm değiştirin.
 ### İnce Ayar
 #### Yol Otomatik Doldurma artık destekleniyor
@ -243,7 +241,7 @@ veya WebUI'de manuel olarak sürüm değiştirin.
 #### Entegre Paket Kullanıcıları
-`go-webui-v2.bat` dosyasına çift tıklayın veya `go-webui-v2.ps1` kullanın, ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.
+`go-webui.bat` dosyasına çift tıklayın veya `go-webui.ps1` kullanın, ardından çıkarım webui'sini `1-GPT-SoVITS-TTS/1C-inference` adresinde açın.
 #### Diğerleri
@ -281,11 +279,11 @@ V1 ortamından V2'yi kullanmak için:
 3. [huggingface](https://huggingface.co/lj1995/GPT-SoVITS/tree/main/gsv-v2final-pretrained) adresinden v2 önceden eğitilmiş modelleri indirin ve bunları `GPT_SoVITS/pretrained_models/gsv-v2final-pretrained` dizinine yerleştirin.
-   Ek olarak Çince V2: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin.)
+   Ek olarak Çince V2: [G2PWModel.zip(HF)](https://huggingface.co/XXXXRT/GPT-SoVITS-Pretrained/resolve/main/G2PWModel.zip)| [G2PWModel.zip(ModelScope)](https://www.modelscope.cn/models/XXXXRT/GPT-SoVITS-Pretrained/resolve/master/G2PWModel.zip) (G2PW modellerini indirip, zipten çıkarıp, `G2PWModel` olarak yeniden adlandırıp `GPT_SoVITS/text` dizinine yerleştirin)
 ## V3 Sürüm Notları
-Yeni Özellikler:
+### Yeni Özellikler
 1. **Tını benzerliği** daha yüksek olup, hedef konuşmacıyı yakınsamak için daha az eğitim verisi gerekmektedir (tını benzerliği, base model doğrudan kullanılacak şekilde fine-tuning yapılmadan önemli ölçüde iyileştirilmiştir).
@ -293,7 +291,7 @@ Yeni Özellikler:
   [daha fazla detay](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90v3%E2%80%90features-(%E6%96%B0%E7%89%B9%E6%80%A7)>)
-V2 ortamında V3 kullanımı:
+### v2 ortamında v3 kullanımı
 1. `pip install -r requirements.txt` ile bazı paketleri güncelleyin.
@ -323,7 +321,7 @@ V1/V2/V3 ortamından V4'e geçiş:
 Yeni Özellikler:
 1. **V2 ile karşılaştırıldığında biraz daha yüksek VRAM kullanımı sağlar ancak V4'ten daha iyi performans gösterir; aynı donanım maliyeti ve hız avantajını korur**.
-   [Daha fazla bilgi](https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7))
+   [Daha fazla bilgi](<https://github.com/RVC-Boss/GPT-SoVITS/wiki/GPT%E2%80%90SoVITS%E2%80%90features-(%E5%90%84%E7%89%88%E6%9C%AC%E7%89%B9%E6%80%A7)>)
 2. V1/V2 ve V2Pro serisi benzer özelliklere sahipken, V3/V4 de yakın işlevleri paylaşır. Ortalama kalite düşük olan eğitim setleriyle V1/V2/V2Pro iyi sonuçlar verebilir ama V3/V4 veremez. Ayrıca, V3/V4’ün ürettiği ses tonu genel eğitim setine değil, referans ses örneğine daha çok benzemektedir.
@ -363,11 +361,6 @@ UVR5 için Web Arayüzünü açmak için komut satırını kullanın
 python tools/uvr5/webui.py "<infer_device>" <is_half> <webui_port_uvr5>
 ```
 <!-- Bir tarayıcı açamıyorsanız, UVR işleme için aşağıdaki formatı izleyin,Bu ses işleme için mdxnet kullanıyor
 ```
 python mdxnet.py --model --input_root --output_vocal --output_ins --agg_level --format --device --is_half_precision
 ``` -->
 Veri setinin ses segmentasyonu komut satırı kullanılarak bu şekilde yapılır
 ```bash
@ -443,5 +436,5 @@ python ./tools/asr/fasterwhisper_asr.py -i <girdi> -o <çıktı> -l <dil>
 ## Tüm katkıda bulunanlara çabaları için teşekkürler
 <a href="https://github.com/RVC-Boss/GPT-SoVITS/graphs/contributors" target="_blank">
-  <img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" />
+  <img src="https://contrib.rocks/image?repo=RVC-Boss/GPT-SoVITS" alt="Contributors"/>
 </a>
--- a/install.sh
+++ b/install.sh
@ -34,8 +34,8 @@ print_help() {
    echo "  -h, --help                             Show this help message and exit"
    echo ""
    echo "Examples:"
-    echo "  bash install.sh --source HF --download-uvr5"
+    echo "  bash install.sh --device CU128 --source HF --download-uvr5"
-    echo "  bash install.sh --source ModelScope"
+    echo "  bash install.sh --device MPS --source ModelScope"
 }
 # Show help if no arguments provided
@ -149,7 +149,6 @@ else
            echo "Installing，Please Wait..."
        fi
    done
    conda install -c conda-forge -q -y
 fi
 echo "Installing ffmpeg and cmake..."
--- a/requirements.txt
+++ b/requirements.txt
@ -8,7 +8,7 @@ pytorch-lightning>=2.4
 gradio<5
 ffmpeg-python
 onnxruntime; platform_machine == "aarch64" or platform_machine == "arm64"
-onnxruntime-gpu; platform_machine == "x86_64" or platform_machine == "AMD64"
+onnxruntime-gpu; platform_machine == "x86_64" or platform_machine == "amd64"
 tqdm
 funasr==1.0.27
 cn2an
--- a/tools/subfix.py
+++ b/tools/subfix.py
@ -0,0 +1,544 @@
 import datetime
 import os
 import threading
 import traceback
 from dataclasses import dataclass
 from functools import partial
 from typing import List
 import click
 import gradio as gr
 import librosa
 import numpy as np
 import soundfile
 from gradio.components.audio import WaveformOptions
 from tools.i18n.i18n import I18nAuto
 PARTIAL_EXIT = partial(os._exit, 0)
 LANGUAGE_MAP: dict = {
    "ZH": "ZH",
    "zh": "ZH",
    "JP": "JA",
    "jp": "JA",
    "JA": "JA",
    "ja": "JA",
    "EN": "EN",
    "en": "EN",
    "KO": "KO",
    "ko": "KO",
    "yue": "YUE",
    "YUE": "YUE",
 }
 LOCK = threading.Lock()
 IS_CLI = True
@dataclass
 class SubfixErr:
    error: Exception
    tracebacks: str
 class Subfix:
    batch_size: int = 2
    cur_idx: int = 0
    list_path: str
    textboxes: List[gr.Textbox] = []
    audios: List[gr.Audio] = []
    languages: List[gr.Dropdown] = []
    selections: List[gr.Checkbox] = []
    transcriptions_list: List[List[str]] = []
    merge_audio_button: gr.Button
    delete_audio_button: gr.Button
    previous_index_button1: gr.Button
    next_index_button1: gr.Button
    previous_index_button2: gr.Button
    next_index_button2: gr.Button
    index_slider: gr.Slider
    batch_size_slider: gr.Slider
    close_button: gr.Button
    def __init__(self, i18n: I18nAuto):
        self.i18n = i18n
        with gr.Row(equal_height=True):
            with gr.Column(scale=2, min_width=160):
                self.index_slider = gr.Slider(minimum=0, maximum=1, step=1, label=i18n("音频索引"))
            with gr.Column(scale=1, min_width=160):
                self.previous_index_button1 = gr.Button(value=i18n("上一页"), elem_id="btn_previous")
            with gr.Column(scale=1, min_width=160):
                self.next_index_button1 = gr.Button(value=i18n("下一页"), elem_id="btn_next")
        with gr.Row(equal_height=True):
            with gr.Column(scale=2, min_width=160):
                self.batch_size_slider = gr.Slider(
                    minimum=4, maximum=20, step=2, value=self.batch_size, label=i18n("每页音频条数")
                )
            with gr.Column(scale=1, min_width=160):
                self.merge_audio_button = gr.Button(value=i18n("合并选中音频"))
            with gr.Column(scale=1, min_width=160):
                self.delete_audio_button = gr.Button(value=i18n("删除选中音频"))
        gr.render(
            inputs=[self.index_slider, self.batch_size_slider],
            triggers=[self.batch_size_slider.change],
        )(self._render_text_area)
    @property
    def max_index(self):
        return len(self.transcriptions_list) - 1
    def load_list(self, list_path: str):
        with open(list_path, mode="r", encoding="utf-8") as f:
            list_data = f.readlines()
        for idx, transcriptions in enumerate(list_data):
            data = transcriptions.split("|")
            if len(data) != 4:
                print(f"Error Line {idx + 1}: {'|'.join(data)}")
                continue
            audio_name, audio_folder, text_language, text = data
            self.transcriptions_list.append(
                [
                    audio_name,
                    audio_folder,
                    LANGUAGE_MAP.get(text_language.upper(), text_language.upper()),
                    text.strip("\n").strip(),
                ]
            )
            self.list_path = list_path
    def save_list(self):
        data = []
        for transcriptions in self.transcriptions_list:
            data.append("|".join(transcriptions))
        try:
            with open(self.list_path, mode="w", encoding="utf-8") as f:
                f.write("\n".join(data))
        except Exception as e:
            return SubfixErr(e, traceback.format_exc())
    def change_index(self, index: int):
        audios = []
        texts = []
        languages = []
        checkboxs = []
        with LOCK:
            for i in range(index, index + self.batch_size):
                if i <= self.max_index:
                    audios.append(gr.Audio(value=self.transcriptions_list[i][0]))
                    texts.append(gr.Textbox(value=self.transcriptions_list[i][3], label=self.i18n("Text") + f" {i}"))
                    languages.append(gr.Dropdown(value=self.transcriptions_list[i][2]))
                else:
                    audios.append(gr.Audio(value=None, interactive=False))
                    texts.append(gr.Textbox(value=None, label=self.i18n("Text") + f" {i}", interactive=False))
                    languages.append(gr.Dropdown(value=None, interactive=False))
            checkboxs = [gr.Checkbox(False) for i in range(self.batch_size)]
        self.cur_idx = index
        return *audios, *texts, *languages, *checkboxs
    def next_page(self, index: int):
        batch_size = self.batch_size
        max_index = max(self.max_index - batch_size + 1, 0)
        index = min(index + batch_size, max_index)
        return gr.Slider(value=index), *self.change_index(index)
    def previous_page(self, index: int):
        batch_size = self.batch_size
        index = max(index - batch_size, 0)
        return gr.Slider(value=index), *self.change_index(index)
    def delete_audio(self, index, *selected):
        delete_index = [i + index for i, _ in enumerate(selected) if _]
        delete_index = [i for i in delete_index if i < self.max_index]
        for idx in delete_index[::-1]:
            self.transcriptions_list.pop(idx)
        self.save_list()
        return gr.Slider(value=index, maximum=self.max_index), *self.change_index(index)
    def submit(self, *input):
        with LOCK:
            index = self.cur_idx
            batch_size = self.batch_size
            texts = input[: len(input) // 2]
            languages = input[len(input) // 2 :]
            if texts is None or languages is None:
                raise ValueError()
            print(index, min(index + batch_size, self.max_index))
            for idx in range(index, min(index + batch_size, self.max_index + 1)):
                self.transcriptions_list[idx][3] = texts[idx - index].strip().strip("\n")
                self.transcriptions_list[idx][2] = languages[idx - index]
            result = self.save_list()
            if isinstance(result, SubfixErr):
                gr.Warning(str(result.error))
                print(result.tracebacks)
    def merge_audio(self, index, *selected):
        batch_size = self.batch_size
        merge_index = [i + index for i, _ in enumerate(selected) if _]
        merge_index = [i for i in merge_index if i < self.max_index]
        if len(merge_index) < 2:
            return *(gr.skip() for _ in range(batch_size * 3 + 1)), *(gr.Checkbox(False) for _ in range(batch_size))
        else:
            merge_texts = []
            merge_audios = []
            first_itm_index = merge_index[0]
            first_itm_path = f"{os.path.splitext(self.transcriptions_list[first_itm_index][0])[0]}_{str(datetime.datetime.now().strftime(r'%Y%m%d_%H%M%S'))}.wav"
            final_audio_list = []
            for idx in merge_index:
                merge_texts.append(self.transcriptions_list[idx][3])
                merge_audios.append(self.transcriptions_list[idx][0])
            for idx in merge_index[:0:-1]:
                self.transcriptions_list.pop(idx)
            for audio_path in merge_audios:
                final_audio_list.append(librosa.load(audio_path, sr=32000, mono=True)[0])
                final_audio_list.append(np.zeros(int(32000 * 0.3)))
            final_audio_list.pop()
            final_audio = np.concatenate(final_audio_list)
            soundfile.write(first_itm_path, final_audio, 32000)
            self.transcriptions_list[first_itm_index][0] = first_itm_path
            self.transcriptions_list[first_itm_index][3] = ",".join(merge_texts)
            return gr.Slider(maximum=self.max_index), *self.change_index(index)
    def _render_text_area(self, index, batch_size):
        i18n = self.i18n
        self.textboxes = []
        self.audios = []
        self.languages = []
        self.selections = []
        self.batch_size = batch_size
        for i in range(index, index + batch_size):
            with gr.Row(equal_height=True):
                if i <= self.max_index:
                    with gr.Column(scale=2, min_width=160):
                        textbox_tmp = gr.Textbox(
                            value=self.transcriptions_list[i][3],
                            label=i18n("Text") + f" {i}",
                            lines=2,
                            max_lines=3,
                            interactive=True,
                        )
                    with gr.Column(scale=1, min_width=160):
                        audio_tmp = gr.Audio(
                            value=self.transcriptions_list[i][0],
                            show_label=False,
                            show_download_button=False,
                            editable=False,
                            waveform_options={"show_recording_waveform": False, "show_controls": False},
                        )
                    with gr.Column(scale=1, min_width=160):
                        with gr.Group():
                            with gr.Row():
                                language_tmp = gr.Dropdown(
                                    choices=["ZH", "EN", "JA", "KO", "YUE"],
                                    value=self.transcriptions_list[i][2],
                                    allow_custom_value=True,
                                    label=i18n("文本语言"),
                                    interactive=True,
                                )
                            with gr.Row():
                                selection_tmp = gr.Checkbox(
                                    label=i18n("选择音频"),
                                )
                else:
                    with gr.Column(scale=2, min_width=160):
                        textbox_tmp = gr.Textbox(
                            label=i18n("Text") + f" {i}",
                            lines=2,
                            max_lines=3,
                            elem_id="subfix_textbox",
                            interactive=False,
                        )
                    with gr.Column(scale=1, min_width=160):
                        audio_tmp = gr.Audio(
                            streaming=True,
                            show_label=False,
                            show_download_button=False,
                            interactive=False,
                            waveform_options=WaveformOptions(show_recording_waveform=False, show_controls=False),
                        )
                    with gr.Column(scale=1, min_width=160):
                        with gr.Group():
                            with gr.Row():
                                language_tmp = gr.Dropdown(
                                    choices=["ZH", "EN", "JA", "KO", "YUE"],
                                    value=None,
                                    allow_custom_value=True,
                                    label=i18n("文本语言"),
                                    interactive=False,
                                )
                            with gr.Row():
                                selection_tmp = gr.Checkbox(
                                    label=i18n("选择音频"),
                                    interactive=False,
                                )
            self.textboxes.append(textbox_tmp)
            self.audios.append(audio_tmp)
            self.languages.append(language_tmp)
            self.selections.append(selection_tmp)
        with gr.Row(equal_height=True):
            with gr.Column(scale=2, min_width=160):
                self.close_button = gr.Button(value=i18n("保存并关闭打标WebUI"), variant="stop")
            with gr.Column(scale=1, min_width=160):
                self.previous_index_button2 = gr.Button(value=i18n("上一页"))
            with gr.Column(scale=1, min_width=160):
                self.next_index_button2 = gr.Button(value=i18n("下一页"))
        # Event Trigger Binding
        self.index_slider.release(  # Change Index Button
            fn=self.submit,
            inputs=[
                *self.textboxes,
                *self.languages,
            ],
            outputs=[],
        ).success(
            fn=self.change_index,
            inputs=[
                self.index_slider,
            ],
            outputs=[
                *self.audios,
                *self.textboxes,
                *self.languages,
                *self.selections,
            ],
            max_batch_size=1,
            trigger_mode="once",
        )
        self.next_index_button1.click(  # Next Page Button on the Top
            fn=self.submit,
            inputs=[
                *self.textboxes,
                *self.languages,
            ],
            outputs=[],
        ).success(
            fn=self.next_page,
            inputs=[
                self.index_slider,
            ],
            outputs=[
                self.index_slider,
                *self.audios,
                *self.textboxes,
                *self.languages,
                *self.selections,
            ],
            scroll_to_output=True,
            trigger_mode="once",
        )
        self.next_index_button2.click(  # Next Page Button on the Bottom, Binding to Next Page Button on the Top
            lambda: None,
            [],
            [],
            js="""
            () => {
            document.getElementById("btn_next").click();
            }""",
            trigger_mode="once",
        )
        self.previous_index_button1.click(  # Previous Page Button on the Top
            fn=self.submit,
            inputs=[
                *self.textboxes,
                *self.languages,
            ],
            outputs=[],
        ).success(
            fn=self.previous_page,
            inputs=[
                self.index_slider,
            ],
            outputs=[
                self.index_slider,
                *self.audios,
                *self.textboxes,
                *self.languages,
                *self.selections,
            ],
            scroll_to_output=True,
            trigger_mode="once",
        )
        self.previous_index_button2.click(  # Previous Page Button on the Bottom, Binding to Previous Page Button on the Top
            lambda: None,
            [],
            [],
            js="""
            () => {
            document.getElementById("btn_previous").click();
            }""",
            trigger_mode="once",
        )
        self.delete_audio_button.click(  # Delete the Audio in the Transcription File
            fn=self.submit,
            inputs=[
                *self.textboxes,
                *self.languages,
            ],
            outputs=[],
        ).success(
            fn=self.delete_audio,
            inputs=[
                self.index_slider,
                *self.selections,
            ],
            outputs=[
                self.index_slider,
                *self.audios,
                *self.textboxes,
                *self.languages,
                *self.selections,
            ],
            scroll_to_output=True,
        ).success(
            fn=self.submit,
            inputs=[
                *self.textboxes,
                *self.languages,
            ],
            outputs=[],
            show_progress="hidden",
        )
        self.merge_audio_button.click(  # Delete the Audio in the Transcription File
            fn=self.submit,
            inputs=[
                *self.textboxes,
                *self.languages,
            ],
            outputs=[],
        ).success(
            fn=self.merge_audio,
            inputs=[
                self.index_slider,
                *self.selections,
            ],
            outputs=[
                self.index_slider,
                *self.audios,
                *self.textboxes,
                *self.languages,
                *self.selections,
            ],
            scroll_to_output=True,
        ).success(
            fn=self.submit,
            inputs=[
                *self.textboxes,
                *self.languages,
            ],
            outputs=[],
            show_progress="hidden",
        )
        if not IS_CLI:
            self.close_button.click(  # Close the Subfix Tab, Binding to Close Button on Audio Processing Tab
                fn=lambda: None,
                inputs=[],
                outputs=[],
                js="""
                () => {
                document.getElementById("btn_close").click();
                }""",
                trigger_mode="once",
            )
        else:
            self.close_button.click(  # Close the Subfix Tab, Binding to Close Button on Audio Processing Tab
                fn=self.submit,
                inputs=[
                    *self.textboxes,
                    *self.languages,
                ],
                outputs=[],
                trigger_mode="once",
            ).then(
                fn=PARTIAL_EXIT,
                inputs=[],
                outputs=[],
            )
    def render(self, list_path: str, batch_size: int = 10):
        self.batch_size = batch_size
        self.transcriptions_list = []
        self.load_list(list_path=list_path)
@click.command(name="subfix")
@click.argument(
    "list-path",
    metavar="<Path>",
    type=click.Path(exists=True, dir_okay=False, readable=True, writable=True),
    required=True,
 )
@click.option(
    "--i18n-lang",
    type=str,
    default="Auto",
    help="Languages for internationalisation",
    show_default=True,
 )
@click.option(
    "--port",
    type=int,
    default="9871",
    show_default=True,
 )
@click.option(
    "--share",
    type=bool,
    default=False,
    show_default=True,
 )
 def main(list_path: str = "", i18n_lang="Auto", port=9871, share=False):
    """Web-Based audio subtitle editing and multilingual annotation Tool
    Accept a transcription list path to launch a Gradio WebUI for text editing
    """
    with gr.Blocks(analytics_enabled=False) as app:
        subfix = Subfix(I18nAuto(i18n_lang))
        subfix.render(list_path=list_path)
        if subfix.max_index >= 0:
            timer = gr.Timer(0.1)
            timer.tick(
                fn=lambda: (
                    gr.Slider(value=0, maximum=subfix.max_index, step=1),
                    gr.Slider(value=10),
                    gr.Timer(active=False),
                ),
                inputs=[],
                outputs=[
                    subfix.index_slider,
                    subfix.batch_size_slider,
                    timer,
                ],
            )
        else:
            timer = gr.Timer(2)
            timer.tick(
                fn=lambda x: (_ for _ in ()).throw(gr.Error("Invalid List")) if x is None else None,
                inputs=[],
                outputs=[],
            )
    app.queue().launch(
        server_name="0.0.0.0",
        inbrowser=True,
        share=share,
        server_port=port,
        quiet=False,
        show_api=False,
    )
 if __name__ == "__main__":
    main()
--- a/tools/subfix_webui.py
+++ b/tools/subfix_webui.py
@ -1,422 +0,0 @@
 import sys
 from tools.i18n.i18n import I18nAuto, scan_language_list
 language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto"
 i18n = I18nAuto(language=language)
 import argparse
 import copy
 import json
 import os
 import uuid
 try:
    import gradio.analytics as analytics
    analytics.version_check = lambda: None
 except:
    ...
 import gradio as gr
 import librosa
 import numpy as np
 import soundfile
 g_json_key_text = ""
 g_json_key_path = ""
 g_load_file = ""
 g_load_format = ""
 g_max_json_index = 0
 g_index = 0
 g_batch = 10
 g_text_list = []
 g_audio_list = []
 g_checkbox_list = []
 g_data_json = []
 def reload_data(index, batch):
    global g_index
    g_index = index
    global g_batch
    g_batch = batch
    datas = g_data_json[index : index + batch]
    output = []
    for d in datas:
        output.append({g_json_key_text: d[g_json_key_text], g_json_key_path: d[g_json_key_path]})
    return output
 def b_change_index(index, batch):
    global g_index, g_batch
    g_index, g_batch = index, batch
    datas = reload_data(index, batch)
    output = []
    for i, _ in enumerate(datas):
        output.append(
            # gr.Textbox(
            #     label=f"Text {i+index}",
            #     value=_[g_json_key_text]#text
            # )
            {"__type__": "update", "label": f"Text {i + index}", "value": _[g_json_key_text]}
        )
    for _ in range(g_batch - len(datas)):
        output.append(
            # gr.Textbox(
            #     label=f"Text",
            #     value=""
            # )
            {"__type__": "update", "label": "Text", "value": ""}
        )
    for _ in datas:
        output.append(_[g_json_key_path])
    for _ in range(g_batch - len(datas)):
        output.append(None)
    for _ in range(g_batch):
        output.append(False)
    return output
 def b_next_index(index, batch):
    b_save_file()
    if (index + batch) <= g_max_json_index:
        return index + batch, *b_change_index(index + batch, batch)
    else:
        return index, *b_change_index(index, batch)
 def b_previous_index(index, batch):
    b_save_file()
    if (index - batch) >= 0:
        return index - batch, *b_change_index(index - batch, batch)
    else:
        return 0, *b_change_index(0, batch)
 def b_submit_change(*text_list):
    global g_data_json
    change = False
    for i, new_text in enumerate(text_list):
        if g_index + i <= g_max_json_index:
            new_text = new_text.strip() + " "
            if g_data_json[g_index + i][g_json_key_text] != new_text:
                g_data_json[g_index + i][g_json_key_text] = new_text
                change = True
    if change:
        b_save_file()
    return g_index, *b_change_index(g_index, g_batch)
 def b_delete_audio(*checkbox_list):
    global g_data_json, g_index, g_max_json_index
    b_save_file()
    change = False
    for i, checkbox in reversed(list(enumerate(checkbox_list))):
        if g_index + i < len(g_data_json):
            if checkbox == True:
                g_data_json.pop(g_index + i)
                change = True
    g_max_json_index = len(g_data_json) - 1
    if g_index > g_max_json_index:
        g_index = g_max_json_index
        g_index = g_index if g_index >= 0 else 0
    if change:
        b_save_file()
    # return gr.Slider(value=g_index, maximum=(g_max_json_index if g_max_json_index>=0 else 0)), *b_change_index(g_index, g_batch)
    return {
        "value": g_index,
        "__type__": "update",
        "maximum": (g_max_json_index if g_max_json_index >= 0 else 0),
    }, *b_change_index(g_index, g_batch)
 def b_invert_selection(*checkbox_list):
    new_list = [not item if item is True else True for item in checkbox_list]
    return new_list
 def get_next_path(filename):
    base_dir = os.path.dirname(filename)
    base_name = os.path.splitext(os.path.basename(filename))[0]
    for i in range(100):
        new_path = os.path.join(base_dir, f"{base_name}_{str(i).zfill(2)}.wav")
        if not os.path.exists(new_path):
            return new_path
    return os.path.join(base_dir, f"{str(uuid.uuid4())}.wav")
 def b_audio_split(audio_breakpoint, *checkbox_list):
    global g_data_json, g_max_json_index
    checked_index = []
    for i, checkbox in enumerate(checkbox_list):
        if checkbox == True and g_index + i < len(g_data_json):
            checked_index.append(g_index + i)
    if len(checked_index) == 1:
        index = checked_index[0]
        audio_json = copy.deepcopy(g_data_json[index])
        path = audio_json[g_json_key_path]
        data, sample_rate = librosa.load(path, sr=None, mono=True)
        audio_maxframe = len(data)
        break_frame = int(audio_breakpoint * sample_rate)
        if break_frame >= 1 and break_frame < audio_maxframe:
            audio_first = data[0:break_frame]
            audio_second = data[break_frame:]
            nextpath = get_next_path(path)
            soundfile.write(nextpath, audio_second, sample_rate)
            soundfile.write(path, audio_first, sample_rate)
            g_data_json.insert(index + 1, audio_json)
            g_data_json[index + 1][g_json_key_path] = nextpath
            b_save_file()
    g_max_json_index = len(g_data_json) - 1
    # return gr.Slider(value=g_index, maximum=g_max_json_index), *b_change_index(g_index, g_batch)
    return {"value": g_index, "maximum": g_max_json_index, "__type__": "update"}, *b_change_index(g_index, g_batch)
 def b_merge_audio(interval_r, *checkbox_list):
    global g_data_json, g_max_json_index
    b_save_file()
    checked_index = []
    audios_path = []
    audios_text = []
    for i, checkbox in enumerate(checkbox_list):
        if checkbox == True and g_index + i < len(g_data_json):
            checked_index.append(g_index + i)
    if len(checked_index) > 1:
        for i in checked_index:
            audios_path.append(g_data_json[i][g_json_key_path])
            audios_text.append(g_data_json[i][g_json_key_text])
        for i in reversed(checked_index[1:]):
            g_data_json.pop(i)
        base_index = checked_index[0]
        base_path = audios_path[0]
        g_data_json[base_index][g_json_key_text] = "".join(audios_text)
        audio_list = []
        l_sample_rate = None
        for i, path in enumerate(audios_path):
            data, sample_rate = librosa.load(path, sr=l_sample_rate, mono=True)
            l_sample_rate = sample_rate
            if i > 0:
                silence = np.zeros(int(l_sample_rate * interval_r))
                audio_list.append(silence)
            audio_list.append(data)
        audio_concat = np.concatenate(audio_list)
        soundfile.write(base_path, audio_concat, l_sample_rate)
        b_save_file()
    g_max_json_index = len(g_data_json) - 1
    # return gr.Slider(value=g_index, maximum=g_max_json_index), *b_change_index(g_index, g_batch)
    return {"value": g_index, "maximum": g_max_json_index, "__type__": "update"}, *b_change_index(g_index, g_batch)
 def b_save_json():
    with open(g_load_file, "w", encoding="utf-8") as file:
        for data in g_data_json:
            file.write(f"{json.dumps(data, ensure_ascii=False)}\n")
 def b_save_list():
    with open(g_load_file, "w", encoding="utf-8") as file:
        for data in g_data_json:
            wav_path = data["wav_path"]
            speaker_name = data["speaker_name"]
            language = data["language"]
            text = data["text"]
            file.write(f"{wav_path}|{speaker_name}|{language}|{text}".strip() + "\n")
 def b_load_json():
    global g_data_json, g_max_json_index
    with open(g_load_file, "r", encoding="utf-8") as file:
        g_data_json = file.readlines()
        g_data_json = [json.loads(line) for line in g_data_json]
        g_max_json_index = len(g_data_json) - 1
 def b_load_list():
    global g_data_json, g_max_json_index
    with open(g_load_file, "r", encoding="utf-8") as source:
        data_list = source.readlines()
        for _ in data_list:
            data = _.split("|")
            if len(data) == 4:
                wav_path, speaker_name, language, text = data
                g_data_json.append(
                    {"wav_path": wav_path, "speaker_name": speaker_name, "language": language, "text": text.strip()}
                )
            else:
                print("error line:", data)
        g_max_json_index = len(g_data_json) - 1
 def b_save_file():
    if g_load_format == "json":
        b_save_json()
    elif g_load_format == "list":
        b_save_list()
 def b_load_file():
    if g_load_format == "json":
        b_load_json()
    elif g_load_format == "list":
        b_load_list()
 def set_global(load_json, load_list, json_key_text, json_key_path, batch):
    global g_json_key_text, g_json_key_path, g_load_file, g_load_format, g_batch
    g_batch = int(batch)
    if load_json != "None":
        g_load_format = "json"
        g_load_file = load_json
    elif load_list != "None":
        g_load_format = "list"
        g_load_file = load_list
    else:
        g_load_format = "list"
        g_load_file = "demo.list"
    g_json_key_text = json_key_text
    g_json_key_path = json_key_path
    b_load_file()
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process some integers.")
    parser.add_argument("--load_json", default="None", help="source file, like demo.json")
    parser.add_argument("--is_share", default="False", help="whether webui is_share=True")
    parser.add_argument("--load_list", default="None", help="source file, like demo.list")
    parser.add_argument("--webui_port_subfix", default=9871, help="source file, like demo.list")
    parser.add_argument("--json_key_text", default="text", help="the text key name in json, Default: text")
    parser.add_argument("--json_key_path", default="wav_path", help="the path key name in json, Default: wav_path")
    parser.add_argument("--g_batch", default=10, help="max number g_batch wav to display, Default: 10")
    args = parser.parse_args()
    set_global(args.load_json, args.load_list, args.json_key_text, args.json_key_path, args.g_batch)
    with gr.Blocks(analytics_enabled=False) as demo:
        gr.Markdown(
            value=i18n("Submit Text: 将当前页所有文本框内容手工保存到内存和文件(翻页前后或者退出标注页面前如果没点这个按钮，你再翻回来就回滚了，白忙活。)")
        )
        with gr.Row():
            btn_change_index = gr.Button("Change Index")
            btn_submit_change = gr.Button("Submit Text")
            btn_merge_audio = gr.Button("Merge Audio")
            btn_delete_audio = gr.Button("Delete Audio")
            btn_previous_index = gr.Button("Previous Index")
            btn_next_index = gr.Button("Next Index")
        with gr.Row():
            index_slider = gr.Slider(minimum=0, maximum=g_max_json_index, value=g_index, step=1, label="Index", scale=3)
            splitpoint_slider = gr.Slider(
                minimum=0, maximum=120.0, value=0, step=0.1, label="Audio Split Point(s)", scale=3
            )
            btn_audio_split = gr.Button("Split Audio", scale=1)
            btn_save_json = gr.Button("Save File", visible=True, scale=1)
            btn_invert_selection = gr.Button("Invert Selection", scale=1)
        with gr.Row():
            with gr.Column():
                for _ in range(0, g_batch):
                    with gr.Row():
                        text = gr.Textbox(label="Text", visible=True, scale=5)
                        audio_output = gr.Audio(label="Output Audio", visible=True, scale=5)
                        audio_check = gr.Checkbox(label="Yes", show_label=True, info="Choose Audio", scale=1)
                        g_text_list.append(text)
                        g_audio_list.append(audio_output)
                        g_checkbox_list.append(audio_check)
        with gr.Row():
            batchsize_slider = gr.Slider(
                minimum=1, maximum=g_batch, value=g_batch, step=1, label="Batch Size", scale=3, interactive=False
            )
            interval_slider = gr.Slider(minimum=0, maximum=2, value=0, step=0.01, label="Interval", scale=3)
            btn_theme_dark = gr.Button("Light Theme", link="?__theme=light", scale=1)
            btn_theme_light = gr.Button("Dark Theme", link="?__theme=dark", scale=1)
        btn_change_index.click(
            b_change_index,
            inputs=[
                index_slider,
                batchsize_slider,
            ],
            outputs=[*g_text_list, *g_audio_list, *g_checkbox_list],
        )
        btn_submit_change.click(
            b_submit_change,
            inputs=[
                *g_text_list,
            ],
            outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list],
        )
        btn_previous_index.click(
            b_previous_index,
            inputs=[
                index_slider,
                batchsize_slider,
            ],
            outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list],
        )
        btn_next_index.click(
            b_next_index,
            inputs=[
                index_slider,
                batchsize_slider,
            ],
            outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list],
        )
        btn_delete_audio.click(
            b_delete_audio,
            inputs=[*g_checkbox_list],
            outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list],
        )
        btn_merge_audio.click(
            b_merge_audio,
            inputs=[interval_slider, *g_checkbox_list],
            outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list],
        )
        btn_audio_split.click(
            b_audio_split,
            inputs=[splitpoint_slider, *g_checkbox_list],
            outputs=[index_slider, *g_text_list, *g_audio_list, *g_checkbox_list],
        )
        btn_invert_selection.click(b_invert_selection, inputs=[*g_checkbox_list], outputs=[*g_checkbox_list])
        btn_save_json.click(b_save_file)
        demo.load(
            b_change_index,
            inputs=[
                index_slider,
                batchsize_slider,
            ],
            outputs=[*g_text_list, *g_audio_list, *g_checkbox_list],
        )
    demo.launch(
        server_name="0.0.0.0",
        inbrowser=True,
        # quiet=True,
        share=eval(args.is_share),
        server_port=int(args.webui_port_subfix),
    )
--- a/tools/uvr5/mdxnet.py
+++ b/tools/uvr5/mdxnet.py
@ -190,14 +190,14 @@ class Predictor:
            opt_path_vocal = path_vocal[:-4] + ".%s" % format
            opt_path_other = path_other[:-4] + ".%s" % format
            if os.path.exists(path_vocal):
-                os.system("ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path_vocal, opt_path_vocal))
+                os.system('ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path_vocal, opt_path_vocal))
                if os.path.exists(opt_path_vocal):
                    try:
                        os.remove(path_vocal)
                    except:
                        pass
            if os.path.exists(path_other):
-                os.system("ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path_other, opt_path_other))
+                os.system('ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path_other, opt_path_other))
                if os.path.exists(opt_path_other):
                    try:
                        os.remove(path_other)
--- a/tools/uvr5/vr.py
+++ b/tools/uvr5/vr.py
@ -140,7 +140,7 @@ class AudioPre:
                )
                if os.path.exists(path):
                    opt_format_path = path[:-4] + ".%s" % format
-                    cmd="ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path, opt_format_path)
+                    cmd = 'ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path, opt_format_path)
                    print(cmd)
                    os.system(cmd)
                    if os.path.exists(opt_format_path):
@ -177,7 +177,7 @@ class AudioPre:
                )
                if os.path.exists(path):
                    opt_format_path = path[:-4] + ".%s" % format
-                    cmd="ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path, opt_format_path)
+                    cmd = 'ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path, opt_format_path)
                    print(cmd)
                    os.system(cmd)
                    if os.path.exists(opt_format_path):
@ -307,7 +307,7 @@ class AudioPreDeEcho:
                )
                if os.path.exists(path):
                    opt_format_path = path[:-4] + ".%s" % format
-                    cmd="ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path, opt_format_path)
+                    cmd = 'ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path, opt_format_path)
                    print(cmd)
                    os.system(cmd)
                    if os.path.exists(opt_format_path):
@ -340,7 +340,7 @@ class AudioPreDeEcho:
                )
                if os.path.exists(path):
                    opt_format_path = path[:-4] + ".%s" % format
-                    cmd="ffmpeg -i \"%s\" -vn \"%s\" -q:a 2 -y" % (path, opt_format_path)
+                    cmd = 'ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path, opt_format_path)
                    print(cmd)
                    os.system(cmd)
                    if os.path.exists(opt_format_path):
--- a/tools/uvr5/webui.py
+++ b/tools/uvr5/webui.py
@ -1,23 +1,22 @@
 import logging
 import os
 import sys
 import traceback
 import gradio as gr
 from tools.i18n.i18n import I18nAuto
 from tools.my_utils import clean_path
 i18n = I18nAuto()
 logger = logging.getLogger(__name__)
 import sys
 import ffmpeg
 import gradio as gr
 import torch
 from bsroformer import Roformer_Loader
 from mdxnet import MDXNetDereverb
 from vr import AudioPre, AudioPreDeEcho
 from tools.i18n.i18n import I18nAuto
 from tools.my_utils import clean_path, load_cudnn
 i18n = I18nAuto()
 logger = logging.getLogger(__name__)
 weight_uvr5_root = "tools/uvr5/uvr5_weights"
 uvr5_names = []
 for name in os.listdir(weight_uvr5_root):
@ -44,6 +43,7 @@ def html_center(text, label="p"):
 def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0):
    infos = []
    load_cudnn()
    try:
        inp_root = clean_path(inp_root)
        save_root_vocal = clean_path(save_root_vocal)
@ -220,5 +220,6 @@ app.queue().launch(  # concurrency_count=511, max_size=1022
    inbrowser=True,
    share=is_share,
    server_port=webui_port_uvr5,
    show_api=False,
    # quiet=True,
 )
--- a/webui.py
+++ b/webui.py
@ -12,6 +12,7 @@ import platform
 import shutil
 import signal
 import gradio as gr
 import psutil
 import torch
 import yaml
@ -58,6 +59,7 @@ for site_packages_root in site_packages_roots:
            traceback.print_exc()
 import shutil
 import subprocess
 from multiprocessing import cpu_count
 from subprocess import Popen
 from tools.assets import css, js, top_html
@ -66,7 +68,6 @@ from tools.i18n.i18n import I18nAuto, scan_language_list
 language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto"
 os.environ["language"] = language
 i18n = I18nAuto(language=language)
 from multiprocessing import cpu_count
 from config import (
    GPU_INDEX,
@ -86,14 +87,9 @@ from config import (
 from tools import my_utils
 from tools.my_utils import check_details, check_for_existance
-# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 当遇到mps不支持的步骤时使用cpu
+language = sys.argv[-1] if sys.argv[-1] in scan_language_list() else "Auto"
-try:
+os.environ["language"] = language
-    import gradio.analytics as analytics
+i18n = I18nAuto(language=language)
    analytics.version_check = lambda: None
 except:
    ...
 import gradio as gr
 n_cpu = cpu_count()
@ -276,12 +272,7 @@ def change_label(path_list):
    if p_label is None:
        check_for_existance([path_list])
        path_list = my_utils.clean_path(path_list)
-        cmd = '"%s" -s tools/subfix_webui.py --load_list "%s" --webui_port %s --is_share %s' % (
+        cmd = f'"{python_exec}" -s tools/subfix.py --i18n-lang {language} --port {webui_port_subfix} --share {is_share} "{path_list}"'
            python_exec,
            path_list,
            webui_port_subfix,
            is_share,
        )
        yield (
            process_info(process_name_subfix, "opened"),
            {"__type__": "update", "visible": False},
@ -1981,5 +1972,6 @@ with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css
        inbrowser=True,
        share=is_share,
        server_port=webui_port_main,
        show_api=False,
        # quiet=True,
    )