Merge 50a88a596dea718c83e535136e9cb46b513cef6f into 08d627c3338173c3229286d8787060d6559fe0f8

增加cuda graph支持，普通推理模式推理速度原地翻倍，效果不变。2
2026-06-06 14:38:14 +08:00 · 2026-05-14 01:31:35 +08:00 · 2026-04-30 15:01:45 +08:00 · 2026-04-30 15:01:11 +08:00 · 2026-04-19 21:02:57 +08:00 · 2026-04-18 22:33:55 +08:00
56 changed files with 6326 additions and 1495 deletions
--- a/GPT_SoVITS/AR/data/dataset.py
+++ b/GPT_SoVITS/AR/data/dataset.py
@ -67,8 +67,10 @@ class Text2SemanticDataset(Dataset):
            )
        )  # "%s/3-bert"%exp_dir#bert_dir
        self.path6 = semantic_path  # "%s/6-name2semantic.tsv"%exp_dir#semantic_path
-        assert os.path.exists(self.path2)
+        if not os.path.exists(self.path2):
-        assert os.path.exists(self.path6)
+            raise FileNotFoundError(f"Phoneme data file not found: {self.path2}")
        if not os.path.exists(self.path6):
            raise FileNotFoundError(f"Semantic data file not found: {self.path6}")
        self.phoneme_data = {}
        with open(self.path2, "r", encoding="utf8") as f:
            lines = f.read().strip("\n").split("\n")
@ -131,7 +133,7 @@ class Text2SemanticDataset(Dataset):
                phoneme, word2ph, text = self.phoneme_data[item_name]
            except Exception:
                traceback.print_exc()
-                # print(f"{item_name} not in self.phoneme_data !")
+                print(f"Warning: File \"{item_name}\" not in self.phoneme_data! Skipped. ")
                num_not_in += 1
                continue
@ -152,7 +154,7 @@ class Text2SemanticDataset(Dataset):
                phoneme_ids = cleaned_text_to_sequence(phoneme, version)
            except:
                traceback.print_exc()
-                # print(f"{item_name} not in self.phoneme_data !")
+                print(f"Warning: Failed to convert phonemes to sequence for file \"{item_name}\"! Skipped. ")
                num_not_in += 1
                continue
            # if len(phoneme_ids) >400:###########2：改为恒定限制为semantic/2.5就行
@ -228,7 +230,11 @@ class Text2SemanticDataset(Dataset):
            # bert_feature=torch.zeros_like(phoneme_ids,dtype=torch.float32)
            bert_feature = None
        else:
-            assert bert_feature.shape[-1] == len(phoneme_ids)
+            try:
                assert bert_feature.shape[-1] == len(phoneme_ids)
            except AssertionError:
                print(f"AssertionError: The BERT feature dimension ({bert_feature.shape[-1]}) of the file '{item_name}' does not match the length of the phoneme sequence ({len(phoneme_ids)}).")
                raise
        return {
            "idx": idx,
            "phoneme_ids": phoneme_ids,
--- a/GPT_SoVITS/AR/models/embedding_cudagraph.py
+++ b/GPT_SoVITS/AR/models/embedding_cudagraph.py
@ -0,0 +1,76 @@
 import math
 import torch
 from torch import nn
 class TokenEmbedding(nn.Module):
    def __init__(self, embedding_dim: int, vocab_size: int, dropout: float = 0.0):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.dropout = nn.Dropout(p=dropout)
        self.word_embeddings = nn.Embedding(self.vocab_size, self.embedding_dim)
    @property
    def weight(self) -> torch.Tensor:
        return self.word_embeddings.weight
    def embedding(self, index: int) -> torch.Tensor:
        return self.word_embeddings.weight[index : index + 1]
    def forward(self, x: torch.Tensor):
        x = self.word_embeddings(x)
        x = self.dropout(x)
        return x
 class SinePositionalEmbeddingNested(nn.Module):
    def __init__(
        self,
        embedding_dim: int,
        dropout: float = 0.0,
        scale: bool = False,
        alpha: bool = False,
        max_batch_size: int = 20,
        max_seq_len: int = 2500,
    ):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.x_scale = math.sqrt(embedding_dim) if scale else 1.0
        self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
        self.dropout = nn.Dropout(p=dropout)
        self.max_batch_size = max_batch_size
        self.max_seq_len = max_seq_len
        self.reverse = False
        self.register_buffer(
            "pe", torch.zeros(max_batch_size, max_seq_len, embedding_dim), persistent=False
        )
        self.pe: torch.Tensor
        self.compute_pe()
    def compute_pe(self):
        if self.reverse:
            position = torch.arange(self.max_seq_len - 1, -1, -1.0, dtype=torch.float32).unsqueeze(1)
        else:
            position = torch.arange(self.max_seq_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, self.embedding_dim, 2, dtype=torch.float32)
            * -(math.log(10000.0) / self.embedding_dim)
        )
        pe = self.pe
        pe[:, :, 0::2] = torch.sin(position * div_term)
        pe[:, :, 1::2] = torch.cos(position * div_term)
    def forward(self, input_pos: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
        batch_size = x.shape[0]
        pe_values = self.pe[torch.arange(batch_size), input_pos - 1]
        return x * self.x_scale + self.alpha * pe_values.unsqueeze(1)
    def prefill(self, x: torch.Tensor) -> torch.Tensor:
        input_pos = torch.tensor([i.shape[0] for i in x.unbind()])
        pe_values = torch.nested.nested_tensor(
            [self.pe[i, : input_pos[i], :] for i in range(input_pos.size(0))]
        )
        return x * self.x_scale + self.alpha.item() * pe_values
--- a/GPT_SoVITS/AR/models/structs_cudagraph.py
+++ b/GPT_SoVITS/AR/models/structs_cudagraph.py
@ -0,0 +1,78 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from typing import List, Literal, Optional
 import torch
 Tensor = torch.Tensor
@dataclass
 class T2SResult:
    result: List[Tensor] | None = None
    infer_speed: float = 0.0
    status: Literal["Success", "Error"] = "Success"
    exception: Optional[Exception] = None
    traceback: Optional[str] = None
@dataclass
 class T2SRequest:
    x: List[torch.Tensor]
    x_lens: Tensor
    prompts: torch.Tensor
    bert_feature: List[Tensor]
    valid_length: int
    top_k: int = 5
    top_p: float = 1
    early_stop_num: int = -1
    temperature: float = 1.0
    repetition_penalty: float = 1.35
    use_cuda_graph: bool = False
    debug: bool = False
 class T2SSession:
    def __init__(self, decoder, request: T2SRequest, device: torch.device, dtype: torch.dtype):
        with device:
            self.decoder = decoder
            self.request = request
            self.device = device
            self.dtype = dtype
            bsz = len(request.x)
            y_len = request.prompts.size(-1)
            self.bsz = bsz
            self.y_len = y_len
            from AR.models.t2s_model_cudagraph import Sampler
            self.sampler = Sampler(bsz, decoder.vocab_size)
            self.x = request.x
            self.x_lens = request.x_lens.to(torch.int32)
            self.y = request.prompts
            self.bert_feature = request.bert_feature
            self.prefill_len = self.x_lens + self.y.size(1)
            self.input_pos = torch.zeros_like(self.prefill_len)
            self.input_pos.add_(self.prefill_len)
            self.completed = torch.Tensor([False] * len(self.x)).bool().to(device)
            self.y_results: List[Tensor] = [None] * len(self.x)  # type: ignore
            self.xy_pos = decoder.embed(self.x, self.y, self.bert_feature)
            attn_mask = []
            for bs in range(bsz):
                pos = int(self.x_lens[bs].item())
                mask = torch.zeros(pos + y_len, pos + y_len).bool()
                mask[:, :pos].fill_(True)
                if y_len > 0:
                    mask[-y_len:, -y_len:] = ~torch.triu(
                        torch.ones(y_len, y_len, dtype=torch.bool), diagonal=1
                    )
                attn_mask.append(mask)
            self.attn_mask_nested = torch.nested.nested_tensor(attn_mask)
--- a/GPT_SoVITS/AR/models/t2s_model_cudagraph.py
+++ b/GPT_SoVITS/AR/models/t2s_model_cudagraph.py
@ -0,0 +1,602 @@
 """
 CUDA Graph accelerated T2S decoder.
 Uses PyTorch native scaled_dot_product_attention (no flash_attn dependency).
 Adapted from gsvpp/AR/models/t2s_model_abc.py and t2s_model_flash_attn.py.
 """
 from __future__ import annotations
 import os
 import time
 import traceback
 from typing import Dict, List, MutableSequence, Optional, Tuple
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.cuda.graphs import CUDAGraph
 from tqdm import tqdm
 from AR.models.embedding_cudagraph import (
    SinePositionalEmbeddingNested as SinePositionalEmbedding,
 )
 from AR.models.embedding_cudagraph import TokenEmbedding
 from AR.models.structs_cudagraph import T2SRequest, T2SResult, T2SSession
 Tensor = torch.Tensor
 class Sampler(nn.Module):
    def __init__(self, batch_size: int, vocab_size: int) -> None:
        super().__init__()
        self.batch_size = batch_size
    def sample(
        self,
        logits: Tensor,
        previous_tokens: Tensor,
        temperature: float,
        top_k: int,
        top_p: float,
        repetition_penalty: float,
    ) -> Tensor:
        previous_tokens = previous_tokens.long()
        score = torch.gather(logits, dim=1, index=previous_tokens)
        score = torch.where(
            score < 0, score * repetition_penalty, score / repetition_penalty
        )
        logits.scatter_(dim=1, index=previous_tokens, src=score)
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cum_probs = torch.cumsum(
            torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1
        )
        sorted_indices_to_remove = cum_probs > top_p
        sorted_indices_to_remove[:, 0] = False
        indices_to_remove = sorted_indices_to_remove.scatter(
            dim=1, index=sorted_indices, src=sorted_indices_to_remove
        )
        logits = logits.masked_fill(indices_to_remove, -float("Inf"))
        logits = logits / max(temperature, 1e-5)
        v, _ = torch.topk(logits, top_k)
        pivot = v[:, -1].unsqueeze(-1)
        logits = torch.where(logits < pivot, -float("Inf"), logits)
        probs = torch.nn.functional.softmax(logits, dim=-1)
        q = torch.empty_like(probs).exponential_(1.0)
        idx_next = torch.argmax(probs / q, dim=-1, keepdim=True).to(dtype=torch.int32)
        return idx_next
 # ─── KV Cache ────────────────────<E29480><E29480><EFBFBD>───────────────────────────────────────────
 class KVCacheNHD(nn.Module):
    def __init__(self, batch_size, max_seq_length, n_heads, head_dim):
        super().__init__()
        assert batch_size > 0
        cache_shape = (batch_size, max_seq_length, n_heads, head_dim)
        self.n_head = n_heads
        self.head_dim = head_dim
        self.batch_size = batch_size
        self.max_seq_length = max_seq_length
        self.register_buffer(
            "k_cache", torch.zeros(size=cache_shape), persistent=False
        )
        self.register_buffer(
            "v_cache", torch.zeros(size=cache_shape), persistent=False
        )
    def update(self, input_pos: Tensor, k_val: Tensor, v_val: Tensor):
        index = (
            (input_pos - 1)
            .unsqueeze(-1)
            .unsqueeze(-1)
            .unsqueeze(-1)
            .expand(-1, -1, self.n_head, self.head_dim)
            .to(torch.int64)
        )
        k_out = self.k_cache
        v_out = self.v_cache
        k_out.scatter_(1, index, k_val)
        v_out.scatter_(1, index, v_val)
        return k_out, v_out
    def empty(self):
        self.k_cache.zero_()
        self.v_cache.zero_()
    def prefill_kv(self, k_val: Tensor, v_val: Tensor, bs: int):
        self.k_cache[[bs], : k_val.shape[1]] = k_val
        self.v_cache[[bs], : v_val.shape[1]] = v_val
 # ─── Attention (PyTorch native SDPA, no flash_attn) ─────────────────────────
 class Attention(nn.Module):
    def __init__(self, n_head: int, hidden_dim: int):
        super().__init__()
        self.n_head = n_head
        self.hidden_dim = hidden_dim
        assert hidden_dim % n_head == 0
        self.head_dim = hidden_dim // n_head
        self.in_proj = nn.Linear(hidden_dim, hidden_dim * 3, bias=True)
        self.out_proj = nn.Linear(hidden_dim, hidden_dim, bias=True)
        self.dropout = nn.Dropout(0.1)
        self._register_load_state_dict_pre_hook(self.load_hook)
    def load_hook(self, state_dict: dict, prefix, *args):
        keys_to_modify = [key for key in state_dict if "in_proj_" in key]
        for key in keys_to_modify:
            new_key = key.replace("in_proj_", "in_proj.")
            state_dict[new_key] = state_dict.pop(key)
    def forward(
        self, x: Tensor, input_pos: Tensor, kv_cache: KVCacheNHD
    ) -> Tensor:
        bsz, seqlen, _ = x.shape
        q, k, v = self.in_proj.forward(x).chunk(3, dim=-1)
        q = q.view(bsz, seqlen, self.n_head, self.head_dim)
        k = k.view(bsz, seqlen, self.n_head, self.head_dim)
        v = v.view(bsz, seqlen, self.n_head, self.head_dim)
        k_cache, v_cache = kv_cache.update(input_pos, k, v)
        q = q.transpose(1, 2)  # [B, H, 1, D]
        k_out = k_cache.transpose(1, 2)  # [B, H, max_seq, D]
        v_out = v_cache.transpose(1, 2)  # [B, H, max_seq, D]
        attn = F.scaled_dot_product_attention(q, k_out, v_out)
        attn = self.dropout.forward(attn)
        attn = attn.transpose(1, 2).reshape(bsz, seqlen, self.hidden_dim)
        attn = self.out_proj.forward(attn)
        return attn
    def prefill(self, x: Tensor, mask: Tensor, kv_cache: KVCacheNHD) -> Tensor:
        bsz = x.size(0)
        outputs = []
        for bs in range(bsz):
            x_b = x[bs].unsqueeze(0)
            q, k, v = self.in_proj.forward(x_b.unsqueeze(0)).chunk(3, dim=-1)
            q = q.contiguous().view(1, -1, self.n_head, self.head_dim)
            k = k.contiguous().view(1, -1, self.n_head, self.head_dim)
            v = v.contiguous().view(1, -1, self.n_head, self.head_dim)
            kv_cache.prefill_kv(k, v, bs)
            q, k, v = map(lambda t: t.transpose(1, 2), (q, k, v))
            attn_mask = (
                mask[bs].unsqueeze(0).unsqueeze(0).expand(1, self.n_head, -1, -1)
            )
            attn = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
            attn = self.dropout.forward(attn)
            attn = attn.transpose(1, 2).contiguous().view(1, -1, self.hidden_dim)
            output = self.out_proj.forward(attn)
            outputs.append(output.squeeze(0))
        return torch.nested.nested_tensor(outputs)
 # ─── Feed Forward ────────────────────────────────────────────────────────────
 class FeedForward(nn.Module):
    def __init__(self, dim: int, hidden_dim: int) -> None:
        super().__init__()
        self.linear1 = nn.Linear(dim, hidden_dim, bias=True)
        self.linear2 = nn.Linear(hidden_dim, dim, bias=True)
        self.dropout = nn.Dropout(0.1)
    def forward(self, x: Tensor) -> Tensor:
        return self.dropout.forward(
            self.linear2(self.dropout.forward(F.relu(self.linear1(x))))
        )
 # ─── Transformer Block ──────────────────────────────────────────────────────
 class TransformerBlock(nn.Module):
    def __init__(self, n_head, ffn_dim, hidden_dim) -> None:
        super().__init__()
        self.hidden_dim = hidden_dim
        self.attention = Attention(n_head, hidden_dim)
        self.feed_forward = FeedForward(hidden_dim, ffn_dim)
        self.attention_norm = nn.LayerNorm([hidden_dim])
        self.ffn_norm = nn.LayerNorm([hidden_dim])
        self.dropout = nn.Dropout(0.1)
        self._register_load_state_dict_pre_hook(self.load_hook)
    def load_hook(self, state_dict: dict[str, Tensor], prefix, *args):
        for key in list(state_dict.keys()):
            new_key = (
                key.replace("self_attn", "attention")
                .replace("linear", "feed_forward.linear")
                .replace("norm1", "attention_norm")
                .replace("norm2", "ffn_norm")
            )
            state_dict[new_key] = state_dict.pop(key)
    def forward(
        self, x: Tensor, input_pos: Tensor, kv_cache: KVCacheNHD
    ) -> Tensor:
        h = self.attention_norm.forward(
            x + self.dropout.forward(self.attention.forward(x, input_pos, kv_cache))
        )
        out = self.ffn_norm.forward(h + self.feed_forward.forward(h))
        return out
    def prefill(self, x: Tensor, mask: Tensor, kv_cache: KVCacheNHD) -> Tensor:
        h = self.attention_norm.forward(
            x + self.dropout.forward(self.attention.prefill(x, mask, kv_cache))
        )
        out = self.ffn_norm.forward(h + self.feed_forward.forward(h))
        return out
 # ─── Transformer Decoder ────────────────────────────────────────────────────
 class TransformerDecoder(nn.Module):
    def __init__(
        self,
        hidden_dim,
        n_layer,
        n_head,
        ffn_dim,
        vocab_size,
        max_seq_length,
        max_batch_size,
    ) -> None:
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_head = n_head
        assert hidden_dim % n_head == 0
        self.head_dim = hidden_dim // n_head
        self.vocab_size = vocab_size
        self.n_layer = n_layer
        self.layers = nn.ModuleList(
            TransformerBlock(n_head, ffn_dim, hidden_dim) for _ in range(n_layer)
        )
        self.max_seq_length: int = max_seq_length
        self.max_batch_size: int = max_batch_size
    def forward(
        self,
        input_pos: Tensor,
        x: Tensor,
        kv_caches: MutableSequence[KVCacheNHD],
    ):
        for layer, kv_cache in zip(self.layers, kv_caches):
            x = layer.forward(x, input_pos, kv_cache)
        return x
    def prefill(
        self,
        x: Tensor,
        mask: Tensor,
        kv_caches: MutableSequence[KVCacheNHD],
    ):
        for layer, kv_cache in zip(self.layers, kv_caches):
            x = layer.prefill(x, mask, kv_cache)
        return x
 # ─── T2S Decoder ─────────────────────────────────────────────────────────────
 class T2SDecoder(nn.Module):
    def __init__(
        self,
        config,
        *args,
        norm_first=False,
        max_seq_length=2500,
        max_batch_size=10,
        **kwds,
    ) -> None:
        super().__init__()
        hidden_dim = config["model"]["hidden_dim"]
        embedding_dim = config["model"]["embedding_dim"]
        n_head = config["model"]["head"]
        n_layer = config["model"]["n_layer"]
        vocab_size = config["model"]["vocab_size"]
        phoneme_vocab_size = config["model"]["phoneme_vocab_size"]
        p_dropout = config["model"]["dropout"]
        EOS = config["model"]["EOS"]
        ffn_dim = hidden_dim * 4
        self.n_layer = n_layer
        self.hidden_dim = hidden_dim
        self.n_head = n_head
        assert hidden_dim % n_head == 0
        self.head_dim = hidden_dim // n_head
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.phoneme_vocab_size = phoneme_vocab_size
        self.p_dropout = p_dropout
        self.max_seq_length = max_seq_length
        self.max_batch_size = max_batch_size
        self.EOS = EOS
        assert self.EOS == self.vocab_size - 1
        self.bert_proj = nn.Linear(1024, self.embedding_dim)
        self.ar_text_embedding = TokenEmbedding(
            self.embedding_dim, self.phoneme_vocab_size, self.p_dropout
        )
        self.ar_text_position = SinePositionalEmbedding(
            self.embedding_dim,
            dropout=0.1,
            scale=False,
            alpha=True,
            max_batch_size=max_batch_size,
            max_seq_len=max_seq_length,
        )
        self.ar_audio_embedding = TokenEmbedding(
            self.embedding_dim, self.vocab_size, self.p_dropout
        )
        self.ar_audio_position = SinePositionalEmbedding(
            self.embedding_dim,
            dropout=0.1,
            scale=False,
            alpha=True,
            max_batch_size=max_batch_size,
            max_seq_len=max_seq_length,
        )
        self.ar_predict_layer = nn.Linear(self.hidden_dim, self.vocab_size, bias=False)
        self.h = TransformerDecoder(
            hidden_dim,
            n_layer,
            n_head,
            ffn_dim,
            vocab_size,
            max_seq_length,
            max_batch_size,
        )
        self._register_load_state_dict_pre_hook(self.load_hook)
    def load_hook(self, state_dict, prefix, *args):
        model_keys = [key for key in state_dict if key.startswith("model.")]
        for key in model_keys:
            new_key = key[len("model.") :]
            state_dict[new_key] = state_dict.pop(key)
    def init_cache(self, bsz: int = 0) -> nn.ModuleList:
        bsz = bsz or self.h.max_batch_size
        assert bsz <= self.h.max_batch_size
        seq_lens = self.h.max_seq_length
        device = self.bert_proj.bias.device
        dtype = self.bert_proj.bias.dtype
        return nn.ModuleList(
            [
                KVCacheNHD(bsz, seq_lens, self.n_head, self.head_dim)
                for _ in range(self.n_layer)
            ],
        ).to(device, dtype)
    def embed(
        self,
        x: List[torch.Tensor],
        y: torch.Tensor,
        bert_features: List[torch.Tensor],
    ):
        x_nested = torch.nested.nested_tensor(x)
        assert x_nested.size(0) <= self.max_batch_size
        bert_features_nested = torch.nested.nested_tensor(
            list(map(lambda t: t.transpose(0, 1), bert_features))
        )
        x_emb = self.ar_text_embedding.forward(x_nested)
        bert = self.bert_proj.forward(bert_features_nested)
        x_emb = x_emb + bert
        x_pos = self.ar_text_position.prefill(x_emb)
        y_nested = torch.nested.nested_tensor(list(y.unbind(0)))
        y_emb = self.ar_audio_embedding.forward(y_nested)
        y_pos = self.ar_audio_position.prefill(y_emb)
        xy_pos = torch.nested.nested_tensor(
            [torch.cat([x_pos[i], y_pos[i]]) for i in range(len(x))]
        )
        return xy_pos
    def capture(
        self,
        input_pos: Tensor,
        x: Tensor,
        x_dec: Tensor,
        kv_caches,
    ) -> CUDAGraph:
        s = torch.cuda.Stream()
        s.wait_stream(torch.cuda.current_stream())
        graph = torch.cuda.CUDAGraph()
        with torch.cuda.stream(s):
            for _ in range(5):
                self.h.forward(input_pos, x, kv_caches)
        torch.cuda.current_stream().wait_stream(s)
        with torch.cuda.graph(graph):
            x_dec.copy_(self.h.forward(input_pos, x, kv_caches))
        torch.cuda.synchronize()
        return graph
 # ─── CUDA Graph Runner ───────────────────────────────────────────────────────
 class CUDAGraphRunner:
    def __init__(
        self,
        decoder_model: T2SDecoder,
        device: torch.device = torch.device("cpu"),
        dtype: torch.dtype = torch.float32,
    ) -> None:
        assert device.type in {"cpu", "cuda", "mps", "xpu", "mtia"}
        assert dtype in {torch.float16, torch.bfloat16, torch.float32}
        self.device = device
        self.dtype = dtype
        self.decoder_model: T2SDecoder = decoder_model.to(self.device, self.dtype)
        self.graph: Optional[CUDAGraph] = None
        self.xy_pos_ = torch.rand(
            (1, 1, decoder_model.embedding_dim), device=device
        ).to(dtype)
        self.xy_dec_ = torch.rand(
            (1, 1, decoder_model.embedding_dim), device=device
        ).to(dtype)
        self.kv_cache = decoder_model.init_cache(1)
        self.input_pos = torch.tensor([10]).int().cuda()
    def _handle_request(self, request: T2SRequest):
        with self.device:
            for i in self.kv_cache:
                i.empty()
            decoder = self.decoder_model
            session = T2SSession(decoder, request, device=self.device, dtype=self.dtype)
            self.input_pos.copy_(session.input_pos)
            t1 = 0.0
            infer_speed = 0.0
            y = session.y
            bsz = y.size(0)
            for idx in tqdm(range(1500)):
                if idx == 0:
                    xy_dec = decoder.h.prefill(
                        session.xy_pos, session.attn_mask_nested, self.kv_cache
                    )
                    xy_dec = torch.stack([t[[-1]] for t in xy_dec.unbind()])
                else:
                    if (
                        request.use_cuda_graph
                        and self.graph is None
                        and torch.cuda.is_available()
                    ):
                        self.xy_pos_.copy_(session.xy_pos)
                        self.graph = decoder.capture(
                            self.input_pos,
                            self.xy_pos_,
                            self.xy_dec_,
                            kv_caches=self.kv_cache,
                        )
                    if self.graph:
                        self.xy_pos_.copy_(session.xy_pos)
                        self.graph.replay()
                        xy_dec = self.xy_dec_.clone()
                    else:
                        xy_dec = decoder.h.forward(
                            self.input_pos,
                            session.xy_pos,
                            self.kv_cache,
                        )
                logits = decoder.ar_predict_layer(xy_dec[:, -1])
                self.input_pos.add_(1)
                if idx == 0:
                    logits[:, -1] = float("-inf")
                samples = session.sampler.sample(
                    logits=logits,
                    previous_tokens=session.y,
                    top_k=request.top_k,
                    top_p=request.top_p,
                    repetition_penalty=request.repetition_penalty,
                    temperature=request.temperature,
                )
                session.y = torch.cat([session.y, samples], dim=1)
                argmax_token = torch.argmax(logits, dim=-1)
                sample_token = samples.squeeze(1)
                EOS_mask = (argmax_token == decoder.EOS) | (
                    sample_token == decoder.EOS
                )
                newly_done_mask = EOS_mask & (~session.completed)
                newly_done_indices = newly_done_mask.nonzero()
                if newly_done_indices.numel() > 0:
                    session.y_results[newly_done_indices[0]] = session.y[
                        newly_done_indices[0], session.y_len : -1
                    ].squeeze(0)
                    session.completed[newly_done_indices] = True
                if torch.all(session.completed).item():
                    if session.y.size(1) == 0:
                        session.y = torch.cat(
                            [session.y, torch.zeros_like(samples)], dim=1
                        )
                        tqdm.write("Bad Zero Prediction")
                    else:
                        tqdm.write(
                            f"T2S Decoding EOS {session.prefill_len.tolist().__str__().strip('[]')} -> \n"
                            f"{[i.size(0) for i in session.y_results].__str__().strip('[]')}"
                        )
                        tqdm.write(
                            f"Infer Speed: {(idx - 1) / (time.perf_counter() - t1):.2f} token/s"
                        )
                        infer_speed = (idx - 1) / (time.perf_counter() - t1)
                    break
                if (
                    request.early_stop_num != -1
                    and (session.y.size(1) - session.y_len) > request.early_stop_num
                ) or idx == 1499:
                    for i in range(bsz):
                        if not session.completed[i].item():
                            session.y_results[i] = session.y[i, session.y_len :]
                            session.completed[i] = True
                    break
                y_emb = decoder.ar_audio_embedding(session.y[:, -1:])
                session.xy_pos = decoder.ar_audio_position.forward(
                    self.input_pos - session.x_lens, y_emb
                )
                if idx == 2:
                    t1 = time.perf_counter()
                if idx % 100 == 0 and self.device.type == "cuda":
                    torch.cuda.empty_cache()
            if self.device.type == "cuda":
                torch.cuda.empty_cache()
            return session.y_results[: request.valid_length], infer_speed
    def generate(self, request: T2SRequest) -> T2SResult:
        try:
            result, infer_speed = self._handle_request(request)
            t2s_result = T2SResult(
                result=result, infer_speed=infer_speed, status="Success"
            )
        except Exception as e:
            t2s_result = T2SResult(
                status="Error", exception=e, traceback=traceback.format_exc()
            )
        return t2s_result
    @staticmethod
    def load_decoder(weights_path, max_batch_size=1) -> T2SDecoder:
        print(
            f"Loading Text2Semantic Weights from {weights_path} with CUDA Graph (SDPA) Implement"
        )
        dict_s1 = torch.load(
            weights_path, map_location="cpu", weights_only=False#, mmap=True
        )
        config = dict_s1["config"]
        decoder = T2SDecoder(config, max_batch_size=max_batch_size)
        state_dict = dict_s1["weight"]
        decoder.load_state_dict(state_dict)
        return decoder.eval()
--- a/GPT_SoVITS/AR/models/utils.py
+++ b/GPT_SoVITS/AR/models/utils.py
@ -262,7 +262,7 @@ def make_reject_y(y_o, y_lens):
    reject_y = []
    reject_y_lens = []
    for b in range(bs):
-        process_item_idx = torch.randint(0, 1, size=(1,))[0]
+        process_item_idx = torch.randint(0, 2, size=(1,))[0]
        if process_item_idx == 0:
            new_y = repeat_P(y_o[b])
            reject_y.append(new_y)
--- a/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py
+++ b/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py
@ -8,30 +8,30 @@ def multi_head_attention_forward_patched(
    query,
    key,
    value,
-    embed_dim_to_check: int,
+    embed_dim_to_check,
-    num_heads: int,
+    num_heads,
    in_proj_weight,
-    in_proj_bias: Optional[Tensor],
+    in_proj_bias,
-    bias_k: Optional[Tensor],
+    bias_k,
-    bias_v: Optional[Tensor],
+    bias_v,
-    add_zero_attn: bool,
+    add_zero_attn,
-    dropout_p: float,
+    dropout_p,
-    out_proj_weight: Tensor,
+    out_proj_weight,
-    out_proj_bias: Optional[Tensor],
+    out_proj_bias,
-    training: bool = True,
+    training=True,
-    key_padding_mask: Optional[Tensor] = None,
+    key_padding_mask=None,
-    need_weights: bool = True,
+    need_weights=True,
-    attn_mask: Optional[Tensor] = None,
+    attn_mask=None,
-    use_separate_proj_weight: bool = False,
+    use_separate_proj_weight=False,
-    q_proj_weight: Optional[Tensor] = None,
+    q_proj_weight=None,
-    k_proj_weight: Optional[Tensor] = None,
+    k_proj_weight=None,
-    v_proj_weight: Optional[Tensor] = None,
+    v_proj_weight=None,
-    static_k: Optional[Tensor] = None,
+    static_k=None,
-    static_v: Optional[Tensor] = None,
+    static_v=None,
-    average_attn_weights: bool = True,
+    average_attn_weights=True,
-    is_causal: bool = False,
+    is_causal=False,
    cache=None,
-) -> Tuple[Tensor, Optional[Tensor]]:
+):
    # set up shape vars
    _, _, embed_dim = query.shape
    attn_mask = _canonical_mask(
--- a/GPT_SoVITS/TTS_infer_pack/TTS.py
+++ b/GPT_SoVITS/TTS_infer_pack/TTS.py
@ -499,7 +499,7 @@ class TTS:
        if if_lora_v3 == True and os.path.exists(path_sovits) == False:
            info = path_sovits + i18n("SoVITS %s 底模缺失，无法加载相应 LoRA 权重" % model_version)
-            raise FileExistsError(info)
+            raise FileNotFoundError(info)
        # dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False)
        dict_s2 = load_sovits_new(weights_path)
@ -1578,16 +1578,15 @@ class TTS:
                max_audio = np.abs(audio).max()
                if max_audio > 1:
                    audio /= max_audio
-            audio = (audio * 32768).astype(np.int16)
+                audio = (audio * 32768).astype(np.int16)
            else:
                audio = audio.cpu().numpy()
                audio = (audio * 32768).astype(np.int16)
            t2 = time.perf_counter()
            print(f"超采样用时：{t2 - t1:.3f}s")
        else:
            # audio = audio.float() * 32768
            # audio = audio.to(dtype=torch.int16).clamp(-32768, 32767).cpu().numpy()
            audio = audio.cpu().numpy()
-
+            audio = (audio * 32768).astype(np.int16)
        audio = (audio * 32768).astype(np.int16)
        # try:
@ -1768,7 +1767,10 @@ class TTS:
            pos += chunk_len * upsample_rate
        audio = self.sola_algorithm(audio_fragments, overlapped_len * upsample_rate)
-        audio = audio[overlapped_len * upsample_rate : -padding_len * upsample_rate]
+        if padding_len > 0:
            audio = audio[overlapped_len * upsample_rate : -padding_len * upsample_rate]
        else:
            audio = audio[overlapped_len * upsample_rate :]
        audio_fragments = []
        for feat_len in feat_lens:
--- a/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
+++ b/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
@ -92,7 +92,7 @@ def cut0(inp):
    if not set(inp).issubset(punctuation):
        return inp
    else:
-        return "/n"
+        return "\n"
 # 凑四句一切
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
--- a/GPT_SoVITS/module/distrib.py
+++ b/GPT_SoVITS/module/distrib.py
@ -87,7 +87,7 @@ def sync_buffer(buffers, average=True):
    for buffer, handle in handles:
        handle.wait()
        if average:
-            buffer.data /= world_size
+            buffer.data /= world_size()
 def sync_grad(params):
--- a/GPT_SoVITS/s2_train_v3_lora.py
+++ b/GPT_SoVITS/s2_train_v3_lora.py
@ -55,6 +55,10 @@ def main():
        n_gpus = torch.cuda.device_count()
    else:
        n_gpus = 1
    if n_gpus <= 1:
        run(0, n_gpus, hps)
        return
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = str(randint(20000, 55555))
@ -77,12 +81,14 @@ def run(rank, n_gpus, hps):
        writer = SummaryWriter(log_dir=hps.s2_ckpt_dir)
        writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval"))
-    dist.init_process_group(
+    use_ddp = n_gpus > 1
-        backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
+    if use_ddp:
-        init_method="env://?use_libuv=False",
+        dist.init_process_group(
-        world_size=n_gpus,
+            backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
-        rank=rank,
+            init_method="env://?use_libuv=False",
-    )
+            world_size=n_gpus,
            rank=rank,
        )
    torch.manual_seed(hps.train.seed)
    if torch.cuda.is_available():
        torch.cuda.set_device(rank)
@ -118,15 +124,20 @@ def run(rank, n_gpus, hps):
        shuffle=True,
    )
    collate_fn = TextAudioSpeakerCollate()
-    train_loader = DataLoader(
+    worker_count = 0 if os.name == "nt" and n_gpus <= 1 else min(2 if os.name == "nt" else 5, os.cpu_count() or 1)
-        train_dataset,
+    loader_kwargs = dict(
-        num_workers=5,
+        num_workers=worker_count,
        shuffle=False,
-        pin_memory=True,
+        pin_memory=torch.cuda.is_available(),
        collate_fn=collate_fn,
        batch_sampler=train_sampler,
-        persistent_workers=True,
+    )
-        prefetch_factor=3,
+    if worker_count > 0:
        loader_kwargs["persistent_workers"] = True
        loader_kwargs["prefetch_factor"] = 2 if os.name == "nt" else 3
    train_loader = DataLoader(
        train_dataset,
        **loader_kwargs,
    )
    save_root = "%s/logs_s2_%s_lora_%s" % (hps.data.exp_dir, hps.model.version, hps.train.lora_rank)
    os.makedirs(save_root, exist_ok=True)
@ -156,7 +167,9 @@ def run(rank, n_gpus, hps):
    def model2cuda(net_g, rank):
        if torch.cuda.is_available():
-            net_g = DDP(net_g.cuda(rank), device_ids=[rank], find_unused_parameters=True)
+            net_g = net_g.cuda(rank)
            if use_ddp:
                net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
        else:
            net_g = net_g.to(device)
        return net_g
@ -242,6 +255,8 @@ def run(rank, n_gpus, hps):
                None,
            )
        scheduler_g.step()
    if use_ddp and dist.is_initialized():
        dist.destroy_process_group()
    print("training done")
--- a/GPT_SoVITS/text/chinese2.py
+++ b/GPT_SoVITS/text/chinese2.py
@ -180,10 +180,15 @@ def _merge_erhua(initials: list[str], finals: list[str], word: str, pos: str) ->
 def _g2p(segments):
    phones_list = []
    word2ph = []
-    for seg in segments:
+    g2pw_batch_results = []
    g2pw_batch_cursor = 0
    processed_segments = [re.sub("[a-zA-Z]+", "", seg) for seg in segments]
    if is_g2pw:
        batch_inputs = [seg for seg in processed_segments if seg]
        g2pw_batch_results = g2pw._g2pw(batch_inputs) if batch_inputs else []
    for seg in processed_segments:
        pinyins = []
        # Replace all English words in the sentence
        seg = re.sub("[a-zA-Z]+", "", seg)
        seg_cut = psg.lcut(seg)
        seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
        initials = []
@ -204,8 +209,10 @@ def _g2p(segments):
            finals = sum(finals, [])
            print("pypinyin结果", initials, finals)
        else:
-            # g2pw采用整句推理
+            # g2pw采用整句推理（批量推理，逐句取结果）
-            pinyins = g2pw.lazy_pinyin(seg, neutral_tone_with_five=True, style=Style.TONE3)
+            if seg:
                pinyins = g2pw_batch_results[g2pw_batch_cursor]
                g2pw_batch_cursor += 1
            pre_word_length = 0
            for word, pos in seg_cut:
--- a/GPT_SoVITS/text/g2pw/dataset.py
+++ b/GPT_SoVITS/text/g2pw/dataset.py
@ -18,6 +18,7 @@ Credits
 from typing import Dict
 from typing import List
 from typing import Optional
 from typing import Tuple
 import numpy as np
@ -37,6 +38,8 @@ def prepare_onnx_input(
    use_mask: bool = False,
    window_size: int = None,
    max_len: int = 512,
    char2id: Optional[Dict[str, int]] = None,
    char_phoneme_masks: Optional[Dict[str, List[int]]] = None,
 ) -> Dict[str, np.array]:
    if window_size is not None:
        truncated_texts, truncated_query_ids = _truncate_texts(
@ -48,33 +51,88 @@ def prepare_onnx_input(
    phoneme_masks = []
    char_ids = []
    position_ids = []
    tokenized_cache = {}
    if char2id is None:
        char2id = {char: idx for idx, char in enumerate(chars)}
    if use_mask:
        if char_phoneme_masks is None:
            char_phoneme_masks = {
                char: [1 if i in char2phonemes[char] else 0 for i in range(len(labels))]
                for char in char2phonemes
            }
    else:
        full_phoneme_mask = [1] * len(labels)
    for idx in range(len(texts)):
        text = (truncated_texts if window_size else texts)[idx].lower()
        query_id = (truncated_query_ids if window_size else query_ids)[idx]
-        try:
+        cached = tokenized_cache.get(text)
-            tokens, text2token, token2text = tokenize_and_map(tokenizer=tokenizer, text=text)
+        if cached is None:
-        except Exception:
+            try:
-            print(f'warning: text "{text}" is invalid')
+                tokens, text2token, token2text = tokenize_and_map(tokenizer=tokenizer, text=text)
-            return {}
+            except Exception:
                print(f'warning: text "{text}" is invalid')
                return {}
-        text, query_id, tokens, text2token, token2text = _truncate(
+            if len(tokens) <= max_len - 2:
-            max_len=max_len, text=text, query_id=query_id, tokens=tokens, text2token=text2token, token2text=token2text
+                processed_tokens = ["[CLS]"] + tokens + ["[SEP]"]
-        )
+                shared_input_id = list(np.array(tokenizer.convert_tokens_to_ids(processed_tokens)))
                shared_token_type_id = list(np.zeros((len(processed_tokens),), dtype=int))
                shared_attention_mask = list(np.ones((len(processed_tokens),), dtype=int))
                cached = {
                    "is_short": True,
                    "tokens": tokens,
                    "text2token": text2token,
                    "token2text": token2text,
                    "input_id": shared_input_id,
                    "token_type_id": shared_token_type_id,
                    "attention_mask": shared_attention_mask,
                }
            else:
                cached = {
                    "is_short": False,
                    "tokens": tokens,
                    "text2token": text2token,
                    "token2text": token2text,
                }
            tokenized_cache[text] = cached
-        processed_tokens = ["[CLS]"] + tokens + ["[SEP]"]
+        if cached["is_short"]:
            text_for_query = text
            query_id_for_query = query_id
            text2token_for_query = cached["text2token"]
            input_id = cached["input_id"]
            token_type_id = cached["token_type_id"]
            attention_mask = cached["attention_mask"]
        else:
            (
                text_for_query,
                query_id_for_query,
                tokens_for_query,
                text2token_for_query,
                _token2text_for_query,
            ) = _truncate(
                max_len=max_len,
                text=text,
                query_id=query_id,
                tokens=cached["tokens"],
                text2token=cached["text2token"],
                token2text=cached["token2text"],
            )
            processed_tokens = ["[CLS]"] + tokens_for_query + ["[SEP]"]
            input_id = list(np.array(tokenizer.convert_tokens_to_ids(processed_tokens)))
            token_type_id = list(np.zeros((len(processed_tokens),), dtype=int))
            attention_mask = list(np.ones((len(processed_tokens),), dtype=int))
-        input_id = list(np.array(tokenizer.convert_tokens_to_ids(processed_tokens)))
+        query_char = text_for_query[query_id_for_query]
-        token_type_id = list(np.zeros((len(processed_tokens),), dtype=int))
+        if use_mask:
-        attention_mask = list(np.ones((len(processed_tokens),), dtype=int))
+            phoneme_mask = char_phoneme_masks[query_char]
-
+        else:
-        query_char = text[query_id]
+            phoneme_mask = full_phoneme_mask
-        phoneme_mask = (
+        char_id = char2id[query_char]
-            [1 if i in char2phonemes[query_char] else 0 for i in range(len(labels))] if use_mask else [1] * len(labels)
+        position_id = text2token_for_query[query_id_for_query] + 1  # [CLS] token locate at first place
        )
        char_id = chars.index(query_char)
        position_id = text2token[query_id] + 1  # [CLS] token locate at first place
        input_ids.append(input_id)
        token_type_ids.append(token_type_id)
@ -83,10 +141,15 @@ def prepare_onnx_input(
        char_ids.append(char_id)
        position_ids.append(position_id)
    max_token_length = max(len(seq) for seq in input_ids)
    def _pad_sequences(sequences, pad_value=0):
        return [seq + [pad_value] * (max_token_length - len(seq)) for seq in sequences]
    outputs = {
-        "input_ids": np.array(input_ids).astype(np.int64),
+        "input_ids": np.array(_pad_sequences(input_ids, pad_value=0)).astype(np.int64),
-        "token_type_ids": np.array(token_type_ids).astype(np.int64),
+        "token_type_ids": np.array(_pad_sequences(token_type_ids, pad_value=0)).astype(np.int64),
-        "attention_masks": np.array(attention_masks).astype(np.int64),
+        "attention_masks": np.array(_pad_sequences(attention_masks, pad_value=0)).astype(np.int64),
        "phoneme_masks": np.array(phoneme_masks).astype(np.float32),
        "char_ids": np.array(char_ids).astype(np.int64),
        "position_ids": np.array(position_ids).astype(np.int64),
--- a/GPT_SoVITS/text/g2pw/onnx_api.py
+++ b/GPT_SoVITS/text/g2pw/onnx_api.py
@ -10,7 +10,6 @@ from typing import Any, Dict, List, Tuple
 import numpy as np
 import onnxruntime
 import requests
 import torch
 from opencc import OpenCC
 from pypinyin import Style, pinyin
 from transformers.models.auto.tokenization_auto import AutoTokenizer
@ -22,9 +21,8 @@ from .utils import load_config
 onnxruntime.set_default_logger_severity(3)
 try:
    onnxruntime.preload_dlls()
-except:
+except Exception:
    pass
    # traceback.print_exc()
 warnings.filterwarnings("ignore")
 model_version = "1.1"
@ -55,6 +53,24 @@ def predict(session, onnx_input: Dict[str, Any], labels: List[str]) -> Tuple[Lis
    return all_preds, all_confidences
 def _load_json_from_candidates(filename: str, candidate_dirs: List[str]) -> Dict[str, Any]:
    for candidate_dir in candidate_dirs:
        if not candidate_dir:
            continue
        json_path = os.path.join(candidate_dir, filename)
        if os.path.exists(json_path):
            with open(json_path, "r", encoding="utf-8") as fr:
                return json.load(fr)
    raise FileNotFoundError(f"Cannot locate {filename} in candidate dirs: {candidate_dirs}")
 def _find_first_existing_file(*paths: str) -> str:
    for path in paths:
        if path and os.path.exists(path):
            return path
    raise FileNotFoundError(f"Files not found: {paths}")
 def download_and_decompress(model_dir: str = "G2PWModel/"):
    if not os.path.exists(model_dir):
        parent_directory = os.path.dirname(model_dir)
@ -62,7 +78,7 @@ def download_and_decompress(model_dir: str = "G2PWModel/"):
        extract_dir = os.path.join(parent_directory, "G2PWModel_1.1")
        extract_dir_new = os.path.join(parent_directory, "G2PWModel")
        print("Downloading g2pw model...")
-        modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip"  # "https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip"
+        modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip"
        with requests.get(modelscope_url, stream=True) as r:
            r.raise_for_status()
            with open(zip_dir, "wb") as f:
@ -79,7 +95,7 @@ def download_and_decompress(model_dir: str = "G2PWModel/"):
    return model_dir
-class G2PWOnnxConverter:
+class _G2PWBaseOnnxConverter:
    def __init__(
        self,
        model_dir: str = "G2PWModel/",
@ -87,33 +103,16 @@ class G2PWOnnxConverter:
        model_source: str = None,
        enable_non_tradional_chinese: bool = False,
    ):
-        uncompress_path = download_and_decompress(model_dir)
+        self.model_dir = download_and_decompress(model_dir)
-
+        self.config = load_config(config_path=os.path.join(self.model_dir, "config.py"), use_default=True)
        sess_options = onnxruntime.SessionOptions()
        sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
        sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
        sess_options.intra_op_num_threads = 2 if torch.cuda.is_available() else 0
        if "CUDAExecutionProvider" in onnxruntime.get_available_providers():
            self.session_g2pW = onnxruntime.InferenceSession(
                os.path.join(uncompress_path, "g2pW.onnx"),
                sess_options=sess_options,
                providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
            )
        else:
            self.session_g2pW = onnxruntime.InferenceSession(
                os.path.join(uncompress_path, "g2pW.onnx"),
                sess_options=sess_options,
                providers=["CPUExecutionProvider"],
            )
        self.config = load_config(config_path=os.path.join(uncompress_path, "config.py"), use_default=True)
        self.model_source = model_source if model_source else self.config.model_source
        self.enable_opencc = enable_non_tradional_chinese
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_source)
-        polyphonic_chars_path = os.path.join(uncompress_path, "POLYPHONIC_CHARS.txt")
+        polyphonic_chars_path = os.path.join(self.model_dir, "POLYPHONIC_CHARS.txt")
-        monophonic_chars_path = os.path.join(uncompress_path, "MONOPHONIC_CHARS.txt")
+        monophonic_chars_path = os.path.join(self.model_dir, "MONOPHONIC_CHARS.txt")
        self.polyphonic_chars = [
            line.split("\t") for line in open(polyphonic_chars_path, encoding="utf-8").read().strip().split("\n")
        ]
@ -149,31 +148,47 @@ class G2PWOnnxConverter:
        )
        self.chars = sorted(list(self.char2phonemes.keys()))
        self.char2id = {char: idx for idx, char in enumerate(self.chars)}
        self.char_phoneme_masks = (
            {
                char: [1 if i in self.char2phonemes[char] else 0 for i in range(len(self.labels))]
                for char in self.char2phonemes
            }
            if self.config.use_mask
            else None
        )
        self.polyphonic_chars_new = set(self.chars)
        for char in self.non_polyphonic:
-            if char in self.polyphonic_chars_new:
+            self.polyphonic_chars_new.discard(char)
                self.polyphonic_chars_new.remove(char)
        self.monophonic_chars_dict = {char: phoneme for char, phoneme in self.monophonic_chars}
        for char in self.non_monophonic:
-            if char in self.monophonic_chars_dict:
+            self.monophonic_chars_dict.pop(char, None)
                self.monophonic_chars_dict.pop(char)
-        self.pos_tags = ["UNK", "A", "C", "D", "I", "N", "P", "T", "V", "DE", "SHI"]
+        default_asset_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "G2PWModel"))
        candidate_asset_dirs = [self.model_dir, default_asset_dir]
        self.bopomofo_convert_dict = _load_json_from_candidates(
            "bopomofo_to_pinyin_wo_tune_dict.json", candidate_asset_dirs
        )
        self.char_bopomofo_dict = _load_json_from_candidates("char_bopomofo_dict.json", candidate_asset_dirs)
        with open(os.path.join(uncompress_path, "bopomofo_to_pinyin_wo_tune_dict.json"), "r", encoding="utf-8") as fr:
            self.bopomofo_convert_dict = json.load(fr)
        self.style_convert_func = {
            "bopomofo": lambda x: x,
            "pinyin": self._convert_bopomofo_to_pinyin,
        }[style]
        with open(os.path.join(uncompress_path, "char_bopomofo_dict.json"), "r", encoding="utf-8") as fr:
            self.char_bopomofo_dict = json.load(fr)
        if self.enable_opencc:
            self.cc = OpenCC("s2tw")
        self.enable_sentence_dedup = os.getenv("g2pw_sentence_dedup", "true").strip().lower() in {
            "1",
            "true",
            "yes",
            "y",
            "on",
        }
        # 聚焦到多音字附近上下文，默认左右各16字；设为0表示关闭裁剪（整句）。
        self.polyphonic_context_chars = max(0, int(os.getenv("g2pw_polyphonic_context_chars", "16")))
    def _convert_bopomofo_to_pinyin(self, bopomofo: str) -> str:
        tone = bopomofo[-1]
@ -181,9 +196,8 @@ class G2PWOnnxConverter:
        component = self.bopomofo_convert_dict.get(bopomofo[:-1])
        if component:
            return component + tone
-        else:
+        print(f'Warning: "{bopomofo}" cannot convert to pinyin')
-            print(f'Warning: "{bopomofo}" cannot convert to pinyin')
+        return None
            return None
    def __call__(self, sentences: List[str]) -> List[List[str]]:
        if isinstance(sentences, str):
@ -197,51 +211,147 @@ class G2PWOnnxConverter:
                translated_sentences.append(translated_sent)
            sentences = translated_sentences
-        texts, query_ids, sent_ids, partial_results = self._prepare_data(sentences=sentences)
+        texts, model_query_ids, result_query_ids, sent_ids, partial_results = self._prepare_data(sentences=sentences)
        if len(texts) == 0:
            # sentences no polyphonic words
            return partial_results
-        onnx_input = prepare_onnx_input(
+        model_input = prepare_onnx_input(
            tokenizer=self.tokenizer,
            labels=self.labels,
            char2phonemes=self.char2phonemes,
            chars=self.chars,
            texts=texts,
-            query_ids=query_ids,
+            query_ids=model_query_ids,
            use_mask=self.config.use_mask,
            window_size=None,
            char2id=self.char2id,
            char_phoneme_masks=self.char_phoneme_masks,
        )
-        preds, confidences = predict(session=self.session_g2pW, onnx_input=onnx_input, labels=self.labels)
+        if not model_input:
            return partial_results
        if self.enable_sentence_dedup:
            preds, _confidences = self._predict_with_sentence_dedup(model_input=model_input, texts=texts)
        else:
            preds, _confidences = self._predict(model_input=model_input)
        if self.config.use_char_phoneme:
            preds = [pred.split(" ")[1] for pred in preds]
        results = partial_results
-        for sent_id, query_id, pred in zip(sent_ids, query_ids, preds):
+        for sent_id, query_id, pred in zip(sent_ids, result_query_ids, preds):
            results[sent_id][query_id] = self.style_convert_func(pred)
        return results
-    def _prepare_data(self, sentences: List[str]) -> Tuple[List[str], List[int], List[int], List[List[str]]]:
+    def _prepare_data(
-        texts, query_ids, sent_ids, partial_results = [], [], [], []
+        self, sentences: List[str]
    ) -> Tuple[List[str], List[int], List[int], List[int], List[List[str]]]:
        texts, model_query_ids, result_query_ids, sent_ids, partial_results = [], [], [], [], []
        for sent_id, sent in enumerate(sentences):
            # pypinyin works well for Simplified Chinese than Traditional Chinese
            sent_s = tranditional_to_simplified(sent)
            pypinyin_result = pinyin(sent_s, neutral_tone_with_five=True, style=Style.TONE3)
            partial_result = [None] * len(sent)
            polyphonic_indices: List[int] = []
            for i, char in enumerate(sent):
                if char in self.polyphonic_chars_new:
-                    texts.append(sent)
+                    polyphonic_indices.append(i)
                    query_ids.append(i)
                    sent_ids.append(sent_id)
                elif char in self.monophonic_chars_dict:
                    partial_result[i] = self.style_convert_func(self.monophonic_chars_dict[char])
                elif char in self.char_bopomofo_dict:
                    partial_result[i] = pypinyin_result[i][0]
                    # partial_result[i] =  self.style_convert_func(self.char_bopomofo_dict[char][0])
                else:
                    partial_result[i] = pypinyin_result[i][0]
            if polyphonic_indices:
                if self.polyphonic_context_chars > 0:
                    left = max(0, polyphonic_indices[0] - self.polyphonic_context_chars)
                    right = min(len(sent), polyphonic_indices[-1] + self.polyphonic_context_chars + 1)
                    sent_for_predict = sent[left:right]
                    query_offset = left
                else:
                    sent_for_predict = sent
                    query_offset = 0
                for index in polyphonic_indices:
                    texts.append(sent_for_predict)
                    model_query_ids.append(index - query_offset)
                    result_query_ids.append(index)
                    sent_ids.append(sent_id)
            partial_results.append(partial_result)
-        return texts, query_ids, sent_ids, partial_results
+        return texts, model_query_ids, result_query_ids, sent_ids, partial_results
    def _predict(self, model_input: Dict[str, Any]) -> Tuple[List[str], List[float]]:
        raise NotImplementedError
    def _predict_with_sentence_dedup(
        self, model_input: Dict[str, Any], texts: List[str]
    ) -> Tuple[List[str], List[float]]:
        if len(texts) <= 1:
            return self._predict(model_input=model_input)
        grouped_indices: Dict[str, List[int]] = {}
        for idx, text in enumerate(texts):
            grouped_indices.setdefault(text, []).append(idx)
        if all(len(indices) == 1 for indices in grouped_indices.values()):
            return self._predict(model_input=model_input)
        preds: List[str] = [""] * len(texts)
        confidences: List[float] = [0.0] * len(texts)
        for indices in grouped_indices.values():
            group_input = {name: value[indices] for name, value in model_input.items()}
            if len(indices) > 1:
                for name in ("input_ids", "token_type_ids", "attention_masks"):
                    group_input[name] = group_input[name][:1]
            group_preds, group_confidences = self._predict(model_input=group_input)
            for output_idx, pred, confidence in zip(indices, group_preds, group_confidences):
                preds[output_idx] = pred
                confidences[output_idx] = confidence
        return preds, confidences
 class G2PWOnnxConverter(_G2PWBaseOnnxConverter):
    def __init__(
        self,
        model_dir: str = "G2PWModel/",
        style: str = "bopomofo",
        model_source: str = None,
        enable_non_tradional_chinese: bool = False,
    ):
        super().__init__(
            model_dir=model_dir,
            style=style,
            model_source=model_source,
            enable_non_tradional_chinese=enable_non_tradional_chinese,
        )
        sess_options = onnxruntime.SessionOptions()
        sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
        sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
        sess_options.intra_op_num_threads = 2
        onnx_path = _find_first_existing_file(
            os.path.join(self.model_dir, "g2pW.onnx"),
            os.path.join(self.model_dir, "g2pw.onnx"),
        )
        if "CUDAExecutionProvider" in onnxruntime.get_available_providers():
            self.session_g2pw = onnxruntime.InferenceSession(
                onnx_path,
                sess_options=sess_options,
                providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
            )
        else:
            self.session_g2pw = onnxruntime.InferenceSession(
                onnx_path,
                sess_options=sess_options,
                providers=["CPUExecutionProvider"],
            )
    def _predict(self, model_input: Dict[str, Any]) -> Tuple[List[str], List[float]]:
        return predict(session=self.session_g2pw, onnx_input=model_input, labels=self.labels)
--- a/GPT_SoVITS/text/symbols.py
+++ b/GPT_SoVITS/text/symbols.py
@ -396,4 +396,5 @@ arpa = {
 symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)
 symbols = sorted(set(symbols))
 if __name__ == "__main__":
    print(symbols)
    print(len(symbols))
--- a/README.md
+++ b/README.md
@ -48,6 +48,8 @@ https://github.com/RVC-Boss/GPT-SoVITS/assets/129054828/05bee1fa-bdd8-4d85-9350-
 请不要尬黑GPT-SoVITS推理速度慢，谢谢！
 CPU-Optimized Inference Version：https://github.com/baicai-1145/GPT-SoVITS-CPUFast
 **User guide: [简体中文](https://www.yuque.com/baicaigongchang1145haoyuangong/ib3g1e) | [English](https://rentry.co/GPT-SoVITS-guide#/)**
 ## Installation
--- a/Ref_Audio_Selector/init.py
+++ b/Ref_Audio_Selector/init.py
--- a/Ref_Audio_Selector/common/init.py
+++ b/Ref_Audio_Selector/common/init.py
--- a/Ref_Audio_Selector/common/common.py
+++ b/Ref_Audio_Selector/common/common.py
@ -0,0 +1,156 @@
 from tools import my_utils
 from config import python_exec, is_half
 import subprocess
 import sys
 import os
 class RefAudioListManager:
    def __init__(self, root_dir):
        self.audio_dict = {'default': []}
        absolute_root = os.path.abspath(root_dir)
        for subdir, dirs, files in os.walk(absolute_root):
            relative_path = os.path.relpath(subdir, absolute_root)
            if relative_path == '.':
                category = 'default'
            else:
                category = relative_path.replace(os.sep, '')
            for file in files:
                if file.endswith('.wav'):
                    # 将相对路径转换为绝对路径
                    audio_abs_path = os.path.join(subdir, file)
                    if category not in self.audio_dict:
                        self.audio_dict[category] = []
                    self.audio_dict[category].append(audio_abs_path)
    def get_audio_list(self):
        return self.audio_dict
    def get_flattened_audio_list(self):
        all_audio_files = []
        for category_audios in self.audio_dict.values():
            all_audio_files.extend(category_audios)
        return all_audio_files
    def get_ref_audio_list(self):
        audio_info_list = []
        for category, audio_paths in self.audio_dict.items():
            for audio_path in audio_paths:
                filename_without_extension = os.path.splitext(os.path.basename(audio_path))[0]
                audio_info = {
                    'emotion': f"{category}-{filename_without_extension}",
                    'ref_path': audio_path,
                    'ref_text': filename_without_extension,
                }
                audio_info_list.append(audio_info)
        return audio_info_list
 def batch_clean_paths(paths):
    """
    批量处理路径列表，对每个路径调用 clean_path() 函数。
    参数:
        paths (list[str]): 包含待处理路径的列表。
    返回:
        list[str]: 经过 clean_path() 处理后的路径列表。
    """
    cleaned_paths = []
    for path in paths:
        cleaned_paths.append(my_utils.clean_path(path))
    return cleaned_paths
 def read_text_file_to_list(file_path):
    # 按照UTF-8编码打开文件（确保能够正确读取中文）
    with open(file_path, mode='r', encoding='utf-8') as file:
        # 读取所有行并存储到一个列表中
        lines = file.read().splitlines()
    return lines
 def get_filename_without_extension(file_path):
    """
    Given a file path string, returns the file name without its extension.
    Parameters:
    file_path (str): The full path to the file.
    Returns:
    str: The file name without its extension.
    """
    base_name = os.path.basename(file_path)  # Get the base name (file name with extension)
    file_name, file_extension = os.path.splitext(base_name)  # Split the base name into file name and extension
    return file_name  # Return the file name without extension
 def read_file(file_path):
    # 使用with语句打开并读取文件
    with open(file_path, 'r', encoding='utf-8') as file:  # 'r' 表示以读取模式打开文件
        # 一次性读取文件所有内容
        file_content = file.read()
    # 文件在with语句结束时会自动关闭
    # 现在file_content变量中存储了文件的所有文本内容
    return file_content
 def write_text_to_file(text, output_file_path):
    try:
        with open(output_file_path, 'w', encoding='utf-8') as file:
            file.write(text)
    except IOError as e:
        print(f"Error occurred while writing to the file: {e}")
    else:
        print(f"Text successfully written to file: {output_file_path}")
 def check_path_existence_and_return(path):
    """
    检查给定路径（文件或目录）是否存在。如果存在，返回该路径；否则，返回空字符串。
    :param path: 待检查的文件或目录路径（字符串）
    :return: 如果路径存在，返回原路径；否则，返回空字符串
    """
    if os.path.exists(path):
        return path
    else:
        return ""
 def open_file(filepath):
    if sys.platform.startswith('darwin'):
        subprocess.run(['open', filepath])  # macOS
    elif os.name == 'nt':  # For Windows
        os.startfile(filepath)
    elif os.name == 'posix':  # For Linux, Unix, etc.
        subprocess.run(['xdg-open', filepath])
 def start_new_service(script_path):
    # 对于Windows系统
    if sys.platform.startswith('win'):
        cmd = f'start cmd /k {python_exec} {script_path}'
    # 对于Mac或者Linux系统
    else:
        cmd = f'xterm -e {python_exec} {script_path}'
    proc = subprocess.Popen(cmd, shell=True)
    # 关闭之前启动的子进程
    # proc.terminate()
    # 或者如果需要强制关闭可以使用
    # proc.kill()
    return proc
 if __name__ == '__main__':
    dir = r'C:\Users\Administrator\Desktop/test'
    dir2 = r'"C:\Users\Administrator\Desktop\test2"'
    dir, dir2 = batch_clean_paths([dir, dir2])
    print(dir, dir2)
--- a/Ref_Audio_Selector/common/model_manager.py
+++ b/Ref_Audio_Selector/common/model_manager.py
@ -0,0 +1,46 @@
 import os
 import re
 pretrained_sovits_name = "GPT_SoVITS/pretrained_models/s2G488k.pth"
 pretrained_gpt_name = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt"
 SoVITS_weight_root = "SoVITS_weights"
 GPT_weight_root = "GPT_weights"
 os.makedirs(SoVITS_weight_root, exist_ok=True)
 os.makedirs(GPT_weight_root, exist_ok=True)
 speaker_verification_models = {
    'speech_campplus_sv_zh-cn_16k-common': {
        'task': 'speaker-verification',
        'model': 'Ref_Audio_Selector/tool/speaker_verification/models/speech_campplus_sv_zh-cn_16k-common',
        'model_revision': 'v1.0.0'
    },
    'speech_eres2net_sv_zh-cn_16k-common': {
        'task': 'speaker-verification',
        'model': 'Ref_Audio_Selector/tool/speaker_verification/models/speech_eres2net_sv_zh-cn_16k-common',
        'model_revision': 'v1.0.5'
    }
 }
 def custom_sort_key(s):
    # 使用正则表达式提取字符串中的数字部分和非数字部分
    parts = re.split('(\d+)', s)
    # 将数字部分转换为整数，非数字部分保持不变
    parts = [int(part) if part.isdigit() else part for part in parts]
    return parts
 def get_gpt_model_names():
    gpt_names = [pretrained_gpt_name]
    for name in os.listdir(GPT_weight_root):
        if name.endswith(".ckpt"): gpt_names.append("%s/%s" % (GPT_weight_root, name))
    sorted(gpt_names, key=custom_sort_key)
    return gpt_names
 def get_sovits_model_names():
    sovits_names = [pretrained_sovits_name]
    for name in os.listdir(SoVITS_weight_root):
        if name.endswith(".pth"): sovits_names.append("%s/%s" % (SoVITS_weight_root, name))
    sorted(sovits_names, key=custom_sort_key)
    return sovits_names
--- a/Ref_Audio_Selector/common/time_util.py
+++ b/Ref_Audio_Selector/common/time_util.py
@ -0,0 +1,72 @@
 import time
 import os
 from Ref_Audio_Selector.config_param.log_config import p_logger
 import Ref_Audio_Selector.config_param.config_params as params
 def timeit_decorator(func):
    """
    装饰器，用于计算被装饰函数的执行时间。
    参数:
        func (function): 要计时的函数。
    返回:
        function: 包含计时功能的新函数。
    """
    def wrapper(*args, **kwargs):
        if params.time_log_print_type != 'file':
            return func(*args, **kwargs)
        start_time = time.perf_counter()  # 使用 perf_counter 获取高精度计时起点
        func_result = func(*args, **kwargs)  # 执行原函数
        end_time = time.perf_counter()  # 获取计时终点
        elapsed_time = end_time - start_time  # 计算执行耗时
        # 记录日志内容
        log_message = f"进程ID: {os.getpid()}, {func.__name__} 执行耗时: {elapsed_time:.6f} 秒"
        p_logger.info(log_message)
        return func_result
    return wrapper
 def time_monitor(func):
    """
        返回结果，追加时间
    """
    def wrapper(*args, **kwargs):
        start_time = time.perf_counter()  # 使用 perf_counter 获取高精度计时起点
        func_result = func(*args, **kwargs)  # 执行原函数
        end_time = time.perf_counter()  # 获取计时终点
        elapsed_time = end_time - start_time  # 计算执行耗时
        return elapsed_time, func_result
    return wrapper
 # 使用装饰器
@timeit_decorator
 def example_function(n):
    time.sleep(n)  # 假设这是需要计时的函数，这里模拟耗时操作
    return n * 2
 def example_function2(n):
    time.sleep(n)  # 假设这是需要计时的函数，这里模拟耗时操作
    return n * 2
 if __name__ == "__main__":
    # 调用经过装饰的函数
    # result = example_function(2)
    print(time_monitor(example_function2)(2))
--- a/Ref_Audio_Selector/config.ini
+++ b/Ref_Audio_Selector/config.ini
@ -0,0 +1,57 @@
 # config.ini
 [Base]
 # 服务端口号
 server_port = 9423
 # 参考音频目录
 reference_audio_dir = refer_audio
 # 临时文件目录
 temp_dir = Ref_Audio_Selector/temp
 [Log]
 # 日志保存目录路径
 log_dir = Ref_Audio_Selector/log/general
 # 日志级别 CRITICAL、FATAL、ERROR、WARNING、WARN、INFO、DEBUG、NOTSET、
 log_level = INFO
 # 函数时间消耗日志打印类型 file 打印到文件; close 关闭
 time_log_print_type = file
 # 函数时间消耗日志保存目录路径
 time_log_print_dir = Ref_Audio_Selector/log/performance
 [AudioSample]
 # list转换待选参考音频目录
 list_to_convert_reference_audio_dir = refer_audio_all
 # 音频相似度目录
 audio_similarity_dir = similarity
 # 是否开启基准音频预采样 true false
 enable_pre_sample = true
 [Inference]
 # 默认测试文本位置
 default_test_text_path = Ref_Audio_Selector/file/test_content/test_content.txt
 # 推理音频目录
 inference_audio_dir = inference_audio
 # 推理音频文本聚合目录
 inference_audio_text_aggregation_dir = text
 # 推理音频情绪聚合目录
 inference_audio_emotion_aggregation_dir = emotion
 [ResultCheck]
 # asr输出文件
 asr_filename = asr
 # 文本相似度输出目录
 text_similarity_output_dir = text_similarity
 # 文本情绪平均相似度报告文件名
 text_emotion_average_similarity_report_filename = average_similarity
 # 文本相似度按情绪聚合明细文件名
 text_similarity_by_emotion_detail_filename = emotion_group_detail
 # 文本相似度按文本聚合明细文件名
 text_similarity_by_text_detail_filename = text_group_detail
 [AudioConfig]
 # 默认模板文件位置
 default_template_path = Ref_Audio_Selector/file/config_template/ref_audio_template.txt
 # 参考音频配置文件名
 reference_audio_config_filename = refer_audio
 [Other]
--- a/Ref_Audio_Selector/config_param/init.py
+++ b/Ref_Audio_Selector/config_param/init.py
--- a/Ref_Audio_Selector/config_param/config_manager.py
+++ b/Ref_Audio_Selector/config_param/config_manager.py
@ -0,0 +1,111 @@
 import configparser
 import os
 import Ref_Audio_Selector.common.common as common
 class ParamReadWriteManager:
    def __init__(self):
        self.base_dir = 'Ref_Audio_Selector/file/base_info'
        os.makedirs(self.base_dir, exist_ok=True)
        # 基础信息
        self.work_dir = 'work_dir'
        self.role = 'role'
        # 第一步
        self.subsection_num = 'subsection_num'
        self.sample_num = 'sample_num'
        # 第二步
        self.api_set_model_base_url = 'api_set_model_base_url'
        self.api_gpt_param = 'api_gpt_param'
        self.api_sovits_param = 'api_sovits_param'
        self.api_v2_set_gpt_model_base_url = 'api_v2_set_gpt_model_base_url'
        self.api_v2_gpt_model_param = 'api_v2_gpt_model_param'
        self.api_v2_set_sovits_model_base_url = 'api_v2_set_sovits_model_base_url'
        self.api_v2_sovits_model_param = 'api_v2_sovits_model_param'
        self.text_url = 'text_url'
        self.text_param = 'text_param'
        self.refer_type_param = 'refer_type_param'
        self.ref_path_param = 'ref_path_param'
        self.ref_text_param = 'ref_text_param'
        self.emotion_param = 'emotion_param'
        self.test_content_path = 'test_content_path'
        self.request_concurrency_num = 'request_concurrency_num'
        # 第三步
        self.text_similarity_amplification_boundary = 'text_similarity_amplification_boundary'
        # 第四步
        # 第五步
        self.text_template = 'text_template'
    def read(self, key):
        file_path = os.path.join(self.base_dir, key + '.txt')
        if os.path.exists(file_path):
            content = common.read_file(file_path)
            return content.strip()
        else:
            return ''
    def write(self, key, content):
        file_path = os.path.join(self.base_dir, key + '.txt')
        # 确保内容是字符串类型，如果不是，转换为字符串
        if not isinstance(content, str):
            clean_content = str(content).strip()  # 转换为字符串并移除首尾空白
        else:
            clean_content = content.strip()
        common.write_text_to_file(clean_content, file_path)
 class ConfigManager:
    def __init__(self):
        self.config_path = 'Ref_Audio_Selector/config.ini'
        self.config = configparser.ConfigParser()
        self.config.read(self.config_path, encoding='utf-8')
    def get_base(self, key):
        return self.config.get('Base', key)
    def get_log(self, key):
        return self.config.get('Log', key)
    def get_audio_sample(self, key):
        return self.config.get('AudioSample', key)
    def get_inference(self, key):
        return self.config.get('Inference', key)
    def get_result_check(self, key):
        return self.config.get('ResultCheck', key)
    def get_audio_config(self, key):
        return self.config.get('AudioConfig', key)
    def get_other(self, key):
        return self.config.get('Other', key)
    def print(self):
        # 打印所有配置
        for section in self.config.sections():
            print('[{}]'.format(section))
            for key in self.config[section]:
                print('{} = {}'.format(key, self.config[section][key]))
            print()
 _config = ConfigManager()
 _param_read_write_manager = ParamReadWriteManager()
 def get_config():
    return _config
 def get_rw_param():
    return _param_read_write_manager
 if __name__ == '__main__':
    print(_config.print())
--- a/Ref_Audio_Selector/config_param/config_params.py
+++ b/Ref_Audio_Selector/config_param/config_params.py
@ -0,0 +1,58 @@
 import Ref_Audio_Selector.config_param.config_manager as config_manager
 config = config_manager.get_config()
 # [Base]
 # 服务端口号
 server_port = int(config.get_base('server_port'))
 # 参考音频目录
 reference_audio_dir = config.get_base('reference_audio_dir')
 # 临时文件目录
 temp_dir = config.get_base('temp_dir')
 # [Log]
 # 日志保存目录路径
 log_dir = config.get_log('log_dir')
 # 日志级别 CRITICAL、FATAL、ERROR、WARNING、WARN、INFO、DEBUG、NOTSET、
 log_level = config.get_log('log_level')
 # 函数时间消耗日志打印类型 file 打印到文件; close 关闭
 time_log_print_type = config.get_log('time_log_print_type')
 # 函数时间消耗日志保存目录路径
 time_log_print_dir = config.get_log('time_log_print_dir')
 # [AudioSample]
 # list转换待选参考音频目录
 list_to_convert_reference_audio_dir = config.get_audio_sample('list_to_convert_reference_audio_dir')
 # 音频相似度目录
 audio_similarity_dir = config.get_audio_sample('audio_similarity_dir')
 # 是否开启基准音频预采样 true false
 enable_pre_sample = config.get_audio_sample('enable_pre_sample')
 # [Inference]
 # 默认测试文本位置
 default_test_text_path = config.get_inference('default_test_text_path')
 # 推理音频目录
 inference_audio_dir = config.get_inference('inference_audio_dir')
 # 推理音频文本聚合目录
 inference_audio_text_aggregation_dir = config.get_inference('inference_audio_text_aggregation_dir')
 # 推理音频情绪聚合目录
 inference_audio_emotion_aggregation_dir = config.get_inference('inference_audio_emotion_aggregation_dir')
 # [ResultCheck]
 # asr输出文件
 asr_filename = config.get_result_check('asr_filename')
 # 文本相似度输出目录
 text_similarity_output_dir = config.get_result_check('text_similarity_output_dir')
 # 文本情绪平均相似度报告文件名
 text_emotion_average_similarity_report_filename = config.get_result_check('text_emotion_average_similarity_report_filename')
 # 文本相似度按情绪聚合明细文件名
 text_similarity_by_emotion_detail_filename = config.get_result_check('text_similarity_by_emotion_detail_filename')
 # 文本相似度按文本聚合明细文件名
 text_similarity_by_text_detail_filename = config.get_result_check('text_similarity_by_text_detail_filename')
 # [AudioConfig]
 # 默认模板文件位置
 default_template_path = config.get_audio_config('default_template_path')
 # 参考音频配置文件名
 reference_audio_config_filename = config.get_audio_config('reference_audio_config_filename')
--- a/Ref_Audio_Selector/config_param/log_config.py
+++ b/Ref_Audio_Selector/config_param/log_config.py
@ -0,0 +1,65 @@
 import logging
 import os
 import datetime
 import Ref_Audio_Selector.config_param.config_params as params
 def create_general_logger():
    # 获取当前日期，用于文件名和日志内容
    current_date = datetime.datetime.now().strftime('%Y-%m-%d')
    # 创建一个用于控制台输出的处理器，并设置日志级别
    console_handler = logging.StreamHandler()
    # console_handler.setLevel(logging.INFO)
    # 可以设置控制台输出的格式
    console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(console_formatter)
    console_handler.encoding = 'utf-8'  # 设置字符编码为utf-8
    os.makedirs(params.log_dir, exist_ok=True)
    # 创建一个用于常规日志的处理器
    general_handler = logging.FileHandler(f"{params.log_dir}/{current_date}.log", mode='a', encoding='utf-8')
    # general_handler.setLevel(logging.INFO)
    general_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    general_handler.setFormatter(general_formatter)
    # 配置一个常规的logger
    general_logger = logging.getLogger('general')
    level = logging.getLevelName(params.log_level)
    general_logger.setLevel(level)
    general_logger.addHandler(console_handler)
    general_logger.addHandler(general_handler)
    # 配置根logger，以防万一
    logging.basicConfig(level=logging.WARNING, handlers=[general_handler])
    return general_logger
 def create_performance_logger():
    # 获取当前日期，用于文件名和日志内容
    current_date = datetime.datetime.now().strftime('%Y-%m-%d')
    os.makedirs(params.time_log_print_dir, exist_ok=True)
    # 创建一个专用于性能监控日志的处理器
    performance_handler = logging.FileHandler(
        f"{params.time_log_print_dir}/{current_date}.log", mode='a', encoding='utf-8')
    # performance_handler.setLevel(logging.INFO)
    performance_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    performance_handler.setFormatter(performance_formatter)
    # 配置一个专门用于性能监控的logger
    performance_logger = logging.getLogger('performance')
    performance_logger.setLevel(logging.INFO)
    performance_logger.addHandler(performance_handler)
    return performance_logger
 def setup_logging():
    return create_general_logger(), create_performance_logger()
 logger, p_logger = setup_logging()
--- a/Ref_Audio_Selector/file/base_info/role.txt
+++ b/Ref_Audio_Selector/file/base_info/role.txt
--- a/Ref_Audio_Selector/file/base_info/work_dir.txt
+++ b/Ref_Audio_Selector/file/base_info/work_dir.txt
--- a/Ref_Audio_Selector/file/config_template/ref_audio_template.txt
+++ b/Ref_Audio_Selector/file/config_template/ref_audio_template.txt
@ -0,0 +1,5 @@
 "${emotion}": {
    "ref_wav_path": "${ref_path}",
    "prompt_text": "${ref_text}",
    "prompt_language": "中文"
 }
--- a/Ref_Audio_Selector/file/test_content/test_content.txt
+++ b/Ref_Audio_Selector/file/test_content/test_content.txt
@ -0,0 +1,4 @@
 也是只有一次。”白蓉简单地回答，然后迅速转移话锋，搂住罗辑的脖子说，“算了，我不要那生日礼物了，你也回到正常的生活中来，好吗？”
 云天明看到那是一条丑陋的虫子，软乎乎湿漉漉的，在她白皙的手指间蠕动着，旁边一个女生尖叫道：恶心死了，你碰它干吗？！程心把虫子轻轻放到旁边的草丛中，说，它在这里会给踩死的。
 “那么多的星星，像雾似的。”云天明感叹道。程心把目光从银河收回，转头看着他，指着下面的校园和城市说：“你看下面也很漂亮啊，我们的生活是在这儿，可不是在那么远的银河里。”
 “可我们的专业，不就是为了到地球之外去吗？”“那是为了这里的生活更好，可不是为了逃离地球啊。”云天明当然知道程心的话是委婉地指向他的孤僻和自闭，他也只有默然以对。
--- a/Ref_Audio_Selector/ref_audio_selector_webui.py
+++ b/Ref_Audio_Selector/ref_audio_selector_webui.py
--- a/Ref_Audio_Selector/start_ref_audio_selector_webui.bat
+++ b/Ref_Audio_Selector/start_ref_audio_selector_webui.bat
@ -0,0 +1,5 @@
 CHCP 65001
@echo off 
 cd ../
 runtime\python.exe ./Ref_Audio_Selector/ref_audio_selector_webui.py
 pause
--- a/Ref_Audio_Selector/tool/init.py
+++ b/Ref_Audio_Selector/tool/init.py
--- a/Ref_Audio_Selector/tool/asr/init.py
+++ b/Ref_Audio_Selector/tool/asr/init.py
--- a/Ref_Audio_Selector/tool/asr/fasterwhisper_asr_multi_level_dir.py
+++ b/Ref_Audio_Selector/tool/asr/fasterwhisper_asr_multi_level_dir.py
@ -0,0 +1,120 @@
 import argparse
 import os
 import traceback
 import Ref_Audio_Selector.config_param.config_params as params
 os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
 os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
 import torch
 from faster_whisper import WhisperModel
 from tqdm import tqdm
 from tools.asr.config import check_fw_local_models
 from Ref_Audio_Selector.config_param.log_config import logger
 language_code_list = [
    "af", "am", "ar", "as", "az",
    "ba", "be", "bg", "bn", "bo",
    "br", "bs", "ca", "cs", "cy",
    "da", "de", "el", "en", "es",
    "et", "eu", "fa", "fi", "fo",
    "fr", "gl", "gu", "ha", "haw",
    "he", "hi", "hr", "ht", "hu",
    "hy", "id", "is", "it", "ja",
    "jw", "ka", "kk", "km", "kn",
    "ko", "la", "lb", "ln", "lo",
    "lt", "lv", "mg", "mi", "mk",
    "ml", "mn", "mr", "ms", "mt",
    "my", "ne", "nl", "nn", "no",
    "oc", "pa", "pl", "ps", "pt",
    "ro", "ru", "sa", "sd", "si",
    "sk", "sl", "sn", "so", "sq",
    "sr", "su", "sv", "sw", "ta",
    "te", "tg", "th", "tk", "tl",
    "tr", "tt", "uk", "ur", "uz",
    "vi", "yi", "yo", "zh", "yue",
    "auto"]
 def execute_asr_multi_level_dir(input_folder, output_folder, model_size, language, precision):
    if '-local' in model_size:
        model_size = model_size[:-6]
        model_path = f'tools/asr/models/faster-whisper-{model_size}'
    else:
        model_path = model_size
    if language == 'auto':
        language = None  # 不设置语种由模型自动输出概率最高的语种
    logger.info("loading faster whisper model:", model_size, model_path)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    try:
        model = WhisperModel(model_path, device=device, compute_type=precision)
    except:
        return logger.error(traceback.format_exc())
    output = []
    # 递归遍历输入目录及所有子目录
    for root, dirs, files in os.walk(input_folder):
        for file_name in sorted(files):
            # 只处理wav文件（假设是wav文件）
            if file_name.endswith(".wav"):
                try:
                    file_path = os.path.join(root, file_name)
                    original_text = os.path.basename(root)
                    segments, info = model.transcribe(
                        audio=file_path,
                        beam_size=5,
                        vad_filter=True,
                        vad_parameters=dict(min_silence_duration_ms=700),
                        language=language)
                    text = ''
                    if info.language == "zh":
                        logger.info("检测为中文文本, 转 FunASR 处理")
                        if ("only_asr" not in globals()):
                            from Ref_Audio_Selector.tool.asr.funasr_asr_multi_level_dir import \
                                only_asr  # #如果用英文就不需要导入下载模型
                        text = only_asr(file_path)
                    if text == '':
                        for segment in segments:
                            text += segment.text
                    output.append(f"{file_path}|{original_text}|{info.language.upper()}|{text}")
                    print(f"{file_path}|{original_text}|{info.language.upper()}|{text}")
                except:
                    return logger.error(traceback.format_exc())
    output_folder = output_folder
    os.makedirs(output_folder, exist_ok=True)
    output_file_path = os.path.abspath(f'{output_folder}/{params.asr_filename}.list')
    with open(output_file_path, "w", encoding="utf-8") as f:
        f.write("\n".join(output))
        logger.info(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
    return output_file_path
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input_folder", type=str, required=True,
                        help="Path to the folder containing WAV files.")
    parser.add_argument("-o", "--output_folder", type=str, required=True,
                        help="Output folder to store transcriptions.")
    parser.add_argument("-s", "--model_size", type=str, default='large-v3',
                        choices=check_fw_local_models(),
                        help="Model Size of Faster Whisper")
    parser.add_argument("-l", "--language", type=str, default='ja',
                        choices=language_code_list,
                        help="Language of the audio files.")
    parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16', 'float32'],
                        help="fp16 or fp32")
    cmd = parser.parse_args()
    output_file_path = execute_asr_multi_level_dir(
        input_folder=cmd.input_folder,
        output_folder=cmd.output_folder,
        model_size=cmd.model_size,
        language=cmd.language,
        precision=cmd.precision,
    )
--- a/Ref_Audio_Selector/tool/asr/funasr_asr_multi_level_dir.py
+++ b/Ref_Audio_Selector/tool/asr/funasr_asr_multi_level_dir.py
@ -0,0 +1,94 @@
 # -*- coding:utf-8 -*-
 import argparse
 import os
 import traceback
 import Ref_Audio_Selector.config_param.config_params as params
 from Ref_Audio_Selector.config_param.log_config import logger
 from Ref_Audio_Selector.common.time_util import timeit_decorator
 from tqdm import tqdm
 from funasr import AutoModel
 path_asr = 'tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
 path_vad = 'tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch'
 path_punc = 'tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch'
 path_asr = path_asr if os.path.exists(
    path_asr) else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
 path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
 path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
 model = AutoModel(
    model=path_asr,
    model_revision="v2.0.4",
    vad_model=path_vad,
    vad_model_revision="v2.0.4",
    punc_model=path_punc,
    punc_model_revision="v2.0.4",
 )
 def only_asr(input_file):
    try:
        text = model.generate(input=input_file)[0]["text"]
    except:
        text = ''
        logger.error(traceback.format_exc())
    return text
@timeit_decorator
 def execute_asr_multi_level_dir(input_folder, output_folder, model_size, language):
    output = []
    # 递归遍历输入目录及所有子目录
    for root, dirs, files in os.walk(input_folder):
        for name in sorted(files):
            # 只处理wav文件（假设是wav文件）
            if name.endswith(".wav"):
                try:
                    original_text = os.path.basename(root)
                    # 构造完整的输入音频文件路径
                    input_file_path = os.path.join(root, name)
                    input_file_path = os.path.normpath(input_file_path)  # 先标准化可能存在混合斜杠的情况
                    asr_text = model.generate(input=input_file_path)[0]["text"]
                    output.append(f"{input_file_path}|{original_text}|{language.upper()}|{asr_text}")
                except:
                    logger.error(traceback.format_exc())
    # 创建或打开指定的输出目录
    output_folder = output_folder
    output_dir_abs = os.path.abspath(output_folder)
    os.makedirs(output_dir_abs, exist_ok=True)
    # 构造输出文件路径
    output_file_path = os.path.join(output_dir_abs, f'{params.asr_filename}.list')
    # 将输出写入文件
    with open(output_file_path, "w", encoding="utf-8") as f:
        f.write("\n".join(output))
        logger.info(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
    return output_file_path
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input_folder", type=str, required=True,
                        help="Path to the folder containing WAV files.")
    parser.add_argument("-o", "--output_folder", type=str, required=True,
                        help="Output folder to store transcriptions.")
    parser.add_argument("-s", "--model_size", type=str, default='large',
                        help="Model Size of FunASR is Large")
    parser.add_argument("-l", "--language", type=str, default='zh', choices=['zh'],
                        help="Language of the audio files.")
    parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16', 'float32'],
                        help="fp16 or fp32")  # 还没接入
    cmd = parser.parse_args()
    execute_asr_multi_level_dir(
        input_folder=cmd.input_folder,
        output_folder=cmd.output_folder,
        model_size=cmd.model_size,
        language=cmd.language,
    )
--- a/Ref_Audio_Selector/tool/audio_check.py
+++ b/Ref_Audio_Selector/tool/audio_check.py
@ -0,0 +1,54 @@
 import os
 import shutil
 import Ref_Audio_Selector.common.common as common
 import Ref_Audio_Selector.config_param.config_params as params
 from Ref_Audio_Selector.config_param.log_config import logger
 def remove_matching_audio_files_in_text_dir(text_dir, emotions_list):
    count = 0
    emotions = [item['emotion'] for item in emotions_list]
    for root, dirs, files in os.walk(text_dir):
        for file in files:
            if file.endswith(".wav"):
                emotion_tag = os.path.basename(file)[:-4]
                if emotion_tag not in emotions:
                    file_path = os.path.join(root, file)
                    logger.info(f"Deleting file: {file_path}")
                    try:
                        os.remove(file_path)
                        count += 1
                    except Exception as e:
                        logger.error(f"Error deleting file {file_path}: {e}")
    return count
 def delete_emotion_subdirectories(emotion_dir, emotions_list):
    count = 0
    emotions = [item['emotion'] for item in emotions_list]
    for entry in os.listdir(emotion_dir):
        entry_path = os.path.join(emotion_dir, entry)
        if os.path.isdir(entry_path):
            if entry not in emotions:
                logger.info(f"Deleting directory: {entry_path}")
                try:
                    # 使用shutil.rmtree删除整个子目录及其内容
                    shutil.rmtree(entry_path)
                    count += 1
                except Exception as e:
                    logger.error(f"Error deleting directory {entry_path}: {e}")
    return count
 def sync_ref_audio(ref_audio_dir, inference_audio_dir):
    ref_audio_manager = common.RefAudioListManager(ref_audio_dir)
    ref_list = ref_audio_manager.get_ref_audio_list()
    text_dir = os.path.join(inference_audio_dir, params.inference_audio_text_aggregation_dir)
    emotion_dir = os.path.join(inference_audio_dir, params.inference_audio_emotion_aggregation_dir)
    delete_text_wav_num = remove_matching_audio_files_in_text_dir(text_dir, ref_list)
    delete_emotion_dir_num = delete_emotion_subdirectories(emotion_dir, ref_list)
    return delete_text_wav_num, delete_emotion_dir_num
--- a/Ref_Audio_Selector/tool/audio_config.py
+++ b/Ref_Audio_Selector/tool/audio_config.py
@ -0,0 +1,31 @@
 import os
 import platform
 def generate_audio_config(work_space_dir, template_str, audio_list, output_file_path):
    # 定义一个空字符串来存储最终要写入文件的内容
    file_content = ""
    # 遍历参考音频列表
    for audio_info in audio_list:
        emotion = audio_info['emotion']
        ref_path = audio_info['ref_path']
        ref_text = audio_info['ref_text']
        relative_path = os.path.relpath(ref_path, work_space_dir)
        if platform.system() == 'Windows':
            relative_path = relative_path.replace('\\', '/')
        # 使用字符串模板替换变量
        formatted_line = template_str.replace('${emotion}', emotion).replace('${ref_path}', relative_path).replace(
            '${ref_text}', ref_text)
        # 将格式化后的行添加到内容中，使用逗号和换行符分隔
        file_content += formatted_line + ",\n"
    # 删除最后一个逗号和换行符，确保格式整洁
    file_content = file_content[:-2]
    # 将内容写入输出文件
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        output_file.write(file_content)
--- a/Ref_Audio_Selector/tool/audio_inference.py
+++ b/Ref_Audio_Selector/tool/audio_inference.py
@ -0,0 +1,238 @@
 import time
 import os
 import requests
 import itertools
 import multiprocessing
 from concurrent.futures import ProcessPoolExecutor
 import numpy as np
 import Ref_Audio_Selector.config_param.config_params as params
 from urllib.parse import urlparse, parse_qs, urlencode, urlunparse, quote
 from Ref_Audio_Selector.config_param.log_config import logger, p_logger
 class SetModelURLComposer:
    def __init__(self, type, base_url, gpt_param_name, sovits_param_name):
        self.type = type
        self.base_url = base_url
        self.gpt_param_name = gpt_param_name
        self.sovits_param_name = sovits_param_name
    def is_valid(self):
        if self.base_url is None or self.base_url == '':
            raise Exception("请求地址不能为空")
        if self.type in ['gpt', 'all']:
            if self.gpt_param_name is None or self.gpt_param_name == '':
                raise Exception("GPT参数名不能为空")
        if self.type in ['sovits', 'all']:
            if self.sovits_param_name is None or self.sovits_param_name == '':
                raise Exception("Sovits参数名不能为空")
    def build_get_url(self, value_array, need_url_encode=True):
        params = {}
        if self.type == 'gpt':
            params[self.gpt_param_name] = value_array[0]
        if self.type == 'sovits':
            params[self.sovits_param_name] = value_array[0]
        if self.type == 'all':
            params[self.gpt_param_name] = value_array[0]
            params[self.sovits_param_name] = value_array[1]
        return append_params_to_url(self.base_url, params, need_url_encode)
    def build_post_url(self, value_array, need_url_encode=True):
        url = append_params_to_url(self.base_url, {}, need_url_encode)
        params = {}
        if self.type == 'gpt':
            params[self.gpt_param_name] = value_array[0]
        if self.type == 'sovits':
            params[self.sovits_param_name] = value_array[0]
        if self.type == 'all':
            params[self.gpt_param_name] = value_array[0]
            params[self.sovits_param_name] = value_array[1]
        return url, params
 class TTSURLComposer:
    def __init__(self, base_url, refer_type_param, emotion_param_name, text_param_name, ref_path_param_name, ref_text_param_name):
        self.base_url = base_url
        # 角色情绪 or 参考音频
        self.refer_type_param = refer_type_param 
        self.emotion_param_name = emotion_param_name
        self.text_param_name = text_param_name
        self.ref_path_param_name = ref_path_param_name
        self.ref_text_param_name = ref_text_param_name
    def is_valid(self):
        if self.base_url is None or self.base_url == '':
            raise ValueError("请输入url")
        if self.text_param_name is None or self.text_param_name == '':
            raise ValueError("请输入text参数名")
        if self.emotion_param_name is None and self.ref_path_param_name is None and self.ref_text_param_name is None:
            raise ValueError("请输入至少一个参考or情绪的参数")
    def is_emotion(self):
        return self.refer_type_param == '角色情绪'
    def build_url_with_emotion(self, text_value, emotion_value, need_url_encode=True):
        params = {
            self.text_param_name: text_value,
            self.emotion_param_name: emotion_value,
        }
        return append_params_to_url(self.base_url, params, need_url_encode)
    def build_url_with_ref(self, text_value, ref_path_value, ref_text_value, need_url_encode=True):
        params = {
            self.text_param_name: text_value,
            self.ref_path_param_name: ref_path_value,
            self.ref_text_param_name: ref_text_value,
        }
        return append_params_to_url(self.base_url, params, need_url_encode)
 def append_params_to_url(url_with_params, params, need_url_encode):
    if params:
        query_params = '&'.join([f"{k}={v}" for k, v in params.items()])
        url_with_params += '?' + query_params if '?' not in url_with_params else '&' + query_params
    return url_with_params if not need_url_encode else safe_encode_query_params(url_with_params)
 def safe_encode_query_params(original_url):
    # 分析URL以获取查询字符串部分
    parsed_url = urlparse(original_url)
    query_params = parse_qs(parsed_url.query)
    # 将查询参数转换为编码过的字典（键值对会被转码）
    encoded_params = {k: quote(v[0]) for k, v in query_params.items()}
    # 重新编码查询字符串
    new_query_string = urlencode(encoded_params, doseq=False)
    # 重建完整的URL
    new_parsed_url = parsed_url._replace(query=new_query_string)
    encoded_url = urlunparse(new_parsed_url)
    logger.info(encoded_url)
    return encoded_url
 def generate_audio_files_parallel(url_composer, text_list, emotion_list, output_dir_path, num_processes=1):
    # 将emotion_list均匀分成num_processes个子集
    emotion_groups = np.array_split(emotion_list, num_processes)
    with ProcessPoolExecutor(max_workers=num_processes) as executor:
        futures = [
            executor.submit(generate_audio_files_for_emotion_group, url_composer, text_list, group, output_dir_path)
            for group in emotion_groups]
        for future in futures:
            future.result()  # 等待所有进程完成
 def generate_audio_files_for_emotion_group(url_composer, text_list, emotion_list, output_dir_path):
    start_time = time.perf_counter()  # 使用 perf_counter 获取高精度计时起点
    # Ensure the output directory exists
    output_dir = os.path.abspath(output_dir_path)
    os.makedirs(output_dir, exist_ok=True)
    # Create subdirectories for text and emotion categories
    text_subdir = os.path.join(output_dir, params.inference_audio_text_aggregation_dir)
    os.makedirs(text_subdir, exist_ok=True)
    emotion_subdir = os.path.join(output_dir, params.inference_audio_emotion_aggregation_dir)
    os.makedirs(emotion_subdir, exist_ok=True)
    all_count = len(text_list) * len(emotion_list)
    has_generated_count = 0
    all_text_count = sum(len(item) for item in text_list)
    # 计算笛卡尔积
    cartesian_product = list(itertools.product(text_list, emotion_list))
    for text, emotion in cartesian_product:
        # Generate audio byte stream using the create_audio function
        emotion_name = emotion['emotion']
        text_subdir_text = os.path.join(text_subdir, text)
        os.makedirs(text_subdir_text, exist_ok=True)
        text_subdir_text_file_path = os.path.join(text_subdir_text, emotion_name + '.wav')
        emotion_subdir_emotion = os.path.join(emotion_subdir, emotion_name)
        os.makedirs(emotion_subdir_emotion, exist_ok=True)
        emotion_subdir_emotion_file_path = os.path.join(emotion_subdir_emotion, text + '.wav')
        # 检查是否已经存在对应的音频文件，如果存在则跳过
        if os.path.exists(text_subdir_text_file_path) and os.path.exists(emotion_subdir_emotion_file_path):
            has_generated_count += 1
            logger.info(f"进程ID: {os.getpid()}, 进度: {has_generated_count}/{all_count}")
            continue
        if url_composer.is_emotion():
            real_url = url_composer.build_url_with_emotion(text, emotion['emotion'], False)
        else:
            real_url = url_composer.build_url_with_ref(text, emotion['ref_path'], emotion['ref_text'], False)
        audio_bytes = inference_audio_from_api(real_url)
        # Write audio bytes to the respective files
        with open(text_subdir_text_file_path, 'wb') as f:
            f.write(audio_bytes)
        with open(emotion_subdir_emotion_file_path, 'wb') as f:
            f.write(audio_bytes)
        has_generated_count += 1
        logger.info(f"进程ID: {os.getpid()}, 进度: {has_generated_count}/{all_count}")
    end_time = time.perf_counter()  # 获取计时终点
    elapsed_time = end_time - start_time  # 计算执行耗时
    # 记录日志内容
    log_message = f"进程ID: {os.getpid()}, generate_audio_files_for_emotion_group 执行耗时: {elapsed_time:.6f} 秒；推理数量: {has_generated_count}； 字符总数：{all_text_count}；每秒推理字符数：{all_text_count*len(emotion_list) / elapsed_time:.3f}；"
    p_logger.info(log_message)
    logger.info(log_message)
 def inference_audio_from_api(url):
    logger.info(f'inference_audio_from_api url: {url}')
    # 发起GET请求
    response = requests.get(url, stream=True)
    # 检查响应状态码是否正常（例如200表示成功）
    if response.status_code == 200:
        # 返回音频数据的字节流
        return response.content
    else:
        raise Exception(f"Failed to fetch audio from API. Server responded with status code {response.status_code}.message: {response.json()}")
 def start_api_set_model(set_model_url_composer, gpt_models, sovits_models):
    url, post_body = set_model_url_composer.build_post_url([gpt_models, sovits_models], True)
    logger.info(f'set_model_url_composer url: {set_model_url_composer}')
    logger.info(f'start_api_set_model url: {url}')
    logger.info(f'start_api_set_model post_body: {post_body}')
    response = requests.post(url, json=post_body)
    if response.status_code == 200:
        result = response.text
        return result
    else:
        return f'请求失败，状态码：{response.status_code}'
 def start_api_v2_set_gpt_model(set_model_url_composer, gpt_models):
    url = set_model_url_composer.build_get_url([gpt_models], False)
    logger.info(f'start_api_v2_set_gpt_model url: {url}')
    response = requests.get(url)
    if response.status_code == 200:
        result = response.text
        return result
    else:
        return f'请求失败，状态码：{response.status_code}'
 def start_api_v2_set_sovits_model(set_model_url_composer, sovits_models):
    url = set_model_url_composer.build_get_url([sovits_models], False)
    logger.info(f'start_api_v2_set_sovits_model url: {url}')
    response = requests.get(url)
    if response.status_code == 200:
        result = response.text
        return result
    else:
        return f'请求失败，状态码：{response.status_code}'
--- a/Ref_Audio_Selector/tool/audio_sample.py
+++ b/Ref_Audio_Selector/tool/audio_sample.py
@ -0,0 +1,162 @@
 import os
 import shutil
 import random
 import librosa
 from Ref_Audio_Selector.config_param.log_config import logger
 def check_audio_duration(path, min_duration=3, max_duration=10):
    try:
        # 直接计算音频文件的时长（单位：秒）
        duration = librosa.get_duration(filename=path)
        # 判断时长是否在3s至10s之间
        if min_duration <= duration <= max_duration:
            return True
        else:
            return False
    except Exception as e:
        logger.error(f"无法打开或处理音频文件：{e}")
        return None
 def convert_from_list(list_file, output_dir):
    # 创建输出目录，如果它不存在的话
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # 解析.list文件，并操作文件
    with open(list_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    for line in lines:
        parts = line.strip().split('|')
        if len(parts) != 4:
            logger.error(f"Line format incorrect: {line}")
            continue
        audio_path, _, _, transcription = parts
        # 构建新的文件名和路径
        new_filename = transcription.strip() + '.wav'
        # new_filename = new_filename.replace(' ', '_')  # 移除空格
        # new_filename = ''.join(e for e in new_filename if e.isalnum() or e in ['_', '.'])  # 移除非法字符
        new_path = os.path.join(output_dir, new_filename)
        # 如果目标文件已存在，不要覆盖
        if os.path.exists(new_path):
            logger.info(f"File already exists: {new_path}")
            continue
        try:
            # 检查音频文件是否存在
            if not os.path.exists(audio_path):
                logger.info(f"Audio file does not exist: {audio_path}")
                continue
            if check_audio_duration(audio_path):
                # 复制音频文件到output目录并重命名
                shutil.copy2(audio_path, new_path)
                logger.info(f"File copied and renamed to: {new_path}")
            else:
                logger.info(f"File skipped due to duration: {audio_path}")
        except Exception as e:
            logger.error(f"An error occurred while processing: {audio_path}")
            logger.error(e)
    logger.info("Processing complete.")
 def sample(output_audio_dir, similarity_list, subsection_num, sample_num):
    # 按照相似度分值降序排序相似度列表
    similarity_list.sort(key=lambda x: x['score'], reverse=True)
    # 计算每段的起始索引
    step = len(similarity_list) // subsection_num
    if len(similarity_list) % subsection_num != 0:
        step += 1
    # 分段并随机采样
    for i in range(subsection_num):
        start = i * step
        end = (i + 1) * step
        end = min(end, len(similarity_list))  # 防止最后一段越界
        # 创建子列表
        subsection = similarity_list[start:end]
        # 在子列表上随机打乱
        random.shuffle(subsection)
        # 从打乱后的子列表中抽取相应数量的个体
        num = min(sample_num, len(subsection))
        sampled_subsection = subsection[:num]
        # 创建并进入子目录
        subdir_name = f'emotion_{i + 1}'
        subdir_path = os.path.join(output_audio_dir, subdir_name)
        os.makedirs(subdir_path, exist_ok=True)
        # 复制采样结果的音频到子目录
        for item in sampled_subsection:
            src_path = item['wav_path']
            dst_path = os.path.join(subdir_path, os.path.basename(src_path))
            shutil.copyfile(src_path, dst_path)
    logger.info("Sampling completed.")
 def parse_similarity_file(file_path):
    """
    解析指定文本文件，将其中的内容以元组形式存入列表。
    参数:
        file_path (str): 待解析的文本文件路径。
    返回:
        list[tuple[float, str]]: 存储浮点数和路径的元组列表。
    """
    result_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # 去除行尾换行符并按'|'分割
            score, filepath = line.strip().split('|')
            # 将浮点数字符串转换为浮点数类型
            score = float(score)
            # 将得分和路径作为元组添加到结果列表
            result_list.append({
                'score': score,
                'wav_path': filepath
            })
    return result_list
 def copy_and_move(output_audio_directory, similarity_scores):
    # 确保新目录存在
    if not os.path.exists(output_audio_directory):
        os.makedirs(output_audio_directory)
    # 遍历并复制文件
    for item in similarity_scores:
        # 构造新的文件名
        base_name = os.path.basename(item['wav_path'])[:-4]  # 去掉.wav扩展名
        new_name = f"{item['score'] * 10000:04.0f}-{base_name}.wav"
        # 新文件的完整路径
        new_path = os.path.join(output_audio_directory, new_name)
        # 复制文件到新目录
        shutil.copyfile(item['wav_path'], new_path)
    logger.info("已完成复制和重命名操作。")
 if __name__ == '__main__':
    similarity_list = parse_similarity_file("D:/tt/similarity/啊，除了伊甸和樱，竟然还有其他人会提起我？.txt")
    sample('D:/tt/similarity/output', similarity_list, 10, 4)
--- a/Ref_Audio_Selector/tool/speaker_verification/init.py
+++ b/Ref_Audio_Selector/tool/speaker_verification/init.py
--- a/Ref_Audio_Selector/tool/speaker_verification/voice_similarity.py
+++ b/Ref_Audio_Selector/tool/speaker_verification/voice_similarity.py
@ -0,0 +1,142 @@
 import argparse
 import os
 import torchaudio
 import torchaudio.transforms as T
 import platform
 import Ref_Audio_Selector.config_param.config_params as params
 import Ref_Audio_Selector.config_param.log_config as log_config
 from Ref_Audio_Selector.common.time_util import timeit_decorator
 from Ref_Audio_Selector.common.model_manager import speaker_verification_models as models
 from modelscope.pipelines import pipeline
 def init_model(model_type='speech_campplus_sv_zh-cn_16k-common'):
    log_config.logger.info(f'人声识别模型类型：{model_type}')
    return pipeline(
        task=models[model_type]['task'],
        model=models[model_type]['model'],
        model_revision=models[model_type]['model_revision']
    )
@timeit_decorator
 def compare_audio_and_generate_report(reference_audio_path, comparison_dir_path, output_file_path, model_type):
    sv_pipeline = init_model(model_type)
    # Step 1: 获取比较音频目录下所有音频文件的路径
    comparison_audio_paths = [os.path.join(comparison_dir_path, f) for f in os.listdir(comparison_dir_path) if
                              f.endswith('.wav')]
    if platform.system() == 'Windows':
        # 因为这个模型是基于16k音频数据训练的，为了避免后续比较时，每次都对参考音频进行重采样，所以，提前进行了采样
        # windows不支持torchaudio.sox_effects.apply_effects_tensor，所以改写了依赖文件中的重采样方法
        # 改用torchaudio.transforms.Resample进行重采样，如果在非windows环境下，没有更改依赖包的采样方法的话，
        # 使用这段代码进行预采样会出现因为采样方法不同，而导致的模型相似度计算不准确的问题
        # 当然如果在windows下，使用了其他的采样方法，也会出现不准确的问题
        if params.enable_pre_sample == 'true':
            reference_audio_16k = ensure_16k_wav(reference_audio_path)
        else:
            reference_audio_16k = reference_audio_path
    else:
        reference_audio_16k = reference_audio_path
    # Step 2: 用参考音频依次比较音频目录下的每个音频，获取相似度分数及对应路径
    all_count = len(comparison_audio_paths)
    has_processed_count = 0
    similarity_scores = []
    for audio_path in comparison_audio_paths:
        score = sv_pipeline([reference_audio_16k, audio_path])['score']
        similarity_scores.append({
            'score': score,
            'path': audio_path
        })
        has_processed_count += 1
        log_config.logger.info(f'进度：{has_processed_count}/{all_count}')
    # Step 3: 根据相似度分数降序排列
    similarity_scores.sort(key=lambda x: x['score'], reverse=True)
    # Step 4: 处理输出文件不存在的情况，创建新文件
    if not os.path.exists(output_file_path):
        open(output_file_path, 'w').close()  # Create an empty file
    # Step 5: 将排序后的结果写入输出结果文件（支持中文）
    formatted_scores = [f'{item["score"]}|{item["path"]}' for item in similarity_scores]
    with open(output_file_path, 'w', encoding='utf-8') as f:
        # 使用'\n'将每个字符串分开，使其写入不同行
        content = '\n'.join(formatted_scores)
        f.write(content)
 def ensure_16k_wav(audio_file_path, target_sample_rate=16000):
    """
    输入一个音频文件地址，判断其采样率并决定是否进行重采样，然后将结果保存到指定的输出文件。
    参数:
        audio_file_path (str): 音频文件路径。
        output_file_path (str): 保存重采样后音频数据的目标文件路径。
        target_sample_rate (int, optional): 目标采样率，默认为16000Hz。
    """
    # 读取音频文件并获取其采样率
    waveform, sample_rate = torchaudio.load(audio_file_path)
    # 判断是否需要重采样
    if sample_rate == target_sample_rate:
        return audio_file_path
    else:
        # 创建Resample实例
        resampler = T.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
        # 应用重采样
        resampled_waveform = resampler(waveform)
        # 创建临时文件夹
        os.makedirs(params.temp_dir, exist_ok=True)
        # 设置临时文件名
        temp_file_path = os.path.join(params.temp_dir, os.path.basename(audio_file_path))
        # 保存重采样后的音频到指定文件
        torchaudio.save(temp_file_path, resampled_waveform, target_sample_rate)
    return temp_file_path
 def parse_arguments():
    parser = argparse.ArgumentParser(description="Audio processing script arguments")
    # Reference audio path
    parser.add_argument("-r", "--reference_audio", type=str, required=True,
                        help="Path to the reference WAV file.")
    # Comparison directory path
    parser.add_argument("-c", "--comparison_dir", type=str, required=True,
                        help="Path to the directory containing comparison WAV files.")
    # Output file path
    parser.add_argument("-o", "--output_file", type=str, required=True,
                        help="Path to the output file where results will be written.")
    # Model Type
    parser.add_argument("-m", "--model_type", type=str, required=True,
                        help="Path to the model type.")
    return parser.parse_args()
 if __name__ == '__main__':
    cmd = parse_arguments()
    compare_audio_and_generate_report(
        reference_audio_path=cmd.reference_audio,
        comparison_dir_path=cmd.comparison_dir,
        output_file_path=cmd.output_file,
        model_type=cmd.model_type,
    )
    # compare_audio_and_generate_report(
    #     reference_audio_path="D:/tt/渡鸦/refer_audio_all/也对，你的身份和我们不同吗？.wav",
    #     comparison_dir_path='D:/tt/渡鸦/refer_audio_all',
    #     output_file_path='D:/tt/渡鸦/test.txt',
    # )
--- a/Ref_Audio_Selector/tool/text_check.py
+++ b/Ref_Audio_Selector/tool/text_check.py
@ -0,0 +1,77 @@
 import os
 import Ref_Audio_Selector.common.common as common
 import Ref_Audio_Selector.tool.audio_check as audio_check
 from Ref_Audio_Selector.config_param.log_config import logger
 def parse_text_similarity_result_txt(file_path):
    """
    解析指定格式的txt文件，每行格式：f"{item['average_similarity_score']}|{item['count']}|{item['emotion']}"
    :param file_path: txt文件的路径
    :return: 包含解析后数据的字典列表
    """
    data_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # 使用'|'作为分隔符分割每行数据
            parts = line.strip().split('|')
            if len(parts) == 3:
                # 将分割后的字符串转换为浮点数、整数和字符串
                try:
                    item = {
                        'average_similarity_score': float(parts[0]),
                        'count': int(parts[1]),
                        'emotion': parts[2]
                    }
                    data_list.append(item)
                except ValueError as e:
                    # 如果转换失败，打印错误信息并跳过该行
                    logger.error(f"Error parsing line: {line.strip()} - {e}")
    return data_list
 def remove_low_similarity_files(ref_audio_list, report_list, audio_text_similarity_boundary):
    """
    根据条件删除低相似度音频文件并返回删除数量。
    :param ref_audio_list: 包含音频路径和情感属性的列表
    :param report_list: 包含相似度评分和情感属性的列表
    :param audio_text_similarity_boundary: 相似度阈值
    :return: 删除的文件数量
    """
    deleted_count = 0
    # 筛选出平均相似度低于阈值的报告
    low_similarity_reports = [report for report in report_list if
                              report['average_similarity_score'] < audio_text_similarity_boundary]
    # 遍历低相似度报告，查找并删除对应音频文件
    for report in low_similarity_reports:
        emotion = report['emotion']
        # 查找ref_audio_list中相同情感的音频文件路径
        matching_refs = [ref for ref in ref_audio_list if ref['emotion'] == emotion]
        for match in matching_refs:
            ref_path = match['ref_path']
            # 检查文件是否存在，然后尝试删除
            if os.path.exists(ref_path):
                try:
                    os.remove(ref_path)
                    deleted_count += 1
                    logger.info(f"Deleted file: {ref_path}")
                except Exception as e:
                    logger.error(f"Error deleting file {ref_path}: {e}")
            else:
                logger.error(f"File not found: {ref_path}")
    return deleted_count
 def delete_ref_audio_below_boundary(ref_audio_path, text_similarity_result_path, sync_inference_audio_dir,
                                    audio_text_similarity_boundary):
    ref_audio_list = common.RefAudioListManager(ref_audio_path).get_ref_audio_list()
    report_list = parse_text_similarity_result_txt(text_similarity_result_path)
    count = remove_low_similarity_files(ref_audio_list, report_list, audio_text_similarity_boundary)
    audio_check.sync_ref_audio(ref_audio_path, sync_inference_audio_dir)
    return count
--- a/Ref_Audio_Selector/tool/text_comparison/init.py
+++ b/Ref_Audio_Selector/tool/text_comparison/init.py
--- a/Ref_Audio_Selector/tool/text_comparison/asr_text_process.py
+++ b/Ref_Audio_Selector/tool/text_comparison/asr_text_process.py
@ -0,0 +1,161 @@
 import os
 import argparse
 from collections import defaultdict
 from operator import itemgetter
 from Ref_Audio_Selector.common.time_util import timeit_decorator
 import Ref_Audio_Selector.tool.text_comparison.text_comparison as text_comparison
 import Ref_Audio_Selector.config_param.config_params as params
 import Ref_Audio_Selector.common.common as common
 from Ref_Audio_Selector.config_param.log_config import logger
 def parse_asr_file(file_path):
    output = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # 假设每行都是正确的格式，且"|"'是固定分隔符
            input_file_path, original_text, language, asr_text = line.strip().split('|')
            emotion = common.get_filename_without_extension(input_file_path)
            # 将解析出的数据构造成新的字典或元组等结构
            parsed_data = {
                'emotion': emotion,
                'input_file_path': input_file_path,
                'original_text': original_text,
                'language': language,
                'asr_text': asr_text,
                'similarity_score': 0
            }
            output.append(parsed_data)
    return output
@timeit_decorator
 def calculate_similarity_and_append_to_list(input_list, boundary):
    all_count = len(input_list)
    has_been_processed_count = 0
    for item in input_list:
        original_score, similarity_score = text_comparison.calculate_result(item['original_text'], item['asr_text'], boundary)
        item['similarity_score'] = similarity_score
        item['original_score'] = original_score
        has_been_processed_count += 1
        logger.info(f'进度：{has_been_processed_count}/{all_count}')
    return input_list
 def calculate_average_similarity_by_emotion(data_list):
    result_dict = defaultdict(list)
    for item in data_list:
        emotion = item['emotion']
        similarity_score = item['similarity_score']
        result_dict[emotion].append(similarity_score)
    average_scores = [{'emotion': emotion, 'average_similarity_score': sum(scores) / len(scores), 'count': len(scores)}
                      for emotion, scores in result_dict.items()]
    average_scores.sort(key=lambda x: x['average_similarity_score'], reverse=True)
    return average_scores
 def group_and_sort_by_field(data, group_by_field):
    # 创建一个空的结果字典，键是group_by_field指定的字段，值是一个列表
    result_dict = defaultdict(list)
    # 遍历输入列表
    for item in data:
        # 根据指定的group_by_field将当前元素添加到对应键的列表中
        key_to_group = item[group_by_field]
        result_dict[key_to_group].append(item)
    # 对每个键对应的列表中的元素按similarity_score降序排序
    for key in result_dict:
        result_dict[key].sort(key=itemgetter('similarity_score'), reverse=True)
    # 将结果字典转换为列表，每个元素是一个包含键（emotion或original_text）和排序后数组的元组
    result_list = [(k, v) for k, v in result_dict.items()]
    return result_list
 def format_list_to_text(data_list, output_filename):
    with open(output_filename, 'w', encoding='utf-8') as output_file:
        output_file.write('放大后的相似度分值|原始分值|ASR文本|原文文本\n')
        for key, items in data_list:
            # 写入情绪标题
            output_file.write(key + '\n')
            # 写入每条记录
            for item in items:
                formatted_line = f"{item['similarity_score']}|{item['original_score']}|{item['asr_text']}|{item['original_text']}\n"
                output_file.write(formatted_line)
 def format_list_to_emotion(data_list, output_filename):
    with open(output_filename, 'w', encoding='utf-8') as output_file:
        output_file.write('放大后的相似度分值|原始分值|ASR文本|情绪类型\n')
        for key, items in data_list:
            # 写入情绪标题
            output_file.write(key + '\n')
            # 写入每条记录
            for item in items:
                formatted_line = f"{item['similarity_score']}|{item['original_score']}|{item['asr_text']}|{item['emotion']}\n"
                output_file.write(formatted_line)
@timeit_decorator
 def process(asr_file_path, output_dir, similarity_enlarge_boundary):
    # 检查输出目录是否存在，如果不存在则创建
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    records = parse_asr_file(asr_file_path)
    calculate_similarity_and_append_to_list(records, similarity_enlarge_boundary)
    average_similarity_list = calculate_average_similarity_by_emotion(records)
    average_similarity_file = os.path.join(output_dir,
                                           f'{params.text_emotion_average_similarity_report_filename}.txt')
    average_similarity_content = \
        '\n'.join([f"{item['average_similarity_score']}|{item['count']}|{item['emotion']}" for item in average_similarity_list])
    common.write_text_to_file(average_similarity_content, average_similarity_file)
    emotion_detail_list = group_and_sort_by_field(records, 'emotion')
    emotion_detail_file = os.path.join(output_dir, f'{params.text_similarity_by_emotion_detail_filename}.txt')
    format_list_to_text(emotion_detail_list, emotion_detail_file)
    original_text_detail_list = group_and_sort_by_field(records, 'original_text')
    original_text_detail_file = os.path.join(output_dir, f'{params.text_similarity_by_text_detail_filename}.txt')
    format_list_to_emotion(original_text_detail_list, original_text_detail_file)
    logger.info('文本相似度分析完成。')
 def parse_arguments():
    parser = argparse.ArgumentParser(description="Process ASR files and analyze similarity.")
    parser.add_argument("-a", "--asr_file_path", type=str, required=True,
                        help="Path to the directory containing ASR files or path to a single ASR file.")
    parser.add_argument("-o", "--output_dir", type=str, required=True,
                        help="Path to the directory where the analysis results should be saved.")
    parser.add_argument("-b", "--similarity_enlarge_boundary", type=float, required=True,
                        help="Similarity score boundary value to be used in your calculations.")
    args = parser.parse_args()
    return args
 if __name__ == '__main__':
    cmd = parse_arguments()
    # print(cmd)
    process(cmd.asr_file_path, cmd.output_dir, cmd.similarity_enlarge_boundary)
--- a/Ref_Audio_Selector/tool/text_comparison/text_comparison.py
+++ b/Ref_Audio_Selector/tool/text_comparison/text_comparison.py
@ -0,0 +1,128 @@
 import os
 import torch
 from transformers import AutoTokenizer, AutoModel
 from scipy.spatial.distance import cosine
 from Ref_Audio_Selector.config_param.log_config import logger
 bert_path = os.environ.get(
    "bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
 )
 # Set device to GPU if available, else CPU
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 logger.info(f'使用计算设备: {device}')
 tokenizer = AutoTokenizer.from_pretrained(bert_path)
 model = AutoModel.from_pretrained(bert_path).to(device)
 def calculate_similarity(text1, text2, max_length=512):
    # 预处理文本，设置最大长度
    inputs1 = tokenizer(text1, padding=True, truncation=True, max_length=max_length, return_tensors='pt').to(device)
    inputs2 = tokenizer(text2, padding=True, truncation=True, max_length=max_length, return_tensors='pt').to(device)
    # 获取句子向量（这里是取CLS token的向量并展平为一维）
    with torch.no_grad():
        encoded_text1 = model(**inputs1)[0][:, 0, :].flatten()
        encoded_text2 = model(**inputs2)[0][:, 0, :].flatten()
    # 确保转换为numpy数组并且是一维的
    similarity = 1 - cosine(encoded_text1.cpu().numpy().flatten(), encoded_text2.cpu().numpy().flatten())
    return similarity
 # 对boundary到1区间的值进行放大
 def adjusted_similarity(similarity_score2, boundary=0.8):
    if similarity_score2 < boundary:
        return 0
    # 倍数
    multiple = 1 / (1 - boundary)
    adjusted_score = (similarity_score2 - boundary) * multiple
    return adjusted_score
 def calculate_result(t1, t2, boundary):
    # 计算并打印相似度
    similarity_score2 = calculate_similarity(t1, t2)
    # 调整相似度
    adjusted_similarity_score2 = adjusted_similarity(similarity_score2, boundary)
    return similarity_score2, adjusted_similarity_score2
 def print_result(t1, t2, boundary):
    print(f't2: {t2}')
    # 计算并打印相似度
    similarity_score2 = calculate_similarity(t1, t2)
    print(f"两句话的相似度为: {similarity_score2:.4f}")
    # 调整相似度
    adjusted_similarity_score2 = adjusted_similarity(similarity_score2, boundary)
    print(f"调整后的相似度为: {adjusted_similarity_score2:.4f}")
 def test(boundary):
    # 原始文本
    text1 = "这是第一个句子"
    list = """
    这是第一个句子
    这是第二个句子。
    那么，这是第三个表达。
    当前呈现的是第四个句子。
    接下来，我们有第五句话。
    在此，展示第六条陈述。
    继续下去，这是第七个短句。
    不容忽视的是第八个表述。
    顺延着序列，这是第九句。
    此处列举的是第十个说法。
    进入新的篇章，这是第十一个句子。
    下一段内容即为第十二个句子。
    显而易见，这是第十三个叙述。
    渐进地，我们来到第十四句话。
    向下滚动，您会看到第十五个表达。
    此刻，呈现在眼前的是第十六个句子。
    它们中的一个——第十七个句子在此。
    如同链条般连接，这是第十八个断言。
    按照顺序排列，接下来是第十九个话语。
    逐一列举，这是第二十个陈述句。
    结构相似，本例给出第二十一个实例句。
    这是最初的陈述句。
    首先表达的是这一个句子。
    第一句内容即为此处所示。
    这是起始的叙述段落。
    开篇所展示的第一句话就是这个。
    明媚的阳光洒满大地
    窗外飘落粉色樱花瓣
    笔尖轻触纸面思绪万千
    深夜的月光如水般静谧
    穿越丛林的小径蜿蜒曲折
    浅酌清茶品味人生百态
    破晓时分雄鸡一唱天下白
    草原上奔驰的骏马无拘无束
    秋叶纷飞描绘季节更替画卷
    寒冬雪夜炉火旁围坐共话家常
    kszdRjYXw
    pfsMgTlVHnB
    uQaGxIbWz
    ZtqNhPmKcOe
    jfyrXsStVUo
    wDiEgLkZbn
    yhNvAfUmqC
    TpKjxMrWgs
    eBzHUaFJtYd
    oQnXcVSiPkL
    00000
    """
    list2 = list.strip().split('\n')
    for item in list2:
        print_result(text1, item, boundary)
 if __name__ == '__main__':
    test(0.9)
--- a/Ref_Audio_Selector/ui_init/init.py
+++ b/Ref_Audio_Selector/ui_init/init.py
--- a/Ref_Audio_Selector/ui_init/init_ui_param.py
+++ b/Ref_Audio_Selector/ui_init/init_ui_param.py
@ -0,0 +1,197 @@
 import os
 import multiprocessing
 import Ref_Audio_Selector.config_param.config_params as params
 import Ref_Audio_Selector.tool.audio_inference as audio_inference
 import Ref_Audio_Selector.common.common as common
 rw_param = params.config_manager.get_rw_param()
 # -------------------基本信息---------------------------
 # 角色所在工作目录
 base_dir_default = None
 # 工作目录
 text_work_space_dir_default = None
 # 角色名称
 text_role_default = None
 # 参考音频所在目录
 text_refer_audio_file_dir_default = None
 # 推理音频所在目录
 text_inference_audio_file_dir_default = None
 # -------------------第一步------------------------------
 # 参考音频抽样目录
 text_sample_dir_default = None
 # 分段数
 slider_subsection_num_default = None
 # 每段随机抽样个数
 slider_sample_num_default = None
 # -------------------第二步------------------------------
 # api服务模型切换接口地址
 text_api_set_model_base_url_default = None
 # GPT模型参数名
 text_api_gpt_param_default = None
 # SoVITS模型参数名
 text_api_sovits_param_default = None
 # api服务GPT模型切换接口地址
 text_api_v2_set_gpt_model_base_url_default = None
 # GPT模型参数名
 text_api_v2_gpt_model_param_default = None
 # api服务SoVITS模型切换接口地址
 text_api_v2_set_sovits_model_base_url_default = None
 # SoVITS模型参数名
 text_api_v2_sovits_model_param_default = None
 # 推理服务请求地址与参数
 text_url_default = None
 # 推理服务请求完整地址
 text_whole_url_default = None
 # 文本参数名
 text_text_default = None
 # 参考参数类型
 dropdown_refer_type_param_default = None
 # 参考音频路径参数名
 text_ref_path_default = None
 # 参考音频文本参数名
 text_ref_text_default = None
 # 角色情绪参数名
 text_emotion_default = None
 # 待推理文本路径
 text_test_content_default = None
 # 请求并发数
 slider_request_concurrency_num_default = 3
 # 最大并发数
 slider_request_concurrency_max_num = None
 # -------------------第三步------------------------------
 # 待asr的音频所在目录
 text_asr_audio_dir_default = None
 # 待分析的文件路径
 text_text_similarity_analysis_path_default = None
 # 文本相似度放大边界
 slider_text_similarity_amplification_boundary_default = 0.90
 # 文本相似度分析结果文件所在路径
 text_text_similarity_result_path_default = None
 # -------------------第四步------------------------------
 # -------------------第五步------------------------------
 # 模板内容
 text_template_default = None
 def empty_default(vale, default_value):
    if vale is None or vale == "":
        return default_value
    else:
        return vale
 def init_base():
    global text_work_space_dir_default, text_role_default, base_dir_default, text_refer_audio_file_dir_default, text_inference_audio_file_dir_default
    text_work_space_dir_default = rw_param.read(rw_param.work_dir)
    text_role_default = rw_param.read(rw_param.role)
    base_dir_default = os.path.join(text_work_space_dir_default, text_role_default)
    text_refer_audio_file_dir_default = common.check_path_existence_and_return(
        os.path.join(base_dir_default, params.reference_audio_dir))
    text_inference_audio_file_dir_default = common.check_path_existence_and_return(
        os.path.join(base_dir_default, params.inference_audio_dir))
 def init_first():
    global text_sample_dir_default, slider_subsection_num_default, slider_sample_num_default
    text_sample_dir_default = common.check_path_existence_and_return(
        os.path.join(base_dir_default, params.list_to_convert_reference_audio_dir))
    slider_subsection_num_default = int(empty_default(rw_param.read(rw_param.subsection_num), 10))
    slider_sample_num_default = (empty_default(rw_param.read(rw_param.sample_num), 4))
 def init_second():
    global text_api_set_model_base_url_default, text_api_gpt_param_default, text_api_sovits_param_default, text_api_v2_set_gpt_model_base_url_default, text_api_v2_gpt_model_param_default
    global text_api_v2_set_sovits_model_base_url_default, text_api_v2_sovits_model_param_default, text_url_default, text_whole_url_default, text_text_default, dropdown_refer_type_param_default, text_ref_path_default
    global text_ref_text_default, text_emotion_default, text_test_content_default, slider_request_concurrency_num_default, slider_request_concurrency_max_num
    text_api_set_model_base_url_default = empty_default(rw_param.read(rw_param.api_set_model_base_url),
                                                        'http://localhost:9880/set_model')
    text_api_gpt_param_default = empty_default(rw_param.read(rw_param.api_gpt_param), 'gpt_model_path')
    text_api_sovits_param_default = empty_default(rw_param.read(rw_param.api_sovits_param), 'sovits_model_path')
    text_api_v2_set_gpt_model_base_url_default = empty_default(rw_param.read(rw_param.api_v2_set_gpt_model_base_url),
                                                               'http://localhost:9880/set_gpt_weights')
    text_api_v2_gpt_model_param_default = empty_default(rw_param.read(rw_param.api_v2_gpt_model_param), 'weights_path')
    text_api_v2_set_sovits_model_base_url_default = empty_default(
        rw_param.read(rw_param.api_v2_set_sovits_model_base_url), 'http://localhost:9880/set_sovits_weights')
    text_api_v2_sovits_model_param_default = empty_default(rw_param.read(rw_param.api_v2_sovits_model_param), 'weights_path')
    text_url_default = empty_default(rw_param.read(rw_param.text_url),
                                     'http://localhost:9880?prompt_language=中文&text_language=中文&cut_punc=,.;?!、，。？！;：…')
    text_text_default = empty_default(rw_param.read(rw_param.text_param), 'text')
    dropdown_refer_type_param_default = empty_default(rw_param.read(rw_param.refer_type_param), '参考音频')
    text_ref_path_default = empty_default(rw_param.read(rw_param.ref_path_param), 'refer_wav_path')
    text_ref_text_default = empty_default(rw_param.read(rw_param.ref_text_param), 'prompt_text')
    text_emotion_default = empty_default(rw_param.read(rw_param.emotion_param), 'emotion')
    text_whole_url_default = whole_url(text_url_default, dropdown_refer_type_param_default, text_text_default,
                                       text_ref_path_default, text_ref_text_default, text_emotion_default)
    text_test_content_default = empty_default(rw_param.read(rw_param.test_content_path), params.default_test_text_path)
    slider_request_concurrency_max_num = multiprocessing.cpu_count()
    slider_request_concurrency_num_default = empty_default(rw_param.read(rw_param.request_concurrency_num), 3)
    slider_request_concurrency_num_default = min(int(slider_request_concurrency_num_default), slider_request_concurrency_max_num)
 # 基于请求路径和参数，合成完整的请求路径
 def whole_url(text_url, dropdown_refer_type_param, text_text, text_ref_path, text_ref_text, text_emotion):
    url_composer = audio_inference.TTSURLComposer(text_url, dropdown_refer_type_param, text_emotion, text_text,
                                                  text_ref_path, text_ref_text)
    if url_composer.is_emotion():
        text_whole_url = url_composer.build_url_with_emotion('测试内容', '情绪类型', False)
    else:
        text_whole_url = url_composer.build_url_with_ref('测试内容', '参考路径', '参考文本', False)
    return text_whole_url
 def init_third():
    global text_asr_audio_dir_default, text_text_similarity_analysis_path_default, slider_text_similarity_amplification_boundary_default, text_text_similarity_result_path_default
    text_asr_audio_dir_default = common.check_path_existence_and_return(
        os.path.join(base_dir_default, params.inference_audio_dir, params.inference_audio_text_aggregation_dir))
    text_text_similarity_analysis_path_default = common.check_path_existence_and_return(
        os.path.join(base_dir_default, params.asr_filename + '.list'))
    slider_text_similarity_amplification_boundary_default = empty_default(
        rw_param.read(rw_param.text_similarity_amplification_boundary), 0.90)
    text_text_similarity_result_path_default = common.check_path_existence_and_return(
        os.path.join(base_dir_default, params.text_emotion_average_similarity_report_filename + '.txt'))
 def init_fourth():
    pass
 def init_fifth():
    global text_template_default
    default_template_path = params.default_template_path
    text_template_default = empty_default(rw_param.read(rw_param.text_template),
                                          common.read_file(default_template_path))
 def init_all():
    init_base()
    init_first()
    init_second()
    init_third()
    init_fourth()
    init_fifth()
--- a/Ref_Audio_Selector/参考音频筛选流程.png
+++ b/Ref_Audio_Selector/参考音频筛选流程.png
--- a/docs/cn/Changelog_CN.md
+++ b/docs/cn/Changelog_CN.md
@ -594,11 +594,11 @@
  - 内容: 修复实验名结尾出现空格在win中路径不正确的问题
  - 类型: 修复
  - 提交: RVC-Boss
- 2025.06.10 [Commit#746cb536](https://github.com/RVC-Boss/GPT-SoVITS/commit/746cb536c68b1fe6ce3ca7e882235375b8a8dd89)
+- 2025.06.10 [PR#2449](https://github.com/RVC-Boss/GPT-SoVITS/pull/2449)
  - 内容: 语种分割优化
  - 类型: 优化
  - 提交: KamioRinn
- 2025.06.11 [Commit#dd2b9253](https://github.com/RVC-Boss/GPT-SoVITS/commit/dd2b9253aabb09db32db7a3344570ed9df043351)
+- 2025.06.11 [PR#2450](https://github.com/RVC-Boss/GPT-SoVITS/pull/2450)
  - 内容: 修复并行推理对v2pro支持bug
  - 类型: 修复
  - 提交: YYuX-1145
@ -606,21 +606,132 @@
  - 内容: v2pro对ge提取时会出现数值溢出的问题修复
  - 类型: 修复
  - 提交: RVC-Boss
- 2025.06.11 [Commit#37f5abfc](https://github.com/RVC-Boss/GPT-SoVITS/commit/6fdc67ca83418306f11e90b9139278313ac5c3e9)[Commit#6fdc67ca](https://github.com/RVC-Boss/GPT-SoVITS/commit/37f5abfcb4a6553652235909db2e124b6f8ff3a5)
+- 2025.06.17 [PR#2464](https://github.com/RVC-Boss/GPT-SoVITS/pull/2464) [PR#2482](https://github.com/RVC-Boss/GPT-SoVITS/pull/2482)
  - 内容: install.sh逻辑优化
  - 类型: 优化
  - 提交: XXXXRT666
- 2025.06.27 [Commit#90ebefa7](https://github.com/RVC-Boss/GPT-SoVITS/commit/90ebefa78fd544da36eebe0b2003620879c921b0)
+- 2025.06.27 [PR#2489](https://github.com/RVC-Boss/GPT-SoVITS/pull/2489)
  - 内容: onnxruntime加载逻辑优化（对gpu/cpu的判断）
  - 类型: 优化
  - 提交: KamioRinn
- 2025.06.27 [Commit#6df61f58](https://github.com/RVC-Boss/GPT-SoVITS/commit/6df61f58e4d18d4c2ad9d1eddd6a1bd690034c23)
+- 2025.06.27 [PR#2488](https://github.com/RVC-Boss/GPT-SoVITS/pull/2488)
  - 内容: 语言分割及格式化优化
  - 类型: 优化
  - 提交: KamioRinn
 ## 202507
 - 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
  - 内容: 提升推理进程优先级（修复win11下可能GPU利用率受限的问题）
-  - 类型: 修复
+  - 类型: 优化
  - 提交: XianYue0125
 - 2025.07.16 [PR#2490](https://github.com/RVC-Boss/GPT-SoVITS/pull/2490)
  - 内容: 解决 TTS.py 无法识别真正支持版本 v2Pro、v2ProPlus 的问题, 同时更新一版默认配置。
  - 类型: 修复
  - 提交: jiangsier-xyz
 - 2025.07.16 [Commit#4d8ebf85](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d8ebf85233d4f1166d7cc02fdc595602975ca8f)
  - 内容: 修复并行推理模式下v2pro模型识别问题
  - 类型: 修复
  - 提交: RVC-Boss
 - 2025.07.17 [PR#2531](https://github.com/RVC-Boss/GPT-SoVITS/pull/2531)
  - 内容: whisper asr支持性价比更高的distill模型
  - 类型: 优化
  - 提交: XXXXRT666
 - 2025.07.18 [PR#2536](https://github.com/RVC-Boss/GPT-SoVITS/pull/2536)
  - 内容: 优化TTS_Config的代码逻辑
  - 类型: 优化
  - 提交: ChasonJiang
 - 2025.07.18 [PR#2537](https://github.com/RVC-Boss/GPT-SoVITS/pull/2537)
  - 内容: 修复gpt的loss计算问题
  - 类型: 修复
  - 提交: ChasonJiang
 ## 202508
 - 2025.08.02 [PR#2561](https://github.com/RVC-Boss/GPT-SoVITS/pull/2561)
  - 内容: WSL Rocm
  - 类型: 修复
  - 提交: XXXXRT666
 ## 202509
 - 2025.09.10 [Commit#11aa78bd](https://github.com/RVC-Boss/GPT-SoVITS/commit/11aa78bd9bda8b53047cfcae03abf7ca94d27391)
  - 内容: 修复环境变量可能不为str的问题
  - 类型: 修复
  - 提交: RVC-Boss
 ## 202511
 - 2025.11.28 [PR#2671](https://github.com/RVC-Boss/GPT-SoVITS/pull/2671) [PR#2678](https://github.com/RVC-Boss/GPT-SoVITS/pull/2678)
  - 内容: 流式推理
  - 类型: 新功能
  - 提交: ChasonJiang
 - 2025.11.28 [PR#2636](https://github.com/RVC-Boss/GPT-SoVITS/pull/2636)
  - 内容: 数学计算文本前端逻辑优化
  - 类型: 优化
  - 提交: KamioRinn
 - 2025.11.28 [PR#2469](https://github.com/RVC-Boss/GPT-SoVITS/pull/2469)
  - 内容: 流式推理
  - 类型: 新功能
  - 提交: L-jasmine
 - 2025.11.28 [PR#2577](https://github.com/RVC-Boss/GPT-SoVITS/pull/2577)
  - 内容: 支持vq分布式训练
  - 类型: 优化
  - 提交: wzy3650
 - 2025.11.28 [PR#2627](https://github.com/RVC-Boss/GPT-SoVITS/pull/2627) [PR#2679](https://github.com/RVC-Boss/GPT-SoVITS/pull/2679)
  - 内容: ASR模型下载逻辑优化
  - 类型: 优化
  - 提交: XXXXRT666
 - 2025.11.28 [PR#2662](https://github.com/RVC-Boss/GPT-SoVITS/pull/2662)
  - 内容: default batch size bug 修复
  - 类型: 修复
  - 提交: Spr-Aachen
 ## 202512
 - 2025.12.30 [PR#2703](https://github.com/RVC-Boss/GPT-SoVITS/pull/2703) [PR#2704](https://github.com/RVC-Boss/GPT-SoVITS/pull/2704)
  - 内容: 修复采样错误
  - 类型: 修复
  - 提交: ChasonJiang
 ## 202602
 - 2026.02.08 [PR#2727](https://github.com/RVC-Boss/GPT-SoVITS/pull/2727)
  - 内容: 修复 Conda 条款未同意导致的构建失败
  - 类型: 修复
  - 提交: Oarora
 - 2026.02.09 [PR#2732](https://github.com/RVC-Boss/GPT-SoVITS/pull/2732)
  - 内容: 环境自动构建优化
  - 类型: 优化
  - 提交: XXXXRT666
 ## 202604
 - 2026.04.18 [PR#2763](https://github.com/RVC-Boss/GPT-SoVITS/pull/2763)
  - 内容: 优化 G2PW 的推理输入构造与多音字处理流程，减少重复计算，降低长句场景下的推理开销
  - 类型: 优化
  - 提交: baicai-1145
 - 2026.04.18 [PR#2767](https://github.com/RVC-Boss/GPT-SoVITS/pull/2767)
  - 内容: 改进 Windows 单卡 v3 LoRA 训练流程
  - 类型: 优化
  - 提交: 2409324124
 - 2026.04.18 [PR#2755](https://github.com/RVC-Boss/GPT-SoVITS/pull/2755)
  - 内容: 修复多个模块中的独立 bug
  - 类型: 修复
  - 提交: wishhyt
 - 2026.04.18 [PR#2758](https://github.com/RVC-Boss/GPT-SoVITS/pull/2758)
  - 内容: 添加数据集的错误处理提示
  - 类型: 优化
  - 提交: mushroomcowisheggs
 - 2026.04.18 [PR#2753](https://github.com/RVC-Boss/GPT-SoVITS/pull/2753)
  - 内容: 并行推理部分bug修复
  - 类型: 修复
  - 提交: wishhyt
 - 2026.04.18 [PR#2733](https://github.com/RVC-Boss/GPT-SoVITS/pull/2733)
  - 内容: bug修复：DPO 训练不支持漏字模拟
  - 类型: 修复
  - 提交: Mr-Neutr0n
 - 2026.04.18 [Commit#02425ea](https://github.com/RVC-Boss/GPT-SoVITS/commit/02425ea25680c26c700be0bc158756c69103d827)
  - 内容: 修复onnx脚本未导入Optional等的问题
  - 类型: 修复
  - 提交: RVC-Boss
--- a/docs/en/Changelog_EN.md
+++ b/docs/en/Changelog_EN.md
@ -578,3 +578,160 @@
  - Content: Optimized automatic precision detection logic; added collapsible functionality to WebUI frontend modules.
  - Type: New Feature
  - Contributors: XXXXRT666, RVC-Boss
 - 2025.06.06 [PR#2427](https://github.com/RVC-Boss/GPT-SoVITS/pull/2427)
  - Content: Fix polyphone detection for "X一X" pattern
  - Type: Fix
  - Contributor: wzy3650
 - 2025.06.05 [PR#2439](https://github.com/RVC-Boss/GPT-SoVITS/pull/2439)
  - Content: Config fix; fix SoVITS model loading
  - Type: Fix
  - Contributor: wzy3650
 - 2025.06.09 [Commit#8056efe4](https://github.com/RVC-Boss/GPT-SoVITS/commit/8056efe4ab7bbc3610c72ae356a6f37518441f7d)
  - Content: Fix possible numerical explosion of `ge.sum` causing silent inference
  - Type: Fix
  - Contributor: RVC-Boss
 - 2025.06.10 [Commit#2c0436b9](https://github.com/RVC-Boss/GPT-SoVITS/commit/2c0436b9ce397424ae03476c836fb64c6e5ebcc6)
  - Content: Fix incorrect Windows path when experiment name ends with a space
  - Type: Fix
  - Contributor: RVC-Boss
 - 2025.06.10 [PR#2449](https://github.com/RVC-Boss/GPT-SoVITS/pull/2449)
  - Content: Optimize language segmentation
  - Type: Optimization
  - Contributor: KamioRinn
 - 2025.06.11 [PR#2450](https://github.com/RVC-Boss/GPT-SoVITS/pull/2450)
  - Content: Fix bug in parallel inference support for v2pro
  - Type: Fix
  - Contributor: YYuX-1145
 - 2025.06.11 [Commit#ed89a023](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed89a023378dabba9d4b6580235bb9742245816d)
  - Content: Fix numerical overflow issue when extracting `ge` for v2pro
  - Type: Fix
  - Contributor: RVC-Boss
 - 2025.06.17 [PR#2464](https://github.com/RVC-Boss/GPT-SoVITS/pull/2464) [PR#2482](https://github.com/RVC-Boss/GPT-SoVITS/pull/2482)
  - Content: Optimize `install.sh` logic
  - Type: Optimization
  - Contributor: XXXXRT666
 - 2025.06.27 [PR#2489](https://github.com/RVC-Boss/GPT-SoVITS/pull/2489)
  - Content: Optimize onnxruntime loading logic (GPU/CPU detection)
  - Type: Optimization
  - Contributor: KamioRinn
 - 2025.06.27 [PR#2488](https://github.com/RVC-Boss/GPT-SoVITS/pull/2488)
  - Content: Optimize language segmentation and formatting
  - Type: Optimization
  - Contributor: KamioRinn
 ## 202507
 - 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
  - Content: Increase inference process priority (fix possible GPU utilization limitation on Win11)
  - Type: Optimization
  - Contributor: XianYue0125
 - 2025.07.16 [PR#2490](https://github.com/RVC-Boss/GPT-SoVITS/pull/2490)
  - Content: Fix TTS.py not recognizing actually supported versions v2Pro and v2ProPlus, and update default configuration
  - Type: Fix
  - Contributor: jiangsier-xyz
 - 2025.07.16 [Commit#4d8ebf85](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d8ebf85233d4f1166d7cc02fdc595602975ca8f)
  - Content: Fix v2pro model recognition issue in parallel inference mode
  - Type: Fix
  - Contributor: RVC-Boss
 - 2025.07.17 [PR#2531](https://github.com/RVC-Boss/GPT-SoVITS/pull/2531)
  - Content: Whisper ASR supports more cost-effective distill models
  - Type: Optimization
  - Contributor: XXXXRT666
 - 2025.07.18 [PR#2536](https://github.com/RVC-Boss/GPT-SoVITS/pull/2536)
  - Content: Optimize `TTS_Config` code logic
  - Type: Optimization
  - Contributor: ChasonJiang
 - 2025.07.18 [PR#2537](https://github.com/RVC-Boss/GPT-SoVITS/pull/2537)
  - Content: Fix GPT loss calculation issue
  - Type: Fix
  - Contributor: ChasonJiang
 ## 202508
 - 2025.08.02 [PR#2561](https://github.com/RVC-Boss/GPT-SoVITS/pull/2561)
  - Content: WSL Rocm
  - Type: Fix
  - Contributor: XXXXRT666
 ## 202509
 - 2025.09.10 [Commit#11aa78bd](https://github.com/RVC-Boss/GPT-SoVITS/commit/11aa78bd9bda8b53047cfcae03abf7ca94d27391)
  - Content: Fix issue where environment variable may not be a string
  - Type: Fix
  - Contributor: RVC-Boss
 ## 202511
 - 2025.11.28 [PR#2671](https://github.com/RVC-Boss/GPT-SoVITS/pull/2671) [PR#2678](https://github.com/RVC-Boss/GPT-SoVITS/pull/2678)
  - Content: Streaming inference
  - Type: New Feature
  - Contributor: ChasonJiang
 - 2025.11.28 [PR#2636](https://github.com/RVC-Boss/GPT-SoVITS/pull/2636)
  - Content: Optimize text frontend logic for mathematical expression text
  - Type: Optimization
  - Contributor: KamioRinn
 - 2025.11.28 [PR#2469](https://github.com/RVC-Boss/GPT-SoVITS/pull/2469)
  - Content: Streaming inference
  - Type: New Feature
  - Contributor: L-jasmine
 - 2025.11.28 [PR#2577](https://github.com/RVC-Boss/GPT-SoVITS/pull/2577)
  - Content: Support VQ distributed training
  - Type: Optimization
  - Contributor: wzy3650
 - 2025.11.28 [PR#2627](https://github.com/RVC-Boss/GPT-SoVITS/pull/2627) [PR#2679](https://github.com/RVC-Boss/GPT-SoVITS/pull/2679)
  - Content: Optimize ASR model download logic
  - Type: Optimization
  - Contributor: XXXXRT666
 - 2025.11.28 [PR#2662](https://github.com/RVC-Boss/GPT-SoVITS/pull/2662)
  - Content: Fix default batch size bug
  - Type: Fix
  - Contributor: Spr-Aachen
 ## 202512
 - 2025.12.30 [PR#2703](https://github.com/RVC-Boss/GPT-SoVITS/pull/2703) [PR#2704](https://github.com/RVC-Boss/GPT-SoVITS/pull/2704)
  - Content: Fix sampling error
  - Type: Fix
  - Contributor: ChasonJiang
 ## 202602
 - 2026.02.08 [PR#2727](https://github.com/RVC-Boss/GPT-SoVITS/pull/2727)
  - Content: Fix build failure caused by unaccepted Conda terms
  - Type: Fix
  - Contributor: Oarora
 - 2026.02.09 [PR#2732](https://github.com/RVC-Boss/GPT-SoVITS/pull/2732)
  - Content: Optimize automatic environment setup
  - Type: Optimization
  - Contributor: XXXXRT666
 ## 202604
 - 2026.04.18 [PR#2763](https://github.com/RVC-Boss/GPT-SoVITS/pull/2763)
  - Content: Optimize G2PW inference input construction and polyphone handling to reduce redundant computation and inference overhead for long sentences
  - Type: Optimization
  - Contributor: baicai-1145
 - 2026.04.18 [PR#2767](https://github.com/RVC-Boss/GPT-SoVITS/pull/2767)
  - Content: Improve the LoRA training flow for GPT-SoVITS v3 on a single card under Windows
  - Type: Optimization
  - Contributor: 2409324124
 - 2026.04.18 [PR#2755](https://github.com/RVC-Boss/GPT-SoVITS/pull/2755)
  - Content: Fix miscellaneous bugs in multiple modules
  - Type: Fix
  - Contributor: wishhyt
 - 2026.04.18 [PR#2758](https://github.com/RVC-Boss/GPT-SoVITS/pull/2758)
  - Content: Add error handling hints for dataset processing
  - Type: Optimization
  - Contributor: mushroomcowisheggs
 - 2026.04.18 [PR#2753](https://github.com/RVC-Boss/GPT-SoVITS/pull/2753)
  - Content: Fix some bugs in parallel inference
  - Type: Fix
  - Contributor: wishhyt
 - 2026.04.18 [PR#2733](https://github.com/RVC-Boss/GPT-SoVITS/pull/2733)
  - Content: Fix bug where DPO training does not support missing word simulation
  - Type: Fix
  - Contributor: Mr-Neutr0n
 - 2026.04.18 [Commit#02425ea](https://github.com/RVC-Boss/GPT-SoVITS/commit/02425ea25680c26c700be0bc158756c69103d827)
  - Content: Fix missing imports (e.g., Optional) in ONNX script
  - Type: Fix
  - Contributor: RVC-Boss
--- a/docs/ja/Changelog_JA.md
+++ b/docs/ja/Changelog_JA.md
@ -578,3 +578,160 @@
  - 内容: 自動精度検出ロジックを最適化し、WebUI フロントエンドモジュールに折り畳み（Collapsible）機能を追加
  - タイプ: 新機能
  - 貢献者: XXXXRT666, RVC-Boss
 - 2025.06.06 [PR#2427](https://github.com/RVC-Boss/GPT-SoVITS/pull/2427)
  - 内容: 「X一X」パターンの多音字検出を修正
  - タイプ: 修正
  - 貢献者: wzy3650
 - 2025.06.05 [PR#2439](https://github.com/RVC-Boss/GPT-SoVITS/pull/2439)
  - 内容: 設定の修正；SoVITSモデル読み込みの修正
  - タイプ: 修正
  - 貢献者: wzy3650
 - 2025.06.09 [Commit#8056efe4](https://github.com/RVC-Boss/GPT-SoVITS/commit/8056efe4ab7bbc3610c72ae356a6f37518441f7d)
  - 内容: `ge.sum`の数値爆発による推論の無音化を修正
  - タイプ: 修正
  - 貢献者: RVC-Boss
 - 2025.06.10 [Commit#2c0436b9](https://github.com/RVC-Boss/GPT-SoVITS/commit/2c0436b9ce397424ae03476c836fb64c6e5ebcc6)
  - 内容: 実験名がスペースで終わる場合のWindowsパスの誤りを修正
  - タイプ: 修正
  - 貢献者: RVC-Boss
 - 2025.06.10 [PR#2449](https://github.com/RVC-Boss/GPT-SoVITS/pull/2449)
  - 内容: 言語分割の最適化
  - タイプ: 最適化
  - 貢献者: KamioRinn
 - 2025.06.11 [PR#2450](https://github.com/RVC-Boss/GPT-SoVITS/pull/2450)
  - 内容: v2proの並列推論対応におけるバグを修正
  - タイプ: 修正
  - 貢献者: YYuX-1145
 - 2025.06.11 [Commit#ed89a023](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed89a023378dabba9d4b6580235bb9742245816d)
  - 内容: v2proの`ge`抽出時の数値オーバーフロー問題を修正
  - タイプ: 修正
  - 貢献者: RVC-Boss
 - 2025.06.17 [PR#2464](https://github.com/RVC-Boss/GPT-SoVITS/pull/2464) [PR#2482](https://github.com/RVC-Boss/GPT-SoVITS/pull/2482)
  - 内容: `install.sh`のロジックを最適化
  - タイプ: 最適化
  - 貢献者: XXXXRT666
 - 2025.06.27 [PR#2489](https://github.com/RVC-Boss/GPT-SoVITS/pull/2489)
  - 内容: onnxruntime読み込みロジックを最適化（GPU/CPU検出）
  - タイプ: 最適化
  - 貢献者: KamioRinn
 - 2025.06.27 [PR#2488](https://github.com/RVC-Boss/GPT-SoVITS/pull/2488)
  - 内容: 言語分割と書式を最適化
  - タイプ: 最適化
  - 貢献者: KamioRinn
 ## 202507
 - 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
  - 内容: 推論プロセスの優先度を上げる（Win11でのGPU利用制限の可能性を修正）
  - タイプ: 最適化
  - 貢献者: XianYue0125
 - 2025.07.16 [PR#2490](https://github.com/RVC-Boss/GPT-SoVITS/pull/2490)
  - 内容: TTS.pyが実際にサポートされているバージョンv2Proおよびv2ProPlusを認識しない問題を修正し、デフォルト設定を更新
  - タイプ: 修正
  - 貢献者: jiangsier-xyz
 - 2025.07.16 [Commit#4d8ebf85](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d8ebf85233d4f1166d7cc02fdc595602975ca8f)
  - 内容: 並列推論モードでのv2proモデル認識問題を修正
  - タイプ: 修正
  - 貢献者: RVC-Boss
 - 2025.07.17 [PR#2531](https://github.com/RVC-Boss/GPT-SoVITS/pull/2531)
  - 内容: Whisper ASRがよりコスト効率の高い蒸留モデルをサポート
  - タイプ: 最適化
  - 貢献者: XXXXRT666
 - 2025.07.18 [PR#2536](https://github.com/RVC-Boss/GPT-SoVITS/pull/2536)
  - 内容: `TTS_Config`のコードロジックを最適化
  - タイプ: 最適化
  - 貢献者: ChasonJiang
 - 2025.07.18 [PR#2537](https://github.com/RVC-Boss/GPT-SoVITS/pull/2537)
  - 内容: GPT損失計算の問題を修正
  - タイプ: 修正
  - 貢献者: ChasonJiang
 ## 202508
 - 2025.08.02 [PR#2561](https://github.com/RVC-Boss/GPT-SoVITS/pull/2561)
  - 内容: WSL Rocm対応
  - タイプ: 修正
  - 貢献者: XXXXRT666
 ## 202509
 - 2025.09.10 [Commit#11aa78bd](https://github.com/RVC-Boss/GPT-SoVITS/commit/11aa78bd9bda8b53047cfcae03abf7ca94d27391)
  - 内容: 環境変数が文字列でない可能性がある問題を修正
  - タイプ: 修正
  - 貢献者: RVC-Boss
 ## 202511
 - 2025.11.28 [PR#2671](https://github.com/RVC-Boss/GPT-SoVITS/pull/2671) [PR#2678](https://github.com/RVC-Boss/GPT-SoVITS/pull/2678)
  - 内容: ストリーミング推論
  - タイプ: 新機能
  - 貢献者: ChasonJiang
 - 2025.11.28 [PR#2636](https://github.com/RVC-Boss/GPT-SoVITS/pull/2636)
  - 内容: 数式テキストに対するテキスト前処理ロジックを最適化
  - タイプ: 最適化
  - 貢献者: KamioRinn
 - 2025.11.28 [PR#2469](https://github.com/RVC-Boss/GPT-SoVITS/pull/2469)
  - 内容: ストリーミング推論
  - タイプ: 新機能
  - 貢献者: L-jasmine
 - 2025.11.28 [PR#2577](https://github.com/RVC-Boss/GPT-SoVITS/pull/2577)
  - 内容: VQ分散学習をサポート
  - タイプ: 最適化
  - 貢献者: wzy3650
 - 2025.11.28 [PR#2627](https://github.com/RVC-Boss/GPT-SoVITS/pull/2627) [PR#2679](https://github.com/RVC-Boss/GPT-SoVITS/pull/2679)
  - 内容: ASRモデルダウンロードロジックを最適化
  - タイプ: 最適化
  - 貢献者: XXXXRT666
 - 2025.11.28 [PR#2662](https://github.com/RVC-Boss/GPT-SoVITS/pull/2662)
  - 内容: デフォルトのバッチサイズのバグを修正
  - タイプ: 修正
  - 貢献者: Spr-Aachen
 ## 202512
 - 2025.12.30 [PR#2703](https://github.com/RVC-Boss/GPT-SoVITS/pull/2703) [PR#2704](https://github.com/RVC-Boss/GPT-SoVITS/pull/2704)
  - 内容: サンプリングエラーを修正
  - タイプ: 修正
  - 貢献者: ChasonJiang
 ## 202602
 - 2026.02.08 [PR#2727](https://github.com/RVC-Boss/GPT-SoVITS/pull/2727)
  - 内容: 受け入れられなかったConda利用規約によるビルド失敗を修正
  - タイプ: 修正
  - 貢献者: Oarora
 - 2026.02.09 [PR#2732](https://github.com/RVC-Boss/GPT-SoVITS/pull/2732)
  - 内容: 自動環境セットアップを最適化
  - タイプ: 最適化
  - 貢献者: XXXXRT666
 ## 202604
 - 2026.04.18 [PR#2763](https://github.com/RVC-Boss/GPT-SoVITS/pull/2763)
  - 内容: G2PW推論入力の構築と多音字処理を最適化し、長文における冗長な計算と推論オーバーヘッドを削減
  - タイプ: 最適化
  - 貢献者: baicai-1145
 - 2026.04.18 [PR#2767](https://github.com/RVC-Boss/GPT-SoVITS/pull/2767)
  - 内容: WindowsでのシングルカードにおけるGPT-SoVITS v3のLoRAトレーニングフローを改善
  - タイプ: 最適化
  - 貢献者: 2409324124
 - 2026.04.18 [PR#2755](https://github.com/RVC-Boss/GPT-SoVITS/pull/2755)
  - 内容: 複数モジュールの雑多なバグを修正
  - タイプ: 修正
  - 貢献者: wishhyt
 - 2026.04.18 [PR#2758](https://github.com/RVC-Boss/GPT-SoVITS/pull/2758)
  - 内容: データセット処理時のエラーハンドリングヒントを追加
  - タイプ: 最適化
  - 貢献者: mushroomcowisheggs
 - 2026.04.18 [PR#2753](https://github.com/RVC-Boss/GPT-SoVITS/pull/2753)
  - 内容: 並列推論の一部バグを修正
  - タイプ: 修正
  - 貢献者: wishhyt
 - 2026.04.18 [PR#2733](https://github.com/RVC-Boss/GPT-SoVITS/pull/2733)
  - 内容: DPOトレーニングが欠落単語シミュレーションをサポートしないバグを修正
  - タイプ: 修正
  - 貢献者: Mr-Neutr0n
 - 2026.04.18 [Commit#02425ea](https://github.com/RVC-Boss/GPT-SoVITS/commit/02425ea25680c26c700be0bc158756c69103d827)
  - 内容: ONNXスクリプトでの（Optionalなどの）不足インポートを修正
  - タイプ: 修正
  - 貢献者: RVC-Boss
--- a/docs/ko/Changelog_KO.md
+++ b/docs/ko/Changelog_KO.md
@ -578,3 +578,160 @@
  - 내용: 자동 정밀도 감지 로직 최적화; WebUI 프론트엔드 모듈에 접기 기능 추가
  - 유형: 신규 기능
  - 기여자: XXXXRT666, RVC-Boss
 - 2025.06.06 [PR#2427](https://github.com/RVC-Boss/GPT-SoVITS/pull/2427)
  - 내용: "X一X" 패턴의 다중 발음 감지 오류 수정
  - 유형: 수정
  - 기여자: wzy3650
 - 2025.06.05 [PR#2439](https://github.com/RVC-Boss/GPT-SoVITS/pull/2439)
  - 내용: 설정 오류 수정; SoVITS 모델 로딩 오류 수정
  - 유형: 수정
  - 기여자: wzy3650
 - 2025.06.09 [Commit#8056efe4](https://github.com/RVC-Boss/GPT-SoVITS/commit/8056efe4ab7bbc3610c72ae356a6f37518441f7d)
  - 내용: `ge.sum`의 수치 폭발 가능성으로 인한 추론 무음 현상 수정
  - 유형: 수정
  - 기여자: RVC-Boss
 - 2025.06.10 [Commit#2c0436b9](https://github.com/RVC-Boss/GPT-SoVITS/commit/2c0436b9ce397424ae03476c836fb64c6e5ebcc6)
  - 내용: 실험 이름이 공백으로 끝날 때 발생하는 잘못된 Windows 경로 문제 수정
  - 유형: 수정
  - 기여자: RVC-Boss
 - 2025.06.10 [PR#2449](https://github.com/RVC-Boss/GPT-SoVITS/pull/2449)
  - 내용: 언어 분할 최적화
  - 유형: 최적화
  - 기여자: KamioRinn
 - 2025.06.11 [PR#2450](https://github.com/RVC-Boss/GPT-SoVITS/pull/2450)
  - 내용: v2pro 병렬 추론 지원 버그 수정
  - 유형: 수정
  - 기여자: YYuX-1145
 - 2025.06.11 [Commit#ed89a023](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed89a023378dabba9d4b6580235bb9742245816d)
  - 내용: v2pro의 `ge` 추출 시 수치 오버플로우 문제 수정
  - 유형: 수정
  - 기여자: RVC-Boss
 - 2025.06.17 [PR#2464](https://github.com/RVC-Boss/GPT-SoVITS/pull/2464) [PR#2482](https://github.com/RVC-Boss/GPT-SoVITS/pull/2482)
  - 내용: `install.sh` 로직 최적화
  - 유형: 최적화
  - 기여자: XXXXRT666
 - 2025.06.27 [PR#2489](https://github.com/RVC-Boss/GPT-SoVITS/pull/2489)
  - 내용: onnxruntime 로딩 로직 최적화 (GPU/CPU 감지)
  - 유형: 최적화
  - 기여자: KamioRinn
 - 2025.06.27 [PR#2488](https://github.com/RVC-Boss/GPT-SoVITS/pull/2488)
  - 내용: 언어 분할 및 형식 최적화
  - 유형: 최적화
  - 기여자: KamioRinn
 ## 202507
 - 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
  - 내용: 추론 프로세스 우선순위 증가 (Win11에서 GPU 활용 제한 가능성 수정)
  - 유형: 최적화
  - 기여자: XianYue0125
 - 2025.07.16 [PR#2490](https://github.com/RVC-Boss/GPT-SoVITS/pull/2490)
  - 내용: TTS.py가 실제 지원되는 버전 v2Pro 및 v2ProPlus를 인식하지 못하는 문제 수정 및 기본 설정 업데이트
  - 유형: 수정
  - 기여자: jiangsier-xyz
 - 2025.07.16 [Commit#4d8ebf85](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d8ebf85233d4f1166d7cc02fdc595602975ca8f)
  - 내용: 병렬 추론 모드에서 v2pro 모델 인식 문제 수정
  - 유형: 수정
  - 기여자: RVC-Boss
 - 2025.07.17 [PR#2531](https://github.com/RVC-Boss/GPT-SoVITS/pull/2531)
  - 내용: Whisper ASR이 더 비용 효율적인 distill 모델 지원
  - 유형: 최적화
  - 기여자: XXXXRT666
 - 2025.07.18 [PR#2536](https://github.com/RVC-Boss/GPT-SoVITS/pull/2536)
  - 내용: `TTS_Config` 코드 로직 최적화
  - 유형: 최적화
  - 기여자: ChasonJiang
 - 2025.07.18 [PR#2537](https://github.com/RVC-Boss/GPT-SoVITS/pull/2537)
  - 내용: GPT 손실(loss) 계산 문제 수정
  - 유형: 수정
  - 기여자: ChasonJiang
 ## 202508
 - 2025.08.02 [PR#2561](https://github.com/RVC-Boss/GPT-SoVITS/pull/2561)
  - 내용: WSL Rocm
  - 유형: 수정
  - 기여자: XXXXRT666
 ## 202509
 - 2025.09.10 [Commit#11aa78bd](https://github.com/RVC-Boss/GPT-SoVITS/commit/11aa78bd9bda8b53047cfcae03abf7ca94d27391)
  - 내용: 환경 변수가 문자열이 아닐 수 있는 문제 수정
  - 유형: 수정
  - 기여자: RVC-Boss
 ## 202511
 - 2025.11.28 [PR#2671](https://github.com/RVC-Boss/GPT-SoVITS/pull/2671) [PR#2678](https://github.com/RVC-Boss/GPT-SoVITS/pull/2678)
  - 내용: 스트리밍 추론
  - 유형: 새 기능
  - 기여자: ChasonJiang
 - 2025.11.28 [PR#2636](https://github.com/RVC-Boss/GPT-SoVITS/pull/2636)
  - 내용: 수학 표현식 텍스트에 대한 텍스트 전처리 로직 최적화
  - 유형: 최적화
  - 기여자: KamioRinn
 - 2025.11.28 [PR#2469](https://github.com/RVC-Boss/GPT-SoVITS/pull/2469)
  - 내용: 스트리밍 추론
  - 유형: 새 기능
  - 기여자: L-jasmine
 - 2025.11.28 [PR#2577](https://github.com/RVC-Boss/GPT-SoVITS/pull/2577)
  - 내용: VQ 분산 학습 지원
  - 유형: 최적화
  - 기여자: wzy3650
 - 2025.11.28 [PR#2627](https://github.com/RVC-Boss/GPT-SoVITS/pull/2627) [PR#2679](https://github.com/RVC-Boss/GPT-SoVITS/pull/2679)
  - 내용: ASR 모델 다운로드 로직 최적화
  - 유형: 최적화
  - 기여자: XXXXRT666
 - 2025.11.28 [PR#2662](https://github.com/RVC-Boss/GPT-SoVITS/pull/2662)
  - 내용: 기본 배치 크기 버그 수정
  - 유형: 수정
  - 기여자: Spr-Aachen
 ## 202512
 - 2025.12.30 [PR#2703](https://github.com/RVC-Boss/GPT-SoVITS/pull/2703) [PR#2704](https://github.com/RVC-Boss/GPT-SoVITS/pull/2704)
  - 내용: 샘플링 오류 수정
  - 유형: 수정
  - 기여자: ChasonJiang
 ## 202602
 - 2026.02.08 [PR#2727](https://github.com/RVC-Boss/GPT-SoVITS/pull/2727)
  - 내용: Conda 약관 미동의로 인한 빌드 실패 수정
  - 유형: 수정
  - 기여자: Oarora
 - 2026.02.09 [PR#2732](https://github.com/RVC-Boss/GPT-SoVITS/pull/2732)
  - 내용: 자동 환경 설정 최적화
  - 유형: 최적화
  - 기여자: XXXXRT666
 ## 202604
 - 2026.04.18 [PR#2763](https://github.com/RVC-Boss/GPT-SoVITS/pull/2763)
  - 내용: G2PW 추론 입력 구성 및 다중 발음 처리를 최적화하여 긴 문장에 대한 중복 계산 및 추론 오버헤드 감소
  - 유형: 최적화
  - 기여자: baicai-1145
 - 2026.04.18 [PR#2767](https://github.com/RVC-Boss/GPT-SoVITS/pull/2767)
  - 내용: Windows 환경 단일 GPU에서 GPT-SoVITS v3의 LoRA 학습 흐름 개선
  - 유형: 최적화
  - 기여자: 2409324124
 - 2026.04.18 [PR#2755](https://github.com/RVC-Boss/GPT-SoVITS/pull/2755)
  - 내용: 여러 모듈의 잡다한 버그 수정
  - 유형: 수정
  - 기여자: wishhyt
 - 2026.04.18 [PR#2758](https://github.com/RVC-Boss/GPT-SoVITS/pull/2758)
  - 내용: 데이터셋 처리를 위한 오류 처리 힌트 추가
  - 유형: 최적화
  - 기여자: mushroomcowisheggs
 - 2026.04.18 [PR#2753](https://github.com/RVC-Boss/GPT-SoVITS/pull/2753)
  - 내용: 병렬 추론의 일부 버그 수정
  - 유형: 수정
  - 기여자: wishhyt
 - 2026.04.18 [PR#2733](https://github.com/RVC-Boss/GPT-SoVITS/pull/2733)
  - 내용: DPO 학습이 누락 단어 시뮬레이션을 지원하지 않는 버그 수정
  - 유형: 수정
  - 기여자: Mr-Neutr0n
 - 2026.04.18 [Commit#02425ea](https://github.com/RVC-Boss/GPT-SoVITS/commit/02425ea25680c26c700be0bc158756c69103d827)
  - 내용: ONNX 스크립트에서 Optional 등 누락된 임포트 문제 수정
  - 유형: 수정
  - 기여자: RVC-Boss
--- a/docs/tr/Changelog_TR.md
+++ b/docs/tr/Changelog_TR.md
@ -2,8 +2,6 @@
 ## 202401
 ## 202401
 - 2024.01.21 [PR#108](https://github.com/RVC-Boss/GPT-SoVITS/pull/108)
  - İçerik: WebUI'ya İngilizce sistem çeviri desteği eklendi.
  - Tür: Dokümantasyon
@ -332,6 +330,8 @@
  - Tür: Optimizasyon
  - Katkıda Bulunan: RVC-Boss, GoHomeToMacDonal
  - İlgili: [PR#672](https://github.com/RVC-Boss/GPT-SoVITS/pull/672)
 - Gelecek güncellemeler, `fast_inference` dalındaki değişikliklerin tutarlılığını doğrulamaya devam edecek.
 - 2024.07.13 [PR#1294](https://github.com/RVC-Boss/GPT-SoVITS/pull/1294), [PR#1298](https://github.com/RVC-Boss/GPT-SoVITS/pull/1298)
  - İçerik: i18n taraması yeniden düzenlendi ve çok dilli yapılandırma dosyaları güncellendi
  - Tür: Dokümantasyon
@ -578,3 +578,160 @@
  - İçerik: Otomatik hassasiyet algılama mantığı optimize edildi; WebUI önyüz modüllerine katlanabilir özellik eklendi
  - Tür: Yeni Özellik
  - Katkıda Bulunanlar: XXXXRT666, RVC-Boss
 - 2025.06.06 [PR#2427](https://github.com/RVC-Boss/GPT-SoVITS/pull/2427)
  - İçerik: "X一X" kalıbı için çok sesli harf tespitini düzelt
  - Tür: Düzeltme
  - Katkıda Bulunan: wzy3650
 - 2025.06.05 [PR#2439](https://github.com/RVC-Boss/GPT-SoVITS/pull/2439)
  - İçerik: Yapılandırma düzeltmesi; SoVITS model yüklemesini düzelt
  - Tür: Düzeltme
  - Katkıda Bulunan: wzy3650
 - 2025.06.09 [Commit#8056efe4](https://github.com/RVC-Boss/GPT-SoVITS/commit/8056efe4ab7bbc3610c72ae356a6f37518441f7d)
  - İçerik: `ge.sum` kaynaklı olası sayısal patlamayı (sessiz çıkarıma yol açan) düzelt
  - Tür: Düzeltme
  - Katkıda Bulunan: RVC-Boss
 - 2025.06.10 [Commit#2c0436b9](https://github.com/RVC-Boss/GPT-SoVITS/commit/2c0436b9ce397424ae03476c836fb64c6e5ebcc6)
  - İçerik: Deney adı boşlukla bittiğinde oluşan hatalı Windows yolunu düzelt
  - Tür: Düzeltme
  - Katkıda Bulunan: RVC-Boss
 - 2025.06.10 [PR#2449](https://github.com/RVC-Boss/GPT-SoVITS/pull/2449)
  - İçerik: Dil bölütlemeyi optimize et
  - Tür: Optimizasyon
  - Katkıda Bulunan: KamioRinn
 - 2025.06.11 [PR#2450](https://github.com/RVC-Boss/GPT-SoVITS/pull/2450)
  - İçerik: v2pro için paralel çıkarım desteğindeki hatayı düzelt
  - Tür: Düzeltme
  - Katkıda Bulunan: YYuX-1145
 - 2025.06.11 [Commit#ed89a023](https://github.com/RVC-Boss/GPT-SoVITS/commit/ed89a023378dabba9d4b6580235bb9742245816d)
  - İçerik: v2pro için `ge` çıkarımındaki sayısal taşma sorununu düzelt
  - Tür: Düzeltme
  - Katkıda Bulunan: RVC-Boss
 - 2025.06.17 [PR#2464](https://github.com/RVC-Boss/GPT-SoVITS/pull/2464) [PR#2482](https://github.com/RVC-Boss/GPT-SoVITS/pull/2482)
  - İçerik: `install.sh` mantığını optimize et
  - Tür: Optimizasyon
  - Katkıda Bulunan: XXXXRT666
 - 2025.06.27 [PR#2489](https://github.com/RVC-Boss/GPT-SoVITS/pull/2489)
  - İçerik: onnxruntime yükleme mantığını optimize et (GPU/CPU algılama)
  - Tür: Optimizasyon
  - Katkıda Bulunan: KamioRinn
 - 2025.06.27 [PR#2488](https://github.com/RVC-Boss/GPT-SoVITS/pull/2488)
  - İçerik: Dil bölütleme ve biçimlendirmeyi optimize et
  - Tür: Optimizasyon
  - Katkıda Bulunan: KamioRinn
 ## 202507
 - 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
  - İçerik: Çıkarım işlem önceliğini artır (Win11'de olası GPU kullanım sınırlamasını düzelt)
  - Tür: Optimizasyon
  - Katkıda Bulunan: XianYue0125
 - 2025.07.16 [PR#2490](https://github.com/RVC-Boss/GPT-SoVITS/pull/2490)
  - İçerik: TTS.py'nin gerçekte desteklenen sürümler olan v2Pro ve v2ProPlus'ı tanımaması sorununu düzelt ve varsayılan yapılandırmayı güncelle
  - Tür: Düzeltme
  - Katkıda Bulunan: jiangsier-xyz
 - 2025.07.16 [Commit#4d8ebf85](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d8ebf85233d4f1166d7cc02fdc595602975ca8f)
  - İçerik: Paralel çıkarım modunda v2pro model tanıma sorununu düzelt
  - Tür: Düzeltme
  - Katkıda Bulunan: RVC-Boss
 - 2025.07.17 [PR#2531](https://github.com/RVC-Boss/GPT-SoVITS/pull/2531)
  - İçerik: Whisper ASR daha uygun maliyetli distill modellerini destekler
  - Tür: Optimizasyon
  - Katkıda Bulunan: XXXXRT666
 - 2025.07.18 [PR#2536](https://github.com/RVC-Boss/GPT-SoVITS/pull/2536)
  - İçerik: `TTS_Config` kod mantığını optimize et
  - Tür: Optimizasyon
  - Katkıda Bulunan: ChasonJiang
 - 2025.07.18 [PR#2537](https://github.com/RVC-Boss/GPT-SoVITS/pull/2537)
  - İçerik: GPT kayıp (loss) hesaplama sorununu düzelt
  - Tür: Düzeltme
  - Katkıda Bulunan: ChasonJiang
 ## 202508
 - 2025.08.02 [PR#2561](https://github.com/RVC-Boss/GPT-SoVITS/pull/2561)
  - İçerik: WSL Rocm
  - Tür: Düzeltme
  - Katkıda Bulunan: XXXXRT666
 ## 202509
 - 2025.09.10 [Commit#11aa78bd](https://github.com/RVC-Boss/GPT-SoVITS/commit/11aa78bd9bda8b53047cfcae03abf7ca94d27391)
  - İçerik: Ortam değişkeninin dize (string) olmaması sorununu düzelt
  - Tür: Düzeltme
  - Katkıda Bulunan: RVC-Boss
 ## 202511
 - 2025.11.28 [PR#2671](https://github.com/RVC-Boss/GPT-SoVITS/pull/2671) [PR#2678](https://github.com/RVC-Boss/GPT-SoVITS/pull/2678)
  - İçerik: Akışlı çıkarım (streaming inference)
  - Tür: Yeni Özellik
  - Katkıda Bulunan: ChasonJiang
 - 2025.11.28 [PR#2636](https://github.com/RVC-Boss/GPT-SoVITS/pull/2636)
  - İçerik: Matematiksel ifade metinleri için metin ön uç (frontend) mantığını optimize et
  - Tür: Optimizasyon
  - Katkıda Bulunan: KamioRinn
 - 2025.11.28 [PR#2469](https://github.com/RVC-Boss/GPT-SoVITS/pull/2469)
  - İçerik: Akışlı çıkarım (streaming inference)
  - Tür: Yeni Özellik
  - Katkıda Bulunan: L-jasmine
 - 2025.11.28 [PR#2577](https://github.com/RVC-Boss/GPT-SoVITS/pull/2577)
  - İçerik: VQ dağıtılmış eğitimi destekle
  - Tür: Optimizasyon
  - Katkıda Bulunan: wzy3650
 - 2025.11.28 [PR#2627](https://github.com/RVC-Boss/GPT-SoVITS/pull/2627) [PR#2679](https://github.com/RVC-Boss/GPT-SoVITS/pull/2679)
  - İçerik: ASR model indirme mantığını optimize et
  - Tür: Optimizasyon
  - Katkıda Bulunan: XXXXRT666
 - 2025.11.28 [PR#2662](https://github.com/RVC-Boss/GPT-SoVITS/pull/2662)
  - İçerik: Varsayılan parti boyutu (batch size) hatasını düzelt
  - Tür: Düzeltme
  - Katkıda Bulunan: Spr-Aachen
 ## 202512
 - 2025.12.30 [PR#2703](https://github.com/RVC-Boss/GPT-SoVITS/pull/2703) [PR#2704](https://github.com/RVC-Boss/GPT-SoVITS/pull/2704)
  - İçerik: Örnekleme (sampling) hatasını düzelt
  - Tür: Düzeltme
  - Katkıda Bulunan: ChasonJiang
 ## 202602
 - 2026.02.08 [PR#2727](https://github.com/RVC-Boss/GPT-SoVITS/pull/2727)
  - İçerik: Kabul edilmeyen Conda koşullarının neden olduğu derleme hatasını düzelt
  - Tür: Düzeltme
  - Katkıda Bulunan: Oarora
 - 2026.02.09 [PR#2732](https://github.com/RVC-Boss/GPT-SoVITS/pull/2732)
  - İçerik: Otomatik ortam kurulumunu optimize et
  - Tür: Optimizasyon
  - Katkıda Bulunan: XXXXRT666
 # 202604
 - 2026.04.18 [PR#2763](https://github.com/RVC-Boss/GPT-SoVITS/pull/2763)
  - İçerik: Uzun cümlelerde gereksiz hesaplama ve çıkarım yükünü azaltmak için G2PW çıkarım girdi oluşturmayı ve çok sesli harf işlemeyi optimize et
  - Tür: Optimizasyon
  - Katkıda Bulunan: baicai-1145
 - 2026.04.18 [PR#2767](https://github.com/RVC-Boss/GPT-SoVITS/pull/2767)
  - İçerik: Windows altında tek kartta GPT-SoVITS v3 için LoRA eğitim akışını iyileştir
  - Tür: Optimizasyon
  - Katkıda Bulunan: 2409324124
 - 2026.04.18 [PR#2755](https://github.com/RVC-Boss/GPT-SoVITS/pull/2755)
  - İçerik: Birden çok modüldeki çeşitli hataları düzelt
  - Tür: Düzeltme
  - Katkıda Bulunan: wishhyt
 - 2026.04.18 [PR#2758](https://github.com/RVC-Boss/GPT-SoVITS/pull/2758)
  - İçerik: Veri kümesi işleme için hata işleme ipuçları ekle
  - Tür: Optimizasyon
  - Katkıda Bulunan: mushroomcowisheggs
 - 2026.04.18 [PR#2753](https://github.com/RVC-Boss/GPT-SoVITS/pull/2753)
  - İçerik: Paralel çıkarımdaki bazı hataları düzelt
  - Tür: Düzeltme
  - Katkıda Bulunan: wishhyt
 - 2026.04.18 [PR#2733](https://github.com/RVC-Boss/GPT-SoVITS/pull/2733)
  - İçerik: DPO eğitiminin eksik kelime simülasyonunu desteklememe hatasını düzelt
  - Tür: Düzeltme
  - Katkıda Bulunan: Mr-Neutr0n
 - 2026.04.18 [Commit#02425ea](https://github.com/RVC-Boss/GPT-SoVITS/commit/02425ea25680c26c700be0bc158756c69103d827)
  - İçerik: ONNX betiğinde (Optional vb.) eksik içe aktarmaları düzelt
  - Tür: Düzeltme
  - Katkıda Bulunan: RVC-Boss
--- a/tools/asr/funasr_asr.py
+++ b/tools/asr/funasr_asr.py
@ -39,6 +39,7 @@ def create_model(language="zh"):
            local_dir="tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
        )
        model_revision = "v2.0.4"
        vad_model_revision = punc_model_revision = "v2.0.4"
    elif language == "yue":
        path_asr = "tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online"
        snapshot_download(
@ -51,8 +52,6 @@ def create_model(language="zh"):
    else:
        raise ValueError(f"{language} is not supported")
    vad_model_revision = punc_model_revision = "v2.0.4"
    if language in funasr_models:
        return funasr_models[language]
    else:
--- a/tools/uvr5/lib/lib_v5/spec_utils.py
+++ b/tools/uvr5/lib/lib_v5/spec_utils.py
@ -485,6 +485,8 @@ def istft(spec, hl):
    wave_right = librosa.istft(spec_right, hop_length=hl)
    wave = np.asfortranarray([wave_left, wave_right])
    return wave
 if __name__ == "__main__":
    import argparse
Author	SHA1	Message	Date
FengQingYunDan	bd3dfa685c	Merge 50a88a596dea718c83e535136e9cb46b513cef6f into 08d627c3338173c3229286d8787060d6559fe0f8	2026-05-14 01:31:35 +08:00
RVC-Boss	08d627c333	增加cuda graph支持，普通推理模式推理速度原地翻倍，效果不变。2 增加cuda graph支持，普通推理模式推理速度原地翻倍，效果不变。2	2026-04-30 15:01:45 +08:00
RVC-Boss	6d95b559e8	增加cuda graph支持，普通推理模式推理速度原地翻倍，效果不变。1 增加cuda graph支持，普通推理模式推理速度原地翻倍，效果不变。1	2026-04-30 15:01:11 +08:00
RVC-Boss	ea2d2a8166	Update README.md	2026-04-19 21:02:57 +08:00
SapphireLab	d9f03dad3e	Update Documentation (#2768 ) * 调整日志格式 * docs: Update other languages' changelogs	2026-04-18 22:33:55 +08:00
RVC-Boss	647935357a	Update Changelog_CN.md	2026-04-18 19:01:11 +08:00
RVC-Boss	02425ea256	Fixed issues such as missing imports for types like `Optional`. Fixed issues such as missing imports for types like `Optional`.	2026-04-18 17:33:53 +08:00
Harikrishna KP	938f05fce8	fix: correct torch.randint upper bound to include both values (#2733 )	2026-04-18 17:19:55 +08:00
huang yutong	445d18ccce	fix: 修复 TTS 音频后处理中的多个缺陷 (#2753 ) 1. 修复音频超采样时 int16 双重转换导致整数溢出（CRITICAL） - audio_postprocess 中 `audio = (audio * 32768).astype(np.int16)` 位于 if/else 块之外无条件执行，当 super_sampling=True 时音频已在分支内转为 int16，再次乘以 32768 导致溢出和音频完全失真 - 同时修复 super_sampling=True 但超分模型不存在时 torch.Tensor 调用 .astype() 的 AttributeError 2. 修复 batched vocoder 推理中 padding_len=0 导致音频丢失（HIGH） - 当 padding_len 恰好为 0 时，`-0 * upsample_rate == 0`，切片 `audio[x:0]` 返回空张量，导致整段音频丢失 3. 修复文件不存在时错误地抛出 FileExistsError（LOW） - 应为 FileNotFoundError Made-with: Cursor	2026-04-18 17:16:24 +08:00
Mushroomcowisheggs	00ce973412	feat: 添加数据集的错误处理提示 (#2758 ) Co-authored-by: moomushroom <107208254+moomushroom@users.noreply.github.com>	2026-04-18 17:13:30 +08:00
huang yutong	14191901cd	fix: 修复多个模块中的独立 bug (#2755 ) 1. 修复 sync_buffer 中除以函数对象而非调用结果（distrib.py） - `buffer.data /= world_size` 中 world_size 是函数，缺少 ()，导致 TypeError 使分布式训练 buffer 同步失败 2. 修复 istft 函数缺少 return 语句（spec_utils.py） - 函数计算了结果但未返回，调用者始终得到 None 3. 修复 cut0 返回字面量 "/n" 而非换行符 "\n"（text_segmentation_method.py） - 导致后续 text.split("\n") 无法正确切分，字面 /n 被当作文本内容 4. 修复粤语 ASR 的 vad/punc model_revision 被无条件覆盖（funasr_asr.py） - 粤语分支将 vad_model_revision 设为空（因不使用 VAD/标点模型），但 if/else 外的赋值将其覆盖为 "v2.0.4"，传入错误的 revision 参数 Made-with: Cursor	2026-04-18 17:10:56 +08:00
东云	780383d5bd	[codex] Improve Windows single-GPU v3 LoRA training / 改进 Windows 单卡 v3 LoRA 训练流程 (#2767 ) * Improve Windows single-GPU v3 LoRA training * Drop unrelated checkpoint helper change from PR * Tighten PR scope to single-GPU training path fixes	2026-04-18 16:54:26 +08:00
白菜工厂1145号员工	ba8de9b760	优化 G2PW 的推理输入构造与多音字处理流程，减少重复计算，降低长句场景下的推理开销 (#2763 ) * Enhance G2P processing by implementing batch input handling in _g2p function, improving efficiency. Update prepare_onnx_input to utilize caching for tokenization and add optional parameters for character ID mapping and phoneme masks. Refactor G2PWOnnxConverter to streamline model loading and configuration management. * Enhance G2PW model input handling by introducing polyphonic context character support and updating the data preparation method to return additional query IDs. This improves the processing of polyphonic characters in sentences.	2026-04-18 16:52:32 +08:00
Downupanddownup	50a88a596d	更新到gsv官方最新版本	2024-09-15 19:25:58 +08:00
Downupanddownup	86e5b67448	减少推理文本	2024-08-05 20:11:11 +08:00
Downupanddownup	2faf74beaa	Merge branch 'main' into ref_audio_selector_tool	2024-06-22 10:00:36 +08:00
Downupanddownup	16b3c2a131	--	2024-06-15 02:06:10 +08:00
Downupanddownup	9f418af1dd	--	2024-06-06 22:02:40 +08:00
Downupanddownup	5ffb193bcd	初始化为数字	2024-06-06 18:14:38 +08:00
Downupanddownup	56d6ae6b3b	Merge branch 'main' into ref_audio_selector_tool	2024-06-06 18:04:49 +08:00
Downupanddownup	7c3c778b17	添加可能存在高频齿音的文本	2024-05-07 19:08:00 +08:00
Downupanddownup	61b21e1fca	Merge branch 'main' into ref_audio_selector_tool	2024-05-03 07:10:29 +08:00
Downupanddownup	18002ad809	bug修复	2024-05-02 09:58:28 +08:00
Downupanddownup	12fa7d875f	bug修复	2024-05-02 09:56:00 +08:00
Downupanddownup	48cc70a7de	bug修复	2024-05-02 08:02:56 +08:00
Downupanddownup	3ac7aad4d0	bug修复	2024-05-02 07:39:43 +08:00
Downupanddownup	036d828a7e	bug修复	2024-05-02 07:26:26 +08:00
Downupanddownup	7e1c40ef9f	00	2024-05-01 21:52:09 +08:00
Downupanddownup	fdffd50066	00	2024-05-01 21:23:28 +08:00
Downupanddownup	8a10c528e3	bug修复	2024-05-01 01:49:37 +08:00
Downupanddownup	02fabe807f	bug修复	2024-05-01 00:46:42 +08:00
Downupanddownup	4ebcb3bf1b	bug修复	2024-05-01 00:24:40 +08:00
Downupanddownup	5843d56c4e	bug修复	2024-05-01 00:11:27 +08:00
Downupanddownup	7660f1c8fb	优化监控信息	2024-04-30 10:59:31 +08:00
Downupanddownup	fa45c5ac4f	bug修复	2024-04-30 10:04:41 +08:00
Downupanddownup	2dc36d3d60	添加流程说明	2024-04-29 15:42:46 +08:00
Downupanddownup	f70fd8ff87	优化说明	2024-04-29 15:37:47 +08:00
Downupanddownup	ed8d276ac9	优化说明	2024-04-29 15:02:22 +08:00
Downupanddownup	1de89feb7b	优化代码	2024-04-29 14:49:27 +08:00
Downupanddownup	b8356880dc	优化代码	2024-04-29 14:19:24 +08:00
Downupanddownup	8182908f7d	添加说话人确认模型切换	2024-04-29 14:14:16 +08:00
Downupanddownup	5081168918	添加切换重置按钮事件	2024-04-29 13:04:33 +08:00
Downupanddownup	c26fa983a4	参考类型，添加选择	2024-04-29 11:23:41 +08:00
Downupanddownup	5280d17d2f	ui布局调整	2024-04-29 10:49:07 +08:00
Downupanddownup	371a2d7138	bug调整	2024-04-29 10:13:22 +08:00
Downupanddownup	fe969ab9a2	测试	2024-04-29 00:58:15 +08:00
Downupanddownup	61db7f05dc	测试	2024-04-29 00:41:27 +08:00
Downupanddownup	536c226b1a	添加url消息提示	2024-04-29 00:32:01 +08:00
Downupanddownup	01468158d3	添加url消息提示	2024-04-29 00:29:52 +08:00
Downupanddownup	c9547ab669	添加url消息提示	2024-04-29 00:27:38 +08:00
Downupanddownup	b1ad8b5dcd	bug修复	2024-04-29 00:20:13 +08:00
Downupanddownup	d6e255a071	添加windows下启动文件	2024-04-28 20:21:15 +08:00
Downupanddownup	e89f986e3f	添加ui参数写入	2024-04-28 19:07:09 +08:00
Downupanddownup	af0bd9f414	添加ui初始化值	2024-04-28 18:47:44 +08:00
Downupanddownup	13567362d9	提取一部分公共组件	2024-04-28 16:44:45 +08:00
Downupanddownup	27325f4cf9	调整项目结构，修复随机采样bug	2024-04-28 15:49:05 +08:00
Downupanddownup	6cb3c15448	添加非中文语言的asr操作	2024-04-28 15:20:10 +08:00
Downupanddownup	9264f7e38e	添加事件绑定和实现	2024-04-28 14:10:02 +08:00
Downupanddownup	25b65cdfd0	调整ui布局	2024-04-27 22:09:03 +08:00
Downupanddownup	1a7cf580e0	创建日志目录	2024-04-27 11:24:57 +08:00
Downupanddownup	c36d0a93fe	api推理，添加多进程请求	2024-04-27 01:27:57 +08:00
Downupanddownup	2a23f95f61	bug修复	2024-04-26 22:55:09 +08:00
Downupanddownup	d1e92edc7c	添加一些参数的读取和保存	2024-04-26 17:46:40 +08:00
Downupanddownup	d8d551d4d2	bug修复	2024-04-26 17:10:23 +08:00
Downupanddownup	1d434e1a0a	添加初始启动时的默认值	2024-04-26 17:01:03 +08:00
Downupanddownup	9fe20c14d6	添加音频预采样开关	2024-04-26 16:27:21 +08:00
Downupanddownup	64cc2fd9d1	将打印信息，改由日志输出	2024-04-26 16:18:40 +08:00
Downupanddownup	a291629438	音频相似度比较，添加参考音频的预采样步骤	2024-04-26 15:37:58 +08:00
Downupanddownup	e3e47d2c06	音频相似度比较，添加参考音频的预采样步骤	2024-04-26 15:08:33 +08:00
Downupanddownup	ca9ffbf98e	音频相似度比较，添加参考音频的预采样步骤	2024-04-26 15:00:34 +08:00
Downupanddownup	684e1cfd2f	文本相似度，添加GPU加速	2024-04-26 14:31:54 +08:00
Downupanddownup	878fef248a	bug修复	2024-04-26 14:16:16 +08:00
Downupanddownup	2880e3a6f8	添加性能监控	2024-04-26 13:25:02 +08:00
Downupanddownup	1da23aa259	bug修复	2024-04-25 22:54:40 +08:00
Downupanddownup	c8be484c0e	添加路径清理	2024-04-25 19:09:27 +08:00
Downupanddownup	d855eecc7b	添加目录保存	2024-04-25 18:50:52 +08:00
Downupanddownup	d20bd37965	调整配置参数，进行集中管理	2024-04-25 17:36:13 +08:00
Downupanddownup	926dd6b34a	调整配置管理，去除写入	2024-04-25 17:13:30 +08:00
Downupanddownup	f61a723bab	添加3s至10s的音频过滤	2024-04-25 16:45:42 +08:00
Downupanddownup	441ab54889	url编码调整	2024-04-25 16:39:56 +08:00
Downupanddownup	ecbc7d0b1e	添加配置文件管理	2024-04-25 16:20:11 +08:00
Downupanddownup	b6f0bb36ef	添加同步参考音频代码	2024-04-25 13:26:32 +08:00
Downupanddownup	4daa9ad53c	添加文本相似度比较功能	2024-04-25 11:54:13 +08:00
Downupanddownup	2c8f6bd4c9	配置文件生成、音频抽样、音频推理测试	2024-04-25 00:22:58 +08:00
Downupanddownup	4cbbe2a258	调整目录结构	2024-04-24 18:57:36 +08:00
Downupanddownup	a1fc00a9d8	调整目录结构	2024-04-24 18:53:00 +08:00
Downupanddownup	8c9627bb30	功能补全	2024-04-24 16:58:02 +08:00
Downupanddownup	e69e449599	功能补全	2024-04-24 16:54:51 +08:00
Downupanddownup	29b8370c45	添加根据list，转换参考音频的方法	2024-04-23 23:56:49 +08:00
Downupanddownup	7efdf31113	添加参考音频筛功能选界面	2024-04-23 23:31:02 +08:00