Update Changelog_CN.md

Fixed issues such as missing imports for types like Optional.
Fixed issues such as missing imports for types like `Optional`.
2026-04-29 21:00:42 +08:00 · 2026-04-18 19:01:11 +08:00 · 2026-04-18 17:33:53 +08:00 · 2026-04-18 17:19:55 +08:00 · 2026-04-18 17:16:24 +08:00 · 2026-04-18 17:13:30 +08:00
13 changed files with 434 additions and 137 deletions
--- a/GPT_SoVITS/AR/data/dataset.py
+++ b/GPT_SoVITS/AR/data/dataset.py
@ -67,8 +67,10 @@ class Text2SemanticDataset(Dataset):
            )
        )  # "%s/3-bert"%exp_dir#bert_dir
        self.path6 = semantic_path  # "%s/6-name2semantic.tsv"%exp_dir#semantic_path
-        assert os.path.exists(self.path2)
-        assert os.path.exists(self.path6)
+        if not os.path.exists(self.path2):
+            raise FileNotFoundError(f"Phoneme data file not found: {self.path2}")
+        if not os.path.exists(self.path6):
+            raise FileNotFoundError(f"Semantic data file not found: {self.path6}")
        self.phoneme_data = {}
        with open(self.path2, "r", encoding="utf8") as f:
            lines = f.read().strip("\n").split("\n")
@ -131,7 +133,7 @@ class Text2SemanticDataset(Dataset):
                phoneme, word2ph, text = self.phoneme_data[item_name]
            except Exception:
                traceback.print_exc()
-                # print(f"{item_name} not in self.phoneme_data !")
+                print(f"Warning: File \"{item_name}\" not in self.phoneme_data! Skipped. ")
                num_not_in += 1
                continue

@ -152,7 +154,7 @@ class Text2SemanticDataset(Dataset):
                phoneme_ids = cleaned_text_to_sequence(phoneme, version)
            except:
                traceback.print_exc()
-                # print(f"{item_name} not in self.phoneme_data !")
+                print(f"Warning: Failed to convert phonemes to sequence for file \"{item_name}\"! Skipped. ")
                num_not_in += 1
                continue
            # if len(phoneme_ids) >400:###########2：改为恒定限制为semantic/2.5就行
@ -228,7 +230,11 @@ class Text2SemanticDataset(Dataset):
            # bert_feature=torch.zeros_like(phoneme_ids,dtype=torch.float32)
            bert_feature = None
        else:
-            assert bert_feature.shape[-1] == len(phoneme_ids)
+            try:
+                assert bert_feature.shape[-1] == len(phoneme_ids)
+            except AssertionError:
+                print(f"AssertionError: The BERT feature dimension ({bert_feature.shape[-1]}) of the file '{item_name}' does not match the length of the phoneme sequence ({len(phoneme_ids)}).")
+                raise
        return {
            "idx": idx,
            "phoneme_ids": phoneme_ids,
--- a/GPT_SoVITS/AR/models/utils.py
+++ b/GPT_SoVITS/AR/models/utils.py
@ -262,7 +262,7 @@ def make_reject_y(y_o, y_lens):
    reject_y = []
    reject_y_lens = []
    for b in range(bs):
-        process_item_idx = torch.randint(0, 1, size=(1,))[0]
+        process_item_idx = torch.randint(0, 2, size=(1,))[0]
        if process_item_idx == 0:
            new_y = repeat_P(y_o[b])
            reject_y.append(new_y)
--- a/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py
+++ b/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py
@ -8,30 +8,30 @@ def multi_head_attention_forward_patched(
    query,
    key,
    value,
-    embed_dim_to_check: int,
-    num_heads: int,
+    embed_dim_to_check,
+    num_heads,
    in_proj_weight,
-    in_proj_bias: Optional[Tensor],
-    bias_k: Optional[Tensor],
-    bias_v: Optional[Tensor],
-    add_zero_attn: bool,
-    dropout_p: float,
-    out_proj_weight: Tensor,
-    out_proj_bias: Optional[Tensor],
-    training: bool = True,
-    key_padding_mask: Optional[Tensor] = None,
-    need_weights: bool = True,
-    attn_mask: Optional[Tensor] = None,
-    use_separate_proj_weight: bool = False,
-    q_proj_weight: Optional[Tensor] = None,
-    k_proj_weight: Optional[Tensor] = None,
-    v_proj_weight: Optional[Tensor] = None,
-    static_k: Optional[Tensor] = None,
-    static_v: Optional[Tensor] = None,
-    average_attn_weights: bool = True,
-    is_causal: bool = False,
+    in_proj_bias,
+    bias_k,
+    bias_v,
+    add_zero_attn,
+    dropout_p,
+    out_proj_weight,
+    out_proj_bias,
+    training=True,
+    key_padding_mask=None,
+    need_weights=True,
+    attn_mask=None,
+    use_separate_proj_weight=False,
+    q_proj_weight=None,
+    k_proj_weight=None,
+    v_proj_weight=None,
+    static_k=None,
+    static_v=None,
+    average_attn_weights=True,
+    is_causal=False,
    cache=None,
-) -> Tuple[Tensor, Optional[Tensor]]:
+):
    # set up shape vars
    _, _, embed_dim = query.shape
    attn_mask = _canonical_mask(
--- a/GPT_SoVITS/TTS_infer_pack/TTS.py
+++ b/GPT_SoVITS/TTS_infer_pack/TTS.py
@ -499,7 +499,7 @@ class TTS:

        if if_lora_v3 == True and os.path.exists(path_sovits) == False:
            info = path_sovits + i18n("SoVITS %s 底模缺失，无法加载相应 LoRA 权重" % model_version)
-            raise FileExistsError(info)
+            raise FileNotFoundError(info)

        # dict_s2 = torch.load(weights_path, map_location=self.configs.device,weights_only=False)
        dict_s2 = load_sovits_new(weights_path)
@ -1578,16 +1578,15 @@ class TTS:
                max_audio = np.abs(audio).max()
                if max_audio > 1:
                    audio /= max_audio
-            audio = (audio * 32768).astype(np.int16)
+                audio = (audio * 32768).astype(np.int16)
+            else:
+                audio = audio.cpu().numpy()
+                audio = (audio * 32768).astype(np.int16)
            t2 = time.perf_counter()
            print(f"超采样用时：{t2 - t1:.3f}s")
        else:
-            # audio = audio.float() * 32768
-            # audio = audio.to(dtype=torch.int16).clamp(-32768, 32767).cpu().numpy()
-
            audio = audio.cpu().numpy()
-
-        audio = (audio * 32768).astype(np.int16)
+            audio = (audio * 32768).astype(np.int16)


        # try:
@ -1768,7 +1767,10 @@ class TTS:
            pos += chunk_len * upsample_rate

        audio = self.sola_algorithm(audio_fragments, overlapped_len * upsample_rate)
-        audio = audio[overlapped_len * upsample_rate : -padding_len * upsample_rate]
+        if padding_len > 0:
+            audio = audio[overlapped_len * upsample_rate : -padding_len * upsample_rate]
+        else:
+            audio = audio[overlapped_len * upsample_rate :]

        audio_fragments = []
        for feat_len in feat_lens:
--- a/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
+++ b/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
@ -92,7 +92,7 @@ def cut0(inp):
    if not set(inp).issubset(punctuation):
        return inp
    else:
-        return "/n"
+        return "\n"


 # 凑四句一切
--- a/GPT_SoVITS/module/distrib.py
+++ b/GPT_SoVITS/module/distrib.py
@ -87,7 +87,7 @@ def sync_buffer(buffers, average=True):
    for buffer, handle in handles:
        handle.wait()
        if average:
-            buffer.data /= world_size
+            buffer.data /= world_size()


 def sync_grad(params):
--- a/GPT_SoVITS/s2_train_v3_lora.py
+++ b/GPT_SoVITS/s2_train_v3_lora.py
@ -55,6 +55,10 @@ def main():
        n_gpus = torch.cuda.device_count()
    else:
        n_gpus = 1
+    if n_gpus <= 1:
+        run(0, n_gpus, hps)
+        return
+
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = str(randint(20000, 55555))

@ -77,12 +81,14 @@ def run(rank, n_gpus, hps):
        writer = SummaryWriter(log_dir=hps.s2_ckpt_dir)
        writer_eval = SummaryWriter(log_dir=os.path.join(hps.s2_ckpt_dir, "eval"))

-    dist.init_process_group(
-        backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
-        init_method="env://?use_libuv=False",
-        world_size=n_gpus,
-        rank=rank,
-    )
+    use_ddp = n_gpus > 1
+    if use_ddp:
+        dist.init_process_group(
+            backend="gloo" if os.name == "nt" or not torch.cuda.is_available() else "nccl",
+            init_method="env://?use_libuv=False",
+            world_size=n_gpus,
+            rank=rank,
+        )
    torch.manual_seed(hps.train.seed)
    if torch.cuda.is_available():
        torch.cuda.set_device(rank)
@ -118,15 +124,20 @@ def run(rank, n_gpus, hps):
        shuffle=True,
    )
    collate_fn = TextAudioSpeakerCollate()
-    train_loader = DataLoader(
-        train_dataset,
-        num_workers=5,
+    worker_count = 0 if os.name == "nt" and n_gpus <= 1 else min(2 if os.name == "nt" else 5, os.cpu_count() or 1)
+    loader_kwargs = dict(
+        num_workers=worker_count,
        shuffle=False,
-        pin_memory=True,
+        pin_memory=torch.cuda.is_available(),
        collate_fn=collate_fn,
        batch_sampler=train_sampler,
-        persistent_workers=True,
-        prefetch_factor=3,
+    )
+    if worker_count > 0:
+        loader_kwargs["persistent_workers"] = True
+        loader_kwargs["prefetch_factor"] = 2 if os.name == "nt" else 3
+    train_loader = DataLoader(
+        train_dataset,
+        **loader_kwargs,
    )
    save_root = "%s/logs_s2_%s_lora_%s" % (hps.data.exp_dir, hps.model.version, hps.train.lora_rank)
    os.makedirs(save_root, exist_ok=True)
@ -156,7 +167,9 @@ def run(rank, n_gpus, hps):

    def model2cuda(net_g, rank):
        if torch.cuda.is_available():
-            net_g = DDP(net_g.cuda(rank), device_ids=[rank], find_unused_parameters=True)
+            net_g = net_g.cuda(rank)
+            if use_ddp:
+                net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
        else:
            net_g = net_g.to(device)
        return net_g
@ -242,6 +255,8 @@ def run(rank, n_gpus, hps):
                None,
            )
        scheduler_g.step()
+    if use_ddp and dist.is_initialized():
+        dist.destroy_process_group()
    print("training done")


--- a/GPT_SoVITS/text/chinese2.py
+++ b/GPT_SoVITS/text/chinese2.py
@ -180,10 +180,15 @@ def _merge_erhua(initials: list[str], finals: list[str], word: str, pos: str) ->
 def _g2p(segments):
    phones_list = []
    word2ph = []
-    for seg in segments:
+    g2pw_batch_results = []
+    g2pw_batch_cursor = 0
+    processed_segments = [re.sub("[a-zA-Z]+", "", seg) for seg in segments]
+    if is_g2pw:
+        batch_inputs = [seg for seg in processed_segments if seg]
+        g2pw_batch_results = g2pw._g2pw(batch_inputs) if batch_inputs else []
+
+    for seg in processed_segments:
        pinyins = []
-        # Replace all English words in the sentence
-        seg = re.sub("[a-zA-Z]+", "", seg)
        seg_cut = psg.lcut(seg)
        seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
        initials = []
@ -204,8 +209,10 @@ def _g2p(segments):
            finals = sum(finals, [])
            print("pypinyin结果", initials, finals)
        else:
-            # g2pw采用整句推理
-            pinyins = g2pw.lazy_pinyin(seg, neutral_tone_with_five=True, style=Style.TONE3)
+            # g2pw采用整句推理（批量推理，逐句取结果）
+            if seg:
+                pinyins = g2pw_batch_results[g2pw_batch_cursor]
+                g2pw_batch_cursor += 1

            pre_word_length = 0
            for word, pos in seg_cut:
--- a/GPT_SoVITS/text/g2pw/dataset.py
+++ b/GPT_SoVITS/text/g2pw/dataset.py
@ -18,6 +18,7 @@ Credits

 from typing import Dict
 from typing import List
+from typing import Optional
 from typing import Tuple

 import numpy as np
@ -37,6 +38,8 @@ def prepare_onnx_input(
    use_mask: bool = False,
    window_size: int = None,
    max_len: int = 512,
+    char2id: Optional[Dict[str, int]] = None,
+    char_phoneme_masks: Optional[Dict[str, List[int]]] = None,
 ) -> Dict[str, np.array]:
    if window_size is not None:
        truncated_texts, truncated_query_ids = _truncate_texts(
@ -48,33 +51,88 @@ def prepare_onnx_input(
    phoneme_masks = []
    char_ids = []
    position_ids = []
+    tokenized_cache = {}
+
+    if char2id is None:
+        char2id = {char: idx for idx, char in enumerate(chars)}
+    if use_mask:
+        if char_phoneme_masks is None:
+            char_phoneme_masks = {
+                char: [1 if i in char2phonemes[char] else 0 for i in range(len(labels))]
+                for char in char2phonemes
+            }
+    else:
+        full_phoneme_mask = [1] * len(labels)

    for idx in range(len(texts)):
        text = (truncated_texts if window_size else texts)[idx].lower()
        query_id = (truncated_query_ids if window_size else query_ids)[idx]

-        try:
-            tokens, text2token, token2text = tokenize_and_map(tokenizer=tokenizer, text=text)
-        except Exception:
-            print(f'warning: text "{text}" is invalid')
-            return {}
+        cached = tokenized_cache.get(text)
+        if cached is None:
+            try:
+                tokens, text2token, token2text = tokenize_and_map(tokenizer=tokenizer, text=text)
+            except Exception:
+                print(f'warning: text "{text}" is invalid')
+                return {}

-        text, query_id, tokens, text2token, token2text = _truncate(
-            max_len=max_len, text=text, query_id=query_id, tokens=tokens, text2token=text2token, token2text=token2text
-        )
+            if len(tokens) <= max_len - 2:
+                processed_tokens = ["[CLS]"] + tokens + ["[SEP]"]
+                shared_input_id = list(np.array(tokenizer.convert_tokens_to_ids(processed_tokens)))
+                shared_token_type_id = list(np.zeros((len(processed_tokens),), dtype=int))
+                shared_attention_mask = list(np.ones((len(processed_tokens),), dtype=int))
+                cached = {
+                    "is_short": True,
+                    "tokens": tokens,
+                    "text2token": text2token,
+                    "token2text": token2text,
+                    "input_id": shared_input_id,
+                    "token_type_id": shared_token_type_id,
+                    "attention_mask": shared_attention_mask,
+                }
+            else:
+                cached = {
+                    "is_short": False,
+                    "tokens": tokens,
+                    "text2token": text2token,
+                    "token2text": token2text,
+                }
+            tokenized_cache[text] = cached

-        processed_tokens = ["[CLS]"] + tokens + ["[SEP]"]
+        if cached["is_short"]:
+            text_for_query = text
+            query_id_for_query = query_id
+            text2token_for_query = cached["text2token"]
+            input_id = cached["input_id"]
+            token_type_id = cached["token_type_id"]
+            attention_mask = cached["attention_mask"]
+        else:
+            (
+                text_for_query,
+                query_id_for_query,
+                tokens_for_query,
+                text2token_for_query,
+                _token2text_for_query,
+            ) = _truncate(
+                max_len=max_len,
+                text=text,
+                query_id=query_id,
+                tokens=cached["tokens"],
+                text2token=cached["text2token"],
+                token2text=cached["token2text"],
+            )
+            processed_tokens = ["[CLS]"] + tokens_for_query + ["[SEP]"]
+            input_id = list(np.array(tokenizer.convert_tokens_to_ids(processed_tokens)))
+            token_type_id = list(np.zeros((len(processed_tokens),), dtype=int))
+            attention_mask = list(np.ones((len(processed_tokens),), dtype=int))

-        input_id = list(np.array(tokenizer.convert_tokens_to_ids(processed_tokens)))
-        token_type_id = list(np.zeros((len(processed_tokens),), dtype=int))
-        attention_mask = list(np.ones((len(processed_tokens),), dtype=int))
-
-        query_char = text[query_id]
-        phoneme_mask = (
-            [1 if i in char2phonemes[query_char] else 0 for i in range(len(labels))] if use_mask else [1] * len(labels)
-        )
-        char_id = chars.index(query_char)
-        position_id = text2token[query_id] + 1  # [CLS] token locate at first place
+        query_char = text_for_query[query_id_for_query]
+        if use_mask:
+            phoneme_mask = char_phoneme_masks[query_char]
+        else:
+            phoneme_mask = full_phoneme_mask
+        char_id = char2id[query_char]
+        position_id = text2token_for_query[query_id_for_query] + 1  # [CLS] token locate at first place

        input_ids.append(input_id)
        token_type_ids.append(token_type_id)
@ -83,10 +141,15 @@ def prepare_onnx_input(
        char_ids.append(char_id)
        position_ids.append(position_id)

+    max_token_length = max(len(seq) for seq in input_ids)
+
+    def _pad_sequences(sequences, pad_value=0):
+        return [seq + [pad_value] * (max_token_length - len(seq)) for seq in sequences]
+
    outputs = {
-        "input_ids": np.array(input_ids).astype(np.int64),
-        "token_type_ids": np.array(token_type_ids).astype(np.int64),
-        "attention_masks": np.array(attention_masks).astype(np.int64),
+        "input_ids": np.array(_pad_sequences(input_ids, pad_value=0)).astype(np.int64),
+        "token_type_ids": np.array(_pad_sequences(token_type_ids, pad_value=0)).astype(np.int64),
+        "attention_masks": np.array(_pad_sequences(attention_masks, pad_value=0)).astype(np.int64),
        "phoneme_masks": np.array(phoneme_masks).astype(np.float32),
        "char_ids": np.array(char_ids).astype(np.int64),
        "position_ids": np.array(position_ids).astype(np.int64),
--- a/GPT_SoVITS/text/g2pw/onnx_api.py
+++ b/GPT_SoVITS/text/g2pw/onnx_api.py
@ -10,7 +10,6 @@ from typing import Any, Dict, List, Tuple
 import numpy as np
 import onnxruntime
 import requests
-import torch
 from opencc import OpenCC
 from pypinyin import Style, pinyin
 from transformers.models.auto.tokenization_auto import AutoTokenizer
@ -22,9 +21,8 @@ from .utils import load_config
 onnxruntime.set_default_logger_severity(3)
 try:
    onnxruntime.preload_dlls()
-except:
+except Exception:
    pass
-    # traceback.print_exc()
 warnings.filterwarnings("ignore")

 model_version = "1.1"
@ -55,6 +53,24 @@ def predict(session, onnx_input: Dict[str, Any], labels: List[str]) -> Tuple[Lis
    return all_preds, all_confidences


+def _load_json_from_candidates(filename: str, candidate_dirs: List[str]) -> Dict[str, Any]:
+    for candidate_dir in candidate_dirs:
+        if not candidate_dir:
+            continue
+        json_path = os.path.join(candidate_dir, filename)
+        if os.path.exists(json_path):
+            with open(json_path, "r", encoding="utf-8") as fr:
+                return json.load(fr)
+    raise FileNotFoundError(f"Cannot locate {filename} in candidate dirs: {candidate_dirs}")
+
+
+def _find_first_existing_file(*paths: str) -> str:
+    for path in paths:
+        if path and os.path.exists(path):
+            return path
+    raise FileNotFoundError(f"Files not found: {paths}")
+
+
 def download_and_decompress(model_dir: str = "G2PWModel/"):
    if not os.path.exists(model_dir):
        parent_directory = os.path.dirname(model_dir)
@ -62,7 +78,7 @@ def download_and_decompress(model_dir: str = "G2PWModel/"):
        extract_dir = os.path.join(parent_directory, "G2PWModel_1.1")
        extract_dir_new = os.path.join(parent_directory, "G2PWModel")
        print("Downloading g2pw model...")
-        modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip"  # "https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip"
+        modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip"
        with requests.get(modelscope_url, stream=True) as r:
            r.raise_for_status()
            with open(zip_dir, "wb") as f:
@ -79,7 +95,7 @@ def download_and_decompress(model_dir: str = "G2PWModel/"):
    return model_dir


-class G2PWOnnxConverter:
+class _G2PWBaseOnnxConverter:
    def __init__(
        self,
        model_dir: str = "G2PWModel/",
@ -87,33 +103,16 @@ class G2PWOnnxConverter:
        model_source: str = None,
        enable_non_tradional_chinese: bool = False,
    ):
-        uncompress_path = download_and_decompress(model_dir)
-
-        sess_options = onnxruntime.SessionOptions()
-        sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
-        sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
-        sess_options.intra_op_num_threads = 2 if torch.cuda.is_available() else 0
-        if "CUDAExecutionProvider" in onnxruntime.get_available_providers():
-            self.session_g2pW = onnxruntime.InferenceSession(
-                os.path.join(uncompress_path, "g2pW.onnx"),
-                sess_options=sess_options,
-                providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
-            )
-        else:
-            self.session_g2pW = onnxruntime.InferenceSession(
-                os.path.join(uncompress_path, "g2pW.onnx"),
-                sess_options=sess_options,
-                providers=["CPUExecutionProvider"],
-            )
-        self.config = load_config(config_path=os.path.join(uncompress_path, "config.py"), use_default=True)
+        self.model_dir = download_and_decompress(model_dir)
+        self.config = load_config(config_path=os.path.join(self.model_dir, "config.py"), use_default=True)

        self.model_source = model_source if model_source else self.config.model_source
        self.enable_opencc = enable_non_tradional_chinese
-
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_source)

-        polyphonic_chars_path = os.path.join(uncompress_path, "POLYPHONIC_CHARS.txt")
-        monophonic_chars_path = os.path.join(uncompress_path, "MONOPHONIC_CHARS.txt")
+        polyphonic_chars_path = os.path.join(self.model_dir, "POLYPHONIC_CHARS.txt")
+        monophonic_chars_path = os.path.join(self.model_dir, "MONOPHONIC_CHARS.txt")
+
        self.polyphonic_chars = [
            line.split("\t") for line in open(polyphonic_chars_path, encoding="utf-8").read().strip().split("\n")
        ]
@ -149,31 +148,47 @@ class G2PWOnnxConverter:
        )

        self.chars = sorted(list(self.char2phonemes.keys()))
+        self.char2id = {char: idx for idx, char in enumerate(self.chars)}
+        self.char_phoneme_masks = (
+            {
+                char: [1 if i in self.char2phonemes[char] else 0 for i in range(len(self.labels))]
+                for char in self.char2phonemes
+            }
+            if self.config.use_mask
+            else None
+        )

        self.polyphonic_chars_new = set(self.chars)
        for char in self.non_polyphonic:
-            if char in self.polyphonic_chars_new:
-                self.polyphonic_chars_new.remove(char)
+            self.polyphonic_chars_new.discard(char)

        self.monophonic_chars_dict = {char: phoneme for char, phoneme in self.monophonic_chars}
        for char in self.non_monophonic:
-            if char in self.monophonic_chars_dict:
-                self.monophonic_chars_dict.pop(char)
+            self.monophonic_chars_dict.pop(char, None)

-        self.pos_tags = ["UNK", "A", "C", "D", "I", "N", "P", "T", "V", "DE", "SHI"]
+        default_asset_dir = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "G2PWModel"))
+        candidate_asset_dirs = [self.model_dir, default_asset_dir]
+        self.bopomofo_convert_dict = _load_json_from_candidates(
+            "bopomofo_to_pinyin_wo_tune_dict.json", candidate_asset_dirs
+        )
+        self.char_bopomofo_dict = _load_json_from_candidates("char_bopomofo_dict.json", candidate_asset_dirs)

-        with open(os.path.join(uncompress_path, "bopomofo_to_pinyin_wo_tune_dict.json"), "r", encoding="utf-8") as fr:
-            self.bopomofo_convert_dict = json.load(fr)
        self.style_convert_func = {
            "bopomofo": lambda x: x,
            "pinyin": self._convert_bopomofo_to_pinyin,
        }[style]

-        with open(os.path.join(uncompress_path, "char_bopomofo_dict.json"), "r", encoding="utf-8") as fr:
-            self.char_bopomofo_dict = json.load(fr)
-
        if self.enable_opencc:
            self.cc = OpenCC("s2tw")
+        self.enable_sentence_dedup = os.getenv("g2pw_sentence_dedup", "true").strip().lower() in {
+            "1",
+            "true",
+            "yes",
+            "y",
+            "on",
+        }
+        # 聚焦到多音字附近上下文，默认左右各16字；设为0表示关闭裁剪（整句）。
+        self.polyphonic_context_chars = max(0, int(os.getenv("g2pw_polyphonic_context_chars", "16")))

    def _convert_bopomofo_to_pinyin(self, bopomofo: str) -> str:
        tone = bopomofo[-1]
@ -181,9 +196,8 @@ class G2PWOnnxConverter:
        component = self.bopomofo_convert_dict.get(bopomofo[:-1])
        if component:
            return component + tone
-        else:
-            print(f'Warning: "{bopomofo}" cannot convert to pinyin')
-            return None
+        print(f'Warning: "{bopomofo}" cannot convert to pinyin')
+        return None

    def __call__(self, sentences: List[str]) -> List[List[str]]:
        if isinstance(sentences, str):
@ -197,51 +211,147 @@ class G2PWOnnxConverter:
                translated_sentences.append(translated_sent)
            sentences = translated_sentences

-        texts, query_ids, sent_ids, partial_results = self._prepare_data(sentences=sentences)
+        texts, model_query_ids, result_query_ids, sent_ids, partial_results = self._prepare_data(sentences=sentences)
        if len(texts) == 0:
-            # sentences no polyphonic words
            return partial_results

-        onnx_input = prepare_onnx_input(
+        model_input = prepare_onnx_input(
            tokenizer=self.tokenizer,
            labels=self.labels,
            char2phonemes=self.char2phonemes,
            chars=self.chars,
            texts=texts,
-            query_ids=query_ids,
+            query_ids=model_query_ids,
            use_mask=self.config.use_mask,
            window_size=None,
+            char2id=self.char2id,
+            char_phoneme_masks=self.char_phoneme_masks,
        )

-        preds, confidences = predict(session=self.session_g2pW, onnx_input=onnx_input, labels=self.labels)
+        if not model_input:
+            return partial_results
+
+        if self.enable_sentence_dedup:
+            preds, _confidences = self._predict_with_sentence_dedup(model_input=model_input, texts=texts)
+        else:
+            preds, _confidences = self._predict(model_input=model_input)
+
        if self.config.use_char_phoneme:
            preds = [pred.split(" ")[1] for pred in preds]

        results = partial_results
-        for sent_id, query_id, pred in zip(sent_ids, query_ids, preds):
+        for sent_id, query_id, pred in zip(sent_ids, result_query_ids, preds):
            results[sent_id][query_id] = self.style_convert_func(pred)

        return results

-    def _prepare_data(self, sentences: List[str]) -> Tuple[List[str], List[int], List[int], List[List[str]]]:
-        texts, query_ids, sent_ids, partial_results = [], [], [], []
+    def _prepare_data(
+        self, sentences: List[str]
+    ) -> Tuple[List[str], List[int], List[int], List[int], List[List[str]]]:
+        texts, model_query_ids, result_query_ids, sent_ids, partial_results = [], [], [], [], []
        for sent_id, sent in enumerate(sentences):
-            # pypinyin works well for Simplified Chinese than Traditional Chinese
            sent_s = tranditional_to_simplified(sent)
            pypinyin_result = pinyin(sent_s, neutral_tone_with_five=True, style=Style.TONE3)
            partial_result = [None] * len(sent)
+            polyphonic_indices: List[int] = []
            for i, char in enumerate(sent):
                if char in self.polyphonic_chars_new:
-                    texts.append(sent)
-                    query_ids.append(i)
-                    sent_ids.append(sent_id)
+                    polyphonic_indices.append(i)
                elif char in self.monophonic_chars_dict:
                    partial_result[i] = self.style_convert_func(self.monophonic_chars_dict[char])
                elif char in self.char_bopomofo_dict:
                    partial_result[i] = pypinyin_result[i][0]
-                    # partial_result[i] =  self.style_convert_func(self.char_bopomofo_dict[char][0])
                else:
                    partial_result[i] = pypinyin_result[i][0]

+            if polyphonic_indices:
+                if self.polyphonic_context_chars > 0:
+                    left = max(0, polyphonic_indices[0] - self.polyphonic_context_chars)
+                    right = min(len(sent), polyphonic_indices[-1] + self.polyphonic_context_chars + 1)
+                    sent_for_predict = sent[left:right]
+                    query_offset = left
+                else:
+                    sent_for_predict = sent
+                    query_offset = 0
+
+                for index in polyphonic_indices:
+                    texts.append(sent_for_predict)
+                    model_query_ids.append(index - query_offset)
+                    result_query_ids.append(index)
+                    sent_ids.append(sent_id)
+
            partial_results.append(partial_result)
-        return texts, query_ids, sent_ids, partial_results
+        return texts, model_query_ids, result_query_ids, sent_ids, partial_results
+
+    def _predict(self, model_input: Dict[str, Any]) -> Tuple[List[str], List[float]]:
+        raise NotImplementedError
+
+    def _predict_with_sentence_dedup(
+        self, model_input: Dict[str, Any], texts: List[str]
+    ) -> Tuple[List[str], List[float]]:
+        if len(texts) <= 1:
+            return self._predict(model_input=model_input)
+
+        grouped_indices: Dict[str, List[int]] = {}
+        for idx, text in enumerate(texts):
+            grouped_indices.setdefault(text, []).append(idx)
+
+        if all(len(indices) == 1 for indices in grouped_indices.values()):
+            return self._predict(model_input=model_input)
+
+        preds: List[str] = [""] * len(texts)
+        confidences: List[float] = [0.0] * len(texts)
+        for indices in grouped_indices.values():
+            group_input = {name: value[indices] for name, value in model_input.items()}
+            if len(indices) > 1:
+                for name in ("input_ids", "token_type_ids", "attention_masks"):
+                    group_input[name] = group_input[name][:1]
+
+            group_preds, group_confidences = self._predict(model_input=group_input)
+            for output_idx, pred, confidence in zip(indices, group_preds, group_confidences):
+                preds[output_idx] = pred
+                confidences[output_idx] = confidence
+
+        return preds, confidences
+
+
+class G2PWOnnxConverter(_G2PWBaseOnnxConverter):
+    def __init__(
+        self,
+        model_dir: str = "G2PWModel/",
+        style: str = "bopomofo",
+        model_source: str = None,
+        enable_non_tradional_chinese: bool = False,
+    ):
+        super().__init__(
+            model_dir=model_dir,
+            style=style,
+            model_source=model_source,
+            enable_non_tradional_chinese=enable_non_tradional_chinese,
+        )
+
+        sess_options = onnxruntime.SessionOptions()
+        sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
+        sess_options.intra_op_num_threads = 2
+
+        onnx_path = _find_first_existing_file(
+            os.path.join(self.model_dir, "g2pW.onnx"),
+            os.path.join(self.model_dir, "g2pw.onnx"),
+        )
+
+        if "CUDAExecutionProvider" in onnxruntime.get_available_providers():
+            self.session_g2pw = onnxruntime.InferenceSession(
+                onnx_path,
+                sess_options=sess_options,
+                providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
+            )
+        else:
+            self.session_g2pw = onnxruntime.InferenceSession(
+                onnx_path,
+                sess_options=sess_options,
+                providers=["CPUExecutionProvider"],
+            )
+
+    def _predict(self, model_input: Dict[str, Any]) -> Tuple[List[str], List[float]]:
+        return predict(session=self.session_g2pw, onnx_input=model_input, labels=self.labels)
--- a/docs/cn/Changelog_CN.md
+++ b/docs/cn/Changelog_CN.md
@ -606,7 +606,7 @@
  - 内容: v2pro对ge提取时会出现数值溢出的问题修复
  - 类型: 修复
  - 提交: RVC-Boss
- 2025.06.11 [Commit#37f5abfc](https://github.com/RVC-Boss/GPT-SoVITS/commit/6fdc67ca83418306f11e90b9139278313ac5c3e9)[Commit#6fdc67ca](https://github.com/RVC-Boss/GPT-SoVITS/commit/37f5abfcb4a6553652235909db2e124b6f8ff3a5)
+- 2025.06.11 [Commit#37f5abfc](https://github.com/RVC-Boss/GPT-SoVITS/commit/6fdc67ca83418306f11e90b9139278313ac5c3e9) [Commit#6fdc67ca](https://github.com/RVC-Boss/GPT-SoVITS/commit/37f5abfcb4a6553652235909db2e124b6f8ff3a5)
  - 内容: install.sh逻辑优化
  - 类型: 优化
  - 提交: XXXXRT666
@ -618,9 +618,102 @@
  - 内容: 语言分割及格式化优化
  - 类型: 优化
  - 提交: KamioRinn
+
+## after 202506
 - 2025.07.10 [Commit#426e1a2bb](https://github.com/RVC-Boss/GPT-SoVITS/commit/426e1a2bb43614af2479b877c37acfb0591e952f)
  - 内容: 提升推理进程优先级（修复win11下可能GPU利用率受限的问题）
-  - 类型: 修复
+  - 类型: 优化
  - 提交: XianYue0125
-
+- 2025.07.16 [Commit#e476b01f3](https://github.com/RVC-Boss/GPT-SoVITS/commit/e476b01f30312139555d45a78cbd830f557d892c)
+  - 内容: 解决 TTS.py 无法识别真正支持版本 v2Pro、v2ProPlus 的问题 (#2490)同时更新一版默认配置。
+  - 类型: 修复
+  - 提交: jiangsier-xyz
+- 2025.07.16 [Commit#4d8ebf85](https://github.com/RVC-Boss/GPT-SoVITS/commit/4d8ebf85233d4f1166d7cc02fdc595602975ca8f)
+  - 内容: 修复并行推理模式下v2pro模型识别问题
+  - 类型: 修复
+  - 提交: 
+- 2025.07.17 [Commit#cefafee3](https://github.com/RVC-Boss/GPT-SoVITS/commit/cefafee32cfc08f0f622ef460578b09485cc189e)
+  - 内容: whisper asr支持性价比更高的distill模型
+  - 类型: 优化
+  - 提交: XXXXRT666
+- 2025.07.18 [Commit#b9211657](https://github.com/RVC-Boss/GPT-SoVITS/commit/b9211657d8dfe8cd46f6b6eb9cfc55d5989e6548)
+  - 内容: 优化TTS_Config的代码逻辑 (#2536)
+  - 类型: 优化
+  - 提交: ChasonJiang
+- 2025.07.18 [Commit#b5a67e62](https://github.com/RVC-Boss/GPT-SoVITS/commit/b5a67e62470fb87e7cea28ecad7c7c3bf7a58afd)
+  - 内容: 修复gpt的loss计算问题 (#2537)
+  - 类型: 修复
+  - 提交: ChasonJiang
+- 2025.08.02 [Commit#fdf794e3](https://github.com/RVC-Boss/GPT-SoVITS/commit/fdf794e31d1fd6f91c5cb4fbb0396094491a31ac)
+  - 内容: WSL Rocm (#2561)
+  - 类型: 修复
+  - 提交: XXXXRT666
+- 2025.09.10 [Commit#11aa78bd](https://github.com/RVC-Boss/GPT-SoVITS/commit/11aa78bd9bda8b53047cfcae03abf7ca94d27391)
+  - 内容: 修复环境变量可能不为str的问题
+  - 类型: 修复
+  - 提交: RVC-Boss
+- 2025.11.28 [Commit#92ab59c5](https://github.com/RVC-Boss/GPT-SoVITS/commit/92ab59c5533a5dea368ddb8dad89e14474307145) [Commit#6fb441f](https://github.com/RVC-Boss/GPT-SoVITS/commit/6fb441f65e4b0573d7f7b16d96dc1917d38eda64)
+  - 内容: 流式推理
+  - 类型: 新功能
+  - 提交: ChasonJiang
+- 2025.11.28 [Commit#e00ca921](https://github.com/RVC-Boss/GPT-SoVITS/commit/e00ca92140542e6d947b9f660e24ed757aabc793)
+  - 内容: 数学计算文本前端逻辑优化
+  - 类型: 优化
+  - 提交: KamioRinn
+- 2025.11.28 [Commit#11aa78bd](https://github.com/RVC-Boss/GPT-SoVITS/commit/11aa78bd9bda8b53047cfcae03abf7ca94d27391)
+  - 内容: 流式推理 (#2469)
+  - 类型: 新功能
+  - 提交: L-jasmine
+- 2025.11.28 [Commit#60a4a214](https://github.com/RVC-Boss/GPT-SoVITS/commit/60a4a214aff18057bb4ce76643d3b85de4bb67a4)
+  - 内容: 支持vq分布式训练 (#2577)
+  - 类型: 优化
+  - 提交: wzy3650
+- 2025.11.28 [Commit#c85c54ec](https://github.com/RVC-Boss/GPT-SoVITS/commit/c85c54eca99a2fd01d6b574584217d0ecfbd90c1) [Commit#8577992](https://github.com/RVC-Boss/GPT-SoVITS/commit/857799276c3e8adcda7d662a55b07bf00bc1f01b)
+  - 内容: ASR模型下载逻辑优化
+  - 类型: 优化
+  - 提交: XXXXRT666
+- 2025.11.28 [Commit#92d2d33](https://github.com/RVC-Boss/GPT-SoVITS/commit/92d2d337fd98673c126fd40727e067204e4523ae)
+  - 内容: default batch size bug修复
+  - 类型: 修复
+  - 提交: Spr-Aachen
+- 2025.12.30 [Commit#9080a96](https://github.com/RVC-Boss/GPT-SoVITS/commit/9080a967d5e64f4bfb5a9ea33afc7252136b0256) [Commit#c767f0b](https://github.com/RVC-Boss/GPT-SoVITS/commit/c767f0b83b998e996a4d230d86da575a03f54a3f)
+  - 内容: 修复采样错误
+  - 类型: 修复
+  - 提交: ChasonJiang
+- 2026.02.08 [Commit#9986880](https://github.com/RVC-Boss/GPT-SoVITS/commit/9986880b3f13b3076989db17cc1a7227aa0186c9)
+  - 内容: 修复Conda 条款未同意导致的构建失败 (#2727)
+  - 类型: 修复
+  - 提交: Oarora
+- 2026.02.09 [Commit#2d9193b](https://github.com/RVC-Boss/GPT-SoVITS/commit/2d9193b0d3c0eae0c3a14d8c68a839f1bae157dc)
+  - 内容: 环境自动构建优化 (#2732)
+  - 类型: 优化
+  - 提交: XXXXRT666
+- 2026.04.18 [Commit#ba8de9b](https://github.com/RVC-Boss/GPT-SoVITS/commit/ba8de9b760a4bd5b3eb827a594416e71b44f2510)
+  - 内容: 优化 G2PW 的推理输入构造与多音字处理流程，减少重复计算，降低长句场景下的推理开销 (#2763)
+  - 类型: 优化
+  - 提交: baicai-1145
+- 2026.04.18 [Commit#780383d](https://github.com/RVC-Boss/GPT-SoVITS/commit/780383d5bd0d09a4f132b5ab1e80c04c9606b48a)
+  - 内容: 改进 Windows 单卡 v3 LoRA 训练流程 (#2767)
+  - 类型: 优化
+  - 提交: 2409324124
+- 2026.04.18 [Commit#1419190](https://github.com/RVC-Boss/GPT-SoVITS/commit/14191901cdb7e791d8fee1ff31dffe107f9e28fb)
+  - 内容: 修复多个模块中的独立 bug (#2755)
+  - 类型: 修复
+  - 提交: wishhyt
+- 2026.04.18 [Commit#00ce973](https://github.com/RVC-Boss/GPT-SoVITS/commit/00ce973412384e92a44836f168de2a9a8827259c)
+  - 内容: 添加数据集的错误处理提示 (#2758)
+  - 类型: 优化
+  - 提交: mushroomcowisheggs
+- 2026.04.18 [Commit#445d18c](https://github.com/RVC-Boss/GPT-SoVITS/commit/445d18ccce0b4ea7cb6f8c93ff688b662bc61338)
+  - 内容: 并行推理部分bug修复 (#2753)
+  - 类型: 修复
+  - 提交: wishhyt
+- 2026.04.18 [Commit#938f05f](https://github.com/RVC-Boss/GPT-SoVITS/commit/938f05fce8bcfb2407b8311fbbc10ac4d9ffe1c0)
+  - 内容: bug修复：dpo训练不支持漏字模拟 (#2733)
+  - 类型: 修复
+  - 提交: Mr-Neutr0n
+- 2026.04.18 [Commit#02425ea](https://github.com/RVC-Boss/GPT-SoVITS/commit/02425ea25680c26c700be0bc158756c69103d827)
+  - 内容: 修复onnx脚本未导入Optional等的问题
+  - 类型: 修复
+  - 提交: RVC-Boss

--- a/tools/asr/funasr_asr.py
+++ b/tools/asr/funasr_asr.py
@ -39,6 +39,7 @@ def create_model(language="zh"):
            local_dir="tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
        )
        model_revision = "v2.0.4"
+        vad_model_revision = punc_model_revision = "v2.0.4"
    elif language == "yue":
        path_asr = "tools/asr/models/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online"
        snapshot_download(
@ -51,8 +52,6 @@ def create_model(language="zh"):
    else:
        raise ValueError(f"{language} is not supported")

-    vad_model_revision = punc_model_revision = "v2.0.4"
-
    if language in funasr_models:
        return funasr_models[language]
    else:
--- a/tools/uvr5/lib/lib_v5/spec_utils.py
+++ b/tools/uvr5/lib/lib_v5/spec_utils.py
@ -485,6 +485,8 @@ def istft(spec, hl):
    wave_right = librosa.istft(spec_right, hop_length=hl)
    wave = np.asfortranarray([wave_left, wave_right])

+    return wave
+

 if __name__ == "__main__":
    import argparse
Author	SHA1	Message	Date
RVC-Boss	647935357a	Update Changelog_CN.md	2026-04-18 19:01:11 +08:00
RVC-Boss	02425ea256	Fixed issues such as missing imports for types like `Optional`. Fixed issues such as missing imports for types like `Optional`.	2026-04-18 17:33:53 +08:00
Harikrishna KP	938f05fce8	fix: correct torch.randint upper bound to include both values (#2733 )	2026-04-18 17:19:55 +08:00
huang yutong	445d18ccce	fix: 修复 TTS 音频后处理中的多个缺陷 (#2753 ) 1. 修复音频超采样时 int16 双重转换导致整数溢出（CRITICAL） - audio_postprocess 中 `audio = (audio * 32768).astype(np.int16)` 位于 if/else 块之外无条件执行，当 super_sampling=True 时音频已在分支内转为 int16，再次乘以 32768 导致溢出和音频完全失真 - 同时修复 super_sampling=True 但超分模型不存在时 torch.Tensor 调用 .astype() 的 AttributeError 2. 修复 batched vocoder 推理中 padding_len=0 导致音频丢失（HIGH） - 当 padding_len 恰好为 0 时，`-0 * upsample_rate == 0`，切片 `audio[x:0]` 返回空张量，导致整段音频丢失 3. 修复文件不存在时错误地抛出 FileExistsError（LOW） - 应为 FileNotFoundError Made-with: Cursor	2026-04-18 17:16:24 +08:00
Mushroomcowisheggs	00ce973412	feat: 添加数据集的错误处理提示 (#2758 ) Co-authored-by: moomushroom <107208254+moomushroom@users.noreply.github.com>	2026-04-18 17:13:30 +08:00
huang yutong	14191901cd	fix: 修复多个模块中的独立 bug (#2755 ) 1. 修复 sync_buffer 中除以函数对象而非调用结果（distrib.py） - `buffer.data /= world_size` 中 world_size 是函数，缺少 ()，导致 TypeError 使分布式训练 buffer 同步失败 2. 修复 istft 函数缺少 return 语句（spec_utils.py） - 函数计算了结果但未返回，调用者始终得到 None 3. 修复 cut0 返回字面量 "/n" 而非换行符 "\n"（text_segmentation_method.py） - 导致后续 text.split("\n") 无法正确切分，字面 /n 被当作文本内容 4. 修复粤语 ASR 的 vad/punc model_revision 被无条件覆盖（funasr_asr.py） - 粤语分支将 vad_model_revision 设为空（因不使用 VAD/标点模型），但 if/else 外的赋值将其覆盖为 "v2.0.4"，传入错误的 revision 参数 Made-with: Cursor	2026-04-18 17:10:56 +08:00
东云	780383d5bd	[codex] Improve Windows single-GPU v3 LoRA training / 改进 Windows 单卡 v3 LoRA 训练流程 (#2767 ) * Improve Windows single-GPU v3 LoRA training * Drop unrelated checkpoint helper change from PR * Tighten PR scope to single-GPU training path fixes	2026-04-18 16:54:26 +08:00
白菜工厂1145号员工	ba8de9b760	优化 G2PW 的推理输入构造与多音字处理流程，减少重复计算，降低长句场景下的推理开销 (#2763 ) * Enhance G2P processing by implementing batch input handling in _g2p function, improving efficiency. Update prepare_onnx_input to utilize caching for tokenization and add optional parameters for character ID mapping and phoneme masks. Refactor G2PWOnnxConverter to streamline model loading and configuration management. * Enhance G2PW model input handling by introducing polyphonic context character support and updating the data preparation method to return additional query IDs. This improves the processing of polyphonic characters in sentences.	2026-04-18 16:52:32 +08:00