From 00ce973412384e92a44836f168de2a9a8827259c Mon Sep 17 00:00:00 2001
From: Mushroomcowisheggs
 <107208254+mushroomcowisheggs@users.noreply.github.com>
Date: Sat, 18 Apr 2026 17:13:30 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E6=95=B0=E6=8D=AE?=
 =?UTF-8?q?=E9=9B=86=E7=9A=84=E9=94=99=E8=AF=AF=E5=A4=84=E7=90=86=E6=8F=90?=
 =?UTF-8?q?=E7=A4=BA=20(#2758)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: moomushroom <107208254+moomushroom@users.noreply.github.com>
---
 GPT_SoVITS/AR/data/dataset.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/GPT_SoVITS/AR/data/dataset.py b/GPT_SoVITS/AR/data/dataset.py
index 402483d9..87b284d4 100644
--- a/GPT_SoVITS/AR/data/dataset.py
+++ b/GPT_SoVITS/AR/data/dataset.py
@@ -67,8 +67,10 @@ class Text2SemanticDataset(Dataset):
             )
         )  # "%s/3-bert"%exp_dir#bert_dir
         self.path6 = semantic_path  # "%s/6-name2semantic.tsv"%exp_dir#semantic_path
-        assert os.path.exists(self.path2)
-        assert os.path.exists(self.path6)
+        if not os.path.exists(self.path2):
+            raise FileNotFoundError(f"Phoneme data file not found: {self.path2}")
+        if not os.path.exists(self.path6):
+            raise FileNotFoundError(f"Semantic data file not found: {self.path6}")
         self.phoneme_data = {}
         with open(self.path2, "r", encoding="utf8") as f:
             lines = f.read().strip("\n").split("\n")
@@ -131,7 +133,7 @@ class Text2SemanticDataset(Dataset):
                 phoneme, word2ph, text = self.phoneme_data[item_name]
             except Exception:
                 traceback.print_exc()
-                # print(f"{item_name} not in self.phoneme_data !")
+                print(f"Warning: File \"{item_name}\" not in self.phoneme_data! Skipped. ")
                 num_not_in += 1
                 continue
 
@@ -152,7 +154,7 @@ class Text2SemanticDataset(Dataset):
                 phoneme_ids = cleaned_text_to_sequence(phoneme, version)
             except:
                 traceback.print_exc()
-                # print(f"{item_name} not in self.phoneme_data !")
+                print(f"Warning: Failed to convert phonemes to sequence for file \"{item_name}\"! Skipped. ")
                 num_not_in += 1
                 continue
             # if len(phoneme_ids) >400:###########2：改为恒定限制为semantic/2.5就行
@@ -228,7 +230,11 @@ class Text2SemanticDataset(Dataset):
             # bert_feature=torch.zeros_like(phoneme_ids,dtype=torch.float32)
             bert_feature = None
         else:
-            assert bert_feature.shape[-1] == len(phoneme_ids)
+            try:
+                assert bert_feature.shape[-1] == len(phoneme_ids)
+            except AssertionError:
+                print(f"AssertionError: The BERT feature dimension ({bert_feature.shape[-1]}) of the file '{item_name}' does not match the length of the phoneme sequence ({len(phoneme_ids)}).")
+                raise
         return {
             "idx": idx,
             "phoneme_ids": phoneme_ids,