关于标点符号导致参考泄漏的问题 (#1169)

* punctuation * update * update
2026-07-03 20:48:14 +08:00 · 2024-06-10 09:18:35 +01:00 · 2024-06-10 09:18:35 +01:00 · 277b258360
commit 277b258360
parent a3da8e87b5
2 changed files with 37 additions and 6 deletions
--- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
+++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
@ -8,6 +8,7 @@ sys.path.append(now_dir)
 import re
 import torch
 import LangSegment
+
 from typing import Dict, List, Tuple
 from text.cleaner import clean_text
 from text import cleaned_text_to_sequence
@ -17,6 +18,7 @@ from TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_
 from tools.i18n.i18n import I18nAuto

 i18n = I18nAuto()
+punctuation = set(['!', '?', '…', ',', '.', '-'," "])

 def get_first(text:str) -> str:
    pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]"
@ -54,6 +56,7 @@ class TextPreprocessor:
        
    def preprocess(self, text:str, lang:str, text_split_method:str)->List[Dict]:
        print(i18n("############ 切分文本 ############"))
+        texts = self.replace_consecutive_punctuation(texts)
        texts = self.pre_seg_text(text, lang, text_split_method)
        result = []
        print(i18n("############ 提取文本Bert特征 ############"))
@ -83,6 +86,7 @@ class TextPreprocessor:
            text = text.replace("\n\n", "\n")

        _texts = text.split("\n")
+        _texts = self.process_text(_texts)
        _texts = merge_short_text_in_array(_texts, 5)
        texts = []

@ -205,6 +209,23 @@ class TextPreprocessor:

        return feature
    
+    def process_text(self,texts):
+        _text=[]
+        if all(text in [None, " ", "\n",""] for text in texts):
+            raise ValueError(i18n("请输入有效文本"))
+        for text in texts:
+            if text in  [None, " ", ""]:
+                pass
+            else:
+                _text.append(text)
+        return _text
+    
+
+    def replace_consecutive_punctuation(self,text):
+        punctuations = ''.join(re.escape(p) for p in punctuation)
+        pattern = f'([{punctuations}])([{punctuations}])+'
+        result = re.sub(pattern, r'\1', text)
+        return result



--- a/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
+++ b/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
@ -7,7 +7,7 @@ from typing import Callable
 from tools.i18n.i18n import I18nAuto

 i18n = I18nAuto()
-
+punctuation = set(['!', '?', '…', ',', '.', '-'," "])
 METHODS = dict()

 def get_method(name:str)->Callable:
@ -76,7 +76,10 @@ def split(todo_text):
 # 不切
@register_method("cut0")
 def cut0(inp):
-    return inp
+    if not set(inp).issubset(punctuation):
+        return inp
+    else:
+        return "/n"


 # 凑四句一切
@ -93,6 +96,7 @@ def cut1(inp):
            opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]]))
    else:
        opts = [inp]
+    opts = [item for item in opts if not set(item).issubset(punctuation)]
    return "\n".join(opts)

 # 凑50字一切
@ -118,19 +122,24 @@ def cut2(inp):
    if len(opts) > 1 and len(opts[-1]) < 50:  ##如果最后一个太短了，和前一个合一起
        opts[-2] = opts[-2] + opts[-1]
        opts = opts[:-1]
+    opts = [item for item in opts if not set(item).issubset(punctuation)]
    return "\n".join(opts)

 # 按中文句号。切
@register_method("cut3")
 def cut3(inp):
    inp = inp.strip("\n")
-    return "\n".join(["%s" % item for item in inp.strip("。").split("。")])
+    opts = ["%s" % item for item in inp.strip("。").split("。")]
+    opts = [item for item in opts if not set(item).issubset(punctuation)]
+    return "\n".join(opts)

 #按英文句号.切
@register_method("cut4")
 def cut4(inp):
    inp = inp.strip("\n")
-    return "\n".join(["%s" % item for item in inp.strip(".").split(".")])
+    opts = ["%s" % item for item in inp.strip(".").split(".")]
+    opts = [item for item in opts if not set(item).issubset(punctuation)]
+    return "\n".join(opts)

 # 按标点符号切
 # contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py
@ -146,8 +155,9 @@ def cut5(inp):
    # 在句子不存在符号或句尾无符号的时候保证文本完整
    if len(items)%2 == 1:
        mergeitems.append(items[-1])
-    opt = "\n".join(mergeitems)
-    return opt
+    opts = [item for item in mergeitems if not set(item).issubset(punctuation)]
+    opts = "\n".join(opts)
+    return opts