diff --git a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py index 58b2678..2cc0023 100644 --- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py +++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py @@ -8,6 +8,7 @@ sys.path.append(now_dir) import re import torch import LangSegment + from typing import Dict, List, Tuple from text.cleaner import clean_text from text import cleaned_text_to_sequence @@ -17,6 +18,7 @@ from TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_ from tools.i18n.i18n import I18nAuto i18n = I18nAuto() +punctuation = set(['!', '?', '…', ',', '.', '-'," "]) def get_first(text:str) -> str: pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]" @@ -54,6 +56,7 @@ class TextPreprocessor: def preprocess(self, text:str, lang:str, text_split_method:str)->List[Dict]: print(i18n("############ 切分文本 ############")) + texts = self.replace_consecutive_punctuation(texts) texts = self.pre_seg_text(text, lang, text_split_method) result = [] print(i18n("############ 提取文本Bert特征 ############")) @@ -83,6 +86,7 @@ class TextPreprocessor: text = text.replace("\n\n", "\n") _texts = text.split("\n") + _texts = self.process_text(_texts) _texts = merge_short_text_in_array(_texts, 5) texts = [] @@ -205,6 +209,23 @@ class TextPreprocessor: return feature + def process_text(self,texts): + _text=[] + if all(text in [None, " ", "\n",""] for text in texts): + raise ValueError(i18n("请输入有效文本")) + for text in texts: + if text in [None, " ", ""]: + pass + else: + _text.append(text) + return _text + + + def replace_consecutive_punctuation(self,text): + punctuations = ''.join(re.escape(p) for p in punctuation) + pattern = f'([{punctuations}])([{punctuations}])+' + result = re.sub(pattern, r'\1', text) + return result diff --git a/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py b/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py index 2608f4c..33c6e5d 100644 --- a/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py +++ b/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py @@ -7,7 +7,7 @@ from typing import Callable from tools.i18n.i18n import I18nAuto i18n = I18nAuto() - +punctuation = set(['!', '?', '…', ',', '.', '-'," "]) METHODS = dict() def get_method(name:str)->Callable: @@ -76,7 +76,10 @@ def split(todo_text): # 不切 @register_method("cut0") def cut0(inp): - return inp + if not set(inp).issubset(punctuation): + return inp + else: + return "/n" # 凑四句一切 @@ -93,6 +96,7 @@ def cut1(inp): opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]])) else: opts = [inp] + opts = [item for item in opts if not set(item).issubset(punctuation)] return "\n".join(opts) # 凑50字一切 @@ -118,19 +122,24 @@ def cut2(inp): if len(opts) > 1 and len(opts[-1]) < 50: ##如果最后一个太短了,和前一个合一起 opts[-2] = opts[-2] + opts[-1] opts = opts[:-1] + opts = [item for item in opts if not set(item).issubset(punctuation)] return "\n".join(opts) # 按中文句号。切 @register_method("cut3") def cut3(inp): inp = inp.strip("\n") - return "\n".join(["%s" % item for item in inp.strip("。").split("。")]) + opts = ["%s" % item for item in inp.strip("。").split("。")] + opts = [item for item in opts if not set(item).issubset(punctuation)] + return "\n".join(opts) #按英文句号.切 @register_method("cut4") def cut4(inp): inp = inp.strip("\n") - return "\n".join(["%s" % item for item in inp.strip(".").split(".")]) + opts = ["%s" % item for item in inp.strip(".").split(".")] + opts = [item for item in opts if not set(item).issubset(punctuation)] + return "\n".join(opts) # 按标点符号切 # contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py @@ -146,8 +155,9 @@ def cut5(inp): # 在句子不存在符号或句尾无符号的时候保证文本完整 if len(items)%2 == 1: mergeitems.append(items[-1]) - opt = "\n".join(mergeitems) - return opt + opts = [item for item in mergeitems if not set(item).issubset(punctuation)] + opts = "\n".join(opts) + return opts