关于标点符号导致参考泄漏的问题 (#1169)

* punctuation

* update

* update
This commit is contained in:
XXXXRT666 2024-06-10 09:18:35 +01:00 committed by GitHub
parent a3da8e87b5
commit 277b258360
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 37 additions and 6 deletions

View File

@ -8,6 +8,7 @@ sys.path.append(now_dir)
import re
import torch
import LangSegment
from typing import Dict, List, Tuple
from text.cleaner import clean_text
from text import cleaned_text_to_sequence
@ -17,6 +18,7 @@ from TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_
from tools.i18n.i18n import I18nAuto
i18n = I18nAuto()
punctuation = set(['!', '?', '', ',', '.', '-'," "])
def get_first(text:str) -> str:
pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]"
@ -54,6 +56,7 @@ class TextPreprocessor:
def preprocess(self, text:str, lang:str, text_split_method:str)->List[Dict]:
print(i18n("############ 切分文本 ############"))
texts = self.replace_consecutive_punctuation(texts)
texts = self.pre_seg_text(text, lang, text_split_method)
result = []
print(i18n("############ 提取文本Bert特征 ############"))
@ -83,6 +86,7 @@ class TextPreprocessor:
text = text.replace("\n\n", "\n")
_texts = text.split("\n")
_texts = self.process_text(_texts)
_texts = merge_short_text_in_array(_texts, 5)
texts = []
@ -205,6 +209,23 @@ class TextPreprocessor:
return feature
def process_text(self,texts):
_text=[]
if all(text in [None, " ", "\n",""] for text in texts):
raise ValueError(i18n("请输入有效文本"))
for text in texts:
if text in [None, " ", ""]:
pass
else:
_text.append(text)
return _text
def replace_consecutive_punctuation(self,text):
punctuations = ''.join(re.escape(p) for p in punctuation)
pattern = f'([{punctuations}])([{punctuations}])+'
result = re.sub(pattern, r'\1', text)
return result

View File

@ -7,7 +7,7 @@ from typing import Callable
from tools.i18n.i18n import I18nAuto
i18n = I18nAuto()
punctuation = set(['!', '?', '', ',', '.', '-'," "])
METHODS = dict()
def get_method(name:str)->Callable:
@ -76,7 +76,10 @@ def split(todo_text):
# 不切
@register_method("cut0")
def cut0(inp):
return inp
if not set(inp).issubset(punctuation):
return inp
else:
return "/n"
# 凑四句一切
@ -93,6 +96,7 @@ def cut1(inp):
opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]]))
else:
opts = [inp]
opts = [item for item in opts if not set(item).issubset(punctuation)]
return "\n".join(opts)
# 凑50字一切
@ -118,19 +122,24 @@ def cut2(inp):
if len(opts) > 1 and len(opts[-1]) < 50: ##如果最后一个太短了,和前一个合一起
opts[-2] = opts[-2] + opts[-1]
opts = opts[:-1]
opts = [item for item in opts if not set(item).issubset(punctuation)]
return "\n".join(opts)
# 按中文句号。切
@register_method("cut3")
def cut3(inp):
inp = inp.strip("\n")
return "\n".join(["%s" % item for item in inp.strip("").split("")])
opts = ["%s" % item for item in inp.strip("").split("")]
opts = [item for item in opts if not set(item).issubset(punctuation)]
return "\n".join(opts)
#按英文句号.切
@register_method("cut4")
def cut4(inp):
inp = inp.strip("\n")
return "\n".join(["%s" % item for item in inp.strip(".").split(".")])
opts = ["%s" % item for item in inp.strip(".").split(".")]
opts = [item for item in opts if not set(item).issubset(punctuation)]
return "\n".join(opts)
# 按标点符号切
# contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py
@ -146,8 +155,9 @@ def cut5(inp):
# 在句子不存在符号或句尾无符号的时候保证文本完整
if len(items)%2 == 1:
mergeitems.append(items[-1])
opt = "\n".join(mergeitems)
return opt
opts = [item for item in mergeitems if not set(item).issubset(punctuation)]
opts = "\n".join(opts)
return opts