Update text_segmentation_method.py

第85行,修改cut1函数,如果文本的句数为11句,则原函数会把句子切分为 4/7而不是4/4/3。模型处理长句子容易出现漏字现象。
第137行函数cut5中的punds的取值做出修改:删除重复的";",新增":;"
This commit is contained in:
jmaple12 2024-03-14 18:14:42 +08:00 committed by GitHub
parent 37a895a67d
commit 160dfbdd2c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -82,7 +82,8 @@ def cut1(inp):
inp = inp.strip("\n")
inps = split(inp)
split_idx = list(range(0, len(inps), 4))
split_idx[-1] = None
# split_idx[-1] = None
split_idx.append(None)
if len(split_idx) > 1:
opts = []
for idx in range(len(split_idx) - 1):
@ -135,7 +136,8 @@ def cut5(inp):
# if not re.search(r'[^\w\s]', inp[-1]):
# inp += '。'
inp = inp.strip("\n")
punds = r'[,.;?!、,。?!;:…]'
# punds = r'[,.;?!、,。?!;:…]'
punds = r'[,.;?!、,。?!;::…]'
items = re.split(f'({punds})', inp)
mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])]
# 在句子不存在符号或句尾无符号的时候保证文本完整
@ -149,4 +151,4 @@ def cut5(inp):
if __name__ == '__main__':
method = get_method("cut5")
print(method("你好,我是小明。你好,我是小红。你好,我是小刚。你好,我是小张。"))