优化日英/中英混合推理分词逻辑

示例

【1pass】(现在版本)
['我是一名粉刷匠,粉刷本领强,', 'I am a painter with strong painting skills', ',', 'A.B.C.D.A', '_', 'B', '—', 'C', '……', 'D.']
['zh', 'en', 'zh', 'en', 'zh', 'en', 'zh', 'en', 'zh', 'en']
【2pass】
['我是一名粉刷匠,粉刷本领强,', 'I am a painter with strong painting skills,', 'A.B.C.D.A_', 'B—', 'C……', 'D.']
['zh', 'en', 'en', 'en', 'en', 'en']
【result】
['我是一名粉刷匠,粉刷本领强,', 'I am a painter with strong painting skills,A.B.C.D.A_B—C……D.']
['zh', 'en']
This commit is contained in:
KakaruHayate 2024-01-30 22:37:59 +08:00 committed by GitHub
parent e8b5dc5961
commit 3e8c071e9a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -215,6 +215,21 @@ def splite_en_inf(sentence, language):
if pos < len(sentence):
textlist.append(sentence[pos:])
langlist.append(language)
# Merge punctuation into previous word
for i in range(len(textlist)-1, 0, -1):
if re.match(r'^[\W_]+$', textlist[i]):
textlist[i-1] += textlist[i]
del textlist[i]
del langlist[i]
# Merge consecutive words with the same language tag
i = 0
while i < len(langlist) - 1:
if langlist[i] == langlist[i+1]:
textlist[i] += textlist[i+1]
del textlist[i+1]
del langlist[i+1]
else:
i += 1
return textlist, langlist