Merge pull request #314 from KakaruHayate/patch-6

优化日英/中英混合推理分词逻辑
This commit is contained in:
RVC-Boss 2024-01-30 23:01:21 +08:00 committed by GitHub
commit 53817b983c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -200,7 +200,7 @@ dict_language = {
def splite_en_inf(sentence, language):
pattern = re.compile(r'[a-zA-Z. ]+')
pattern = re.compile(r'[a-zA-Z ]+')
textlist = []
langlist = []
pos = 0
@ -215,6 +215,21 @@ def splite_en_inf(sentence, language):
if pos < len(sentence):
textlist.append(sentence[pos:])
langlist.append(language)
# Merge punctuation into previous word
for i in range(len(textlist)-1, 0, -1):
if re.match(r'^[\W_]+$', textlist[i]):
textlist[i-1] += textlist[i]
del textlist[i]
del langlist[i]
# Merge consecutive words with the same language tag
i = 0
while i < len(langlist) - 1:
if langlist[i] == langlist[i+1]:
textlist[i] += textlist[i+1]
del textlist[i+1]
del langlist[i+1]
else:
i += 1
return textlist, langlist