mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-04-05 19:41:56 +08:00
优化日英/中英混合推理分词逻辑
示例 【1pass】(现在版本) ['我是一名粉刷匠,粉刷本领强,', 'I am a painter with strong painting skills', ',', 'A.B.C.D.A', '_', 'B', '—', 'C', '……', 'D.'] ['zh', 'en', 'zh', 'en', 'zh', 'en', 'zh', 'en', 'zh', 'en'] 【2pass】 ['我是一名粉刷匠,粉刷本领强,', 'I am a painter with strong painting skills,', 'A.B.C.D.A_', 'B—', 'C……', 'D.'] ['zh', 'en', 'en', 'en', 'en', 'en'] 【result】 ['我是一名粉刷匠,粉刷本领强,', 'I am a painter with strong painting skills,A.B.C.D.A_B—C……D.'] ['zh', 'en']
This commit is contained in:
parent
e8b5dc5961
commit
3e8c071e9a
@ -215,6 +215,21 @@ def splite_en_inf(sentence, language):
|
||||
if pos < len(sentence):
|
||||
textlist.append(sentence[pos:])
|
||||
langlist.append(language)
|
||||
# Merge punctuation into previous word
|
||||
for i in range(len(textlist)-1, 0, -1):
|
||||
if re.match(r'^[\W_]+$', textlist[i]):
|
||||
textlist[i-1] += textlist[i]
|
||||
del textlist[i]
|
||||
del langlist[i]
|
||||
# Merge consecutive words with the same language tag
|
||||
i = 0
|
||||
while i < len(langlist) - 1:
|
||||
if langlist[i] == langlist[i+1]:
|
||||
textlist[i] += textlist[i+1]
|
||||
del textlist[i+1]
|
||||
del langlist[i+1]
|
||||
else:
|
||||
i += 1
|
||||
|
||||
return textlist, langlist
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user