支持自定义读音(多音字)

This commit is contained in:
fangzhehao 2024-10-28 11:09:41 +08:00
parent 98cc47699c
commit 31b99d6173
2 changed files with 40 additions and 9 deletions

View File

@ -131,13 +131,32 @@ def _merge_erhua(initials: list[str],
new_finals.append(phn) new_finals.append(phn)
return new_initials, new_finals return new_initials, new_finals
def remove_brackets(s):
if s.startswith('(') and s.endswith(')'):
return s[1:-1]
return s
def custom_pinyin(seg:str):
'''
自定义拼音格式这个字的读音是角(jue2)而不是角(jiao3)
拼音格式符合 pinyin.Style.TON3(声调风格3即拼音声调在各个拼音之后用数字 [1-4] 进行表示 中国 -> ``zhong1 guo2``)
逻辑使用正则截取原字符串中指定的拼音符合截取拼音后的纯汉字字符串和自定义拼音列表无自定义拼音的返回空 return:这个字的读音是角色而不是角色 ["","","","","","","","jue2","","","","","","jiao3",""]
'''
reg=re.compile('\([a-zA-Z]+\d\)')
custom_pys=reg.findall(seg)
result=["" for s in range(len(seg))]
for custom_py in custom_pys:
index=seg.index(custom_py)
result[index-1]=remove_brackets(custom_py)
seg=seg.replace(custom_py,"",1)
result=result[:len(seg)]
return seg,result
def _g2p(segments): def _g2p(segments):
phones_list = [] phones_list = []
word2ph = [] word2ph = []
for seg in segments: for seg in segments:
pinyins = [] pinyins = []
seg, customer_pinyin = custom_pinyin(seg)
# Replace all English words in the sentence # Replace all English words in the sentence
seg = re.sub("[a-zA-Z]+", "", seg) seg = re.sub("[a-zA-Z]+", "", seg)
seg_cut = psg.lcut(seg) seg_cut = psg.lcut(seg)
@ -162,7 +181,8 @@ def _g2p(segments):
else: else:
# g2pw采用整句推理 # g2pw采用整句推理
pinyins = g2pw.lazy_pinyin(seg, neutral_tone_with_five=True, style=Style.TONE3) pinyins = g2pw.lazy_pinyin(seg, neutral_tone_with_five=True, style=Style.TONE3)
pinyins = [customer_pinyin[index] if customer_pinyin[index] != "" else item for index,item in enumerate(pinyins) ]
print(f"g2pw seg:{seg} pinyins:{pinyins}")
pre_word_length = 0 pre_word_length = 0
for word, pos in seg_cut: for word, pos in seg_cut:
sub_initials = [] sub_initials = []
@ -176,7 +196,7 @@ def _g2p(segments):
word_pinyins = pinyins[pre_word_length:now_word_length] word_pinyins = pinyins[pre_word_length:now_word_length]
# 多音字消歧 # 多音字消歧
word_pinyins = correct_pronunciation(word,word_pinyins) # word_pinyins = correct_pronunciation(word,word_pinyins) # 拿缓存
for pinyin in word_pinyins: for pinyin in word_pinyins:
if pinyin[0].isalpha(): if pinyin[0].isalpha():
@ -272,6 +292,7 @@ def replace_consecutive_punctuation(text):
def text_normalize(text): def text_normalize(text):
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
tx = TextNormalizer() tx = TextNormalizer()
text, custom_py = custom_pinyin(text)
sentences = tx.normalize(text) sentences = tx.normalize(text)
dest_text = "" dest_text = ""
for sentence in sentences: for sentence in sentences:
@ -279,8 +300,19 @@ def text_normalize(text):
# 避免重复标点引起的参考泄露 # 避免重复标点引起的参考泄露
dest_text = replace_consecutive_punctuation(dest_text) dest_text = replace_consecutive_punctuation(dest_text)
if len(dest_text)==len(text):
result=""
for index,rune in enumerate(dest_text):
result=result+rune
if text[index]==dest_text[index] and custom_py[index]!="":
result=result+"("+custom_py[index]+")"
dest_text=result
else:
print("text_normalize 后长度不一致")
return dest_text return dest_text
def clean_custom_pinyin(text):
text, _ = custom_pinyin(text)
return text
# 不排除英文的文本格式化 # 不排除英文的文本格式化
def mix_text_normalize(text): def mix_text_normalize(text):
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
@ -296,11 +328,8 @@ def mix_text_normalize(text):
if __name__ == "__main__": if __name__ == "__main__":
text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏" text = "这个字的读音是角(jue2)色,而不是角(jiao3)色"
text = "呣呣呣~就是…大人的鼹鼠党吧?" print(custom_pinyin(text))
text = "你好"
text = text_normalize(text)
print(g2p(text))
# # 示例用法 # # 示例用法

View File

@ -40,6 +40,8 @@ def clean_text(text, language, version=None):
norm_text=text norm_text=text
if language == "zh" or language=="yue":########## if language == "zh" or language=="yue":##########
phones, word2ph = language_module.g2p(norm_text) phones, word2ph = language_module.g2p(norm_text)
if hasattr(language_module,"clean_custom_pinyin"):
norm_text=language_module.clean_custom_pinyin(norm_text)
assert len(phones) == sum(word2ph) assert len(phones) == sum(word2ph)
assert len(norm_text) == len(word2ph) assert len(norm_text) == len(word2ph)
elif language == "en": elif language == "en":