Fix normalize (#1404)

This commit is contained in:
KamioRinn 2024-08-06 12:49:13 +08:00 committed by GitHub
parent 21e63c4c3e
commit 9f8f365b93
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 66 additions and 2 deletions

View File

@ -311,14 +311,14 @@ def get_phones_and_bert(text,language,version):
if language == "zh": if language == "zh":
if re.search(r'[A-Za-z]', formattext): if re.search(r'[A-Za-z]', formattext):
formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext) formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext)
formattext = chinese.text_normalize(formattext) formattext = chinese.mix_text_normalize(formattext)
return get_phones_and_bert(formattext,"zh",version) return get_phones_and_bert(formattext,"zh",version)
else: else:
phones, word2ph, norm_text = clean_text_inf(formattext, language, version) phones, word2ph, norm_text = clean_text_inf(formattext, language, version)
bert = get_bert_feature(norm_text, word2ph).to(device) bert = get_bert_feature(norm_text, word2ph).to(device)
elif language == "yue" and re.search(r'[A-Za-z]', formattext): elif language == "yue" and re.search(r'[A-Za-z]', formattext):
formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext) formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext)
formattext = chinese.text_normalize(formattext) formattext = chinese.mix_text_normalize(formattext)
return get_phones_and_bert(formattext,"yue",version) return get_phones_and_bert(formattext,"yue",version)
else: else:
phones, word2ph, norm_text = clean_text_inf(formattext, language, version) phones, word2ph, norm_text = clean_text_inf(formattext, language, version)

View File

@ -47,6 +47,19 @@ def replace_punctuation(text):
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
replaced_text = re.sub(
r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
)
return replaced_text
def replace_punctuation_with_en(text):
text = text.replace("", "").replace("", "")
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
replaced_text = re.sub( replaced_text = re.sub(
r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text
) )
@ -171,6 +184,20 @@ def text_normalize(text):
return dest_text return dest_text
# 不排除英文的文本格式化
def mix_text_normalize(text):
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
tx = TextNormalizer()
sentences = tx.normalize(text)
dest_text = ""
for sentence in sentences:
dest_text += replace_punctuation_with_en(sentence)
# 避免重复标点引起的参考泄露
dest_text = replace_consecutive_punctuation(dest_text)
return dest_text
if __name__ == "__main__": if __name__ == "__main__":
text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏" text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏"
text = "呣呣呣~就是…大人的鼹鼠党吧?" text = "呣呣呣~就是…大人的鼹鼠党吧?"

View File

@ -60,6 +60,26 @@ def replace_punctuation(text):
return replaced_text return replaced_text
def replace_punctuation_with_en(text):
text = text.replace("", "").replace("", "")
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
replaced_text = re.sub(
r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text
)
return replaced_text
def replace_consecutive_punctuation(text):
punctuations = ''.join(re.escape(p) for p in punctuation)
pattern = f'([{punctuations}])([{punctuations}])+'
result = re.sub(pattern, r'\1', text)
return result
def g2p(text): def g2p(text):
pattern = r"(?<=[{0}])\s*".format("".join(punctuation)) pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
sentences = [i for i in re.split(pattern, text) if i.strip() != ""] sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
@ -171,6 +191,23 @@ def text_normalize(text):
dest_text = "" dest_text = ""
for sentence in sentences: for sentence in sentences:
dest_text += replace_punctuation(sentence) dest_text += replace_punctuation(sentence)
# 避免重复标点引起的参考泄露
dest_text = replace_consecutive_punctuation(dest_text)
return dest_text
# 不排除英文的文本格式化
def mix_text_normalize(text):
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
tx = TextNormalizer()
sentences = tx.normalize(text)
dest_text = ""
for sentence in sentences:
dest_text += replace_punctuation_with_en(sentence)
# 避免重复标点引起的参考泄露
dest_text = replace_consecutive_punctuation(dest_text)
return dest_text return dest_text