From 9f8f365b93766626b4e9c43937b87427d2e9cfa9 Mon Sep 17 00:00:00 2001 From: KamioRinn <63162909+KamioRinn@users.noreply.github.com> Date: Tue, 6 Aug 2024 12:49:13 +0800 Subject: [PATCH] Fix normalize (#1404) --- GPT_SoVITS/inference_webui.py | 4 ++-- GPT_SoVITS/text/chinese.py | 27 +++++++++++++++++++++++++ GPT_SoVITS/text/chinese2.py | 37 +++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 2 deletions(-) diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 32ce4a6..9218b0d 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -311,14 +311,14 @@ def get_phones_and_bert(text,language,version): if language == "zh": if re.search(r'[A-Za-z]', formattext): formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext) - formattext = chinese.text_normalize(formattext) + formattext = chinese.mix_text_normalize(formattext) return get_phones_and_bert(formattext,"zh",version) else: phones, word2ph, norm_text = clean_text_inf(formattext, language, version) bert = get_bert_feature(norm_text, word2ph).to(device) elif language == "yue" and re.search(r'[A-Za-z]', formattext): formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext) - formattext = chinese.text_normalize(formattext) + formattext = chinese.mix_text_normalize(formattext) return get_phones_and_bert(formattext,"yue",version) else: phones, word2ph, norm_text = clean_text_inf(formattext, language, version) diff --git a/GPT_SoVITS/text/chinese.py b/GPT_SoVITS/text/chinese.py index bebf3f0..2255c6e 100644 --- a/GPT_SoVITS/text/chinese.py +++ b/GPT_SoVITS/text/chinese.py @@ -47,6 +47,19 @@ def replace_punctuation(text): replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) + replaced_text = re.sub( + r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text + ) + + return replaced_text + + +def replace_punctuation_with_en(text): + text = text.replace("嗯", "恩").replace("呣", "母") + pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) + + replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) + replaced_text = re.sub( r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text ) @@ -171,6 +184,20 @@ def text_normalize(text): return dest_text +# 不排除英文的文本格式化 +def mix_text_normalize(text): + # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization + tx = TextNormalizer() + sentences = tx.normalize(text) + dest_text = "" + for sentence in sentences: + dest_text += replace_punctuation_with_en(sentence) + + # 避免重复标点引起的参考泄露 + dest_text = replace_consecutive_punctuation(dest_text) + return dest_text + + if __name__ == "__main__": text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏" text = "呣呣呣~就是…大人的鼹鼠党吧?" diff --git a/GPT_SoVITS/text/chinese2.py b/GPT_SoVITS/text/chinese2.py index a12e360..d8dce13 100644 --- a/GPT_SoVITS/text/chinese2.py +++ b/GPT_SoVITS/text/chinese2.py @@ -60,6 +60,26 @@ def replace_punctuation(text): return replaced_text +def replace_punctuation_with_en(text): + text = text.replace("嗯", "恩").replace("呣", "母") + pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) + + replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) + + replaced_text = re.sub( + r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text + ) + + return replaced_text + + +def replace_consecutive_punctuation(text): + punctuations = ''.join(re.escape(p) for p in punctuation) + pattern = f'([{punctuations}])([{punctuations}])+' + result = re.sub(pattern, r'\1', text) + return result + + def g2p(text): pattern = r"(?<=[{0}])\s*".format("".join(punctuation)) sentences = [i for i in re.split(pattern, text) if i.strip() != ""] @@ -171,6 +191,23 @@ def text_normalize(text): dest_text = "" for sentence in sentences: dest_text += replace_punctuation(sentence) + + # 避免重复标点引起的参考泄露 + dest_text = replace_consecutive_punctuation(dest_text) + return dest_text + + +# 不排除英文的文本格式化 +def mix_text_normalize(text): + # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization + tx = TextNormalizer() + sentences = tx.normalize(text) + dest_text = "" + for sentence in sentences: + dest_text += replace_punctuation_with_en(sentence) + + # 避免重复标点引起的参考泄露 + dest_text = replace_consecutive_punctuation(dest_text) return dest_text