From a3c4e040c80136413aa9397a02fd5f2a45f10e73 Mon Sep 17 00:00:00 2001 From: KamioRinn <63162909+KamioRinn@users.noreply.github.com> Date: Fri, 12 Apr 2024 11:18:34 +0800 Subject: [PATCH] Normalize chinese arithmetic operations (#947) --- GPT_SoVITS/text/zh_normalization/num.py | 33 +++++++++++++++++-- .../zh_normalization/text_normlization.py | 9 ++++- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/GPT_SoVITS/text/zh_normalization/num.py b/GPT_SoVITS/text/zh_normalization/num.py index 8ef7f48..d38d5a6 100644 --- a/GPT_SoVITS/text/zh_normalization/num.py +++ b/GPT_SoVITS/text/zh_normalization/num.py @@ -106,6 +106,29 @@ def replace_default_num(match): return verbalize_digit(number, alt_one=True) +# 加减乘除 +RE_ASMD = re.compile( + r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))([\+\-\×÷=])((-?)((\d+)(\.\d+)?)|(\.(\d+)))') +asmd_map = { + '+': '加', + '-': '减', + '×': '乘', + '÷': '除', + '=': '等于' +} + + +def replace_asmd(match) -> str: + """ + Args: + match (re.Match) + Returns: + str + """ + result = match.group(1) + asmd_map[match.group(8)] + match.group(9) + return result + + # 数字表达式 # 纯小数 RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))') @@ -155,7 +178,13 @@ def replace_number(match) -> str: # match.group(1) and match.group(8) are copy from RE_NUMBER RE_RANGE = re.compile( - r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))[-~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))') + r""" + (? str: @@ -165,7 +194,7 @@ def replace_range(match) -> str: Returns: str """ - first, second = match.group(1), match.group(8) + first, second = match.group(1), match.group(6) first = RE_NUMBER.sub(replace_number, first) second = RE_NUMBER.sub(replace_number, second) result = f"{first}到{second}" diff --git a/GPT_SoVITS/text/zh_normalization/text_normlization.py b/GPT_SoVITS/text/zh_normalization/text_normlization.py index b4c1494..e852fe9 100644 --- a/GPT_SoVITS/text/zh_normalization/text_normlization.py +++ b/GPT_SoVITS/text/zh_normalization/text_normlization.py @@ -34,6 +34,7 @@ from .num import RE_PERCENTAGE from .num import RE_POSITIVE_QUANTIFIERS from .num import RE_RANGE from .num import RE_TO_RANGE +from .num import RE_ASMD from .num import replace_default_num from .num import replace_frac from .num import replace_negative_num @@ -42,6 +43,7 @@ from .num import replace_percentage from .num import replace_positive_quantifier from .num import replace_range from .num import replace_to_range +from .num import replace_asmd from .phonecode import RE_MOBILE_PHONE from .phonecode import RE_NATIONAL_UNIFORM_NUMBER from .phonecode import RE_TELEPHONE @@ -67,7 +69,7 @@ class TextNormalizer(): if lang == "zh": text = text.replace(" ", "") # 过滤掉特殊字符 - text = re.sub(r'[——《》【】<=>{}()()#&@“”^_|\\]', '', text) + text = re.sub(r'[——《》【】<>{}()()#&@“”^_|\\]', '', text) text = self.SENTENCE_SPLITOR.sub(r'\1\n', text) text = text.strip() sentences = [sentence.strip() for sentence in re.split(r'\n+', text)] @@ -142,6 +144,11 @@ class TextNormalizer(): sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence) sentence = RE_RANGE.sub(replace_range, sentence) + + # 处理加减乘除 + while RE_ASMD.search(sentence): + sentence = RE_ASMD.sub(replace_asmd, sentence) + sentence = RE_INTEGER.sub(replace_negative_num, sentence) sentence = RE_DECIMAL_NUM.sub(replace_number, sentence) sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier,