From 82b458625d83f8edaf9636166e29bf4056a6aae7 Mon Sep 17 00:00:00 2001 From: Ella Zhang <144317607+EllaZhangCA@users.noreply.github.com> Date: Tue, 23 Sep 2025 02:48:07 -0700 Subject: [PATCH 1/3] =?UTF-8?q?=E4=B8=BA=E4=B8=AD=E6=96=87=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0=E4=BA=86=E8=B4=A7=E5=B8=81=E8=AE=A1=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- GPT_SoVITS/text/cleaner.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/GPT_SoVITS/text/cleaner.py b/GPT_SoVITS/text/cleaner.py index 7ba8f376..7c13023c 100644 --- a/GPT_SoVITS/text/cleaner.py +++ b/GPT_SoVITS/text/cleaner.py @@ -10,6 +10,7 @@ import os from text import symbols as symbols_v1 from text import symbols2 as symbols_v2 + special = [ # ("%", "zh", "SP"), ("¥", "zh", "SP2"), @@ -17,7 +18,6 @@ special = [ # ('@', 'zh', "SP4")#不搞鬼畜了,和第二版保持一致吧 ] - def clean_text(text, language, version=None): if version is None: version = os.environ.get("version", "v2") @@ -31,6 +31,14 @@ def clean_text(text, language, version=None): if language not in language_module_map: language = "en" text = " " + if language in ("zh"): #处理货币似乎只能这里截胡,不然货币符号会被吞 + from text.zh_normalization.num import ( + RE_CNY_PREFIX, RE_CNY_SUFFIX, replace_cny_prefix, replace_cny_suffix, + RE_USD_SYMBOL, RE_USD_SUFFIX, replace_usd_symbol, replace_usd_suffix,) + text = RE_CNY_PREFIX.sub(replace_cny_prefix, text) + text = RE_CNY_SUFFIX.sub(replace_cny_suffix, text) + text = RE_USD_SYMBOL.sub(replace_usd_symbol, text) + text = RE_USD_SUFFIX.sub(replace_usd_suffix, text) for special_s, special_l, target_symbol in special: if special_s in text and language == special_l: return clean_special(text, language, special_s, target_symbol, version) From 0c02ebf5aeb9d6d6465d17eaf65d4451a2ad91ec Mon Sep 17 00:00:00 2001 From: Ella Zhang <144317607+EllaZhangCA@users.noreply.github.com> Date: Tue, 23 Sep 2025 02:48:30 -0700 Subject: [PATCH 2/3] Update num.py --- GPT_SoVITS/text/zh_normalization/num.py | 113 ++++++++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/GPT_SoVITS/text/zh_normalization/num.py b/GPT_SoVITS/text/zh_normalization/num.py index 14d602b0..3aed785f 100644 --- a/GPT_SoVITS/text/zh_normalization/num.py +++ b/GPT_SoVITS/text/zh_normalization/num.py @@ -337,3 +337,116 @@ def num2str(value_string: str) -> str: result = result if result else "零" result += "点" + verbalize_digit(decimal) return result + +RE_CNY_PREFIX = re.compile(r"(?:¥|¥)\s*(-?\d[\d,]*(?:\.\d+)?)") +RE_CNY_SUFFIX = re.compile(r"(-?\d[\d,]*(?:\.\d+)?)(?:\s*(?:人民币|元|CNY|cny|¥|¥))") + +def _strip_commas(s: str) -> str: + return s.replace(",", "") + +def _split_amount(amount: str): + neg = amount.startswith("-") + if neg: + amount = amount[1:] + amount = _strip_commas(amount) or "0" + + if "." in amount: + integer, frac = amount.split(".", 1) + had_frac = True + else: + integer, frac, had_frac = amount, "", False + + integer = integer or "0" + frac = (frac + "00")[:2] + return neg, integer, frac, had_frac + +#人民币和美元的处理都在cleaner那边,防吞 +def replace_cny_amount(amount: str, num2str) -> str: + neg, integer, frac, had_frac = _split_amount(amount) + + integer_cn = num2str(integer) if integer != "0" else "零" + + jiao, fen = frac[0], frac[1] + parts = [] + + if integer != "0": + parts.append(integer_cn + "元") + else: + parts.append("零元") + + if jiao != "0" or fen != "0": + if jiao != "0": + parts.append(num2str(jiao) + "角") + if fen != "0": + parts.append(num2str(fen) + "分") + elif had_frac: + parts.append("整") + + res = "".join(parts) + if neg and res and res[0] != "负": + res = "负" + res + return res + +def replace_cny_prefix(m, num2str=num2str): + return replace_cny_amount(m.group(1), num2str) + +def replace_cny_suffix(m, num2str=num2str): + return replace_cny_amount(m.group(1), num2str) + +#我知道美元符也可能是加拿大元什么的,但是就当它美元吧whatever +RE_USD_SYMBOL = re.compile(r"(?:\$|$)\s*(-?\d[\d,]*(?:\.\d+)?)") +RE_USD_SUFFIX = re.compile(r"(-?\d[\d,]*(?:\.\d+)?)(?:\s*(?:美元|USD|usd|\$|$))") + +def _strip_commas(s: str) -> str: + return s.replace(",", "") + +def _split_amount(amount: str): + neg = amount.startswith("-") + if neg: + amount = amount[1:] + amount = _strip_commas(amount) or "0" + + if "." in amount: + integer, frac = amount.split(".", 1) + had_frac = True + else: + integer, frac, had_frac = amount, "", False + + integer = integer or "0" + # 只保留两位小数用来读美分 + frac = (frac + "00")[:2] + return neg, integer, frac, had_frac + +def replace_usd_amount(amount: str, num2str) -> str: + neg, integer, frac, had_frac = _split_amount(amount) + + integer_cn = num2str(integer) if integer != "0" else "零" + + jiao, fen = frac[0], frac[1] + parts = [] + if integer != "0": + parts.append(integer_cn + "美元") + + if jiao != "0" or fen != "0": + cents = "" + if jiao != "0": + cents += num2str(jiao) + "十" + if fen != "0": + cents += num2str(fen) + cents = cents.replace("一十", "十") + parts.append(cents + "美分") + elif had_frac: + parts.append("整") + elif integer == "0": + parts = ["零美元"] + + res = "".join(parts) + if neg and res and res[0] != "负": + res = "负" + res + return res + +def replace_usd_symbol(m, num2str=num2str): + return replace_usd_amount(m.group(1), num2str) + +def replace_usd_suffix(m, num2str=num2str): + return replace_usd_amount(m.group(1), num2str) From 8c0cb0d691554d8311d8904972ae3efa8bfd1cc4 Mon Sep 17 00:00:00 2001 From: Ella Zhang <144317607+EllaZhangCA@users.noreply.github.com> Date: Tue, 23 Sep 2025 02:50:39 -0700 Subject: [PATCH 3/3] =?UTF-8?q?=E4=B8=BA=E4=B8=AD=E6=96=87=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0=E4=BA=86=E8=B4=A7=E5=B8=81=E8=AE=A1=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- GPT_SoVITS/text/cleaner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPT_SoVITS/text/cleaner.py b/GPT_SoVITS/text/cleaner.py index 7c13023c..388b4e16 100644 --- a/GPT_SoVITS/text/cleaner.py +++ b/GPT_SoVITS/text/cleaner.py @@ -13,7 +13,7 @@ from text import symbols2 as symbols_v2 special = [ # ("%", "zh", "SP"), - ("¥", "zh", "SP2"), + # ("¥", "zh", "SP2"), #加了货币计数所以人民币符不是SP2了 ("^", "zh", "SP3"), # ('@', 'zh', "SP4")#不搞鬼畜了,和第二版保持一致吧 ] @@ -31,7 +31,7 @@ def clean_text(text, language, version=None): if language not in language_module_map: language = "en" text = " " - if language in ("zh"): #处理货币似乎只能这里截胡,不然货币符号会被吞 + if language in ("zh"): #处理货币似乎最佳方案是这里截胡,不然可能被吞... from text.zh_normalization.num import ( RE_CNY_PREFIX, RE_CNY_SUFFIX, replace_cny_prefix, replace_cny_suffix, RE_USD_SYMBOL, RE_USD_SUFFIX, replace_usd_symbol, replace_usd_suffix,)