diff --git a/GPT_SoVITS/text/cleaner.py b/GPT_SoVITS/text/cleaner.py index 7ba8f376..388b4e16 100644 --- a/GPT_SoVITS/text/cleaner.py +++ b/GPT_SoVITS/text/cleaner.py @@ -10,14 +10,14 @@ import os from text import symbols as symbols_v1 from text import symbols2 as symbols_v2 + special = [ # ("%", "zh", "SP"), - ("¥", "zh", "SP2"), + # ("¥", "zh", "SP2"), #加了货币计数所以人民币符不是SP2了 ("^", "zh", "SP3"), # ('@', 'zh', "SP4")#不搞鬼畜了,和第二版保持一致吧 ] - def clean_text(text, language, version=None): if version is None: version = os.environ.get("version", "v2") @@ -31,6 +31,14 @@ def clean_text(text, language, version=None): if language not in language_module_map: language = "en" text = " " + if language in ("zh"): #处理货币似乎最佳方案是这里截胡,不然可能被吞... + from text.zh_normalization.num import ( + RE_CNY_PREFIX, RE_CNY_SUFFIX, replace_cny_prefix, replace_cny_suffix, + RE_USD_SYMBOL, RE_USD_SUFFIX, replace_usd_symbol, replace_usd_suffix,) + text = RE_CNY_PREFIX.sub(replace_cny_prefix, text) + text = RE_CNY_SUFFIX.sub(replace_cny_suffix, text) + text = RE_USD_SYMBOL.sub(replace_usd_symbol, text) + text = RE_USD_SUFFIX.sub(replace_usd_suffix, text) for special_s, special_l, target_symbol in special: if special_s in text and language == special_l: return clean_special(text, language, special_s, target_symbol, version) diff --git a/GPT_SoVITS/text/zh_normalization/num.py b/GPT_SoVITS/text/zh_normalization/num.py index 14d602b0..3aed785f 100644 --- a/GPT_SoVITS/text/zh_normalization/num.py +++ b/GPT_SoVITS/text/zh_normalization/num.py @@ -337,3 +337,116 @@ def num2str(value_string: str) -> str: result = result if result else "零" result += "点" + verbalize_digit(decimal) return result + +RE_CNY_PREFIX = re.compile(r"(?:¥|¥)\s*(-?\d[\d,]*(?:\.\d+)?)") +RE_CNY_SUFFIX = re.compile(r"(-?\d[\d,]*(?:\.\d+)?)(?:\s*(?:人民币|元|CNY|cny|¥|¥))") + +def _strip_commas(s: str) -> str: + return s.replace(",", "") + +def _split_amount(amount: str): + neg = amount.startswith("-") + if neg: + amount = amount[1:] + amount = _strip_commas(amount) or "0" + + if "." in amount: + integer, frac = amount.split(".", 1) + had_frac = True + else: + integer, frac, had_frac = amount, "", False + + integer = integer or "0" + frac = (frac + "00")[:2] + return neg, integer, frac, had_frac + +#人民币和美元的处理都在cleaner那边,防吞 +def replace_cny_amount(amount: str, num2str) -> str: + neg, integer, frac, had_frac = _split_amount(amount) + + integer_cn = num2str(integer) if integer != "0" else "零" + + jiao, fen = frac[0], frac[1] + parts = [] + + if integer != "0": + parts.append(integer_cn + "元") + else: + parts.append("零元") + + if jiao != "0" or fen != "0": + if jiao != "0": + parts.append(num2str(jiao) + "角") + if fen != "0": + parts.append(num2str(fen) + "分") + elif had_frac: + parts.append("整") + + res = "".join(parts) + if neg and res and res[0] != "负": + res = "负" + res + return res + +def replace_cny_prefix(m, num2str=num2str): + return replace_cny_amount(m.group(1), num2str) + +def replace_cny_suffix(m, num2str=num2str): + return replace_cny_amount(m.group(1), num2str) + +#我知道美元符也可能是加拿大元什么的,但是就当它美元吧whatever +RE_USD_SYMBOL = re.compile(r"(?:\$|$)\s*(-?\d[\d,]*(?:\.\d+)?)") +RE_USD_SUFFIX = re.compile(r"(-?\d[\d,]*(?:\.\d+)?)(?:\s*(?:美元|USD|usd|\$|$))") + +def _strip_commas(s: str) -> str: + return s.replace(",", "") + +def _split_amount(amount: str): + neg = amount.startswith("-") + if neg: + amount = amount[1:] + amount = _strip_commas(amount) or "0" + + if "." in amount: + integer, frac = amount.split(".", 1) + had_frac = True + else: + integer, frac, had_frac = amount, "", False + + integer = integer or "0" + # 只保留两位小数用来读美分 + frac = (frac + "00")[:2] + return neg, integer, frac, had_frac + +def replace_usd_amount(amount: str, num2str) -> str: + neg, integer, frac, had_frac = _split_amount(amount) + + integer_cn = num2str(integer) if integer != "0" else "零" + + jiao, fen = frac[0], frac[1] + parts = [] + if integer != "0": + parts.append(integer_cn + "美元") + + if jiao != "0" or fen != "0": + cents = "" + if jiao != "0": + cents += num2str(jiao) + "十" + if fen != "0": + cents += num2str(fen) + cents = cents.replace("一十", "十") + parts.append(cents + "美分") + elif had_frac: + parts.append("整") + elif integer == "0": + parts = ["零美元"] + + res = "".join(parts) + if neg and res and res[0] != "负": + res = "负" + res + return res + +def replace_usd_symbol(m, num2str=num2str): + return replace_usd_amount(m.group(1), num2str) + +def replace_usd_suffix(m, num2str=num2str): + return replace_usd_amount(m.group(1), num2str)