Merge 8c0cb0d691554d8311d8904972ae3efa8bfd1cc4 into 11aa78bd9bda8b53047cfcae03abf7ca94d27391

This commit is contained in:
Ella Zhang 2025-09-23 03:24:21 -07:00 committed by GitHub
commit 3d28749110
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 123 additions and 2 deletions

View File

@ -10,14 +10,14 @@ import os
from text import symbols as symbols_v1 from text import symbols as symbols_v1
from text import symbols2 as symbols_v2 from text import symbols2 as symbols_v2
special = [ special = [
# ("%", "zh", "SP"), # ("%", "zh", "SP"),
("", "zh", "SP2"), # ("¥", "zh", "SP2"), #加了货币计数所以人民币符不是SP2了
("^", "zh", "SP3"), ("^", "zh", "SP3"),
# ('@', 'zh', "SP4")#不搞鬼畜了,和第二版保持一致吧 # ('@', 'zh', "SP4")#不搞鬼畜了,和第二版保持一致吧
] ]
def clean_text(text, language, version=None): def clean_text(text, language, version=None):
if version is None: if version is None:
version = os.environ.get("version", "v2") version = os.environ.get("version", "v2")
@ -31,6 +31,14 @@ def clean_text(text, language, version=None):
if language not in language_module_map: if language not in language_module_map:
language = "en" language = "en"
text = " " text = " "
if language in ("zh"): #处理货币似乎最佳方案是这里截胡,不然可能被吞...
from text.zh_normalization.num import (
RE_CNY_PREFIX, RE_CNY_SUFFIX, replace_cny_prefix, replace_cny_suffix,
RE_USD_SYMBOL, RE_USD_SUFFIX, replace_usd_symbol, replace_usd_suffix,)
text = RE_CNY_PREFIX.sub(replace_cny_prefix, text)
text = RE_CNY_SUFFIX.sub(replace_cny_suffix, text)
text = RE_USD_SYMBOL.sub(replace_usd_symbol, text)
text = RE_USD_SUFFIX.sub(replace_usd_suffix, text)
for special_s, special_l, target_symbol in special: for special_s, special_l, target_symbol in special:
if special_s in text and language == special_l: if special_s in text and language == special_l:
return clean_special(text, language, special_s, target_symbol, version) return clean_special(text, language, special_s, target_symbol, version)

View File

@ -337,3 +337,116 @@ def num2str(value_string: str) -> str:
result = result if result else "" result = result if result else ""
result += "" + verbalize_digit(decimal) result += "" + verbalize_digit(decimal)
return result return result
RE_CNY_PREFIX = re.compile(r"(?:¥|¥)\s*(-?\d[\d,]*(?:\.\d+)?)")
RE_CNY_SUFFIX = re.compile(r"(-?\d[\d,]*(?:\.\d+)?)(?:\s*(?:人民币|元|CNY|cny|¥|¥))")
def _strip_commas(s: str) -> str:
return s.replace(",", "")
def _split_amount(amount: str):
neg = amount.startswith("-")
if neg:
amount = amount[1:]
amount = _strip_commas(amount) or "0"
if "." in amount:
integer, frac = amount.split(".", 1)
had_frac = True
else:
integer, frac, had_frac = amount, "", False
integer = integer or "0"
frac = (frac + "00")[:2]
return neg, integer, frac, had_frac
#人民币和美元的处理都在cleaner那边防吞
def replace_cny_amount(amount: str, num2str) -> str:
neg, integer, frac, had_frac = _split_amount(amount)
integer_cn = num2str(integer) if integer != "0" else ""
jiao, fen = frac[0], frac[1]
parts = []
if integer != "0":
parts.append(integer_cn + "")
else:
parts.append("零元")
if jiao != "0" or fen != "0":
if jiao != "0":
parts.append(num2str(jiao) + "")
if fen != "0":
parts.append(num2str(fen) + "")
elif had_frac:
parts.append("")
res = "".join(parts)
if neg and res and res[0] != "":
res = "" + res
return res
def replace_cny_prefix(m, num2str=num2str):
return replace_cny_amount(m.group(1), num2str)
def replace_cny_suffix(m, num2str=num2str):
return replace_cny_amount(m.group(1), num2str)
#我知道美元符也可能是加拿大元什么的但是就当它美元吧whatever
RE_USD_SYMBOL = re.compile(r"(?:\$|)\s*(-?\d[\d,]*(?:\.\d+)?)")
RE_USD_SUFFIX = re.compile(r"(-?\d[\d,]*(?:\.\d+)?)(?:\s*(?:美元|USD|usd|\$|))")
def _strip_commas(s: str) -> str:
return s.replace(",", "")
def _split_amount(amount: str):
neg = amount.startswith("-")
if neg:
amount = amount[1:]
amount = _strip_commas(amount) or "0"
if "." in amount:
integer, frac = amount.split(".", 1)
had_frac = True
else:
integer, frac, had_frac = amount, "", False
integer = integer or "0"
# 只保留两位小数用来读美分
frac = (frac + "00")[:2]
return neg, integer, frac, had_frac
def replace_usd_amount(amount: str, num2str) -> str:
neg, integer, frac, had_frac = _split_amount(amount)
integer_cn = num2str(integer) if integer != "0" else ""
jiao, fen = frac[0], frac[1]
parts = []
if integer != "0":
parts.append(integer_cn + "美元")
if jiao != "0" or fen != "0":
cents = ""
if jiao != "0":
cents += num2str(jiao) + ""
if fen != "0":
cents += num2str(fen)
cents = cents.replace("一十", "")
parts.append(cents + "美分")
elif had_frac:
parts.append("")
elif integer == "0":
parts = ["零美元"]
res = "".join(parts)
if neg and res and res[0] != "":
res = "" + res
return res
def replace_usd_symbol(m, num2str=num2str):
return replace_usd_amount(m.group(1), num2str)
def replace_usd_suffix(m, num2str=num2str):
return replace_usd_amount(m.group(1), num2str)