2025-09-23 02:48:30 -07:00

453 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Rules to verbalize numbers into Chinese characters.
https://zh.wikipedia.org/wiki/中文数字#現代中文
"""
import re
from collections import OrderedDict
from typing import List
DIGITS = {str(i): tran for i, tran in enumerate("零一二三四五六七八九")}
UNITS = OrderedDict(
{
1: "",
2: "",
3: "",
4: "",
8: "亿",
}
)
COM_QUANTIFIERS = "(处|台|架|枚|趟|幅|平|方|堵|间|床|株|批|项|例|列|篇|栋|注|亩|封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)"
# 分数表达式
RE_FRAC = re.compile(r"(-?)(\d+)/(\d+)")
def replace_frac(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
sign = match.group(1)
nominator = match.group(2)
denominator = match.group(3)
sign: str = "" if sign else ""
nominator: str = num2str(nominator)
denominator: str = num2str(denominator)
result = f"{sign}{denominator}分之{nominator}"
return result
# 百分数表达式
RE_PERCENTAGE = re.compile(r"(-?)(\d+(\.\d+)?)%")
def replace_percentage(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
sign = match.group(1)
percent = match.group(2)
sign: str = "" if sign else ""
percent: str = num2str(percent)
result = f"{sign}百分之{percent}"
return result
# 整数表达式
# 带负号的整数 -10
RE_INTEGER = re.compile(r"(-)" r"(\d+)")
def replace_negative_num(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
sign = match.group(1)
number = match.group(2)
sign: str = "" if sign else ""
number: str = num2str(number)
result = f"{sign}{number}"
return result
# 编号-无符号整形
# 00078
RE_DEFAULT_NUM = re.compile(r"\d{3}\d*")
def replace_default_num(match):
"""
Args:
match (re.Match)
Returns:
str
"""
number = match.group(0)
return verbalize_digit(number, alt_one=True)
# 加减乘除
# RE_ASMD = re.compile(
# r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))([\+\-\×÷=])((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
RE_ASMD = re.compile(
r"((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))([\+\-\×÷=])((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))"
)
asmd_map = {"+": "", "-": "", "×": "", "÷": "", "=": "等于"}
def replace_asmd(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
result = match.group(1) + asmd_map[match.group(8)] + match.group(9)
return result
# 次方专项
RE_POWER = re.compile(r"[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]+")
power_map = {
"": "0",
"¹": "1",
"²": "2",
"³": "3",
"": "4",
"": "5",
"": "6",
"": "7",
"": "8",
"": "9",
"ˣ": "x",
"ʸ": "y",
"": "n",
}
def replace_power(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
power_num = ""
for m in match.group(0):
power_num += power_map[m]
result = "" + power_num + "次方"
return result
# 数字表达式
# 纯小数
RE_DECIMAL_NUM = re.compile(r"(-?)((\d+)(\.\d+))" r"|(\.(\d+))")
# 正整数 + 量词
RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS)
RE_NUMBER = re.compile(r"(-?)((\d+)(\.\d+)?)" r"|(\.(\d+))")
def replace_positive_quantifier(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
number = match.group(1)
match_2 = match.group(2)
if match_2 == "+":
match_2 = ""
match_2: str = match_2 if match_2 else ""
quantifiers: str = match.group(3)
number: str = num2str(number)
number = "" if number == "" else number
result = f"{number}{match_2}{quantifiers}"
return result
def replace_number(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
sign = match.group(1)
number = match.group(2)
pure_decimal = match.group(5)
if pure_decimal:
result = num2str(pure_decimal)
else:
sign: str = "" if sign else ""
number: str = num2str(number)
result = f"{sign}{number}"
return result
# 范围表达式
# match.group(1) and match.group(8) are copy from RE_NUMBER
RE_RANGE = re.compile(
r"""
(?<![\d\+\-\×÷=]) # 使用反向前瞻以确保数字范围之前没有其他数字和操作符
((-?)((\d+)(\.\d+)?)) # 匹配范围起始的负数或正数(整数或小数)
[-~] # 匹配范围分隔符
((-?)((\d+)(\.\d+)?)) # 匹配范围结束的负数或正数(整数或小数)
(?![\d\+\-\×÷=]) # 使用正向前瞻以确保数字范围之后没有其他数字和操作符
""",
re.VERBOSE,
)
def replace_range(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
first, second = match.group(1), match.group(6)
first = RE_NUMBER.sub(replace_number, first)
second = RE_NUMBER.sub(replace_number, second)
result = f"{first}{second}"
return result
# ~至表达式
RE_TO_RANGE = re.compile(
r"((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)"
)
def replace_to_range(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
result = match.group(0).replace("~", "")
return result
RE_VERSION_NUM = re.compile(r"((\d+)(\.\d+)(\.\d+)?(\.\d+)+)")
def replace_vrsion_num(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
result = ""
for c in match.group(1):
if c == ".":
result += ""
else:
result += num2str(c)
return result
def _get_value(value_string: str, use_zero: bool = True) -> List[str]:
stripped = value_string.lstrip("0")
if len(stripped) == 0:
return []
elif len(stripped) == 1:
if use_zero and len(stripped) < len(value_string):
return [DIGITS["0"], DIGITS[stripped]]
else:
return [DIGITS[stripped]]
else:
largest_unit = next(power for power in reversed(UNITS.keys()) if power < len(stripped))
first_part = value_string[:-largest_unit]
second_part = value_string[-largest_unit:]
return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(second_part)
def verbalize_cardinal(value_string: str) -> str:
if not value_string:
return ""
# 000 -> '零' , 0 -> '零'
value_string = value_string.lstrip("0")
if len(value_string) == 0:
return DIGITS["0"]
result_symbols = _get_value(value_string)
# verbalized number starting with '一十*' is abbreviated as `十*`
if len(result_symbols) >= 2 and result_symbols[0] == DIGITS["1"] and result_symbols[1] == UNITS[1]:
result_symbols = result_symbols[1:]
return "".join(result_symbols)
def verbalize_digit(value_string: str, alt_one=False) -> str:
result_symbols = [DIGITS[digit] for digit in value_string]
result = "".join(result_symbols)
if alt_one:
result = result.replace("", "")
return result
def num2str(value_string: str) -> str:
integer_decimal = value_string.split(".")
if len(integer_decimal) == 1:
integer = integer_decimal[0]
decimal = ""
elif len(integer_decimal) == 2:
integer, decimal = integer_decimal
else:
raise ValueError(f"The value string: '${value_string}' has more than one point in it.")
result = verbalize_cardinal(integer)
if decimal.endswith("0"):
decimal = decimal.rstrip("0") + "0"
else:
decimal = decimal.rstrip("0")
if decimal:
# '.22' is verbalized as '零点二二'
# '3.20' is verbalized as '三点二
result = result if result else ""
result += "" + verbalize_digit(decimal)
return result
RE_CNY_PREFIX = re.compile(r"(?:¥|¥)\s*(-?\d[\d,]*(?:\.\d+)?)")
RE_CNY_SUFFIX = re.compile(r"(-?\d[\d,]*(?:\.\d+)?)(?:\s*(?:人民币|元|CNY|cny|¥|¥))")
def _strip_commas(s: str) -> str:
return s.replace(",", "")
def _split_amount(amount: str):
neg = amount.startswith("-")
if neg:
amount = amount[1:]
amount = _strip_commas(amount) or "0"
if "." in amount:
integer, frac = amount.split(".", 1)
had_frac = True
else:
integer, frac, had_frac = amount, "", False
integer = integer or "0"
frac = (frac + "00")[:2]
return neg, integer, frac, had_frac
#人民币和美元的处理都在cleaner那边防吞
def replace_cny_amount(amount: str, num2str) -> str:
neg, integer, frac, had_frac = _split_amount(amount)
integer_cn = num2str(integer) if integer != "0" else ""
jiao, fen = frac[0], frac[1]
parts = []
if integer != "0":
parts.append(integer_cn + "")
else:
parts.append("零元")
if jiao != "0" or fen != "0":
if jiao != "0":
parts.append(num2str(jiao) + "")
if fen != "0":
parts.append(num2str(fen) + "")
elif had_frac:
parts.append("")
res = "".join(parts)
if neg and res and res[0] != "":
res = "" + res
return res
def replace_cny_prefix(m, num2str=num2str):
return replace_cny_amount(m.group(1), num2str)
def replace_cny_suffix(m, num2str=num2str):
return replace_cny_amount(m.group(1), num2str)
#我知道美元符也可能是加拿大元什么的但是就当它美元吧whatever
RE_USD_SYMBOL = re.compile(r"(?:\$|)\s*(-?\d[\d,]*(?:\.\d+)?)")
RE_USD_SUFFIX = re.compile(r"(-?\d[\d,]*(?:\.\d+)?)(?:\s*(?:美元|USD|usd|\$|))")
def _strip_commas(s: str) -> str:
return s.replace(",", "")
def _split_amount(amount: str):
neg = amount.startswith("-")
if neg:
amount = amount[1:]
amount = _strip_commas(amount) or "0"
if "." in amount:
integer, frac = amount.split(".", 1)
had_frac = True
else:
integer, frac, had_frac = amount, "", False
integer = integer or "0"
# 只保留两位小数用来读美分
frac = (frac + "00")[:2]
return neg, integer, frac, had_frac
def replace_usd_amount(amount: str, num2str) -> str:
neg, integer, frac, had_frac = _split_amount(amount)
integer_cn = num2str(integer) if integer != "0" else ""
jiao, fen = frac[0], frac[1]
parts = []
if integer != "0":
parts.append(integer_cn + "美元")
if jiao != "0" or fen != "0":
cents = ""
if jiao != "0":
cents += num2str(jiao) + ""
if fen != "0":
cents += num2str(fen)
cents = cents.replace("一十", "")
parts.append(cents + "美分")
elif had_frac:
parts.append("")
elif integer == "0":
parts = ["零美元"]
res = "".join(parts)
if neg and res and res[0] != "":
res = "" + res
return res
def replace_usd_symbol(m, num2str=num2str):
return replace_usd_amount(m.group(1), num2str)
def replace_usd_suffix(m, num2str=num2str):
return replace_usd_amount(m.group(1), num2str)