GPT-SoVITS/GPT_SoVITS/text/zh_normalization/num.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Rules to verbalize numbers into Chinese characters.
https://zh.wikipedia.org/wiki/中文数字#現代中文
"""

import re
from collections import OrderedDict
from typing import List

DIGITS = {str(i): tran for i, tran in enumerate("零一二三四五六七八九")}
UNITS = OrderedDict(
    {
        1: "十",
        2: "百",
        3: "千",
        4: "万",
        8: "亿",
    }
)

COM_QUANTIFIERS = "(处|台|架|枚|趟|幅|平|方|堵|间|床|株|批|项|例|列|篇|栋|注|亩|封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)"

# 分数表达式
RE_FRAC = re.compile(r"(-?)(\d+)/(\d+)")


def replace_frac(match) -> str:
    """
    Args:
        match (re.Match)
    Returns:
        str
    """
    sign = match.group(1)
    nominator = match.group(2)
    denominator = match.group(3)
    sign: str = "负" if sign else ""
    nominator: str = num2str(nominator)
    denominator: str = num2str(denominator)
    result = f"{sign}{denominator}分之{nominator}"
    return result


# 百分数表达式
RE_PERCENTAGE = re.compile(r"(-?)(\d+(\.\d+)?)%")


def replace_percentage(match) -> str:
    """
    Args:
        match (re.Match)
    Returns:
        str
    """
    sign = match.group(1)
    percent = match.group(2)
    sign: str = "负" if sign else ""
    percent: str = num2str(percent)
    result = f"{sign}百分之{percent}"
    return result


# 整数表达式
# 带负号的整数 -10
RE_INTEGER = re.compile(r"(-)" r"(\d+)")


def replace_negative_num(match) -> str:
    """
    Args:
        match (re.Match)
    Returns:
        str
    """
    sign = match.group(1)
    number = match.group(2)
    sign: str = "负" if sign else ""
    number: str = num2str(number)
    result = f"{sign}{number}"
    return result


# 编号-无符号整形
# 00078
RE_DEFAULT_NUM = re.compile(r"\d{3}\d*")


def replace_default_num(match):
    """
    Args:
        match (re.Match)
    Returns:
        str
    """
    number = match.group(0)
    return verbalize_digit(number, alt_one=True)


# 加减乘除
# RE_ASMD = re.compile(
#     r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))([\+\-\×÷=])((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
RE_ASMD = re.compile(
    r"((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))([\+\-\×÷=])((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))"
)

asmd_map = {"+": "加", "-": "减", "×": "乘", "÷": "除", "=": "等于"}


def replace_asmd(match) -> str:
    """
    Args:
        match (re.Match)
    Returns:
        str
    """
    result = match.group(1) + asmd_map[match.group(8)] + match.group(9)
    return result


# 次方专项
RE_POWER = re.compile(r"[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]+")

power_map = {
    "⁰": "0",
    "¹": "1",
    "²": "2",
    "³": "3",
    "⁴": "4",
    "⁵": "5",
    "⁶": "6",
    "⁷": "7",
    "⁸": "8",
    "⁹": "9",
    "ˣ": "x",
    "ʸ": "y",
    "ⁿ": "n",
}


def replace_power(match) -> str:
    """
    Args:
        match (re.Match)
    Returns:
        str
    """
    power_num = ""
    for m in match.group(0):
        power_num += power_map[m]
    result = "的" + power_num + "次方"
    return result


# 数字表达式
# 纯小数
RE_DECIMAL_NUM = re.compile(r"(-?)((\d+)(\.\d+))" r"|(\.(\d+))")
# 正整数 + 量词
RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS)
RE_NUMBER = re.compile(r"(-?)((\d+)(\.\d+)?)" r"|(\.(\d+))")


def replace_positive_quantifier(match) -> str:
    """
    Args:
        match (re.Match)
    Returns:
        str
    """
    number = match.group(1)
    match_2 = match.group(2)
    if match_2 == "+":
        match_2 = "多"
    match_2: str = match_2 if match_2 else ""
    quantifiers: str = match.group(3)
    number: str = num2str(number)
    number = "两" if number == "二" else number
    result = f"{number}{match_2}{quantifiers}"
    return result


def replace_number(match) -> str:
    """
    Args:
        match (re.Match)
    Returns:
        str
    """
    sign = match.group(1)
    number = match.group(2)
    pure_decimal = match.group(5)
    if pure_decimal:
        result = num2str(pure_decimal)
    else:
        sign: str = "负" if sign else ""
        number: str = num2str(number)
        result = f"{sign}{number}"
    return result


# 范围表达式
# match.group(1) and match.group(8) are copy from RE_NUMBER

RE_RANGE = re.compile(
    r"""
    (?<![\d\+\-\×÷=])      # 使用反向前瞻以确保数字范围之前没有其他数字和操作符
    ((-?)((\d+)(\.\d+)?))  # 匹配范围起始的负数或正数（整数或小数）
    [-~]                   # 匹配范围分隔符
    ((-?)((\d+)(\.\d+)?))  # 匹配范围结束的负数或正数（整数或小数）
    (?![\d\+\-\×÷=])       # 使用正向前瞻以确保数字范围之后没有其他数字和操作符
    """,
    re.VERBOSE,
)


def replace_range(match) -> str:
    """
    Args:
        match (re.Match)
    Returns:
        str
    """
    first, second = match.group(1), match.group(6)
    first = RE_NUMBER.sub(replace_number, first)
    second = RE_NUMBER.sub(replace_number, second)
    result = f"{first}到{second}"
    return result


# ~至表达式
RE_TO_RANGE = re.compile(
    r"((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)"
)


def replace_to_range(match) -> str:
    """
    Args:
        match (re.Match)
    Returns:
        str
    """
    result = match.group(0).replace("~", "至")
    return result


RE_VERSION_NUM = re.compile(r"((\d+)(\.\d+)(\.\d+)?(\.\d+)+)")
def replace_vrsion_num(match) -> str:
    """
    Args:
        match (re.Match)
    Returns:
        str
    """
    result = ""
    for c in match.group(1):
        if c == ".":
            result += "点"
        else:
            result += num2str(c)
    return result


def _get_value(value_string: str, use_zero: bool = True) -> List[str]:
    stripped = value_string.lstrip("0")
    if len(stripped) == 0:
        return []
    elif len(stripped) == 1:
        if use_zero and len(stripped) < len(value_string):
            return [DIGITS["0"], DIGITS[stripped]]
        else:
            return [DIGITS[stripped]]
    else:
        largest_unit = next(power for power in reversed(UNITS.keys()) if power < len(stripped))
        first_part = value_string[:-largest_unit]
        second_part = value_string[-largest_unit:]
        return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(second_part)


def verbalize_cardinal(value_string: str) -> str:
    if not value_string:
        return ""

    # 000 -> '零' , 0 -> '零'
    value_string = value_string.lstrip("0")
    if len(value_string) == 0:
        return DIGITS["0"]

    result_symbols = _get_value(value_string)
    # verbalized number starting with '一十*' is abbreviated as `十*`
    if len(result_symbols) >= 2 and result_symbols[0] == DIGITS["1"] and result_symbols[1] == UNITS[1]:
        result_symbols = result_symbols[1:]
    return "".join(result_symbols)


def verbalize_digit(value_string: str, alt_one=False) -> str:
    result_symbols = [DIGITS[digit] for digit in value_string]
    result = "".join(result_symbols)
    if alt_one:
        result = result.replace("一", "幺")
    return result


def num2str(value_string: str) -> str:
    integer_decimal = value_string.split(".")
    if len(integer_decimal) == 1:
        integer = integer_decimal[0]
        decimal = ""
    elif len(integer_decimal) == 2:
        integer, decimal = integer_decimal
    else:
        raise ValueError(f"The value string: '${value_string}' has more than one point in it.")

    result = verbalize_cardinal(integer)

    if decimal.endswith("0"):
        decimal = decimal.rstrip("0") + "0"
    else:
        decimal = decimal.rstrip("0")

    if decimal:
        # '.22' is verbalized as '零点二二'
        # '3.20' is verbalized as '三点二
        result = result if result else "零"
        result += "点" + verbalize_digit(decimal)
    return result

RE_CNY_PREFIX = re.compile(r"(?:¥|￥)\s*(-?\d[\d,]*(?:\.\d+)?)")
RE_CNY_SUFFIX = re.compile(r"(-?\d[\d,]*(?:\.\d+)?)(?:\s*(?:人民币|元|CNY|cny|¥|￥))")

def _strip_commas(s: str) -> str:
    return s.replace(",", "")

def _split_amount(amount: str):
    neg = amount.startswith("-")
    if neg:
        amount = amount[1:]
    amount = _strip_commas(amount) or "0"

    if "." in amount:
        integer, frac = amount.split(".", 1)
        had_frac = True
    else:
        integer, frac, had_frac = amount, "", False

    integer = integer or "0"
    frac = (frac + "00")[:2]
    return neg, integer, frac, had_frac

#人民币和美元的处理都在cleaner那边，防吞
def replace_cny_amount(amount: str, num2str) -> str:
    neg, integer, frac, had_frac = _split_amount(amount)

    integer_cn = num2str(integer) if integer != "0" else "零"

    jiao, fen = frac[0], frac[1]
    parts = []

    if integer != "0":
        parts.append(integer_cn + "元")
    else:
        parts.append("零元")

    if jiao != "0" or fen != "0":
        if jiao != "0":
            parts.append(num2str(jiao) + "角")
        if fen != "0":
            parts.append(num2str(fen) + "分")
    elif had_frac:
        parts.append("整")

    res = "".join(parts)
    if neg and res and res[0] != "负":
        res = "负" + res
    return res

def replace_cny_prefix(m, num2str=num2str):
    return replace_cny_amount(m.group(1), num2str)

def replace_cny_suffix(m, num2str=num2str):
    return replace_cny_amount(m.group(1), num2str)

#我知道美元符也可能是加拿大元什么的，但是就当它美元吧whatever
RE_USD_SYMBOL = re.compile(r"(?:\$|＄)\s*(-?\d[\d,]*(?:\.\d+)?)")
RE_USD_SUFFIX = re.compile(r"(-?\d[\d,]*(?:\.\d+)?)(?:\s*(?:美元|USD|usd|\$|＄))")

def _strip_commas(s: str) -> str:
    return s.replace(",", "")

def _split_amount(amount: str):
    neg = amount.startswith("-")
    if neg:
        amount = amount[1:]
    amount = _strip_commas(amount) or "0"

    if "." in amount:
        integer, frac = amount.split(".", 1)
        had_frac = True
    else:
        integer, frac, had_frac = amount, "", False

    integer = integer or "0"
    # 只保留两位小数用来读美分
    frac = (frac + "00")[:2]
    return neg, integer, frac, had_frac

def replace_usd_amount(amount: str, num2str) -> str:
    neg, integer, frac, had_frac = _split_amount(amount)

    integer_cn = num2str(integer) if integer != "0" else "零"

    jiao, fen = frac[0], frac[1]
    parts = []
    if integer != "0":
        parts.append(integer_cn + "美元")

    if jiao != "0" or fen != "0":
        cents = ""
        if jiao != "0":
            cents += num2str(jiao) + "十"
        if fen != "0":
            cents += num2str(fen)
        cents = cents.replace("一十", "十")
        parts.append(cents + "美分")
    elif had_frac:
        parts.append("整")
    elif integer == "0":
        parts = ["零美元"]

    res = "".join(parts)
    if neg and res and res[0] != "负":
        res = "负" + res
    return res

def replace_usd_symbol(m, num2str=num2str):
    return replace_usd_amount(m.group(1), num2str)

def replace_usd_suffix(m, num2str=num2str):
    return replace_usd_amount(m.group(1), num2str)