# by https://github.com/Cosmo-klara from __future__ import print_function import re import inflect import unicodedata # 后缀计量单位替换表 measurement_map = { "m": ["meter", "meters"], "km": ["kilometer", "kilometers"], "km/h": ["kilometer per hour", "kilometers per hour"], "ft": ["feet", "feet"], "L": ["liter", "liters"], "tbsp": ["tablespoon", "tablespoons"], "tsp": ["teaspoon", "teaspoons"], "h": ["hour", "hours"], "min": ["minute", "minutes"], "s": ["second", "seconds"], "°C": ["degree celsius", "degrees celsius"], "°F": ["degree fahrenheit", "degrees fahrenheit"], } # 识别 12,000 类型 _inflect = inflect.engine() # 转化数字序数词 _ordinal_number_re = re.compile(r"\b([0-9]+)\. ") # 我听说好像对于数字正则识别其实用 \d 会好一点 _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])") # 时间识别 _time_re = re.compile(r"\b([01]?[0-9]|2[0-3]):([0-5][0-9])\b") # 后缀计量单位识别 _measurement_re = re.compile(r"\b([0-9]+(\.[0-9]+)?(m|km|km/h|ft|L|tbsp|tsp|h|min|s|°C|°F))\b") # 前后 £ 识别 ( 写了识别两边某一边的,但是不知道为什么失败了┭┮﹏┭┮ ) _pounds_re_start = re.compile(r"£([0-9\.\,]*[0-9]+)") _pounds_re_end = re.compile(r"([0-9\.\,]*[0-9]+)£") # 前后 $ 识别 _dollars_re_start = re.compile(r"\$([0-9\.\,]*[0-9]+)") _dollars_re_end = re.compile(r"([(0-9\.\,]*[0-9]+)\$") # 小数的识别 _decimal_number_re = re.compile(r"([0-9]+\.\s*[0-9]+)") # 分数识别 (形式 "3/4" ) _fraction_re = re.compile(r"([0-9]+/[0-9]+)") # 序数词识别 _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)") # 数字处理 _number_re = re.compile(r"[0-9]+") def _convert_ordinal(m): """ 标准化序数词, 例如: 1. 2. 3. 4. 5. 6. Examples: input: "1. " output: "1st" 然后在后面的 _expand_ordinal, 将其转化为 first 这类的 """ ordinal = _inflect.ordinal(m.group(1)) return ordinal + ", " def _remove_commas(m): return m.group(1).replace(",", "") def _expand_time(m): """ 将 24 小时制的时间转换为 12 小时制的时间表示方式。 Examples: input: "13:00 / 4:00 / 13:30" output: "one o'clock p.m. / four o'clock am. / one thirty p.m." """ hours, minutes = map(int, m.group(1, 2)) period = "a.m." if hours < 12 else "p.m." if hours > 12: hours -= 12 hour_word = _inflect.number_to_words(hours) minute_word = _inflect.number_to_words(minutes) if minutes != 0 else "" if minutes == 0: return f"{hour_word} o'clock {period}" else: return f"{hour_word} {minute_word} {period}" def _expand_measurement(m): """ 处理一些常见的测量单位后缀, 目前支持: m, km, km/h, ft, L, tbsp, tsp, h, min, s, °C, °F 如果要拓展的话修改: _measurement_re 和 measurement_map """ sign = m.group(3) ptr = 1 # 想不到怎么方便的取数字,又懒得改正则,诶,1.2 反正也是复数读法,干脆直接去掉 "." num = int(m.group(1).replace(sign, "").replace(".", "")) decimal_part = m.group(2) # 上面判断的漏洞,比如 0.1 的情况,在这里排除了 if decimal_part == None and num == 1: ptr = 0 return m.group(1).replace(sign, " " + measurement_map[sign][ptr]) def _expand_pounds(m): """ 没找到特别规范的说明,和美元的处理一样,其实可以把两个合并在一起 """ match = m.group(1) parts = match.split(".") if len(parts) > 2: return match + " pounds" # Unexpected format pounds = int(parts[0]) if parts[0] else 0 pence = int(parts[1].ljust(2, "0")) if len(parts) > 1 and parts[1] else 0 if pounds and pence: pound_unit = "pound" if pounds == 1 else "pounds" penny_unit = "penny" if pence == 1 else "pence" return "%s %s and %s %s" % (pounds, pound_unit, pence, penny_unit) elif pounds: pound_unit = "pound" if pounds == 1 else "pounds" return "%s %s" % (pounds, pound_unit) elif pence: penny_unit = "penny" if pence == 1 else "pence" return "%s %s" % (pence, penny_unit) else: return "zero pounds" def _expand_dollars(m): """ change: 美分是 100 的限值, 应该要做补零的吧 Example: input: "32.3$ / $6.24" output: "thirty-two dollars and thirty cents" / "six dollars and twenty-four cents" """ match = m.group(1) parts = match.split(".") if len(parts) > 2: return match + " dollars" # Unexpected format dollars = int(parts[0]) if parts[0] else 0 cents = int(parts[1].ljust(2, "0")) if len(parts) > 1 and parts[1] else 0 if dollars and cents: dollar_unit = "dollar" if dollars == 1 else "dollars" cent_unit = "cent" if cents == 1 else "cents" return "%s %s and %s %s" % (dollars, dollar_unit, cents, cent_unit) elif dollars: dollar_unit = "dollar" if dollars == 1 else "dollars" return "%s %s" % (dollars, dollar_unit) elif cents: cent_unit = "cent" if cents == 1 else "cents" return "%s %s" % (cents, cent_unit) else: return "zero dollars" # 小数的处理 def _expand_decimal_number(m): """ Example: input: "13.234" output: "thirteen point two three four" """ match = m.group(1) parts = match.split(".") words = [] # 遍历字符串中的每个字符 for char in parts[1]: if char == ".": words.append("point") else: words.append(char) return parts[0] + " point " + " ".join(words) # 分数的处理 def _expend_fraction(m): """ 规则1: 分子使用基数词读法, 分母用序数词读法. 规则2: 如果分子大于 1, 在读分母的时候使用序数词复数读法. 规则3: 当分母为2的时候, 分母读做 half, 并且当分子大于 1 的时候, half 也要用复数读法, 读为 halves. Examples: | Written | Said | |:---:|:---:| | 1/3 | one third | | 3/4 | three fourths | | 5/6 | five sixths | | 1/2 | one half | | 3/2 | three halves | """ match = m.group(0) numerator, denominator = map(int, match.split("/")) numerator_part = _inflect.number_to_words(numerator) if denominator == 2: if numerator == 1: denominator_part = "half" else: denominator_part = "halves" elif denominator == 1: return f"{numerator_part}" else: denominator_part = _inflect.ordinal(_inflect.number_to_words(denominator)) if numerator > 1: denominator_part += "s" return f"{numerator_part} {denominator_part}" def _expand_ordinal(m): return _inflect.number_to_words(m.group(0)) def _expand_number(m): num = int(m.group(0)) if num > 1000 and num < 3000: if num == 2000: return "two thousand" elif num > 2000 and num < 2010: return "two thousand " + _inflect.number_to_words(num % 100) elif num % 100 == 0: return _inflect.number_to_words(num // 100) + " hundred" else: return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ") else: return _inflect.number_to_words(num, andword="") class CharMapper: """ 字符映射追踪器,用于记录文本标准化过程中字符位置的变化 核心思想:维护从原始文本到当前文本的映射 - orig_to_curr[i] 表示原始文本第i个字符对应当前文本的位置 """ def __init__(self, text): self.original_text = text self.text = text # 初始化:每个原始字符映射到自己 self.orig_to_curr = list(range(len(text))) def apply_sub(self, pattern, replacement_func): """ 应用正则替换并更新映射 关键:需要通过旧映射来更新新映射 支持捕获组的特殊处理(如大写字母拆分) """ new_text = "" # curr_to_new[i] 表示当前文本第i个字符在新文本中的位置 curr_to_new = [-1] * len(self.text) pos = 0 for match in pattern.finditer(self.text): # 处理匹配前的未变化文本 for i in range(pos, match.start()): curr_to_new[i] = len(new_text) new_text += self.text[i] # 处理匹配的部分 replacement = replacement_func(match) replacement_start_pos = len(new_text) # 特殊处理:如果替换文本包含原始文本的字符(例如 "A" -> " A") # 尝试找到对应关系 match_text = match.group(0) if len(match.groups()) > 0 and match.group(1) in replacement: # 有捕获组,尝试精确映射 # 例如: "(?= 0: # 捕获的字符在替换文本中 for i in range(match.start(), match.end()): char = self.text[i] if char in captured: # 这个字符在捕获组中,映射到替换文本中的对应位置 char_idx_in_replacement = replacement.find(char, replacement_idx) if char_idx_in_replacement >= 0: curr_to_new[i] = replacement_start_pos + char_idx_in_replacement else: curr_to_new[i] = replacement_start_pos else: curr_to_new[i] = replacement_start_pos else: # 捕获的字符不在替换文本中,都映射到起始位置 for i in range(match.start(), match.end()): curr_to_new[i] = replacement_start_pos else: # 没有捕获组或不包含原始字符,匹配部分的所有字符都映射到替换文本的起始位置 for i in range(match.start(), match.end()): curr_to_new[i] = replacement_start_pos new_text += replacement pos = match.end() # 处理剩余文本 for i in range(pos, len(self.text)): curr_to_new[i] = len(new_text) new_text += self.text[i] # 更新原始到当前的映射:orig -> old_curr -> new_curr new_orig_to_curr = [] for orig_idx in range(len(self.original_text)): old_curr_idx = self.orig_to_curr[orig_idx] if old_curr_idx >= 0 and old_curr_idx < len(curr_to_new): new_orig_to_curr.append(curr_to_new[old_curr_idx]) else: new_orig_to_curr.append(-1) self.text = new_text self.orig_to_curr = new_orig_to_curr def apply_char_filter(self, keep_pattern): """ 应用字符过滤(只保留符合模式的字符)并更新映射 keep_pattern: 正则表达式字符串,如 "[ A-Za-z'.,?!-]" """ new_text = "" curr_to_new = [] for i, char in enumerate(self.text): if re.match(keep_pattern, char): curr_to_new.append(len(new_text)) new_text += char else: # 字符被删除 if new_text: curr_to_new.append(len(new_text) - 1) else: curr_to_new.append(-1) # 更新原始映射 new_orig_to_curr = [] for orig_idx in range(len(self.original_text)): old_curr_idx = self.orig_to_curr[orig_idx] if old_curr_idx >= 0 and old_curr_idx < len(curr_to_new): new_orig_to_curr.append(curr_to_new[old_curr_idx]) else: new_orig_to_curr.append(-1) self.text = new_text self.orig_to_curr = new_orig_to_curr def get_norm_to_orig(self): """ 构建标准化文本到原始文本的反向映射 """ if not self.text: return [] norm_to_orig = [-1] * len(self.text) for orig_idx, norm_idx in enumerate(self.orig_to_curr): if 0 <= norm_idx < len(self.text): # 如果多个原始字符映射到同一个标准化位置,取第一个 if norm_to_orig[norm_idx] == -1: norm_to_orig[norm_idx] = orig_idx return norm_to_orig def normalize_with_map(text): """ 带字符映射的标准化函数 返回: normalized_text: 标准化后的文本 char_mappings: 字典,包含: - "orig_to_norm": list[int], 原始文本每个字符对应标准化文本的位置 - "norm_to_orig": list[int], 标准化文本每个字符对应原始文本的位置 """ mapper = CharMapper(text) # 按照 normalize() 的顺序应用所有转换 mapper.apply_sub(_ordinal_number_re, _convert_ordinal) mapper.apply_sub(re.compile(r"(?= len(mapper.text): # NFD 展开了一些字符 new_orig_to_curr = [] for orig_idx in range(len(mapper.original_text)): old_curr_idx = mapper.orig_to_curr[orig_idx] if old_curr_idx >= 0 and old_curr_idx < len(curr_to_new): new_orig_to_curr.append(curr_to_new[old_curr_idx]) else: new_orig_to_curr.append(-1) mapper.orig_to_curr = new_orig_to_curr mapper.text = new_text # 继续其他替换 mapper.apply_sub(re.compile("%"), lambda m: " percent") # 删除非法字符 - 使用 apply_char_filter mapper.apply_char_filter(r"[ A-Za-z'.,?!\-]") mapper.apply_sub(re.compile(r"(?i)i\.e\."), lambda m: "that is") mapper.apply_sub(re.compile(r"(?i)e\.g\."), lambda m: "for example") mapper.apply_sub(re.compile(r"(?