Use PaddleSpeech-NSW-Normalization process Chinese

This commit is contained in:
KamioRinn 2024-02-03 16:46:09 +08:00
parent 8d91183c4c
commit bace09951c
11 changed files with 799 additions and 5 deletions

View File

@ -7,6 +7,7 @@ from pypinyin import lazy_pinyin, Style
from text.symbols import punctuation
from text.tone_sandhi import ToneSandhi
from text.zh_normalization.text_normlization import TextNormalizer
normalizer = lambda x: cn2an.transform(x, "an2cn")
@ -149,10 +150,13 @@ def _g2p(segments):
def text_normalize(text):
dest_text=normalizer(text)
text = replace_punctuation(dest_text)
return text
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
tx = TextNormalizer()
sentences = tx.normalize(text)
dest_text = ""
for sentence in sentences:
dest_text += replace_punctuation(sentence)
return dest_text
if __name__ == "__main__":

View File

@ -2,7 +2,7 @@ from text import chinese, japanese, cleaned_text_to_sequence, symbols, english
language_module_map = {"zh": chinese, "ja": japanese, "en": english}
special = [
("%", "zh", "SP"),
# ("%", "zh", "SP"),
("", "zh", "SP2"),
("^", "zh", "SP3"),
# ('@', 'zh', "SP4")#不搞鬼畜了,和第二版保持一致吧

View File

@ -0,0 +1,16 @@
## Supported NSW (Non-Standard-Word) Normalization
|NSW type|raw|normalized|
|:--|:-|:-|
|serial number|电影中梁朝伟扮演的陈永仁的编号27149|电影中梁朝伟扮演的陈永仁的编号二七一四九|
|cardinal|这块黄金重达324.75克<br>我们班的最高总分为583分|这块黄金重达三百二十四点七五克<br>我们班的最高总分为五百八十三分|
|numeric range |12\~23<br>-1.5\~2|十二到二十三<br>负一点五到二|
|date|她出生于86年8月18日她弟弟出生于1995年3月1日|她出生于八六年八月十八日, 她弟弟出生于一九九五年三月一日|
|time|等会请在12:05请通知我|等会请在十二点零五分请通知我
|temperature|今天的最低气温达到-10°C|今天的最低气温达到零下十度
|fraction|现场有7/12的观众投出了赞成票|现场有十二分之七的观众投出了赞成票|
|percentage|明天有62的概率降雨|明天有百分之六十二的概率降雨|
|money|随便来几个价格12块534.5元20.1万|随便来几个价格十二块五,三十四点五元,二十点一万|
|telephone|这是固话0421-33441122<br>这是手机+86 18544139121|这是固话零四二一三三四四一一二二<br>这是手机八六一八五四四一三九一二一|
## References
[Pull requests #658 of DeepSpeech](https://github.com/PaddlePaddle/DeepSpeech/pull/658/files)

View File

@ -0,0 +1,14 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from text.zh_normalization.text_normlization import *

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,134 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from .num import DIGITS
from .num import num2str
from .num import verbalize_cardinal
from .num import verbalize_digit
def _time_num2str(num_string: str) -> str:
"""A special case for verbalizing number in time."""
result = num2str(num_string.lstrip('0'))
if num_string.startswith('0'):
result = DIGITS['0'] + result
return result
# 时刻表达式
RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])'
r':([0-5][0-9])'
r'(:([0-5][0-9]))?')
# 时间范围如8:30-12:30
RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'
r':([0-5][0-9])'
r'(:([0-5][0-9]))?'
r'(~|-)'
r'([0-1]?[0-9]|2[0-3])'
r':([0-5][0-9])'
r'(:([0-5][0-9]))?')
def replace_time(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
is_range = len(match.groups()) > 5
hour = match.group(1)
minute = match.group(2)
second = match.group(4)
if is_range:
hour_2 = match.group(6)
minute_2 = match.group(7)
second_2 = match.group(9)
result = f"{num2str(hour)}"
if minute.lstrip('0'):
if int(minute) == 30:
result += ""
else:
result += f"{_time_num2str(minute)}"
if second and second.lstrip('0'):
result += f"{_time_num2str(second)}"
if is_range:
result += ""
result += f"{num2str(hour_2)}"
if minute_2.lstrip('0'):
if int(minute) == 30:
result += ""
else:
result += f"{_time_num2str(minute_2)}"
if second_2 and second_2.lstrip('0'):
result += f"{_time_num2str(second_2)}"
return result
RE_DATE = re.compile(r'(\d{4}|\d{2})年'
r'((0?[1-9]|1[0-2])月)?'
r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?')
def replace_date(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
year = match.group(1)
month = match.group(3)
day = match.group(5)
result = ""
if year:
result += f"{verbalize_digit(year)}"
if month:
result += f"{verbalize_cardinal(month)}"
if day:
result += f"{verbalize_cardinal(day)}{match.group(9)}"
return result
# 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期
RE_DATE2 = re.compile(
r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])')
def replace_date2(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
year = match.group(1)
month = match.group(3)
day = match.group(4)
result = ""
if year:
result += f"{verbalize_digit(year)}"
if month:
result += f"{verbalize_cardinal(month)}"
if day:
result += f"{verbalize_cardinal(day)}"
return result

View File

@ -0,0 +1,62 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import string
from pypinyin.constants import SUPPORT_UCS4
# 全角半角转换
# 英文字符全角 -> 半角映射表 (num: 52)
F2H_ASCII_LETTERS = {
ord(char) + 65248: ord(char)
for char in string.ascii_letters
}
# 英文字符半角 -> 全角映射表
H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
# 数字字符全角 -> 半角映射表 (num: 10)
F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits}
# 数字字符半角 -> 全角映射表
H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
# 标点符号全角 -> 半角映射表 (num: 32)
F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation}
# 标点符号半角 -> 全角映射表
H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
# 空格 (num: 1)
F2H_SPACE = {'\u3000': ' '}
H2F_SPACE = {' ': '\u3000'}
# 非"有拼音的汉字"的字符串可用于NSW提取
if SUPPORT_UCS4:
RE_NSW = re.compile(r'(?:[^'
r'\u3007' #
r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
r'\uf900-\ufaff' # CJK兼容:[F900-FAFF]
r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF]
r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F]
r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D]
r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F]
r'])+')
else:
RE_NSW = re.compile( # pragma: no cover
r'(?:[^'
r'\u3007' #
r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
r'\uf900-\ufaff' # CJK兼容:[F900-FAFF]
r'])+')

View File

@ -0,0 +1,238 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Rules to verbalize numbers into Chinese characters.
https://zh.wikipedia.org/wiki/中文数字#現代中文
"""
import re
from collections import OrderedDict
from typing import List
DIGITS = {str(i): tran for i, tran in enumerate('零一二三四五六七八九')}
UNITS = OrderedDict({
1: '',
2: '',
3: '',
4: '',
8: '亿',
})
COM_QUANTIFIERS = '(封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)'
# 分数表达式
RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
def replace_frac(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
sign = match.group(1)
nominator = match.group(2)
denominator = match.group(3)
sign: str = "" if sign else ""
nominator: str = num2str(nominator)
denominator: str = num2str(denominator)
result = f"{sign}{denominator}分之{nominator}"
return result
# 百分数表达式
RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%')
def replace_percentage(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
sign = match.group(1)
percent = match.group(2)
sign: str = "" if sign else ""
percent: str = num2str(percent)
result = f"{sign}百分之{percent}"
return result
# 整数表达式
# 带负号的整数 -10
RE_INTEGER = re.compile(r'(-)' r'(\d+)')
def replace_negative_num(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
sign = match.group(1)
number = match.group(2)
sign: str = "" if sign else ""
number: str = num2str(number)
result = f"{sign}{number}"
return result
# 编号-无符号整形
# 00078
RE_DEFAULT_NUM = re.compile(r'\d{3}\d*')
def replace_default_num(match):
"""
Args:
match (re.Match)
Returns:
str
"""
number = match.group(0)
return verbalize_digit(number, alt_one=True)
# 数字表达式
# 纯小数
RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))')
# 正整数 + 量词
RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS)
RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))')
def replace_positive_quantifier(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
number = match.group(1)
match_2 = match.group(2)
if match_2 == "+":
match_2 = ""
match_2: str = match_2 if match_2 else ""
quantifiers: str = match.group(3)
number: str = num2str(number)
result = f"{number}{match_2}{quantifiers}"
return result
def replace_number(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
sign = match.group(1)
number = match.group(2)
pure_decimal = match.group(5)
if pure_decimal:
result = num2str(pure_decimal)
else:
sign: str = "" if sign else ""
number: str = num2str(number)
result = f"{sign}{number}"
return result
# 范围表达式
# match.group(1) and match.group(8) are copy from RE_NUMBER
RE_RANGE = re.compile(
r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))[-~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
def replace_range(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
first, second = match.group(1), match.group(8)
first = RE_NUMBER.sub(replace_number, first)
second = RE_NUMBER.sub(replace_number, second)
result = f"{first}{second}"
return result
def _get_value(value_string: str, use_zero: bool=True) -> List[str]:
stripped = value_string.lstrip('0')
if len(stripped) == 0:
return []
elif len(stripped) == 1:
if use_zero and len(stripped) < len(value_string):
return [DIGITS['0'], DIGITS[stripped]]
else:
return [DIGITS[stripped]]
else:
largest_unit = next(
power for power in reversed(UNITS.keys()) if power < len(stripped))
first_part = value_string[:-largest_unit]
second_part = value_string[-largest_unit:]
return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(
second_part)
def verbalize_cardinal(value_string: str) -> str:
if not value_string:
return ''
# 000 -> '零' , 0 -> '零'
value_string = value_string.lstrip('0')
if len(value_string) == 0:
return DIGITS['0']
result_symbols = _get_value(value_string)
# verbalized number starting with '一十*' is abbreviated as `十*`
if len(result_symbols) >= 2 and result_symbols[0] == DIGITS[
'1'] and result_symbols[1] == UNITS[1]:
result_symbols = result_symbols[1:]
return ''.join(result_symbols)
def verbalize_digit(value_string: str, alt_one=False) -> str:
result_symbols = [DIGITS[digit] for digit in value_string]
result = ''.join(result_symbols)
if alt_one:
result = result.replace("", "")
return result
def num2str(value_string: str) -> str:
integer_decimal = value_string.split('.')
if len(integer_decimal) == 1:
integer = integer_decimal[0]
decimal = ''
elif len(integer_decimal) == 2:
integer, decimal = integer_decimal
else:
raise ValueError(
f"The value string: '${value_string}' has more than one point in it."
)
result = verbalize_cardinal(integer)
decimal = decimal.rstrip('0')
if decimal:
# '.22' is verbalized as '零点二二'
# '3.20' is verbalized as '三点二
result = result if result else ""
result += '' + verbalize_digit(decimal)
return result

View File

@ -0,0 +1,63 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from .num import verbalize_digit
# 规范化固话/手机号码
# 手机
# http://www.jihaoba.com/news/show/13680
# 移动139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
# 联通130、131、132、156、155、186、185、176
# 电信133、153、189、180、181、177
RE_MOBILE_PHONE = re.compile(
r"(?<!\d)((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})(?!\d)")
RE_TELEPHONE = re.compile(
r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})(?!\d)")
# 全国统一的号码400开头
RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")
def phone2str(phone_string: str, mobile=True) -> str:
if mobile:
sp_parts = phone_string.strip('+').split()
result = ''.join(
[verbalize_digit(part, alt_one=True) for part in sp_parts])
return result
else:
sil_parts = phone_string.split('-')
result = ''.join(
[verbalize_digit(part, alt_one=True) for part in sil_parts])
return result
def replace_phone(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
return phone2str(match.group(0), mobile=False)
def replace_mobile(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
return phone2str(match.group(0))

View File

@ -0,0 +1,63 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from .num import num2str
# 温度表达式,温度会影响负号的读法
# -3°C 零下三度
RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)')
measure_dict = {
"cm2": "平方厘米",
"cm²": "平方厘米",
"cm3": "立方厘米",
"cm³": "立方厘米",
"cm": "厘米",
"db": "分贝",
"ds": "毫秒",
"kg": "千克",
"km": "千米",
"m2": "平方米",
"": "平方米",
"": "立方米",
"m3": "立方米",
"ml": "毫升",
"m": "",
"mm": "毫米",
"s": ""
}
def replace_temperature(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
sign = match.group(1)
temperature = match.group(2)
unit = match.group(3)
sign: str = "零下" if sign else ""
temperature: str = num2str(temperature)
unit: str = "摄氏度" if unit == "摄氏度" else ""
result = f"{sign}{temperature}{unit}"
return result
def replace_measure(sentence) -> str:
for q_notation in measure_dict:
if q_notation in sentence:
sentence = sentence.replace(q_notation, measure_dict[q_notation])
return sentence

View File

@ -0,0 +1,154 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from typing import List
from .char_convert import tranditional_to_simplified
from .chronology import RE_DATE
from .chronology import RE_DATE2
from .chronology import RE_TIME
from .chronology import RE_TIME_RANGE
from .chronology import replace_date
from .chronology import replace_date2
from .chronology import replace_time
from .constants import F2H_ASCII_LETTERS
from .constants import F2H_DIGITS
from .constants import F2H_SPACE
from .num import RE_DECIMAL_NUM
from .num import RE_DEFAULT_NUM
from .num import RE_FRAC
from .num import RE_INTEGER
from .num import RE_NUMBER
from .num import RE_PERCENTAGE
from .num import RE_POSITIVE_QUANTIFIERS
from .num import RE_RANGE
from .num import replace_default_num
from .num import replace_frac
from .num import replace_negative_num
from .num import replace_number
from .num import replace_percentage
from .num import replace_positive_quantifier
from .num import replace_range
from .phonecode import RE_MOBILE_PHONE
from .phonecode import RE_NATIONAL_UNIFORM_NUMBER
from .phonecode import RE_TELEPHONE
from .phonecode import replace_mobile
from .phonecode import replace_phone
from .quantifier import RE_TEMPERATURE
from .quantifier import replace_measure
from .quantifier import replace_temperature
class TextNormalizer():
def __init__(self):
self.SENTENCE_SPLITOR = re.compile(r'([:、,;。?!,;?!][”’]?)')
def _split(self, text: str, lang="zh") -> List[str]:
"""Split long text into sentences with sentence-splitting punctuations.
Args:
text (str): The input text.
Returns:
List[str]: Sentences.
"""
# Only for pure Chinese here
if lang == "zh":
text = text.replace(" ", "")
# 过滤掉特殊字符
text = re.sub(r'[——《》【】<=>{}()#&@“”^_|…\\]', '', text)
text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
text = text.strip()
sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
return sentences
def _post_replace(self, sentence: str) -> str:
sentence = sentence.replace('/', '')
sentence = sentence.replace('~', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('α', '阿尔法')
sentence = sentence.replace('β', '贝塔')
sentence = sentence.replace('γ', '伽玛').replace('Γ', '伽玛')
sentence = sentence.replace('δ', '德尔塔').replace('Δ', '德尔塔')
sentence = sentence.replace('ε', '艾普西龙')
sentence = sentence.replace('ζ', '捷塔')
sentence = sentence.replace('η', '依塔')
sentence = sentence.replace('θ', '西塔').replace('Θ', '西塔')
sentence = sentence.replace('ι', '艾欧塔')
sentence = sentence.replace('κ', '喀帕')
sentence = sentence.replace('λ', '拉姆达').replace('Λ', '拉姆达')
sentence = sentence.replace('μ', '')
sentence = sentence.replace('ν', '')
sentence = sentence.replace('ξ', '克西').replace('Ξ', '克西')
sentence = sentence.replace('ο', '欧米克伦')
sentence = sentence.replace('π', '').replace('Π', '')
sentence = sentence.replace('ρ', '')
sentence = sentence.replace('ς', '西格玛').replace('Σ', '西格玛').replace(
'σ', '西格玛')
sentence = sentence.replace('τ', '')
sentence = sentence.replace('υ', '宇普西龙')
sentence = sentence.replace('φ', '服艾').replace('Φ', '服艾')
sentence = sentence.replace('χ', '')
sentence = sentence.replace('ψ', '普赛').replace('Ψ', '普赛')
sentence = sentence.replace('ω', '欧米伽').replace('Ω', '欧米伽')
# re filter special characters, have one more character "-" than line 68
sentence = re.sub(r'[-——《》【】<=>{}()#&@“”^_|…\\]', '', sentence)
return sentence
def normalize_sentence(self, sentence: str) -> str:
# basic character conversions
sentence = tranditional_to_simplified(sentence)
sentence = sentence.translate(F2H_ASCII_LETTERS).translate(
F2H_DIGITS).translate(F2H_SPACE)
# number related NSW verbalization
sentence = RE_DATE.sub(replace_date, sentence)
sentence = RE_DATE2.sub(replace_date2, sentence)
# range first
sentence = RE_TIME_RANGE.sub(replace_time, sentence)
sentence = RE_TIME.sub(replace_time, sentence)
sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
sentence = replace_measure(sentence)
sentence = RE_FRAC.sub(replace_frac, sentence)
sentence = RE_PERCENTAGE.sub(replace_percentage, sentence)
sentence = RE_MOBILE_PHONE.sub(replace_mobile, sentence)
sentence = RE_TELEPHONE.sub(replace_phone, sentence)
sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence)
sentence = RE_RANGE.sub(replace_range, sentence)
sentence = RE_INTEGER.sub(replace_negative_num, sentence)
sentence = RE_DECIMAL_NUM.sub(replace_number, sentence)
sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier,
sentence)
sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence)
sentence = RE_NUMBER.sub(replace_number, sentence)
sentence = self._post_replace(sentence)
return sentence
def normalize(self, text: str) -> List[str]:
sentences = self._split(text)
sentences = [self.normalize_sentence(sent) for sent in sentences]
return sentences