调整minus识别,防止误识别转化

This commit is contained in:
Cosmo Clara 2024-12-11 02:50:20 +08:00 committed by GitHub
parent f6d0165843
commit 401c275c5a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -237,6 +237,7 @@ def normalize(text):
""" """
text = re.sub(_ordinal_number_re, _convert_ordinal, text) text = re.sub(_ordinal_number_re, _convert_ordinal, text)
text = re.sub(r'(?<!\d)-|-(?!\d)', ' minus ', text)
text = re.sub(_comma_number_re, _remove_commas, text) text = re.sub(_comma_number_re, _remove_commas, text)
text = re.sub(_time_re, _expand_time, text) text = re.sub(_time_re, _expand_time, text)
text = re.sub(_measurement_re, _expand_measurement, text) text = re.sub(_measurement_re, _expand_measurement, text)
@ -252,7 +253,6 @@ def normalize(text):
text = ''.join(char for char in unicodedata.normalize('NFD', text) text = ''.join(char for char in unicodedata.normalize('NFD', text)
if unicodedata.category(char) != 'Mn') # Strip accents if unicodedata.category(char) != 'Mn') # Strip accents
text = re.sub("-", "minus ", text)
text = re.sub("%", " percent", text) text = re.sub("%", " percent", text)
text = re.sub("[^ A-Za-z'.,?!\-]", "", text) text = re.sub("[^ A-Za-z'.,?!\-]", "", text)
text = re.sub(r"(?i)i\.e\.", "that is", text) text = re.sub(r"(?i)i\.e\.", "that is", text)
@ -269,4 +269,4 @@ if __name__ == '__main__':
print(normalize("1st, 22nd")) print(normalize("1st, 22nd"))
print(normalize("a test 20h, 1.2s, 1L, 0.1km")) print(normalize("a test 20h, 1.2s, 1L, 0.1km"))
print(normalize("a test of time 4:00, 13:00, 13:30")) print(normalize("a test of time 4:00, 13:00, 13:30"))
print(normalize("a test of temperature 4°F, 23°C, -19°C")) print(normalize("a test of temperature 4°F, 23°C, -19°C"))