fix: 多音字修改 (#2791)

* fix: 固定唑的读音

* add: 增加md5检测以更新缓存
This commit is contained in:
SapphireLab 2026-06-16 21:46:53 +08:00 committed by GitHub
parent 08d627c333
commit b2cff0cd0a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 25 additions and 3 deletions

View File

@ -1,5 +1,6 @@
# This code is modified from https://github.com/mozillazg/pypinyin-g2pW
import hashlib
import pickle
import os
@ -14,6 +15,16 @@ current_file_path = os.path.dirname(__file__)
CACHE_PATH = os.path.join(current_file_path, "polyphonic.pickle")
PP_DICT_PATH = os.path.join(current_file_path, "polyphonic.rep")
PP_FIX_DICT_PATH = os.path.join(current_file_path, "polyphonic-fix.rep")
MD5_PATH = os.path.join(current_file_path, "polyphonic.md5")
def get_file_md5(file_path):
if not os.path.exists(file_path):
return ""
hasher = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hasher.update(chunk)
return hasher.hexdigest()
class G2PWPinyin(Pinyin):
@ -115,13 +126,22 @@ def cache_dict(polyphonic_dict, file_path):
def get_dict():
if os.path.exists(CACHE_PATH):
new_md5 = get_file_md5(PP_DICT_PATH) + get_file_md5(PP_FIX_DICT_PATH)
old_md5 = ""
if os.path.exists(MD5_PATH):
with open(MD5_PATH, "r", encoding="utf-8") as f:
old_md5 = f.read().strip()
need_rebuild = (not os.path.exists(CACHE_PATH)) or (new_md5 != old_md5)
if not need_rebuild:
with open(CACHE_PATH, "rb") as pickle_file:
polyphonic_dict = pickle.load(pickle_file)
else:
print("Rebuilding Polyphonic Dictionary: " + f"{old_md5} -> {new_md5}")
polyphonic_dict = read_dict()
cache_dict(polyphonic_dict, CACHE_PATH)
with open(MD5_PATH, "w", encoding="utf-8") as f:
f.write(new_md5)
return polyphonic_dict

View File

@ -45023,4 +45023,5 @@
鼎铛玉石: ['ding3', 'cheng1', 'yu4', 'shi2']
齿豁头童: ['chi3', 'huo1', 'tou2', 'tong2']
牦牛: ['mao2', 'niu2']
牦: ['mao2']
牦: ['mao2']
唑: ['zuo4']

View File

@ -0,0 +1 @@
13b2211c317c75794123ffdf7c2aea021c75b0c606ad61d7c8b05bb00b64fa21