diff --git a/GPT_SoVITS/text/g2pw/g2pw.py b/GPT_SoVITS/text/g2pw/g2pw.py index 08525e91..fbfcd096 100644 --- a/GPT_SoVITS/text/g2pw/g2pw.py +++ b/GPT_SoVITS/text/g2pw/g2pw.py @@ -1,5 +1,6 @@ # This code is modified from https://github.com/mozillazg/pypinyin-g2pW +import hashlib import pickle import os @@ -14,6 +15,16 @@ current_file_path = os.path.dirname(__file__) CACHE_PATH = os.path.join(current_file_path, "polyphonic.pickle") PP_DICT_PATH = os.path.join(current_file_path, "polyphonic.rep") PP_FIX_DICT_PATH = os.path.join(current_file_path, "polyphonic-fix.rep") +MD5_PATH = os.path.join(current_file_path, "polyphonic.md5") + +def get_file_md5(file_path): + if not os.path.exists(file_path): + return "" + hasher = hashlib.md5() + with open(file_path, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hasher.update(chunk) + return hasher.hexdigest() class G2PWPinyin(Pinyin): @@ -115,13 +126,22 @@ def cache_dict(polyphonic_dict, file_path): def get_dict(): - if os.path.exists(CACHE_PATH): + new_md5 = get_file_md5(PP_DICT_PATH) + get_file_md5(PP_FIX_DICT_PATH) + old_md5 = "" + if os.path.exists(MD5_PATH): + with open(MD5_PATH, "r", encoding="utf-8") as f: + old_md5 = f.read().strip() + need_rebuild = (not os.path.exists(CACHE_PATH)) or (new_md5 != old_md5) + + if not need_rebuild: with open(CACHE_PATH, "rb") as pickle_file: polyphonic_dict = pickle.load(pickle_file) else: + print("Rebuilding Polyphonic Dictionary: " + f"{old_md5} -> {new_md5}") polyphonic_dict = read_dict() cache_dict(polyphonic_dict, CACHE_PATH) - + with open(MD5_PATH, "w", encoding="utf-8") as f: + f.write(new_md5) return polyphonic_dict diff --git a/GPT_SoVITS/text/g2pw/polyphonic-fix.rep b/GPT_SoVITS/text/g2pw/polyphonic-fix.rep index 1800b886..51ebc145 100644 --- a/GPT_SoVITS/text/g2pw/polyphonic-fix.rep +++ b/GPT_SoVITS/text/g2pw/polyphonic-fix.rep @@ -45023,4 +45023,5 @@ 鼎铛玉石: ['ding3', 'cheng1', 'yu4', 'shi2'] 齿豁头童: ['chi3', 'huo1', 'tou2', 'tong2'] 牦牛: ['mao2', 'niu2'] -牦: ['mao2'] \ No newline at end of file +牦: ['mao2'] +唑: ['zuo4'] \ No newline at end of file diff --git a/GPT_SoVITS/text/g2pw/polyphonic.md5 b/GPT_SoVITS/text/g2pw/polyphonic.md5 new file mode 100644 index 00000000..24182cbb --- /dev/null +++ b/GPT_SoVITS/text/g2pw/polyphonic.md5 @@ -0,0 +1 @@ +13b2211c317c75794123ffdf7c2aea021c75b0c606ad61d7c8b05bb00b64fa21 \ No newline at end of file diff --git a/GPT_SoVITS/text/g2pw/polyphonic.pickle b/GPT_SoVITS/text/g2pw/polyphonic.pickle index fbe46ce1..b749b068 100644 Binary files a/GPT_SoVITS/text/g2pw/polyphonic.pickle and b/GPT_SoVITS/text/g2pw/polyphonic.pickle differ