From fa7b072066893966a5309d0863a5aae1eee54dac Mon Sep 17 00:00:00 2001 From: Akito-UzukiP <12011903@mail.sustech.edu.cn> Date: Sat, 10 Aug 2024 20:09:26 +0800 Subject: [PATCH] add modifiable japanese dict --- GPT_SoVITS/text/ja_userdic/userdict.csv | 1 + GPT_SoVITS/text/japanese.py | 24 +++++++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 GPT_SoVITS/text/ja_userdic/userdict.csv diff --git a/GPT_SoVITS/text/ja_userdic/userdict.csv b/GPT_SoVITS/text/ja_userdic/userdict.csv new file mode 100644 index 00000000..b23e0d63 --- /dev/null +++ b/GPT_SoVITS/text/ja_userdic/userdict.csv @@ -0,0 +1 @@ +主殿,*,*,-32767,名詞,固有名詞,一般,*,*,*,アルジドノ,アルジドノ,アルジドノ,3/5,* \ No newline at end of file diff --git a/GPT_SoVITS/text/japanese.py b/GPT_SoVITS/text/japanese.py index 4c10720e..e9fe7c1a 100644 --- a/GPT_SoVITS/text/japanese.py +++ b/GPT_SoVITS/text/japanese.py @@ -1,8 +1,30 @@ # modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py import re -import sys import pyopenjtalk +import os +import hashlib +current_file_path = os.path.dirname(__file__) +def get_hash(fp: str) -> str: + hash_md5 = hashlib.md5() + with open(fp, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + +USERDIC_CSV_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.csv") +USERDIC_BIN_PATH = os.path.join(current_file_path, "ja_userdic", "user.dict") +USERDIC_HASH_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.md5") +# 如果没有用户词典,就生成一个;如果有,就检查md5,如果不一样,就重新生成 +if os.path.exists(USERDIC_CSV_PATH): + if not os.path.exists(USERDIC_BIN_PATH) or get_hash(USERDIC_CSV_PATH) != open(USERDIC_HASH_PATH, "r",encoding='utf-8').read(): + pyopenjtalk.mecab_dict_index(USERDIC_CSV_PATH, USERDIC_BIN_PATH) + with open(USERDIC_HASH_PATH, "w", encoding='utf-8') as f: + f.write(get_hash(USERDIC_CSV_PATH)) + +if os.path.exists(USERDIC_BIN_PATH): + pyopenjtalk.update_global_jtalk_with_user_dict(USERDIC_BIN_PATH) + from text.symbols import punctuation # Regular expression matching Japanese without punctuation marks: