add modifiable japanese dict (#1443)

This commit is contained in:
AkitoLiu 2024-08-12 10:43:36 +08:00 committed by GitHub
parent 62831dfcc7
commit d552c971bf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 24 additions and 1 deletions

View File

@ -0,0 +1 @@
主殿,*,*,-32767,名詞,固有名詞,一般,*,*,*,アルジドノ,アルジドノ,アルジドノ,3/5,*
1 主殿 * * -32767 名詞 固有名詞 一般 * * * アルジドノ アルジドノ アルジドノ 3/5 *

View File

@ -1,8 +1,30 @@
# modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
import re
import sys
import pyopenjtalk
import os
import hashlib
current_file_path = os.path.dirname(__file__)
def get_hash(fp: str) -> str:
hash_md5 = hashlib.md5()
with open(fp, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
USERDIC_CSV_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.csv")
USERDIC_BIN_PATH = os.path.join(current_file_path, "ja_userdic", "user.dict")
USERDIC_HASH_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.md5")
# 如果没有用户词典就生成一个如果有就检查md5如果不一样就重新生成
if os.path.exists(USERDIC_CSV_PATH):
if not os.path.exists(USERDIC_BIN_PATH) or get_hash(USERDIC_CSV_PATH) != open(USERDIC_HASH_PATH, "r",encoding='utf-8').read():
pyopenjtalk.mecab_dict_index(USERDIC_CSV_PATH, USERDIC_BIN_PATH)
with open(USERDIC_HASH_PATH, "w", encoding='utf-8') as f:
f.write(get_hash(USERDIC_CSV_PATH))
if os.path.exists(USERDIC_BIN_PATH):
pyopenjtalk.update_global_jtalk_with_user_dict(USERDIC_BIN_PATH)
from text.symbols import punctuation
# Regular expression matching Japanese without punctuation marks: