add LLM translated CMUDICT-KATAKANA dictionary (#1660)

This commit is contained in:
AkitoLiu 2024-09-29 05:29:43 -04:00 committed by GitHub
parent 5efb960898
commit 38cd881578
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 134207 additions and 4 deletions

File diff suppressed because it is too large Load Diff

View File

@ -82,8 +82,9 @@ def post_replace_ph(ph):
ph = rep_map[ph]
# if ph in symbols:
# return ph
# if ph not in symbols:
# ph = "UNK"
if ph not in symbols:
ph = "UNK"
# UNK may be useful as a pause token as it was trained in the model
return ph
@ -103,6 +104,8 @@ def symbols_to_japanese(text):
def preprocess_jap(text, with_prosody=False):
"""Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
text = symbols_to_japanese(text)
# English words to lower case, should have no influence on japanese words.
text = text.lower()
sentences = re.split(_japanese_marks, text)
marks = re.findall(_japanese_marks, text)
text = []
@ -219,5 +222,6 @@ def g2p(norm_text, with_prosody=True):
if __name__ == "__main__":
phones = g2p("こんにちは, hello, AKITOです,よろしくお願いしますね!")
from text.symbols2 import symbols
phones = g2p("Hello.こんにちは今日もNiCe天気ですねtokyotowerに行きましょう")
print(phones)