Replace cantonese g2p with "ToJyutping" (#1697)

This commit is contained in:
Naozumi 2025-01-18 20:51:48 +08:00 committed by GitHub
parent a1fe2267af
commit b7a904a671
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 20 additions and 8 deletions

View File

@ -3,8 +3,8 @@
import sys import sys
import re import re
import cn2an import cn2an
import ToJyutping
from pyjyutping import jyutping
from text.symbols import punctuation from text.symbols import punctuation
from text.zh_normalization.text_normlization import TextNormalizer from text.zh_normalization.text_normlization import TextNormalizer
@ -173,12 +173,24 @@ def jyuping_to_initials_finals_tones(jyuping_syllables):
def get_jyutping(text): def get_jyutping(text):
jp = jyutping.convert(text) jyutping_array = []
# print(1111111,jp) punct_pattern = re.compile(r"^[{}]+$".format(re.escape("".join(punctuation))))
for symbol in punctuation:
jp = jp.replace(symbol, " " + symbol + " ") syllables = ToJyutping.get_jyutping_list(text)
jp_array = jp.split()
return jp_array for word, syllable in syllables:
if punct_pattern.match(word):
puncts = re.split(r"([{}])".format(re.escape("".join(punctuation))), word)
for punct in puncts:
if len(punct) > 0:
jyutping_array.append(punct)
else:
# match multple jyutping eg: liu4 ge3, or single jyutping eg: liu4
if not re.search(r"^([a-z]+[1-6]+[ ]?)+$", syllable):
raise ValueError(f"Failed to convert {word} to jyutping: {syllable}")
jyutping_array.append(syllable)
return jyutping_array
def get_bert_feature(text, word2ph): def get_bert_feature(text, word2ph):

View File

@ -27,7 +27,7 @@ LangSegment>=0.2.0
Faster_Whisper Faster_Whisper
wordsegment wordsegment
rotary_embedding_torch rotary_embedding_torch
pyjyutping ToJyutping
g2pk2 g2pk2
ko_pron ko_pron
opencc; sys_platform != 'linux' opencc; sys_platform != 'linux'