diff --git a/GPT_SoVITS/text/japanese.py b/GPT_SoVITS/text/japanese.py index 1cef2db..68112b9 100644 --- a/GPT_SoVITS/text/japanese.py +++ b/GPT_SoVITS/text/japanese.py @@ -4,8 +4,8 @@ import sys import pyopenjtalk -from text import symbols +from text import symbols # Regular expression matching Japanese without punctuation marks: _japanese_characters = re.compile( r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" @@ -71,7 +71,7 @@ def symbols_to_japanese(text): return text -def preprocess_jap(text): +def preprocess_jap(text, with_prosody=False): """Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html""" text = symbols_to_japanese(text) sentences = re.split(_japanese_marks, text) @@ -79,10 +79,15 @@ def preprocess_jap(text): text = [] for i, sentence in enumerate(sentences): if re.match(_japanese_characters, sentence): - p = pyopenjtalk.g2p(sentence) - text += p.split(" ") + if with_prosody: + text += pyopenjtalk_g2p_prosody(sentence)[1:-1] + else: + p = pyopenjtalk.g2p(sentence) + text += p.split(" ") if i < len(marks): + if marks[i] == " ":# 防止意外的UNK + continue text += [marks[i].replace(" ", "")] return text @@ -91,16 +96,96 @@ def text_normalize(text): # todo: jap text normalize return text +# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py +def pyopenjtalk_g2p_prosody(text: str, drop_unvoiced_vowels: bool = True) -> list[str]: + """Extract phoneme + prosoody symbol sequence from input full-context labels. -def g2p(norm_text): - phones = preprocess_jap(norm_text) + The algorithm is based on `Prosodic features control by symbols as input of + sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks. + + Args: + text (str): Input text. + drop_unvoiced_vowels (bool): whether to drop unvoiced vowels. + + Returns: + List[str]: List of phoneme + prosody symbols. + + Examples: + >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody + >>> pyopenjtalk_g2p_prosody("こんにちは。") + ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$'] + + .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic + modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104 + + """ + labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text)) + N = len(labels) + + phones = [] + for n in range(N): + lab_curr = labels[n] + + # current phoneme + p3 = re.search(r"\-(.*?)\+", lab_curr).group(1) + # deal unvoiced vowels as normal vowels + if drop_unvoiced_vowels and p3 in "AEIOU": + p3 = p3.lower() + + # deal with sil at the beginning and the end of text + if p3 == "sil": + assert n == 0 or n == N - 1 + if n == 0: + phones.append("^") + elif n == N - 1: + # check question form or not + e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr) + if e3 == 0: + phones.append("$") + elif e3 == 1: + phones.append("?") + continue + elif p3 == "pau": + phones.append("_") + continue + else: + phones.append(p3) + + # accent type and position info (forward or backward) + a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr) + a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr) + a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr) + + # number of mora in accent phrase + f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr) + + a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1]) + # accent phrase border + if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl": + phones.append("#") + # pitch falling + elif a1 == 0 and a2_next == a2 + 1 and a2 != f1: + phones.append("]") + # pitch rising + elif a2 == 1 and a2_next == 2: + phones.append("[") + + return phones + +# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py +def _numeric_feature_by_regex(regex, s): + match = re.search(regex, s) + if match is None: + return -50 + return int(match.group(1)) + +def g2p(norm_text, with_prosody=False): + phones = preprocess_jap(norm_text, with_prosody) phones = [post_replace_ph(i) for i in phones] # todo: implement tones and word2ph return phones if __name__ == "__main__": - for line in open("../../../Downloads/transcript_utf8.txt").readlines(): - text = line.split(":")[1] - phones = g2p(text) - print(phones) + phones = g2p("こんにちは, hello, AKITOです,よろしくお願いしますね!") + print(phones) \ No newline at end of file diff --git a/GPT_SoVITS/text/symbols.py b/GPT_SoVITS/text/symbols.py index 97e3938..7049949 100644 --- a/GPT_SoVITS/text/symbols.py +++ b/GPT_SoVITS/text/symbols.py @@ -315,6 +315,10 @@ ja_symbols = [ "w", "y", "z", + # "[", #上升调型 + # "]", #下降调型 + # "$", #结束符 + # "^", #开始符 ] arpa = {