diff --git a/GPT_SoVITS/text/__init__.py b/GPT_SoVITS/text/__init__.py index a48b9a4..01afcf8 100644 --- a/GPT_SoVITS/text/__init__.py +++ b/GPT_SoVITS/text/__init__.py @@ -1,5 +1,8 @@ -from text.symbols import * - +import os +if os.environ.get("version","v1")=="v1": + from text.symbols import symbols +else: + from text.symbols2 import symbols _symbol_to_id = {s: i for i, s in enumerate(symbols)} diff --git a/GPT_SoVITS/text/cantonese.py b/GPT_SoVITS/text/cantonese.py new file mode 100644 index 0000000..915be38 --- /dev/null +++ b/GPT_SoVITS/text/cantonese.py @@ -0,0 +1,204 @@ +import sys +# sys.path.append("/data/docker/liujing04/gpt-vits/mq-vits-s1bert_no_bert-kv_cache-new_text-emo") +sys.path.append("/data/docker/liujing04/gpt-vits/mq-vits-s1bert_no_bert-kv_cache-new_text-emo2v2") +import re +import cn2an + +from pyjyutping import jyutping +from text.symbols import punctuation + +normalizer = lambda x: cn2an.transform(x, "an2cn") + +INITIALS = [ + "aa", + "aai", + "aak", + "aap", + "aat", + "aau", + "ai", + "au", + "ap", + "at", + "ak", + "a", + "p", + "b", + "e", + "ts", + "t", + "dz", + "d", + "kw", + "k", + "gw", + "g", + "f", + "h", + "l", + "m", + "ng", + "n", + "s", + "y", + "w", + "c", + "z", + "j", + "ong", + "on", + "ou", + "oi", + "ok", + "o", + "uk", + "ung", +] +INITIALS += ["sp", "spl", "spn", "sil"] + + +rep_map = { + ":": ",", + ";": ",", + ",": ",", + "。": ".", + "!": "!", + "?": "?", + "\n": ".", + "·": ",", + "、": ",", + "...": "…", + "$": ".", + "“": "'", + "”": "'", + '"': "'", + "‘": "'", + "’": "'", + "(": "'", + ")": "'", + "(": "'", + ")": "'", + "《": "'", + "》": "'", + "【": "'", + "】": "'", + "[": "'", + "]": "'", + "—": "-", + "~": "-", + "~": "-", + "「": "'", + "」": "'", +} + + +def replace_punctuation(text): + # text = text.replace("嗯", "恩").replace("呣", "母") + pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) + + replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) + + replaced_text = re.sub( + r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text + ) + + return replaced_text + + +def text_normalize(text): + text = normalizer(text) + text = replace_punctuation(text) + return text + +punctuation_set=set(punctuation) +def jyuping_to_initials_finals_tones(jyuping_syllables): + initials_finals = [] + tones = [] + word2ph = [] + + for syllable in jyuping_syllables: + if syllable in punctuation: + initials_finals.append(syllable) + tones.append(0) + word2ph.append(1) # Add 1 for punctuation + elif syllable == "_": + initials_finals.append(syllable) + tones.append(0) + word2ph.append(1) # Add 1 for underscore + else: + try: + tone = int(syllable[-1]) + syllable_without_tone = syllable[:-1] + except ValueError: + tone = 0 + syllable_without_tone = syllable + + for initial in INITIALS: + if syllable_without_tone.startswith(initial): + if syllable_without_tone.startswith("nga"): + initials_finals.extend( + [ + syllable_without_tone[:2], + syllable_without_tone[2:] or syllable_without_tone[-1], + ] + ) + # tones.extend([tone, tone]) + tones.extend([-1, tone]) + word2ph.append(2) + else: + final = syllable_without_tone[len(initial) :] or initial[-1] + initials_finals.extend([initial, final]) + # tones.extend([tone, tone]) + tones.extend([-1, tone]) + word2ph.append(2) + break + assert len(initials_finals) == len(tones) + + ###魔改为辅音+带音调的元音 + phones=[] + for a,b in zip(initials_finals,tones): + if(b not in [-1,0]):###防止粤语和普通话重合开头加Y,如果是标点,不加。 + todo="%s%s"%(a,b) + else:todo=a + if(todo not in punctuation_set):todo="Y%s"%todo + phones.append(todo) + + # return initials_finals, tones, word2ph + return phones, word2ph + + +def get_jyutping(text): + jp = jyutping.convert(text) + # print(1111111,jp) + for symbol in punctuation: + jp = jp.replace(symbol, " " + symbol + " ") + jp_array = jp.split() + return jp_array + + +def get_bert_feature(text, word2ph): + from text import chinese_bert + + return chinese_bert.get_bert_feature(text, word2ph) + + +def g2p(text): + # word2ph = [] + jyuping = get_jyutping(text) + # print(jyuping) + # phones, tones, word2ph = jyuping_to_initials_finals_tones(jyuping) + phones, word2ph = jyuping_to_initials_finals_tones(jyuping) + # phones = ["_"] + phones + ["_"] + # tones = [0] + tones + [0] + # word2ph = [1] + word2ph + [1] + return phones, word2ph + + +if __name__ == "__main__": + # text = "啊!但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏" + text = "佢個鋤頭太短啦。" + text = text_normalize(text) + # phones, tones, word2ph = g2p(text) + phones, word2ph = g2p(text) + # print(phones, tones, word2ph) + print(phones, word2ph) \ No newline at end of file diff --git a/GPT_SoVITS/text/chinese2.py b/GPT_SoVITS/text/chinese2.py new file mode 100644 index 0000000..a12e360 --- /dev/null +++ b/GPT_SoVITS/text/chinese2.py @@ -0,0 +1,187 @@ +import os +import pdb +import re + +import cn2an +from pypinyin import lazy_pinyin, Style + +from text.symbols import punctuation +from text.tone_sandhi import ToneSandhi +from text.zh_normalization.text_normlization import TextNormalizer + +normalizer = lambda x: cn2an.transform(x, "an2cn") + +current_file_path = os.path.dirname(__file__) +pinyin_to_symbol_map = { + line.split("\t")[0]: line.strip().split("\t")[1] + for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines() +} + +import jieba_fast.posseg as psg + +# is_g2pw_str = os.environ.get("is_g2pw", "True")##默认开启 +# is_g2pw = False#True if is_g2pw_str.lower() == 'true' else False +is_g2pw = True#True if is_g2pw_str.lower() == 'true' else False +if is_g2pw: + print("当前使用g2pw进行拼音推理") + from text.g2pw import G2PWPinyin + parent_directory = os.path.dirname(current_file_path) + g2pw = G2PWPinyin(model_dir="GPT_SoVITS/text/G2PWModel",model_source="GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",v_to_u=False, neutral_tone_with_five=True) + +rep_map = { + ":": ",", + ";": ",", + ",": ",", + "。": ".", + "!": "!", + "?": "?", + "\n": ".", + "·": ",", + "、": ",", + "...": "…", + "$": ".", + "/": ",", + "—": "-", +} + +tone_modifier = ToneSandhi() + + +def replace_punctuation(text): + text = text.replace("嗯", "恩").replace("呣", "母") + pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) + + replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) + + replaced_text = re.sub( + r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text + ) + + return replaced_text + + +def g2p(text): + pattern = r"(?<=[{0}])\s*".format("".join(punctuation)) + sentences = [i for i in re.split(pattern, text) if i.strip() != ""] + phones, word2ph = _g2p(sentences) + return phones, word2ph + + +def _get_initials_finals(word): + initials = [] + finals = [] + + if not is_g2pw: + orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS) + orig_finals = lazy_pinyin( + word, neutral_tone_with_five=True, style=Style.FINALS_TONE3 + ) + else: + orig_initials = g2pw.lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS) + orig_finals = g2pw.lazy_pinyin( + word, neutral_tone_with_five=True, style=Style.FINALS_TONE3 + ) + + for c, v in zip(orig_initials, orig_finals): + initials.append(c) + finals.append(v) + return initials, finals + + +def _g2p(segments): + phones_list = [] + word2ph = [] + for seg in segments: + pinyins = [] + # Replace all English words in the sentence + seg = re.sub("[a-zA-Z]+", "", seg) + seg_cut = psg.lcut(seg) + initials = [] + finals = [] + seg_cut = tone_modifier.pre_merge_for_modify(seg_cut) + for word, pos in seg_cut: + if pos == "eng": + continue + sub_initials, sub_finals = _get_initials_finals(word) + sub_finals = tone_modifier.modified_tone(word, pos, sub_finals) + initials.append(sub_initials) + finals.append(sub_finals) + # assert len(sub_initials) == len(sub_finals) == len(word) + initials = sum(initials, []) + finals = sum(finals, []) + # + for c, v in zip(initials, finals): + raw_pinyin = c + v + # NOTE: post process for pypinyin outputs + # we discriminate i, ii and iii + if c == v: + assert c in punctuation + phone = [c] + word2ph.append(1) + else: + v_without_tone = v[:-1] + tone = v[-1] + + pinyin = c + v_without_tone + assert tone in "12345" + + if c: + # 多音节 + v_rep_map = { + "uei": "ui", + "iou": "iu", + "uen": "un", + } + if v_without_tone in v_rep_map.keys(): + pinyin = c + v_rep_map[v_without_tone] + else: + # 单音节 + pinyin_rep_map = { + "ing": "ying", + "i": "yi", + "in": "yin", + "u": "wu", + } + if pinyin in pinyin_rep_map.keys(): + pinyin = pinyin_rep_map[pinyin] + else: + single_rep_map = { + "v": "yu", + "e": "e", + "i": "y", + "u": "w", + } + if pinyin[0] in single_rep_map.keys(): + pinyin = single_rep_map[pinyin[0]] + pinyin[1:] + + assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin) + new_c, new_v = pinyin_to_symbol_map[pinyin].split(" ") + new_v = new_v + tone + phone = [new_c, new_v] + word2ph.append(len(phone)) + + phones_list += phone + return phones_list, word2ph + + +def text_normalize(text): + # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization + tx = TextNormalizer() + sentences = tx.normalize(text) + dest_text = "" + for sentence in sentences: + dest_text += replace_punctuation(sentence) + return dest_text + + +if __name__ == "__main__": + text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏" + text = "呣呣呣~就是…大人的鼹鼠党吧?" + text = "你好" + text = text_normalize(text) + print(g2p(text)) + + +# # 示例用法 +# text = "这是一个示例文本:,你好!这是一个测试..." +# print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试 diff --git a/GPT_SoVITS/text/cleaner.py b/GPT_SoVITS/text/cleaner.py index c42264a..b1e5007 100644 --- a/GPT_SoVITS/text/cleaner.py +++ b/GPT_SoVITS/text/cleaner.py @@ -1,6 +1,13 @@ -from text import chinese, japanese, cleaned_text_to_sequence, symbols, english +from text import japanese, cleaned_text_to_sequence, english,korean,cantonese +import os +if os.environ.get("version","v1")=="v1": + from text import chinese + from text.symbols import symbols +else: + from text import chinese2 as chinese + from text.symbols2 import symbols -language_module_map = {"zh": chinese, "ja": japanese, "en": english} +language_module_map = {"zh": chinese, "ja": japanese, "en": english, "ko": korean,"yue":cantonese} special = [ # ("%", "zh", "SP"), ("¥", "zh", "SP2"), @@ -17,8 +24,11 @@ def clean_text(text, language): if special_s in text and language == special_l: return clean_special(text, language, special_s, target_symbol) language_module = language_module_map[language] - norm_text = language_module.text_normalize(text) - if language == "zh": + if hasattr(language_module,"text_normalize"): + norm_text = language_module.text_normalize(text) + else: + norm_text=text + if language == "zh" or language=="yue":########## phones, word2ph = language_module.g2p(norm_text) assert len(phones) == sum(word2ph) assert len(norm_text) == len(word2ph) diff --git a/GPT_SoVITS/text/korean.py b/GPT_SoVITS/text/korean.py new file mode 100644 index 0000000..6a4067e --- /dev/null +++ b/GPT_SoVITS/text/korean.py @@ -0,0 +1,262 @@ +import re +from jamo import h2j, j2hcj +import ko_pron +from g2pk2 import G2p +from text import symbols + +# This is a list of Korean classifiers preceded by pure Korean numerals. +_korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통' + +# List of (hangul, hangul divided) pairs: +_hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [ + # ('ㄳ', 'ㄱㅅ'), # g2pk2, A Syllable-ending Rule + # ('ㄵ', 'ㄴㅈ'), + # ('ㄶ', 'ㄴㅎ'), + # ('ㄺ', 'ㄹㄱ'), + # ('ㄻ', 'ㄹㅁ'), + # ('ㄼ', 'ㄹㅂ'), + # ('ㄽ', 'ㄹㅅ'), + # ('ㄾ', 'ㄹㅌ'), + # ('ㄿ', 'ㄹㅍ'), + # ('ㅀ', 'ㄹㅎ'), + # ('ㅄ', 'ㅂㅅ'), + ('ㅘ', 'ㅗㅏ'), + ('ㅙ', 'ㅗㅐ'), + ('ㅚ', 'ㅗㅣ'), + ('ㅝ', 'ㅜㅓ'), + ('ㅞ', 'ㅜㅔ'), + ('ㅟ', 'ㅜㅣ'), + ('ㅢ', 'ㅡㅣ'), + ('ㅑ', 'ㅣㅏ'), + ('ㅒ', 'ㅣㅐ'), + ('ㅕ', 'ㅣㅓ'), + ('ㅖ', 'ㅣㅔ'), + ('ㅛ', 'ㅣㅗ'), + ('ㅠ', 'ㅣㅜ') +]] + +# List of (Latin alphabet, hangul) pairs: +_latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ + ('a', '에이'), + ('b', '비'), + ('c', '시'), + ('d', '디'), + ('e', '이'), + ('f', '에프'), + ('g', '지'), + ('h', '에이치'), + ('i', '아이'), + ('j', '제이'), + ('k', '케이'), + ('l', '엘'), + ('m', '엠'), + ('n', '엔'), + ('o', '오'), + ('p', '피'), + ('q', '큐'), + ('r', '아르'), + ('s', '에스'), + ('t', '티'), + ('u', '유'), + ('v', '브이'), + ('w', '더블유'), + ('x', '엑스'), + ('y', '와이'), + ('z', '제트') +]] + +# List of (ipa, lazy ipa) pairs: +_ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ + ('t͡ɕ','ʧ'), + ('d͡ʑ','ʥ'), + ('ɲ','n^'), + ('ɕ','ʃ'), + ('ʷ','w'), + ('ɭ','l`'), + ('ʎ','ɾ'), + ('ɣ','ŋ'), + ('ɰ','ɯ'), + ('ʝ','j'), + ('ʌ','ə'), + ('ɡ','g'), + ('\u031a','#'), + ('\u0348','='), + ('\u031e',''), + ('\u0320',''), + ('\u0339','') +]] + + +def fix_g2pk2_error(text): + new_text = "" + i = 0 + while i < len(text) - 4: + if (text[i:i+3] == 'ㅇㅡㄹ' or text[i:i+3] == 'ㄹㅡㄹ') and text[i+3] == ' ' and text[i+4] == 'ㄹ': + new_text += text[i:i+3] + ' ' + 'ㄴ' + i += 5 + else: + new_text += text[i] + i += 1 + + new_text += text[i:] + return new_text + + +def latin_to_hangul(text): + for regex, replacement in _latin_to_hangul: + text = re.sub(regex, replacement, text) + return text + + +def divide_hangul(text): + text = j2hcj(h2j(text)) + for regex, replacement in _hangul_divided: + text = re.sub(regex, replacement, text) + return text + + +def hangul_number(num, sino=True): + '''Reference https://github.com/Kyubyong/g2pK''' + num = re.sub(',', '', num) + + if num == '0': + return '영' + if not sino and num == '20': + return '스무' + + digits = '123456789' + names = '일이삼사오육칠팔구' + digit2name = {d: n for d, n in zip(digits, names)} + + modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉' + decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔' + digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())} + digit2dec = {d: dec for d, dec in zip(digits, decimals.split())} + + spelledout = [] + for i, digit in enumerate(num): + i = len(num) - i - 1 + if sino: + if i == 0: + name = digit2name.get(digit, '') + elif i == 1: + name = digit2name.get(digit, '') + '십' + name = name.replace('일십', '십') + else: + if i == 0: + name = digit2mod.get(digit, '') + elif i == 1: + name = digit2dec.get(digit, '') + if digit == '0': + if i % 4 == 0: + last_three = spelledout[-min(3, len(spelledout)):] + if ''.join(last_three) == '': + spelledout.append('') + continue + else: + spelledout.append('') + continue + if i == 2: + name = digit2name.get(digit, '') + '백' + name = name.replace('일백', '백') + elif i == 3: + name = digit2name.get(digit, '') + '천' + name = name.replace('일천', '천') + elif i == 4: + name = digit2name.get(digit, '') + '만' + name = name.replace('일만', '만') + elif i == 5: + name = digit2name.get(digit, '') + '십' + name = name.replace('일십', '십') + elif i == 6: + name = digit2name.get(digit, '') + '백' + name = name.replace('일백', '백') + elif i == 7: + name = digit2name.get(digit, '') + '천' + name = name.replace('일천', '천') + elif i == 8: + name = digit2name.get(digit, '') + '억' + elif i == 9: + name = digit2name.get(digit, '') + '십' + elif i == 10: + name = digit2name.get(digit, '') + '백' + elif i == 11: + name = digit2name.get(digit, '') + '천' + elif i == 12: + name = digit2name.get(digit, '') + '조' + elif i == 13: + name = digit2name.get(digit, '') + '십' + elif i == 14: + name = digit2name.get(digit, '') + '백' + elif i == 15: + name = digit2name.get(digit, '') + '천' + spelledout.append(name) + return ''.join(elem for elem in spelledout) + + +def number_to_hangul(text): + '''Reference https://github.com/Kyubyong/g2pK''' + tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text)) + for token in tokens: + num, classifier = token + if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers: + spelledout = hangul_number(num, sino=False) + else: + spelledout = hangul_number(num, sino=True) + text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}') + # digit by digit for remaining digits + digits = '0123456789' + names = '영일이삼사오육칠팔구' + for d, n in zip(digits, names): + text = text.replace(d, n) + return text + + +def korean_to_lazy_ipa(text): + text = latin_to_hangul(text) + text = number_to_hangul(text) + text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text) + for regex, replacement in _ipa_to_lazy_ipa: + text = re.sub(regex, replacement, text) + return text + +_g2p=G2p() +def korean_to_ipa(text): + text = latin_to_hangul(text) + text = number_to_hangul(text) + text = _g2p(text) + text = fix_g2pk2_error(text) + text = korean_to_lazy_ipa(text) + return text.replace('ʧ','tʃ').replace('ʥ','dʑ') + +def post_replace_ph(ph): + rep_map = { + ":": ",", + ";": ",", + ",": ",", + "。": ".", + "!": "!", + "?": "?", + "\n": ".", + "·": ",", + "、": ",", + "...": "…", + " ": "空", + } + if ph in rep_map.keys(): + ph = rep_map[ph] + if ph in symbols: + return ph + if ph not in symbols: + ph = "停" + return ph + +def g2p(text): + text = latin_to_hangul(text) + text = _g2p(text) + text = divide_hangul(text) + text = fix_g2pk2_error(text) + text = re.sub(r'([\u3131-\u3163])$', r'\1.', text) + # text = "".join([post_replace_ph(i) for i in text]) + text = [post_replace_ph(i) for i in text] + return text \ No newline at end of file diff --git a/GPT_SoVITS/text/symbols2.py b/GPT_SoVITS/text/symbols2.py new file mode 100644 index 0000000..a442350 --- /dev/null +++ b/GPT_SoVITS/text/symbols2.py @@ -0,0 +1,419 @@ +import os + +# punctuation = ['!', '?', '…', ",", ".","@"]#@是SP停顿 +punctuation = ["!", "?", "…", ",", "."] # @是SP停顿 +punctuation.append("-") +pu_symbols = punctuation + ["SP", "SP2", "SP3", "UNK"] +# pu_symbols = punctuation + ["SP", 'SP2', 'SP3','SP4', "UNK"] +pad = "_" + +c = [ + "AA", + "EE", + "OO", + "b", + "c", + "ch", + "d", + "f", + "g", + "h", + "j", + "k", + "l", + "m", + "n", + "p", + "q", + "r", + "s", + "sh", + "t", + "w", + "x", + "y", + "z", + "zh", +] +v = [ + "E1", + "En1", + "a1", + "ai1", + "an1", + "ang1", + "ao1", + "e1", + "ei1", + "en1", + "eng1", + "er1", + "i1", + "i01", + "ia1", + "ian1", + "iang1", + "iao1", + "ie1", + "in1", + "ing1", + "iong1", + "ir1", + "iu1", + "o1", + "ong1", + "ou1", + "u1", + "ua1", + "uai1", + "uan1", + "uang1", + "ui1", + "un1", + "uo1", + "v1", + "van1", + "ve1", + "vn1", + "E2", + "En2", + "a2", + "ai2", + "an2", + "ang2", + "ao2", + "e2", + "ei2", + "en2", + "eng2", + "er2", + "i2", + "i02", + "ia2", + "ian2", + "iang2", + "iao2", + "ie2", + "in2", + "ing2", + "iong2", + "ir2", + "iu2", + "o2", + "ong2", + "ou2", + "u2", + "ua2", + "uai2", + "uan2", + "uang2", + "ui2", + "un2", + "uo2", + "v2", + "van2", + "ve2", + "vn2", + "E3", + "En3", + "a3", + "ai3", + "an3", + "ang3", + "ao3", + "e3", + "ei3", + "en3", + "eng3", + "er3", + "i3", + "i03", + "ia3", + "ian3", + "iang3", + "iao3", + "ie3", + "in3", + "ing3", + "iong3", + "ir3", + "iu3", + "o3", + "ong3", + "ou3", + "u3", + "ua3", + "uai3", + "uan3", + "uang3", + "ui3", + "un3", + "uo3", + "v3", + "van3", + "ve3", + "vn3", + "E4", + "En4", + "a4", + "ai4", + "an4", + "ang4", + "ao4", + "e4", + "ei4", + "en4", + "eng4", + "er4", + "i4", + "i04", + "ia4", + "ian4", + "iang4", + "iao4", + "ie4", + "in4", + "ing4", + "iong4", + "ir4", + "iu4", + "o4", + "ong4", + "ou4", + "u4", + "ua4", + "uai4", + "uan4", + "uang4", + "ui4", + "un4", + "uo4", + "v4", + "van4", + "ve4", + "vn4", + "E5", + "En5", + "a5", + "ai5", + "an5", + "ang5", + "ao5", + "e5", + "ei5", + "en5", + "eng5", + "er5", + "i5", + "i05", + "ia5", + "ian5", + "iang5", + "iao5", + "ie5", + "in5", + "ing5", + "iong5", + "ir5", + "iu5", + "o5", + "ong5", + "ou5", + "u5", + "ua5", + "uai5", + "uan5", + "uang5", + "ui5", + "un5", + "uo5", + "v5", + "van5", + "ve5", + "vn5", +] + +v_without_tone = [ + "E", + "En", + "a", + "ai", + "an", + "ang", + "ao", + "e", + "ei", + "en", + "eng", + "er", + "i", + "i0", + "ia", + "ian", + "iang", + "iao", + "ie", + "in", + "ing", + "iong", + "ir", + "iu", + "o", + "ong", + "ou", + "u", + "ua", + "uai", + "uan", + "uang", + "ui", + "un", + "uo", + "v", + "van", + "ve", + "vn", +] + +# japanese +ja_symbols = [ + "I", + "N", + "U", + "a", + "b", + "by", + "ch", + "cl", + "d", + "dy", + "e", + "f", + "g", + "gy", + "h", + "hy", + "i", + "j", + "k", + "ky", + "m", + "my", + "n", + "ny", + "o", + "p", + "py", + "r", + "ry", + "s", + "sh", + "t", + "ts", + "u", + "v", + "w", + "y", + "z", + ###楼下2个留到后面加 + # "[", #上升调型 + # "]", #下降调型 + # "$", #结束符 + # "^", #开始符 +] + +arpa = { + "AH0", + "S", + "AH1", + "EY2", + "AE2", + "EH0", + "OW2", + "UH0", + "NG", + "B", + "G", + "AY0", + "M", + "AA0", + "F", + "AO0", + "ER2", + "UH1", + "IY1", + "AH2", + "DH", + "IY0", + "EY1", + "IH0", + "K", + "N", + "W", + "IY2", + "T", + "AA1", + "ER1", + "EH2", + "OY0", + "UH2", + "UW1", + "Z", + "AW2", + "AW1", + "V", + "UW2", + "AA2", + "ER", + "AW0", + "UW0", + "R", + "OW1", + "EH1", + "ZH", + "AE0", + "IH2", + "IH", + "Y", + "JH", + "P", + "AY1", + "EY0", + "OY2", + "TH", + "HH", + "D", + "ER0", + "CH", + "AO1", + "AE1", + "AO2", + "OY1", + "AY2", + "IH1", + "OW0", + "L", + "SH", +} + +ko_symbols='ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ空停' +# ko_symbols='ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ ' + +yue_symbols={'Yeot3', 'Yip1', 'Yyu3', 'Yeng4', 'Yut5', 'Yaan5', 'Ym5', 'Yaan6', 'Yang1', 'Yun4', 'Yon2', 'Yui5', 'Yun2', 'Yat3', 'Ye', 'Yeot1', 'Yoeng5', 'Yoek2', 'Yam2', 'Yeon6', 'Yu6', 'Yiu3', 'Yaang6', 'Yp5', 'Yai4', 'Yoek4', 'Yit6', 'Yam5', 'Yoeng6', 'Yg1', 'Yk3', 'Yoe4', 'Yam3', 'Yc', 'Yyu4', 'Yyut1', 'Yiu4', 'Ying3', 'Yip3', 'Yaap3', 'Yau3', 'Yan4', 'Yau1', 'Yap4', 'Yk6', 'Yok3', 'Yai1', 'Yeot6', 'Yan2', 'Yoek6', 'Yt1', 'Yoi1', 'Yit5', 'Yn4', 'Yaau3', 'Yau4', 'Yuk6', 'Ys', 'Yuk', 'Yin6', 'Yung6', 'Ya', 'You', 'Yaai5', 'Yau5', 'Yoi3', 'Yaak3', 'Yaat3', 'Ying2', 'Yok5', 'Yeng2', 'Yyut3', 'Yam1', 'Yip5', 'You1', 'Yam6', 'Yaa5', 'Yi6', 'Yek4', 'Yyu2', 'Yuk5', 'Yaam1', 'Yang2', 'Yai', 'Yiu6', 'Yin4', 'Yok4', 'Yot3', 'Yui2', 'Yeoi5', 'Yyun6', 'Yyu5', 'Yoi5', 'Yeot2', 'Yim4', 'Yeoi2', 'Yaan1', 'Yang6', 'Yong1', 'Yaang4', 'Yung5', 'Yeon1', 'Yin2', 'Ya3', 'Yaang3', 'Yg', 'Yk2', 'Yaau5', 'Yut1', 'Yt5', 'Yip4', 'Yung4', 'Yj', 'Yong3', 'Ya1', 'Yg6', 'Yaau6', 'Yit3', 'Yun3', 'Ying1', 'Yn2', 'Yg4', 'Yl', 'Yp3', 'Yn3', 'Yak1', 'Yang5', 'Yoe6', 'You2', 'Yap2', 'Yak2', 'Yt3', 'Yot5', 'Yim2', 'Yi1', 'Yn6', 'Yaat5', 'Yaam3', 'Yoek5', 'Ye3', 'Yeon4', 'Yaa2', 'Yu3', 'Yim6', 'Ym', 'Yoe3', 'Yaai2', 'Ym2', 'Ya6', 'Yeng6', 'Yik4', 'Yot4', 'Yaai4', 'Yyun3', 'Yu1', 'Yoeng1', 'Yaap2', 'Yuk3', 'Yoek3', 'Yeng5', 'Yeoi1', 'Yiu2', 'Yok1', 'Yo1', 'Yoek1', 'Yoeng2', 'Yeon5', 'Yiu1', 'Yoeng4', 'Yuk2', 'Yat4', 'Yg5', 'Yut4', 'Yan6', 'Yin3', 'Yaa6', 'Yap1', 'Yg2', 'Yoe5', 'Yt4', 'Ya5', 'Yo4', 'Yyu1', 'Yak3', 'Yeon2', 'Yong4', 'Ym1', 'Ye2', 'Yaang5', 'Yoi2', 'Yeng3', 'Yn', 'Yyut4', 'Yau', 'Yaak2', 'Yaan4', 'Yek2', 'Yin1', 'Yi5', 'Yoe2', 'Yei5', 'Yaat6', 'Yak5', 'Yp6', 'Yok6', 'Yei2', 'Yaap1', 'Yyut5', 'Yi4', 'Yim1', 'Yk5', 'Ye4', 'Yok2', 'Yaam6', 'Yat2', 'Yon6', 'Yei3', 'Yyu6', 'Yeot5', 'Yk4', 'Yai6', 'Yd', 'Yg3', 'Yei6', 'Yau2', 'Yok', 'Yau6', 'Yung3', 'Yim5', 'Yut6', 'Yit1', 'Yon3', 'Yat1', 'Yaam2', 'Yyut2', 'Yui6', 'Yt2', 'Yek6', 'Yt', 'Ye6', 'Yang3', 'Ying6', 'Yaau1', 'Yeon3', 'Yng', 'Yh', 'Yang4', 'Ying5', 'Yaap6', 'Yoeng3', 'Yyun4', 'You3', 'Yan5', 'Yat5', 'Yot1', 'Yun1', 'Yi3', 'Yaa1', 'Yaap4', 'You6', 'Yaang2', 'Yaap5', 'Yaa3', 'Yaak6', 'Yeng1', 'Yaak1', 'Yo5', 'Yoi4', 'Yam4', 'Yik1', 'Ye1', 'Yai5', 'Yung1', 'Yp2', 'Yui4', 'Yaak4', 'Yung2', 'Yak4', 'Yaat4', 'Yeoi4', 'Yut2', 'Yin5', 'Yaau4', 'Yap6', 'Yb', 'Yaam4', 'Yw', 'Yut3', 'Yong2', 'Yt6', 'Yaai6', 'Yap5', 'Yik5', 'Yun6', 'Yaam5', 'Yun5', 'Yik3', 'Ya2', 'Yyut6', 'Yon4', 'Yk1', 'Yit4', 'Yak6', 'Yaan2', 'Yuk1', 'Yai2', 'Yik2', 'Yaat2', 'Yo3', 'Ykw', 'Yn5', 'Yaa', 'Ye5', 'Yu4', 'Yei1', 'Yai3', 'Yyun5', 'Yip2', 'Yaau2', 'Yiu5', 'Ym4', 'Yeoi6', 'Yk', 'Ym6', 'Yoe1', 'Yeoi3', 'Yon', 'Yuk4', 'Yaai3', 'Yaa4', 'Yot6', 'Yaang1', 'Yei4', 'Yek1', 'Yo', 'Yp', 'Yo6', 'Yp4', 'Yan3', 'Yoi', 'Yap3', 'Yek3', 'Yim3', 'Yz', 'Yot2', 'Yoi6', 'Yit2', 'Yu5', 'Yaan3', 'Yan1', 'Yon5', 'Yp1', 'Yong5', 'Ygw', 'Yak', 'Yat6', 'Ying4', 'Yu2', 'Yf', 'Ya4', 'Yon1', 'You4', 'Yik6', 'Yui1', 'Yaat1', 'Yeot4', 'Yi2', 'Yaai1', 'Yek5', 'Ym3', 'Yong6', 'You5', 'Yyun1', 'Yn1', 'Yo2', 'Yip6', 'Yui3', 'Yaak5', 'Yyun2'} + +# symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)+list(ko_symbols)#+list(yue_symbols)###直接这么加yue顺序乱了 +symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa) +symbols = sorted(set(symbols)) +# print(len(symbols)) +symbols+=["[","]"]##日文新增上升下降调型 +symbols+=sorted(list(ko_symbols)) +symbols+=sorted(list(yue_symbols))##新加的yue统一摆在后头#已查过开头加Y后没有重复,韩文显然不会重复 +# print(len(symbols)) +if __name__ == "__main__": + print(len(symbols)) +''' +粤语: + 732-353=379 +韩文+粤语: + 732-322=410 +'''