diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 1868a122..c49f86f4 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -252,13 +252,13 @@ def clean_text_inf(text, language): dtype=torch.float16 if is_half == True else torch.float32 def get_bert_inf(phones, word2ph, norm_text, language): language=language.replace("all_","") - if language == "zh": - bert = get_bert_feature(norm_text, word2ph).to(device)#.to(dtype) - else: - bert = torch.zeros( - (1024, len(phones)), - dtype=torch.float16 if is_half == True else torch.float32, - ).to(device) + # if language == "zh": + # bert = get_bert_feature(norm_text, word2ph).to(device)#.to(dtype) + # else: + bert = torch.zeros( + (1024, len(phones)), + dtype=torch.float16 if is_half == True else torch.float32, + ).to(device) return bert @@ -269,9 +269,21 @@ def nonen_clean_text_inf(text, language): else: textlist=[] langlist=[] + mix = "" for tmp in LangSegment.getTexts(text): - langlist.append(tmp["lang"]) - textlist.append(tmp["text"]) + if tmp["lang"] in {"zh","en"}: + mix += tmp["text"] + else: + if mix: + langlist.append("zh") + textlist.append(mix) + mix = "" + langlist.append(tmp["lang"]) + textlist.append(tmp["text"]) + if mix: + langlist.append("zh") + textlist.append(mix) + mix = "" print(textlist) print(langlist) phones_list = [] @@ -325,19 +337,19 @@ def get_first(text): def get_cleaned_text_fianl(text,language): - if language in {"en","all_zh","all_ja"}: + if language in {"en","all_zh","all_ja","zh"}: phones, word2ph, norm_text = clean_text_inf(text, language) - elif language in {"zh", "ja","auto"}: + elif language in {"ja","auto"}: phones, word2ph, norm_text = nonen_clean_text_inf(text, language) return phones, word2ph, norm_text def get_bert_final(phones, word2ph, norm_text,language,device): - if text_language == "en": + if text_language in {"en","all_zh","zh"}: bert = get_bert_inf(phones, word2ph, norm_text, text_language) - elif text_language in {"zh", "ja","auto"}: + elif text_language in {"ja","auto"}: bert = nonen_get_bert_inf(text, text_language) - elif text_language == "all_zh": - bert = get_bert_feature(norm_text, word2ph).to(device) + # elif text_language == "all_zh": + # bert = get_bert_feature(norm_text, word2ph).to(device) else: bert = torch.zeros((1024, len(phones))).to(device) return bert diff --git a/GPT_SoVITS/text/cleaner.py b/GPT_SoVITS/text/cleaner.py index 92a18ebd..657f56d2 100644 --- a/GPT_SoVITS/text/cleaner.py +++ b/GPT_SoVITS/text/cleaner.py @@ -1,4 +1,221 @@ from text import chinese, japanese, cleaned_text_to_sequence, symbols, english +import ttsfrd +import re,os,zipfile,requests + +ENG_LANG_MAPPING = { + "PinYin": "zh-cn", + "English": "en-us", + "British": "en-gb", + "ZhHK": "hk_cantonese", + "Sichuan": "sichuan", + "Japanese": "japanese", + "WuuShangHai": "shanghai", + "Indonesian": "indonesian", + "Malay": "malay", + "Filipino": "filipino", + "Vietnamese": "vietnamese", + "Korean": "korean", + "Russian": "russian", +} + +chinese_dict = { + "xx":"x", + "uei":"ui", + "ii":"i0", + "ih":"ir", + "uen":"un", + "iou":"iu", + "angr":"ang", + "anr":"an", + "aor":"ao", + "ar":"a", + "eir":"ei", + "engr":"eng", + "enr":"en", + "ianr":"ian", + "iaor":"iao", + "ingr":"ing", + "or":"o", + "ur":"u", + "ihr":"ih", + "ongr":"ong", + "our":"ou", + "uangr":"uang", + "uanr":"uan", + "ueir":"uei", + "uenr":"uen", + "uor":"uo", + "iir":"ii", + "air":"ai", + "ier":"ie", + "uair":"uai", + "uar":"ua", + "iar":"ia", + "inr":"in", + "iour":"iou", + "vanr":"van", + "ver":"ve", + "vnr":"vn", + "iangr":"iang", + "vr":"v", + "iongr":"iong", +} +english_dict = { + "DH1":"DH", + "NG0":"NG", + "SH0":"SH", + "NG1":"NG", + "CH0":"CH", + "HH0":"HH", + "ZH0":"ZH", + "HH1":"HH", + "SH1":"SH", + "ZH1":"ZH", + "DH0":"DH", + "TH1":"TH", + "CH1":"CH", + "JH1":"JH", + "JH0":"JH", + "NG2":"NG", + "TH0":"TH", +} +japanese_dict = { + "nn":"N", + "ux":"U", + "ix":"I", +} + +resource_dir = "GPT_SoVITS/text/resource" +resources_zip_file = "GPT_SoVITS/text/resource.zip" +if not os.path.exists(resource_dir): + if not os.path.exists(resources_zip_file): + print("Downloading ttsfrd resources...") + modelscope_url = "https://www.modelscope.cn/api/v1/models/speech_tts/speech_kantts_ttsfrd/repo?Revision=v0.0.1&FilePath=resource.zip" + with requests.get(modelscope_url, stream=True) as r: + r.raise_for_status() + with open(resources_zip_file, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + + print("Extracting ttsfrd resources...") + with zipfile.ZipFile(resources_zip_file, "r") as zip_ref: + zip_ref.extractall("GPT_SoVITS/text") + +fe = ttsfrd.TtsFrontendEngine() +assert fe.initialize(resource_dir),"Check ttsfrd resource" + +def clean_text(text, language): + if(language not in language_module_map): + language="en" + text=" " + if language == "zh": + phones = [] + word2ph = [] + count = 0 + fe.set_lang_type(ENG_LANG_MAPPING["PinYin"]) + res = fe.gen_tacotron_symbols(text) + matches = re.findall(r'\{(.*?)\}', res) + for match in matches: + elements = match.split("$") + if elements[2] == "s_none": + if elements[0] == "#4": + phone = "." + phones += [phone] + word2ph.append(1) + continue + if elements[0] == "#3": + phone = "," + phones += [phone] + word2ph.append(1) + continue + + # Chinese + if elements[0] == "ga": + phone = "AA" + phones += [phone] + count += 1 + continue + if elements[0] == "ge": + phone = "EE" + phones += [phone] + count += 1 + continue + if elements[0] == "go": + phone = "OO" + phones += [phone] + count += 1 + continue + if "_c" in elements[0]: + if elements[2] in ("s_begin","s_middle","s_both","s_end"): + phone = elements[0].replace("_c", "") + phone = chinese_dict.get(phone, phone) + phone = chinese_dict.get(phone, phone) + count += 1 + if elements[2] == "s_end": + phone += elements[1].replace("tone", "") + word2ph.append(count) + count = 0 + phones += [phone] + continue + + # English + else: + if elements[2] in ("s_begin","s_middle","s_both","s_end"): + phone = elements[0].upper() + if len(elements[0]) > 1 : + phone += elements[1].replace("tone", "") + phone = english_dict.get(phone, phone) + phones += [phone] + continue + elif language == "en": + phones = [] + word2ph = None + fe.set_lang_type(ENG_LANG_MAPPING["English"]) + res = fe.gen_tacotron_symbols(text) + matches = re.findall(r'\{(.*?)\}', res) + for match in matches: + elements = match.split("$") + if elements[2] == "s_none": + if elements[0] == "#4": + phone = "." + phones += [phone] + continue + + if elements[2] in ("s_begin","s_middle","s_both","s_end"): + phone = elements[0].upper() + if len(elements[0]) > 1 : + phone += elements[1].replace("tone", "") + phone = english_dict.get(phone, phone) + phones += [phone] + continue + elif language == "ja": + phones = [] + word2ph = None + fe.set_lang_type(ENG_LANG_MAPPING["Japanese"]) + res = fe.gen_tacotron_symbols(text) + matches = re.findall(r'\{(.*?)\}', res) + for match in matches: + elements = match.split("$") + if elements[2] == "s_none": + if elements[0] == "#4": + phone = "." + phones += [phone] + continue + if elements[0] == "#3": + phone = "," + phones += [phone] + continue + + if elements[2] in ("s_begin","s_middle","s_both","s_end"): + phone = elements[0] + phone = japanese_dict.get(phone, phone) + phones += [phone] + continue + # print("new:",phones) + # p,w,n = clean_text_old(text, language) + # print("old:",p) + return phones, word2ph, text language_module_map = {"zh": chinese, "ja": japanese, "en": english} special = [ @@ -9,7 +226,7 @@ special = [ ] -def clean_text(text, language): +def clean_text_old(text, language): if(language not in language_module_map): language="en" text=" "