Use TTSFRD process text

This commit is contained in:
KamioRinn 2024-02-03 14:07:59 +08:00
parent 8d91183c4c
commit efe803f589
2 changed files with 245 additions and 16 deletions

View File

@ -252,13 +252,13 @@ def clean_text_inf(text, language):
dtype=torch.float16 if is_half == True else torch.float32
def get_bert_inf(phones, word2ph, norm_text, language):
language=language.replace("all_","")
if language == "zh":
bert = get_bert_feature(norm_text, word2ph).to(device)#.to(dtype)
else:
bert = torch.zeros(
(1024, len(phones)),
dtype=torch.float16 if is_half == True else torch.float32,
).to(device)
# if language == "zh":
# bert = get_bert_feature(norm_text, word2ph).to(device)#.to(dtype)
# else:
bert = torch.zeros(
(1024, len(phones)),
dtype=torch.float16 if is_half == True else torch.float32,
).to(device)
return bert
@ -269,9 +269,21 @@ def nonen_clean_text_inf(text, language):
else:
textlist=[]
langlist=[]
mix = ""
for tmp in LangSegment.getTexts(text):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
if tmp["lang"] in {"zh","en"}:
mix += tmp["text"]
else:
if mix:
langlist.append("zh")
textlist.append(mix)
mix = ""
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
if mix:
langlist.append("zh")
textlist.append(mix)
mix = ""
print(textlist)
print(langlist)
phones_list = []
@ -325,19 +337,19 @@ def get_first(text):
def get_cleaned_text_fianl(text,language):
if language in {"en","all_zh","all_ja"}:
if language in {"en","all_zh","all_ja","zh"}:
phones, word2ph, norm_text = clean_text_inf(text, language)
elif language in {"zh", "ja","auto"}:
elif language in {"ja","auto"}:
phones, word2ph, norm_text = nonen_clean_text_inf(text, language)
return phones, word2ph, norm_text
def get_bert_final(phones, word2ph, norm_text,language,device):
if text_language == "en":
if text_language in {"en","all_zh","zh"}:
bert = get_bert_inf(phones, word2ph, norm_text, text_language)
elif text_language in {"zh", "ja","auto"}:
elif text_language in {"ja","auto"}:
bert = nonen_get_bert_inf(text, text_language)
elif text_language == "all_zh":
bert = get_bert_feature(norm_text, word2ph).to(device)
# elif text_language == "all_zh":
# bert = get_bert_feature(norm_text, word2ph).to(device)
else:
bert = torch.zeros((1024, len(phones))).to(device)
return bert

View File

@ -1,4 +1,221 @@
from text import chinese, japanese, cleaned_text_to_sequence, symbols, english
import ttsfrd
import re,os,zipfile,requests
ENG_LANG_MAPPING = {
"PinYin": "zh-cn",
"English": "en-us",
"British": "en-gb",
"ZhHK": "hk_cantonese",
"Sichuan": "sichuan",
"Japanese": "japanese",
"WuuShangHai": "shanghai",
"Indonesian": "indonesian",
"Malay": "malay",
"Filipino": "filipino",
"Vietnamese": "vietnamese",
"Korean": "korean",
"Russian": "russian",
}
chinese_dict = {
"xx":"x",
"uei":"ui",
"ii":"i0",
"ih":"ir",
"uen":"un",
"iou":"iu",
"angr":"ang",
"anr":"an",
"aor":"ao",
"ar":"a",
"eir":"ei",
"engr":"eng",
"enr":"en",
"ianr":"ian",
"iaor":"iao",
"ingr":"ing",
"or":"o",
"ur":"u",
"ihr":"ih",
"ongr":"ong",
"our":"ou",
"uangr":"uang",
"uanr":"uan",
"ueir":"uei",
"uenr":"uen",
"uor":"uo",
"iir":"ii",
"air":"ai",
"ier":"ie",
"uair":"uai",
"uar":"ua",
"iar":"ia",
"inr":"in",
"iour":"iou",
"vanr":"van",
"ver":"ve",
"vnr":"vn",
"iangr":"iang",
"vr":"v",
"iongr":"iong",
}
english_dict = {
"DH1":"DH",
"NG0":"NG",
"SH0":"SH",
"NG1":"NG",
"CH0":"CH",
"HH0":"HH",
"ZH0":"ZH",
"HH1":"HH",
"SH1":"SH",
"ZH1":"ZH",
"DH0":"DH",
"TH1":"TH",
"CH1":"CH",
"JH1":"JH",
"JH0":"JH",
"NG2":"NG",
"TH0":"TH",
}
japanese_dict = {
"nn":"N",
"ux":"U",
"ix":"I",
}
resource_dir = "GPT_SoVITS/text/resource"
resources_zip_file = "GPT_SoVITS/text/resource.zip"
if not os.path.exists(resource_dir):
if not os.path.exists(resources_zip_file):
print("Downloading ttsfrd resources...")
modelscope_url = "https://www.modelscope.cn/api/v1/models/speech_tts/speech_kantts_ttsfrd/repo?Revision=v0.0.1&FilePath=resource.zip"
with requests.get(modelscope_url, stream=True) as r:
r.raise_for_status()
with open(resources_zip_file, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print("Extracting ttsfrd resources...")
with zipfile.ZipFile(resources_zip_file, "r") as zip_ref:
zip_ref.extractall("GPT_SoVITS/text")
fe = ttsfrd.TtsFrontendEngine()
assert fe.initialize(resource_dir),"Check ttsfrd resource"
def clean_text(text, language):
if(language not in language_module_map):
language="en"
text=" "
if language == "zh":
phones = []
word2ph = []
count = 0
fe.set_lang_type(ENG_LANG_MAPPING["PinYin"])
res = fe.gen_tacotron_symbols(text)
matches = re.findall(r'\{(.*?)\}', res)
for match in matches:
elements = match.split("$")
if elements[2] == "s_none":
if elements[0] == "#4":
phone = "."
phones += [phone]
word2ph.append(1)
continue
if elements[0] == "#3":
phone = ","
phones += [phone]
word2ph.append(1)
continue
# Chinese
if elements[0] == "ga":
phone = "AA"
phones += [phone]
count += 1
continue
if elements[0] == "ge":
phone = "EE"
phones += [phone]
count += 1
continue
if elements[0] == "go":
phone = "OO"
phones += [phone]
count += 1
continue
if "_c" in elements[0]:
if elements[2] in ("s_begin","s_middle","s_both","s_end"):
phone = elements[0].replace("_c", "")
phone = chinese_dict.get(phone, phone)
phone = chinese_dict.get(phone, phone)
count += 1
if elements[2] == "s_end":
phone += elements[1].replace("tone", "")
word2ph.append(count)
count = 0
phones += [phone]
continue
# English
else:
if elements[2] in ("s_begin","s_middle","s_both","s_end"):
phone = elements[0].upper()
if len(elements[0]) > 1 :
phone += elements[1].replace("tone", "")
phone = english_dict.get(phone, phone)
phones += [phone]
continue
elif language == "en":
phones = []
word2ph = None
fe.set_lang_type(ENG_LANG_MAPPING["English"])
res = fe.gen_tacotron_symbols(text)
matches = re.findall(r'\{(.*?)\}', res)
for match in matches:
elements = match.split("$")
if elements[2] == "s_none":
if elements[0] == "#4":
phone = "."
phones += [phone]
continue
if elements[2] in ("s_begin","s_middle","s_both","s_end"):
phone = elements[0].upper()
if len(elements[0]) > 1 :
phone += elements[1].replace("tone", "")
phone = english_dict.get(phone, phone)
phones += [phone]
continue
elif language == "ja":
phones = []
word2ph = None
fe.set_lang_type(ENG_LANG_MAPPING["Japanese"])
res = fe.gen_tacotron_symbols(text)
matches = re.findall(r'\{(.*?)\}', res)
for match in matches:
elements = match.split("$")
if elements[2] == "s_none":
if elements[0] == "#4":
phone = "."
phones += [phone]
continue
if elements[0] == "#3":
phone = ","
phones += [phone]
continue
if elements[2] in ("s_begin","s_middle","s_both","s_end"):
phone = elements[0]
phone = japanese_dict.get(phone, phone)
phones += [phone]
continue
# print("new:",phones)
# p,w,n = clean_text_old(text, language)
# print("old:",p)
return phones, word2ph, text
language_module_map = {"zh": chinese, "ja": japanese, "en": english}
special = [
@ -9,7 +226,7 @@ special = [
]
def clean_text(text, language):
def clean_text_old(text, language):
if(language not in language_module_map):
language="en"
text=" "