mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-10-07 07:14:52 +08:00
Use TTSFRD process text
This commit is contained in:
parent
8d91183c4c
commit
efe803f589
@ -252,13 +252,13 @@ def clean_text_inf(text, language):
|
|||||||
dtype=torch.float16 if is_half == True else torch.float32
|
dtype=torch.float16 if is_half == True else torch.float32
|
||||||
def get_bert_inf(phones, word2ph, norm_text, language):
|
def get_bert_inf(phones, word2ph, norm_text, language):
|
||||||
language=language.replace("all_","")
|
language=language.replace("all_","")
|
||||||
if language == "zh":
|
# if language == "zh":
|
||||||
bert = get_bert_feature(norm_text, word2ph).to(device)#.to(dtype)
|
# bert = get_bert_feature(norm_text, word2ph).to(device)#.to(dtype)
|
||||||
else:
|
# else:
|
||||||
bert = torch.zeros(
|
bert = torch.zeros(
|
||||||
(1024, len(phones)),
|
(1024, len(phones)),
|
||||||
dtype=torch.float16 if is_half == True else torch.float32,
|
dtype=torch.float16 if is_half == True else torch.float32,
|
||||||
).to(device)
|
).to(device)
|
||||||
|
|
||||||
return bert
|
return bert
|
||||||
|
|
||||||
@ -269,9 +269,21 @@ def nonen_clean_text_inf(text, language):
|
|||||||
else:
|
else:
|
||||||
textlist=[]
|
textlist=[]
|
||||||
langlist=[]
|
langlist=[]
|
||||||
|
mix = ""
|
||||||
for tmp in LangSegment.getTexts(text):
|
for tmp in LangSegment.getTexts(text):
|
||||||
langlist.append(tmp["lang"])
|
if tmp["lang"] in {"zh","en"}:
|
||||||
textlist.append(tmp["text"])
|
mix += tmp["text"]
|
||||||
|
else:
|
||||||
|
if mix:
|
||||||
|
langlist.append("zh")
|
||||||
|
textlist.append(mix)
|
||||||
|
mix = ""
|
||||||
|
langlist.append(tmp["lang"])
|
||||||
|
textlist.append(tmp["text"])
|
||||||
|
if mix:
|
||||||
|
langlist.append("zh")
|
||||||
|
textlist.append(mix)
|
||||||
|
mix = ""
|
||||||
print(textlist)
|
print(textlist)
|
||||||
print(langlist)
|
print(langlist)
|
||||||
phones_list = []
|
phones_list = []
|
||||||
@ -325,19 +337,19 @@ def get_first(text):
|
|||||||
|
|
||||||
|
|
||||||
def get_cleaned_text_fianl(text,language):
|
def get_cleaned_text_fianl(text,language):
|
||||||
if language in {"en","all_zh","all_ja"}:
|
if language in {"en","all_zh","all_ja","zh"}:
|
||||||
phones, word2ph, norm_text = clean_text_inf(text, language)
|
phones, word2ph, norm_text = clean_text_inf(text, language)
|
||||||
elif language in {"zh", "ja","auto"}:
|
elif language in {"ja","auto"}:
|
||||||
phones, word2ph, norm_text = nonen_clean_text_inf(text, language)
|
phones, word2ph, norm_text = nonen_clean_text_inf(text, language)
|
||||||
return phones, word2ph, norm_text
|
return phones, word2ph, norm_text
|
||||||
|
|
||||||
def get_bert_final(phones, word2ph, norm_text,language,device):
|
def get_bert_final(phones, word2ph, norm_text,language,device):
|
||||||
if text_language == "en":
|
if text_language in {"en","all_zh","zh"}:
|
||||||
bert = get_bert_inf(phones, word2ph, norm_text, text_language)
|
bert = get_bert_inf(phones, word2ph, norm_text, text_language)
|
||||||
elif text_language in {"zh", "ja","auto"}:
|
elif text_language in {"ja","auto"}:
|
||||||
bert = nonen_get_bert_inf(text, text_language)
|
bert = nonen_get_bert_inf(text, text_language)
|
||||||
elif text_language == "all_zh":
|
# elif text_language == "all_zh":
|
||||||
bert = get_bert_feature(norm_text, word2ph).to(device)
|
# bert = get_bert_feature(norm_text, word2ph).to(device)
|
||||||
else:
|
else:
|
||||||
bert = torch.zeros((1024, len(phones))).to(device)
|
bert = torch.zeros((1024, len(phones))).to(device)
|
||||||
return bert
|
return bert
|
||||||
|
@ -1,4 +1,221 @@
|
|||||||
from text import chinese, japanese, cleaned_text_to_sequence, symbols, english
|
from text import chinese, japanese, cleaned_text_to_sequence, symbols, english
|
||||||
|
import ttsfrd
|
||||||
|
import re,os,zipfile,requests
|
||||||
|
|
||||||
|
ENG_LANG_MAPPING = {
|
||||||
|
"PinYin": "zh-cn",
|
||||||
|
"English": "en-us",
|
||||||
|
"British": "en-gb",
|
||||||
|
"ZhHK": "hk_cantonese",
|
||||||
|
"Sichuan": "sichuan",
|
||||||
|
"Japanese": "japanese",
|
||||||
|
"WuuShangHai": "shanghai",
|
||||||
|
"Indonesian": "indonesian",
|
||||||
|
"Malay": "malay",
|
||||||
|
"Filipino": "filipino",
|
||||||
|
"Vietnamese": "vietnamese",
|
||||||
|
"Korean": "korean",
|
||||||
|
"Russian": "russian",
|
||||||
|
}
|
||||||
|
|
||||||
|
chinese_dict = {
|
||||||
|
"xx":"x",
|
||||||
|
"uei":"ui",
|
||||||
|
"ii":"i0",
|
||||||
|
"ih":"ir",
|
||||||
|
"uen":"un",
|
||||||
|
"iou":"iu",
|
||||||
|
"angr":"ang",
|
||||||
|
"anr":"an",
|
||||||
|
"aor":"ao",
|
||||||
|
"ar":"a",
|
||||||
|
"eir":"ei",
|
||||||
|
"engr":"eng",
|
||||||
|
"enr":"en",
|
||||||
|
"ianr":"ian",
|
||||||
|
"iaor":"iao",
|
||||||
|
"ingr":"ing",
|
||||||
|
"or":"o",
|
||||||
|
"ur":"u",
|
||||||
|
"ihr":"ih",
|
||||||
|
"ongr":"ong",
|
||||||
|
"our":"ou",
|
||||||
|
"uangr":"uang",
|
||||||
|
"uanr":"uan",
|
||||||
|
"ueir":"uei",
|
||||||
|
"uenr":"uen",
|
||||||
|
"uor":"uo",
|
||||||
|
"iir":"ii",
|
||||||
|
"air":"ai",
|
||||||
|
"ier":"ie",
|
||||||
|
"uair":"uai",
|
||||||
|
"uar":"ua",
|
||||||
|
"iar":"ia",
|
||||||
|
"inr":"in",
|
||||||
|
"iour":"iou",
|
||||||
|
"vanr":"van",
|
||||||
|
"ver":"ve",
|
||||||
|
"vnr":"vn",
|
||||||
|
"iangr":"iang",
|
||||||
|
"vr":"v",
|
||||||
|
"iongr":"iong",
|
||||||
|
}
|
||||||
|
english_dict = {
|
||||||
|
"DH1":"DH",
|
||||||
|
"NG0":"NG",
|
||||||
|
"SH0":"SH",
|
||||||
|
"NG1":"NG",
|
||||||
|
"CH0":"CH",
|
||||||
|
"HH0":"HH",
|
||||||
|
"ZH0":"ZH",
|
||||||
|
"HH1":"HH",
|
||||||
|
"SH1":"SH",
|
||||||
|
"ZH1":"ZH",
|
||||||
|
"DH0":"DH",
|
||||||
|
"TH1":"TH",
|
||||||
|
"CH1":"CH",
|
||||||
|
"JH1":"JH",
|
||||||
|
"JH0":"JH",
|
||||||
|
"NG2":"NG",
|
||||||
|
"TH0":"TH",
|
||||||
|
}
|
||||||
|
japanese_dict = {
|
||||||
|
"nn":"N",
|
||||||
|
"ux":"U",
|
||||||
|
"ix":"I",
|
||||||
|
}
|
||||||
|
|
||||||
|
resource_dir = "GPT_SoVITS/text/resource"
|
||||||
|
resources_zip_file = "GPT_SoVITS/text/resource.zip"
|
||||||
|
if not os.path.exists(resource_dir):
|
||||||
|
if not os.path.exists(resources_zip_file):
|
||||||
|
print("Downloading ttsfrd resources...")
|
||||||
|
modelscope_url = "https://www.modelscope.cn/api/v1/models/speech_tts/speech_kantts_ttsfrd/repo?Revision=v0.0.1&FilePath=resource.zip"
|
||||||
|
with requests.get(modelscope_url, stream=True) as r:
|
||||||
|
r.raise_for_status()
|
||||||
|
with open(resources_zip_file, 'wb') as f:
|
||||||
|
for chunk in r.iter_content(chunk_size=8192):
|
||||||
|
if chunk:
|
||||||
|
f.write(chunk)
|
||||||
|
|
||||||
|
print("Extracting ttsfrd resources...")
|
||||||
|
with zipfile.ZipFile(resources_zip_file, "r") as zip_ref:
|
||||||
|
zip_ref.extractall("GPT_SoVITS/text")
|
||||||
|
|
||||||
|
fe = ttsfrd.TtsFrontendEngine()
|
||||||
|
assert fe.initialize(resource_dir),"Check ttsfrd resource"
|
||||||
|
|
||||||
|
def clean_text(text, language):
|
||||||
|
if(language not in language_module_map):
|
||||||
|
language="en"
|
||||||
|
text=" "
|
||||||
|
if language == "zh":
|
||||||
|
phones = []
|
||||||
|
word2ph = []
|
||||||
|
count = 0
|
||||||
|
fe.set_lang_type(ENG_LANG_MAPPING["PinYin"])
|
||||||
|
res = fe.gen_tacotron_symbols(text)
|
||||||
|
matches = re.findall(r'\{(.*?)\}', res)
|
||||||
|
for match in matches:
|
||||||
|
elements = match.split("$")
|
||||||
|
if elements[2] == "s_none":
|
||||||
|
if elements[0] == "#4":
|
||||||
|
phone = "."
|
||||||
|
phones += [phone]
|
||||||
|
word2ph.append(1)
|
||||||
|
continue
|
||||||
|
if elements[0] == "#3":
|
||||||
|
phone = ","
|
||||||
|
phones += [phone]
|
||||||
|
word2ph.append(1)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Chinese
|
||||||
|
if elements[0] == "ga":
|
||||||
|
phone = "AA"
|
||||||
|
phones += [phone]
|
||||||
|
count += 1
|
||||||
|
continue
|
||||||
|
if elements[0] == "ge":
|
||||||
|
phone = "EE"
|
||||||
|
phones += [phone]
|
||||||
|
count += 1
|
||||||
|
continue
|
||||||
|
if elements[0] == "go":
|
||||||
|
phone = "OO"
|
||||||
|
phones += [phone]
|
||||||
|
count += 1
|
||||||
|
continue
|
||||||
|
if "_c" in elements[0]:
|
||||||
|
if elements[2] in ("s_begin","s_middle","s_both","s_end"):
|
||||||
|
phone = elements[0].replace("_c", "")
|
||||||
|
phone = chinese_dict.get(phone, phone)
|
||||||
|
phone = chinese_dict.get(phone, phone)
|
||||||
|
count += 1
|
||||||
|
if elements[2] == "s_end":
|
||||||
|
phone += elements[1].replace("tone", "")
|
||||||
|
word2ph.append(count)
|
||||||
|
count = 0
|
||||||
|
phones += [phone]
|
||||||
|
continue
|
||||||
|
|
||||||
|
# English
|
||||||
|
else:
|
||||||
|
if elements[2] in ("s_begin","s_middle","s_both","s_end"):
|
||||||
|
phone = elements[0].upper()
|
||||||
|
if len(elements[0]) > 1 :
|
||||||
|
phone += elements[1].replace("tone", "")
|
||||||
|
phone = english_dict.get(phone, phone)
|
||||||
|
phones += [phone]
|
||||||
|
continue
|
||||||
|
elif language == "en":
|
||||||
|
phones = []
|
||||||
|
word2ph = None
|
||||||
|
fe.set_lang_type(ENG_LANG_MAPPING["English"])
|
||||||
|
res = fe.gen_tacotron_symbols(text)
|
||||||
|
matches = re.findall(r'\{(.*?)\}', res)
|
||||||
|
for match in matches:
|
||||||
|
elements = match.split("$")
|
||||||
|
if elements[2] == "s_none":
|
||||||
|
if elements[0] == "#4":
|
||||||
|
phone = "."
|
||||||
|
phones += [phone]
|
||||||
|
continue
|
||||||
|
|
||||||
|
if elements[2] in ("s_begin","s_middle","s_both","s_end"):
|
||||||
|
phone = elements[0].upper()
|
||||||
|
if len(elements[0]) > 1 :
|
||||||
|
phone += elements[1].replace("tone", "")
|
||||||
|
phone = english_dict.get(phone, phone)
|
||||||
|
phones += [phone]
|
||||||
|
continue
|
||||||
|
elif language == "ja":
|
||||||
|
phones = []
|
||||||
|
word2ph = None
|
||||||
|
fe.set_lang_type(ENG_LANG_MAPPING["Japanese"])
|
||||||
|
res = fe.gen_tacotron_symbols(text)
|
||||||
|
matches = re.findall(r'\{(.*?)\}', res)
|
||||||
|
for match in matches:
|
||||||
|
elements = match.split("$")
|
||||||
|
if elements[2] == "s_none":
|
||||||
|
if elements[0] == "#4":
|
||||||
|
phone = "."
|
||||||
|
phones += [phone]
|
||||||
|
continue
|
||||||
|
if elements[0] == "#3":
|
||||||
|
phone = ","
|
||||||
|
phones += [phone]
|
||||||
|
continue
|
||||||
|
|
||||||
|
if elements[2] in ("s_begin","s_middle","s_both","s_end"):
|
||||||
|
phone = elements[0]
|
||||||
|
phone = japanese_dict.get(phone, phone)
|
||||||
|
phones += [phone]
|
||||||
|
continue
|
||||||
|
# print("new:",phones)
|
||||||
|
# p,w,n = clean_text_old(text, language)
|
||||||
|
# print("old:",p)
|
||||||
|
return phones, word2ph, text
|
||||||
|
|
||||||
language_module_map = {"zh": chinese, "ja": japanese, "en": english}
|
language_module_map = {"zh": chinese, "ja": japanese, "en": english}
|
||||||
special = [
|
special = [
|
||||||
@ -9,7 +226,7 @@ special = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def clean_text(text, language):
|
def clean_text_old(text, language):
|
||||||
if(language not in language_module_map):
|
if(language not in language_module_map):
|
||||||
language="en"
|
language="en"
|
||||||
text=" "
|
text=" "
|
||||||
|
Loading…
x
Reference in New Issue
Block a user