mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-04-06 03:57:44 +08:00
Refactoring get phones and bert
This commit is contained in:
parent
1e8a8d03da
commit
bbef82fa86
@ -209,54 +209,8 @@ dict_language = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def splite_en_inf(sentence, language):
|
|
||||||
pattern = re.compile(r'[a-zA-Z ]+')
|
|
||||||
textlist = []
|
|
||||||
langlist = []
|
|
||||||
pos = 0
|
|
||||||
for match in pattern.finditer(sentence):
|
|
||||||
start, end = match.span()
|
|
||||||
if start > pos:
|
|
||||||
textlist.append(sentence[pos:start])
|
|
||||||
langlist.append(language)
|
|
||||||
textlist.append(sentence[start:end])
|
|
||||||
langlist.append("en")
|
|
||||||
pos = end
|
|
||||||
if pos < len(sentence):
|
|
||||||
textlist.append(sentence[pos:])
|
|
||||||
langlist.append(language)
|
|
||||||
# Merge punctuation into previous word
|
|
||||||
for i in range(len(textlist)-1, 0, -1):
|
|
||||||
if re.match(r'^[\W_]+$', textlist[i]):
|
|
||||||
textlist[i-1] += textlist[i]
|
|
||||||
del textlist[i]
|
|
||||||
del langlist[i]
|
|
||||||
# Merge consecutive words with the same language tag
|
|
||||||
i = 0
|
|
||||||
while i < len(langlist) - 1:
|
|
||||||
if langlist[i] == langlist[i+1]:
|
|
||||||
textlist[i] += textlist[i+1]
|
|
||||||
del textlist[i+1]
|
|
||||||
del langlist[i+1]
|
|
||||||
else:
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
return textlist, langlist
|
|
||||||
|
|
||||||
|
|
||||||
def clean_text_inf(text, language):
|
def clean_text_inf(text, language):
|
||||||
formattext = ""
|
phones, word2ph, norm_text = clean_text(text, language)
|
||||||
language = language.replace("all_","")
|
|
||||||
for tmp in LangSegment.getTexts(text):
|
|
||||||
if language == "ja":
|
|
||||||
if tmp["lang"] == language or tmp["lang"] == "zh":
|
|
||||||
formattext += tmp["text"] + " "
|
|
||||||
continue
|
|
||||||
if tmp["lang"] == language:
|
|
||||||
formattext += tmp["text"] + " "
|
|
||||||
while " " in formattext:
|
|
||||||
formattext = formattext.replace(" ", " ")
|
|
||||||
phones, word2ph, norm_text = clean_text(formattext, language)
|
|
||||||
phones = cleaned_text_to_sequence(phones)
|
phones = cleaned_text_to_sequence(phones)
|
||||||
return phones, word2ph, norm_text
|
return phones, word2ph, norm_text
|
||||||
|
|
||||||
@ -274,55 +228,6 @@ def get_bert_inf(phones, word2ph, norm_text, language):
|
|||||||
return bert
|
return bert
|
||||||
|
|
||||||
|
|
||||||
def nonen_clean_text_inf(text, language):
|
|
||||||
if(language!="auto"):
|
|
||||||
textlist, langlist = splite_en_inf(text, language)
|
|
||||||
else:
|
|
||||||
textlist=[]
|
|
||||||
langlist=[]
|
|
||||||
for tmp in LangSegment.getTexts(text):
|
|
||||||
langlist.append(tmp["lang"])
|
|
||||||
textlist.append(tmp["text"])
|
|
||||||
phones_list = []
|
|
||||||
word2ph_list = []
|
|
||||||
norm_text_list = []
|
|
||||||
for i in range(len(textlist)):
|
|
||||||
lang = langlist[i]
|
|
||||||
phones, word2ph, norm_text = clean_text_inf(textlist[i], lang)
|
|
||||||
phones_list.append(phones)
|
|
||||||
if lang == "zh":
|
|
||||||
word2ph_list.append(word2ph)
|
|
||||||
norm_text_list.append(norm_text)
|
|
||||||
print(word2ph_list)
|
|
||||||
phones = sum(phones_list, [])
|
|
||||||
word2ph = sum(word2ph_list, [])
|
|
||||||
norm_text = ' '.join(norm_text_list)
|
|
||||||
|
|
||||||
return phones, word2ph, norm_text
|
|
||||||
|
|
||||||
|
|
||||||
def nonen_get_bert_inf(text, language):
|
|
||||||
if(language!="auto"):
|
|
||||||
textlist, langlist = splite_en_inf(text, language)
|
|
||||||
else:
|
|
||||||
textlist=[]
|
|
||||||
langlist=[]
|
|
||||||
for tmp in LangSegment.getTexts(text):
|
|
||||||
langlist.append(tmp["lang"])
|
|
||||||
textlist.append(tmp["text"])
|
|
||||||
print(textlist)
|
|
||||||
print(langlist)
|
|
||||||
bert_list = []
|
|
||||||
for i in range(len(textlist)):
|
|
||||||
lang = langlist[i]
|
|
||||||
phones, word2ph, norm_text = clean_text_inf(textlist[i], lang)
|
|
||||||
bert = get_bert_inf(phones, word2ph, norm_text, lang)
|
|
||||||
bert_list.append(bert)
|
|
||||||
bert = torch.cat(bert_list, dim=1)
|
|
||||||
|
|
||||||
return bert
|
|
||||||
|
|
||||||
|
|
||||||
splits = {",", "。", "?", "!", ",", ".", "?", "!", "~", ":", ":", "—", "…", }
|
splits = {",", "。", "?", "!", ",", ".", "?", "!", "~", ":", ":", "—", "…", }
|
||||||
|
|
||||||
|
|
||||||
@ -332,23 +237,59 @@ def get_first(text):
|
|||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def get_cleaned_text_final(text,language):
|
def get_phones_and_bert(text,language):
|
||||||
if language in {"en","all_zh","all_ja"}:
|
if language in {"en","all_zh","all_ja"}:
|
||||||
phones, word2ph, norm_text = clean_text_inf(text, language)
|
language = language.replace("all_","")
|
||||||
|
if language == "en":
|
||||||
|
LangSegment.setfilters(["en"])
|
||||||
|
formattext = " ".join(tmp["text"] for tmp in LangSegment.getTexts(text))
|
||||||
|
else:
|
||||||
|
# 因无法区别中日文汉字,以用户输入为准
|
||||||
|
formattext = text
|
||||||
|
while " " in formattext:
|
||||||
|
formattext = formattext.replace(" ", " ")
|
||||||
|
phones, word2ph, norm_text = clean_text_inf(formattext, language)
|
||||||
|
if language == "zh":
|
||||||
|
bert = get_bert_feature(norm_text, word2ph).to(device)
|
||||||
|
else:
|
||||||
|
bert = torch.zeros(
|
||||||
|
(1024, len(phones)),
|
||||||
|
dtype=torch.float16 if is_half == True else torch.float32,
|
||||||
|
).to(device)
|
||||||
elif language in {"zh", "ja","auto"}:
|
elif language in {"zh", "ja","auto"}:
|
||||||
phones, word2ph, norm_text = nonen_clean_text_inf(text, language)
|
textlist=[]
|
||||||
return phones, word2ph, norm_text
|
langlist=[]
|
||||||
|
LangSegment.setfilters(["zh","ja","en"])
|
||||||
|
if language == "auto":
|
||||||
|
for tmp in LangSegment.getTexts(text):
|
||||||
|
langlist.append(tmp["lang"])
|
||||||
|
textlist.append(tmp["text"])
|
||||||
|
else:
|
||||||
|
for tmp in LangSegment.getTexts(text):
|
||||||
|
if tmp["lang"] == "en":
|
||||||
|
langlist.append(tmp["lang"])
|
||||||
|
else:
|
||||||
|
# 因无法区别中日文汉字,以用户输入为准
|
||||||
|
langlist.append(language)
|
||||||
|
textlist.append(tmp["text"])
|
||||||
|
print(textlist)
|
||||||
|
print(langlist)
|
||||||
|
phones_list = []
|
||||||
|
bert_list = []
|
||||||
|
norm_text_list = []
|
||||||
|
for i in range(len(textlist)):
|
||||||
|
lang = langlist[i]
|
||||||
|
phones, word2ph, norm_text = clean_text_inf(textlist[i], lang)
|
||||||
|
bert = get_bert_inf(phones, word2ph, norm_text, lang)
|
||||||
|
phones_list.append(phones)
|
||||||
|
norm_text_list.append(norm_text)
|
||||||
|
bert_list.append(bert)
|
||||||
|
bert = torch.cat(bert_list, dim=1)
|
||||||
|
phones = sum(phones_list, [])
|
||||||
|
norm_text = ' '.join(norm_text_list)
|
||||||
|
|
||||||
|
return phones,bert.to(dtype),norm_text
|
||||||
|
|
||||||
def get_bert_final(phones, word2ph, text,language,device):
|
|
||||||
if language == "en":
|
|
||||||
bert = get_bert_inf(phones, word2ph, text, language)
|
|
||||||
elif language in {"zh", "ja","auto"}:
|
|
||||||
bert = nonen_get_bert_inf(text, language)
|
|
||||||
elif language == "all_zh":
|
|
||||||
bert = get_bert_feature(text, word2ph).to(device)
|
|
||||||
else:
|
|
||||||
bert = torch.zeros((1024, len(phones))).to(device)
|
|
||||||
return bert
|
|
||||||
|
|
||||||
def merge_short_text_in_array(texts, threshold):
|
def merge_short_text_in_array(texts, threshold):
|
||||||
if (len(texts)) < 2:
|
if (len(texts)) < 2:
|
||||||
@ -425,8 +366,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|||||||
texts = merge_short_text_in_array(texts, 5)
|
texts = merge_short_text_in_array(texts, 5)
|
||||||
audio_opt = []
|
audio_opt = []
|
||||||
if not ref_free:
|
if not ref_free:
|
||||||
phones1, word2ph1, norm_text1=get_cleaned_text_final(prompt_text, prompt_language)
|
phones1,bert1,norm_text1=get_phones_and_bert(prompt_text, prompt_language)
|
||||||
bert1=get_bert_final(phones1, word2ph1, norm_text1,prompt_language,device).to(dtype)
|
|
||||||
|
|
||||||
for text in texts:
|
for text in texts:
|
||||||
# 解决输入目标文本的空行导致报错的问题
|
# 解决输入目标文本的空行导致报错的问题
|
||||||
@ -434,8 +374,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|||||||
continue
|
continue
|
||||||
if (text[-1] not in splits): text += "。" if text_language != "en" else "."
|
if (text[-1] not in splits): text += "。" if text_language != "en" else "."
|
||||||
print(i18n("实际输入的目标文本(每句):"), text)
|
print(i18n("实际输入的目标文本(每句):"), text)
|
||||||
phones2, word2ph2, norm_text2 = get_cleaned_text_final(text, text_language)
|
phones2,bert2,norm_text2=get_phones_and_bert(text, text_language)
|
||||||
bert2 = get_bert_final(phones2, word2ph2, norm_text2, text_language, device).to(dtype)
|
|
||||||
if not ref_free:
|
if not ref_free:
|
||||||
bert = torch.cat([bert1, bert2], 1)
|
bert = torch.cat([bert1, bert2], 1)
|
||||||
all_phoneme_ids = torch.LongTensor(phones1+phones2).to(device).unsqueeze(0)
|
all_phoneme_ids = torch.LongTensor(phones1+phones2).to(device).unsqueeze(0)
|
||||||
|
@ -30,7 +30,7 @@ rep_map = {
|
|||||||
"\n": ".",
|
"\n": ".",
|
||||||
"·": ",",
|
"·": ",",
|
||||||
"、": ",",
|
"、": ",",
|
||||||
# "...": "…",
|
"...": "…",
|
||||||
"$": ".",
|
"$": ".",
|
||||||
"/": ",",
|
"/": ",",
|
||||||
"—": "-",
|
"—": "-",
|
||||||
|
@ -23,5 +23,5 @@ PyYAML
|
|||||||
psutil
|
psutil
|
||||||
jieba_fast
|
jieba_fast
|
||||||
jieba
|
jieba
|
||||||
LangSegment
|
LangSegment>=0.2.0
|
||||||
Faster_Whisper
|
Faster_Whisper
|
Loading…
x
Reference in New Issue
Block a user