mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-04-05 12:38:35 +08:00
Fix normalize (#1404)
This commit is contained in:
parent
21e63c4c3e
commit
9f8f365b93
@ -311,14 +311,14 @@ def get_phones_and_bert(text,language,version):
|
||||
if language == "zh":
|
||||
if re.search(r'[A-Za-z]', formattext):
|
||||
formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext)
|
||||
formattext = chinese.text_normalize(formattext)
|
||||
formattext = chinese.mix_text_normalize(formattext)
|
||||
return get_phones_and_bert(formattext,"zh",version)
|
||||
else:
|
||||
phones, word2ph, norm_text = clean_text_inf(formattext, language, version)
|
||||
bert = get_bert_feature(norm_text, word2ph).to(device)
|
||||
elif language == "yue" and re.search(r'[A-Za-z]', formattext):
|
||||
formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext)
|
||||
formattext = chinese.text_normalize(formattext)
|
||||
formattext = chinese.mix_text_normalize(formattext)
|
||||
return get_phones_and_bert(formattext,"yue",version)
|
||||
else:
|
||||
phones, word2ph, norm_text = clean_text_inf(formattext, language, version)
|
||||
|
@ -47,6 +47,19 @@ def replace_punctuation(text):
|
||||
|
||||
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
||||
|
||||
replaced_text = re.sub(
|
||||
r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
|
||||
)
|
||||
|
||||
return replaced_text
|
||||
|
||||
|
||||
def replace_punctuation_with_en(text):
|
||||
text = text.replace("嗯", "恩").replace("呣", "母")
|
||||
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
|
||||
|
||||
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
||||
|
||||
replaced_text = re.sub(
|
||||
r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text
|
||||
)
|
||||
@ -171,6 +184,20 @@ def text_normalize(text):
|
||||
return dest_text
|
||||
|
||||
|
||||
# 不排除英文的文本格式化
|
||||
def mix_text_normalize(text):
|
||||
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
|
||||
tx = TextNormalizer()
|
||||
sentences = tx.normalize(text)
|
||||
dest_text = ""
|
||||
for sentence in sentences:
|
||||
dest_text += replace_punctuation_with_en(sentence)
|
||||
|
||||
# 避免重复标点引起的参考泄露
|
||||
dest_text = replace_consecutive_punctuation(dest_text)
|
||||
return dest_text
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏"
|
||||
text = "呣呣呣~就是…大人的鼹鼠党吧?"
|
||||
|
@ -60,6 +60,26 @@ def replace_punctuation(text):
|
||||
return replaced_text
|
||||
|
||||
|
||||
def replace_punctuation_with_en(text):
|
||||
text = text.replace("嗯", "恩").replace("呣", "母")
|
||||
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
|
||||
|
||||
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
||||
|
||||
replaced_text = re.sub(
|
||||
r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text
|
||||
)
|
||||
|
||||
return replaced_text
|
||||
|
||||
|
||||
def replace_consecutive_punctuation(text):
|
||||
punctuations = ''.join(re.escape(p) for p in punctuation)
|
||||
pattern = f'([{punctuations}])([{punctuations}])+'
|
||||
result = re.sub(pattern, r'\1', text)
|
||||
return result
|
||||
|
||||
|
||||
def g2p(text):
|
||||
pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
|
||||
sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
|
||||
@ -171,6 +191,23 @@ def text_normalize(text):
|
||||
dest_text = ""
|
||||
for sentence in sentences:
|
||||
dest_text += replace_punctuation(sentence)
|
||||
|
||||
# 避免重复标点引起的参考泄露
|
||||
dest_text = replace_consecutive_punctuation(dest_text)
|
||||
return dest_text
|
||||
|
||||
|
||||
# 不排除英文的文本格式化
|
||||
def mix_text_normalize(text):
|
||||
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
|
||||
tx = TextNormalizer()
|
||||
sentences = tx.normalize(text)
|
||||
dest_text = ""
|
||||
for sentence in sentences:
|
||||
dest_text += replace_punctuation_with_en(sentence)
|
||||
|
||||
# 避免重复标点引起的参考泄露
|
||||
dest_text = replace_consecutive_punctuation(dest_text)
|
||||
return dest_text
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user