remove duplicate spaces

This commit is contained in:
KamioRinn 2025-06-27 11:15:34 +08:00
parent ed96ffd752
commit 05bcf9c859
3 changed files with 187 additions and 187 deletions

View File

@ -121,70 +121,70 @@ class TextPreprocessor:
def get_phones_and_bert(self, text: str, language: str, version: str, final: bool = False): def get_phones_and_bert(self, text: str, language: str, version: str, final: bool = False):
with self.bert_lock: with self.bert_lock:
if language in {"all_zh", "all_yue", "all_ja", "all_ko", "zh", "ja", "ko", "yue", "en", "auto", "auto_yue"}: text = re.sub(r' {2,}', ' ', text)
textlist = [] textlist = []
langlist = [] langlist = []
if language == "all_zh": if language == "all_zh":
for tmp in LangSegmenter.getTexts(text,"zh"): for tmp in LangSegmenter.getTexts(text,"zh"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_yue":
for tmp in LangSegmenter.getTexts(text,"zh"):
if tmp["lang"] == "zh":
tmp["lang"] = "yue"
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_ja":
for tmp in LangSegmenter.getTexts(text,"ja"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_ko":
for tmp in LangSegmenter.getTexts(text,"ko"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "en":
formattext = text
while " " in formattext:
formattext = formattext.replace(" ", " ")
langlist.append("en")
textlist.append(formattext)
elif language == "auto":
for tmp in LangSegmenter.getTexts(text):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "auto_yue":
for tmp in LangSegmenter.getTexts(text):
if tmp["lang"] == "zh":
tmp["lang"] = "yue"
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
else:
for tmp in LangSegmenter.getTexts(text):
if langlist:
if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"):
textlist[-1] += tmp["text"]
continue
if tmp["lang"] == "en":
langlist.append(tmp["lang"]) langlist.append(tmp["lang"])
textlist.append(tmp["text"]) else:
elif language == "all_yue": # 因无法区别中日韩文汉字,以用户输入为准
for tmp in LangSegmenter.getTexts(text,"zh"): langlist.append(language)
if tmp["lang"] == "zh": textlist.append(tmp["text"])
tmp["lang"] = "yue" # print(textlist)
langlist.append(tmp["lang"]) # print(langlist)
textlist.append(tmp["text"]) phones_list = []
elif language == "all_ja": bert_list = []
for tmp in LangSegmenter.getTexts(text,"ja"): norm_text_list = []
langlist.append(tmp["lang"]) for i in range(len(textlist)):
textlist.append(tmp["text"]) lang = langlist[i]
elif language == "all_ko": phones, word2ph, norm_text = self.clean_text_inf(textlist[i], lang, version)
for tmp in LangSegmenter.getTexts(text,"ko"): bert = self.get_bert_inf(phones, word2ph, norm_text, lang)
langlist.append(tmp["lang"]) phones_list.append(phones)
textlist.append(tmp["text"]) norm_text_list.append(norm_text)
elif language == "en": bert_list.append(bert)
formattext = text bert = torch.cat(bert_list, dim=1)
while " " in formattext: phones = sum(phones_list, [])
formattext = formattext.replace(" ", " ") norm_text = "".join(norm_text_list)
langlist.append("en")
textlist.append(formattext)
elif language == "auto":
for tmp in LangSegmenter.getTexts(text):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "auto_yue":
for tmp in LangSegmenter.getTexts(text):
if tmp["lang"] == "zh":
tmp["lang"] = "yue"
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
else:
for tmp in LangSegmenter.getTexts(text):
if langlist:
if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"):
textlist[-1] += tmp["text"]
continue
if tmp["lang"] == "en":
langlist.append(tmp["lang"])
else:
# 因无法区别中日韩文汉字,以用户输入为准
langlist.append(language)
textlist.append(tmp["text"])
# print(textlist)
# print(langlist)
phones_list = []
bert_list = []
norm_text_list = []
for i in range(len(textlist)):
lang = langlist[i]
phones, word2ph, norm_text = self.clean_text_inf(textlist[i], lang, version)
bert = self.get_bert_inf(phones, word2ph, norm_text, lang)
phones_list.append(phones)
norm_text_list.append(norm_text)
bert_list.append(bert)
bert = torch.cat(bert_list, dim=1)
phones = sum(phones_list, [])
norm_text = "".join(norm_text_list)
if not final and len(phones) < 6: if not final and len(phones) < 6:
return self.get_phones_and_bert("." + text, language, version, final=True) return self.get_phones_and_bert("." + text, language, version, final=True)

View File

@ -586,70 +586,70 @@ from text import chinese
def get_phones_and_bert(text, language, version, final=False): def get_phones_and_bert(text, language, version, final=False):
if language in {"all_zh", "all_yue", "all_ja", "all_ko", "zh", "ja", "ko", "yue", "en", "auto", "auto_yue"}: text = re.sub(r' {2,}', ' ', text)
textlist = [] textlist = []
langlist = [] langlist = []
if language == "all_zh": if language == "all_zh":
for tmp in LangSegmenter.getTexts(text,"zh"): for tmp in LangSegmenter.getTexts(text,"zh"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_yue":
for tmp in LangSegmenter.getTexts(text,"zh"):
if tmp["lang"] == "zh":
tmp["lang"] = "yue"
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_ja":
for tmp in LangSegmenter.getTexts(text,"ja"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_ko":
for tmp in LangSegmenter.getTexts(text,"ko"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "en":
formattext = text
while " " in formattext:
formattext = formattext.replace(" ", " ")
langlist.append("en")
textlist.append(formattext)
elif language == "auto":
for tmp in LangSegmenter.getTexts(text):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "auto_yue":
for tmp in LangSegmenter.getTexts(text):
if tmp["lang"] == "zh":
tmp["lang"] = "yue"
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
else:
for tmp in LangSegmenter.getTexts(text):
if langlist:
if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"):
textlist[-1] += tmp["text"]
continue
if tmp["lang"] == "en":
langlist.append(tmp["lang"]) langlist.append(tmp["lang"])
textlist.append(tmp["text"]) else:
elif language == "all_yue": # 因无法区别中日韩文汉字,以用户输入为准
for tmp in LangSegmenter.getTexts(text,"zh"): langlist.append(language)
if tmp["lang"] == "zh": textlist.append(tmp["text"])
tmp["lang"] = "yue" print(textlist)
langlist.append(tmp["lang"]) print(langlist)
textlist.append(tmp["text"]) phones_list = []
elif language == "all_ja": bert_list = []
for tmp in LangSegmenter.getTexts(text,"ja"): norm_text_list = []
langlist.append(tmp["lang"]) for i in range(len(textlist)):
textlist.append(tmp["text"]) lang = langlist[i]
elif language == "all_ko": phones, word2ph, norm_text = clean_text_inf(textlist[i], lang, version)
for tmp in LangSegmenter.getTexts(text,"ko"): bert = get_bert_inf(phones, word2ph, norm_text, lang)
langlist.append(tmp["lang"]) phones_list.append(phones)
textlist.append(tmp["text"]) norm_text_list.append(norm_text)
elif language == "en": bert_list.append(bert)
formattext = text bert = torch.cat(bert_list, dim=1)
while " " in formattext: phones = sum(phones_list, [])
formattext = formattext.replace(" ", " ") norm_text = "".join(norm_text_list)
langlist.append("en")
textlist.append(formattext)
elif language == "auto":
for tmp in LangSegmenter.getTexts(text):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "auto_yue":
for tmp in LangSegmenter.getTexts(text):
if tmp["lang"] == "zh":
tmp["lang"] = "yue"
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
else:
for tmp in LangSegmenter.getTexts(text):
if langlist:
if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"):
textlist[-1] += tmp["text"]
continue
if tmp["lang"] == "en":
langlist.append(tmp["lang"])
else:
# 因无法区别中日韩文汉字,以用户输入为准
langlist.append(language)
textlist.append(tmp["text"])
print(textlist)
print(langlist)
phones_list = []
bert_list = []
norm_text_list = []
for i in range(len(textlist)):
lang = langlist[i]
phones, word2ph, norm_text = clean_text_inf(textlist[i], lang, version)
bert = get_bert_inf(phones, word2ph, norm_text, lang)
phones_list.append(phones)
norm_text_list.append(norm_text)
bert_list.append(bert)
bert = torch.cat(bert_list, dim=1)
phones = sum(phones_list, [])
norm_text = "".join(norm_text_list)
if not final and len(phones) < 6: if not final and len(phones) < 6:
return get_phones_and_bert("." + text, language, version, final=True) return get_phones_and_bert("." + text, language, version, final=True)

122
api.py
View File

@ -532,68 +532,68 @@ from text import chinese
def get_phones_and_bert(text, language, version, final=False): def get_phones_and_bert(text, language, version, final=False):
if language in {"all_zh", "all_yue", "all_ja", "all_ko", "zh", "ja", "ko", "yue", "en", "auto", "auto_yue"}: text = re.sub(r' {2,}', ' ', text)
textlist = [] textlist = []
langlist = [] langlist = []
if language == "all_zh": if language == "all_zh":
for tmp in LangSegmenter.getTexts(text,"zh"): for tmp in LangSegmenter.getTexts(text,"zh"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_yue":
for tmp in LangSegmenter.getTexts(text,"zh"):
if tmp["lang"] == "zh":
tmp["lang"] = "yue"
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_ja":
for tmp in LangSegmenter.getTexts(text,"ja"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "all_ko":
for tmp in LangSegmenter.getTexts(text,"ko"):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "en":
formattext = text
while " " in formattext:
formattext = formattext.replace(" ", " ")
langlist.append("en")
textlist.append(formattext)
elif language == "auto":
for tmp in LangSegmenter.getTexts(text):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "auto_yue":
for tmp in LangSegmenter.getTexts(text):
if tmp["lang"] == "zh":
tmp["lang"] = "yue"
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
else:
for tmp in LangSegmenter.getTexts(text):
if langlist:
if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"):
textlist[-1] += tmp["text"]
continue
if tmp["lang"] == "en":
langlist.append(tmp["lang"]) langlist.append(tmp["lang"])
textlist.append(tmp["text"]) else:
elif language == "all_yue": # 因无法区别中日韩文汉字,以用户输入为准
for tmp in LangSegmenter.getTexts(text,"zh"): langlist.append(language)
if tmp["lang"] == "zh": textlist.append(tmp["text"])
tmp["lang"] = "yue" phones_list = []
langlist.append(tmp["lang"]) bert_list = []
textlist.append(tmp["text"]) norm_text_list = []
elif language == "all_ja": for i in range(len(textlist)):
for tmp in LangSegmenter.getTexts(text,"ja"): lang = langlist[i]
langlist.append(tmp["lang"]) phones, word2ph, norm_text = clean_text_inf(textlist[i], lang, version)
textlist.append(tmp["text"]) bert = get_bert_inf(phones, word2ph, norm_text, lang)
elif language == "all_ko": phones_list.append(phones)
for tmp in LangSegmenter.getTexts(text,"ko"): norm_text_list.append(norm_text)
langlist.append(tmp["lang"]) bert_list.append(bert)
textlist.append(tmp["text"]) bert = torch.cat(bert_list, dim=1)
elif language == "en": phones = sum(phones_list, [])
formattext = text norm_text = "".join(norm_text_list)
while " " in formattext:
formattext = formattext.replace(" ", " ")
langlist.append("en")
textlist.append(formattext)
elif language == "auto":
for tmp in LangSegmenter.getTexts(text):
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
elif language == "auto_yue":
for tmp in LangSegmenter.getTexts(text):
if tmp["lang"] == "zh":
tmp["lang"] = "yue"
langlist.append(tmp["lang"])
textlist.append(tmp["text"])
else:
for tmp in LangSegmenter.getTexts(text):
if langlist:
if (tmp["lang"] == "en" and langlist[-1] == "en") or (tmp["lang"] != "en" and langlist[-1] != "en"):
textlist[-1] += tmp["text"]
continue
if tmp["lang"] == "en":
langlist.append(tmp["lang"])
else:
# 因无法区别中日韩文汉字,以用户输入为准
langlist.append(language)
textlist.append(tmp["text"])
phones_list = []
bert_list = []
norm_text_list = []
for i in range(len(textlist)):
lang = langlist[i]
phones, word2ph, norm_text = clean_text_inf(textlist[i], lang, version)
bert = get_bert_inf(phones, word2ph, norm_text, lang)
phones_list.append(phones)
norm_text_list.append(norm_text)
bert_list.append(bert)
bert = torch.cat(bert_list, dim=1)
phones = sum(phones_list, [])
norm_text = "".join(norm_text_list)
if not final and len(phones) < 6: if not final and len(phones) < 6:
return get_phones_and_bert("." + text, language, version, final=True) return get_phones_and_bert("." + text, language, version, final=True)