GPT-SoVITS/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py

199 lines
8.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from typing import Callable
from tools.i18n.i18n import I18nAuto
i18n = I18nAuto()
METHODS = dict()
def get_method(name:str)->Callable:
method = METHODS.get(name, None)
if method is None:
raise ValueError(f"Method {name} not found")
return method
def register_method(name):
def decorator(func):
METHODS[name] = func
return func
return decorator
splits = {"", "", "", "", ",", ".", "?", "!", "~", ":", "", "", "", }
# contributed by XTer
# 简单的按长度切分,不希望出现超长的句子
def split_big_text(text, max_length=510):
opts = []
sentences = text.split('\n')
for sentence in sentences:
while len(sentence) > max_length:
part = sentence[:max_length]
opts.append(part)
sentence = sentence[max_length:]
if sentence:
opts.append(sentence)
return "\n".join(opts)
def split(todo_text):
todo_text = todo_text.replace("……", "").replace("——", "")
if todo_text[-1] not in splits:
todo_text += ""
i_split_head = i_split_tail = 0
len_text = len(todo_text)
todo_texts = []
while 1:
if i_split_head >= len_text:
break # 结尾一定有标点,所以直接跳出即可,最后一段在上次已加入
if todo_text[i_split_head] in splits:
i_split_head += 1
todo_texts.append(todo_text[i_split_tail:i_split_head])
i_split_tail = i_split_head
else:
i_split_head += 1
return todo_texts
# 不切
@register_method("cut0")
def cut0(inp):
return inp
# 凑四句一切
@register_method("cut1")
def cut1(inp):
inp = split_big_text(inp).strip("\n")
inps = split(inp)
split_idx = list(range(0, len(inps), 4))
split_idx[-1] = None
if len(split_idx) > 1:
opts = []
for idx in range(len(split_idx) - 1):
opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]]))
else:
opts = [inp]
return "\n".join(opts)
# 凑50字一切
@register_method("cut2")
def cut2(inp, max_length=50):
inp = split_big_text(inp).strip("\n")
inps = split(inp)
if len(inps) < 2:
return inp
opts = []
summ = 0
tmp_str = ""
for i in range(len(inps)):
summ += len(inps[i])
tmp_str += inps[i]
if summ > max_length:
summ = 0
opts.append(tmp_str)
tmp_str = ""
if tmp_str != "":
opts.append(tmp_str)
# print(opts)
if len(opts) > 1 and len(opts[-1]) < 50: ##如果最后一个太短了,和前一个合一起
opts[-2] = opts[-2] + opts[-1]
opts = opts[:-1]
return "\n".join(opts)
# 按中文句号。切
@register_method("cut3")
def cut3(inp):
inp = split_big_text(inp).strip("\n")
return "\n".join(["%s" % item for item in inp.strip("").split("")])
# 按英文句号.切
@register_method("cut4")
def cut4(inp):
inp = split_big_text(inp).strip("\n")
return "\n".join(["%s" % item for item in inp.strip(".").split(".")])
# 按标点符号切
# contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py
@register_method("cut5")
def cut5(inp):
# if not re.search(r'[^\w\s]', inp[-1]):
# inp += '。'
inp = split_big_text(inp).strip("\n")
punds = r'[,.;?!、,。?!;:…]'
items = re.split(f'({punds})', inp)
mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])]
# 在句子不存在符号或句尾无符号的时候保证文本完整
if len(items)%2 == 1:
mergeitems.append(items[-1])
opt = "\n".join(mergeitems)
return opt
# contributed by https://github.com/X-T-E-R/GPT-SoVITS-Inference/blob/main/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
@register_method("auto_cut")
def auto_cut(inp, max_length=60):
# if not re.search(r'[^\w\s]', inp[-1]):
# inp += '。'
inp = inp.strip("\n")
erase_punds = r'[“”"\'()【】[\]{}<>《》〈〉〔〕〖〗〘〙〚〛〛〞〟]'
inp = re.sub(erase_punds, '', inp)
split_punds = r'[?!。?!~]'
if inp[-1] not in split_punds:
inp+=""
items = re.split(f'({split_punds})', inp)
items = ["".join(group) for group in zip(items[::2], items[1::2])]
def process_commas(text, max_length):
# Define separators and the regular expression for splitting
separators = ['', ',', '', '——', '']
# 使用正则表达式的捕获组来保留分隔符,分隔符两边的括号就是所谓的捕获组
regex_pattern = '(' + '|'.join(map(re.escape, separators)) + ')'
# 使用re.split函数分割文本由于使用了捕获组分隔符也会作为分割结果的一部分返回
sentences = re.split(regex_pattern, text)
processed_text = ""
current_line = ""
final_sentences = []
for sentence in sentences:
if len(sentence)>max_length:
final_sentences+=split_big_text(sentence,max_length=max_length).split("\n")
else:
final_sentences.append(sentence)
for sentence in final_sentences:
# Add the length of the sentence plus one for the space or newline that will follow
if len(current_line) + len(sentence) <= max_length:
# If adding the next sentence does not exceed max length, add it to the current line
current_line += sentence
else:
# If the current line is too long, start a new line
processed_text += current_line.strip() + '\n'
current_line = sentence + " " # Start the new line with the current sentence
# Add any remaining text in current_line to processed_text
processed_text += current_line.strip()
return processed_text
final_items = []
for item in items:
final_items+=process_commas(item,max_length=max_length).split("\n")
final_items = [item for item in final_items if item.strip() and not (len(item.strip()) == 1 and item.strip() in "?!,。?!~")]
return "\n".join(final_items)
if __name__ == '__main__':
method = get_method("cut0")
str1="""一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十
"""
print("|\n|".join(method(str1).split("\n")))