修改了auto_cut

This commit is contained in:
XTer 2024-03-11 15:33:56 +08:00
parent 301b3c7f7c
commit 2531f6ff27
3 changed files with 74 additions and 36 deletions

View File

@ -1,7 +1,3 @@
import re
from typing import Callable
from tools.i18n.i18n import I18nAuto
@ -24,6 +20,8 @@ def register_method(name):
splits = {"", "", "", "", ",", ".", "?", "!", "~", ":", "", "", "", }
def split_big_text(text, max_len=510):
# 定义全角和半角标点符号
punctuation = "".join(splits)
@ -49,8 +47,6 @@ def split_big_text(text, max_len=510):
return result
def split(todo_text):
todo_text = todo_text.replace("……", "").replace("——", "")
if todo_text[-1] not in splits:
@ -69,6 +65,20 @@ def split(todo_text):
i_split_head += 1
return todo_texts
# contributed by XTer
# 简单的按长度切分,不希望出现超长的句子
def split_long_sentence(text, max_length=510):
opts = []
sentences = text.split('\n')
for sentence in sentences:
while len(sentence) > max_length:
part = sentence[:max_length]
opts.append(part)
sentence = sentence[max_length:]
if sentence:
opts.append(sentence)
return "\n".join(opts)
# 不切
@register_method("cut0")
@ -79,7 +89,7 @@ def cut0(inp):
# 凑四句一切
@register_method("cut1")
def cut1(inp):
inp = inp.strip("\n")
inp = split_long_sentence(inp).strip("\n")
inps = split(inp)
split_idx = list(range(0, len(inps), 4))
split_idx[-1] = None
@ -91,10 +101,11 @@ def cut1(inp):
opts = [inp]
return "\n".join(opts)
# 凑50字一切
@register_method("cut2")
def cut2(inp):
inp = inp.strip("\n")
def cut2(inp, max_length=50):
inp = split_long_sentence(inp).strip("\n")
inps = split(inp)
if len(inps) < 2:
return inp
@ -104,7 +115,7 @@ def cut2(inp):
for i in range(len(inps)):
summ += len(inps[i])
tmp_str += inps[i]
if summ > 50:
if summ > max_length:
summ = 0
opts.append(tmp_str)
tmp_str = ""
@ -116,16 +127,18 @@ def cut2(inp):
opts = opts[:-1]
return "\n".join(opts)
# 按中文句号。切
@register_method("cut3")
def cut3(inp):
inp = inp.strip("\n")
inp = split_long_sentence(inp).strip("\n")
return "\n".join(["%s" % item for item in inp.strip("").split("")])
#按英文句号.切
# 按英文句号.切
@register_method("cut4")
def cut4(inp):
inp = inp.strip("\n")
inp = split_long_sentence(inp).strip("\n")
return "\n".join(["%s" % item for item in inp.strip(".").split(".")])
# 按标点符号切
@ -134,7 +147,7 @@ def cut4(inp):
def cut5(inp):
# if not re.search(r'[^\w\s]', inp[-1]):
# inp += '。'
inp = inp.strip("\n")
inp = split_long_sentence(inp).strip("\n")
punds = r'[,.;?!、,。?!;:…]'
items = re.split(f'({punds})', inp)
mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])]
@ -146,41 +159,65 @@ def cut5(inp):
# contributed by https://github.com/X-T-E-R/GPT-SoVITS-Inference/blob/main/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
@register_method("auto_cut")
def auto_cut(inp):
def auto_cut(inp, max_length=60):
# if not re.search(r'[^\w\s]', inp[-1]):
# inp += '。'
inp = inp.strip("\n")
erase_punds = r'[“”"\'()【】[\]{}<>《》〈〉〔〕〖〗〘〙〚〛〛〞〟]'
inp = re.sub(erase_punds, '', inp)
split_punds = r'[?!。?!~]'
if inp[-1] not in split_punds:
inp+=""
items = re.split(f'({split_punds})', inp)
items = ["".join(group) for group in zip(items[::2], items[1::2])]
def process_commas(text):
def process_commas(text, max_length):
# Define separators and the regular expression for splitting
separators = ['', ',', '', '——', '']
count = 0
processed_text = ""
for char in text:
processed_text += char
if char in separators:
if count > 12:
processed_text += '\n'
count = 0
# 使用正则表达式的捕获组来保留分隔符,分隔符两边的括号就是所谓的捕获组
regex_pattern = '(' + '|'.join(map(re.escape, separators)) + ')'
# 使用re.split函数分割文本由于使用了捕获组分隔符也会作为分割结果的一部分返回
sentences = re.split(regex_pattern, text)
processed_text = ""
current_line = ""
final_sentences = []
for sentence in sentences:
if len(sentence)>max_length:
final_sentences+=split_long_sentence(sentence,max_length=max_length).split("\n")
else:
count += 1 # 对于非分隔符字符,增加计数
final_sentences.append(sentence)
for sentence in final_sentences:
# Add the length of the sentence plus one for the space or newline that will follow
if len(current_line) + len(sentence) <= max_length:
# If adding the next sentence does not exceed max length, add it to the current line
current_line += sentence
else:
# If the current line is too long, start a new line
processed_text += current_line.strip() + '\n'
current_line = sentence + " " # Start the new line with the current sentence
# Add any remaining text in current_line to processed_text
processed_text += current_line.strip()
return processed_text
final_items=[process_commas(item) for item in items]
final_items = [item for item in final_items if item.strip()]
final_items = []
for item in items:
final_items+=process_commas(item,max_length=max_length).split("\n")
final_items = [item for item in final_items if item.strip() and not (len(item.strip()) == 1 and item.strip() in "?!,。?!~")]
return "\n".join(final_items)
if __name__ == '__main__':
method = get_method("cut5")
print(method("你好,我是小明。你好,我是小红。你好,我是小刚。你好,我是小张。"))
method = get_method("cut0")
str1="""一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十
"""
print("|\n|".join(method(str1).split("\n")))

@ -1 +1 @@
Subproject commit 6c84aff6120967474229706a9d63e1a32e794552
Subproject commit ce9cfd552de45df058a14c6de4dc3c2cc2b62bd1

View File

@ -30,4 +30,5 @@ jieba_fast
jieba
LangSegment>=0.2.5
Faster_Whisper
pyaudio
pyaudio
flask_httpauth