diff --git a/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py b/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py index 39fb04d0..9a92f659 100644 --- a/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py +++ b/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py @@ -1,7 +1,3 @@ - - - - import re from typing import Callable from tools.i18n.i18n import I18nAuto @@ -24,6 +20,8 @@ def register_method(name): splits = {",", "。", "?", "!", ",", ".", "?", "!", "~", ":", ":", "—", "…", } + + def split_big_text(text, max_len=510): # 定义全角和半角标点符号 punctuation = "".join(splits) @@ -49,8 +47,6 @@ def split_big_text(text, max_len=510): return result - - def split(todo_text): todo_text = todo_text.replace("……", "。").replace("——", ",") if todo_text[-1] not in splits: @@ -69,6 +65,20 @@ def split(todo_text): i_split_head += 1 return todo_texts +# contributed by XTer +# 简单的按长度切分,不希望出现超长的句子 +def split_long_sentence(text, max_length=510): + + opts = [] + sentences = text.split('\n') + for sentence in sentences: + while len(sentence) > max_length: + part = sentence[:max_length] + opts.append(part) + sentence = sentence[max_length:] + if sentence: + opts.append(sentence) + return "\n".join(opts) # 不切 @register_method("cut0") @@ -79,7 +89,7 @@ def cut0(inp): # 凑四句一切 @register_method("cut1") def cut1(inp): - inp = inp.strip("\n") + inp = split_long_sentence(inp).strip("\n") inps = split(inp) split_idx = list(range(0, len(inps), 4)) split_idx[-1] = None @@ -91,10 +101,11 @@ def cut1(inp): opts = [inp] return "\n".join(opts) + # 凑50字一切 @register_method("cut2") -def cut2(inp): - inp = inp.strip("\n") +def cut2(inp, max_length=50): + inp = split_long_sentence(inp).strip("\n") inps = split(inp) if len(inps) < 2: return inp @@ -104,7 +115,7 @@ def cut2(inp): for i in range(len(inps)): summ += len(inps[i]) tmp_str += inps[i] - if summ > 50: + if summ > max_length: summ = 0 opts.append(tmp_str) tmp_str = "" @@ -116,16 +127,18 @@ def cut2(inp): opts = opts[:-1] return "\n".join(opts) + # 按中文句号。切 @register_method("cut3") def cut3(inp): - inp = inp.strip("\n") + inp = split_long_sentence(inp).strip("\n") return "\n".join(["%s" % item for item in inp.strip("。").split("。")]) -#按英文句号.切 + +# 按英文句号.切 @register_method("cut4") def cut4(inp): - inp = inp.strip("\n") + inp = split_long_sentence(inp).strip("\n") return "\n".join(["%s" % item for item in inp.strip(".").split(".")]) # 按标点符号切 @@ -134,7 +147,7 @@ def cut4(inp): def cut5(inp): # if not re.search(r'[^\w\s]', inp[-1]): # inp += '。' - inp = inp.strip("\n") + inp = split_long_sentence(inp).strip("\n") punds = r'[,.;?!、,。?!;:…]' items = re.split(f'({punds})', inp) mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])] @@ -146,41 +159,65 @@ def cut5(inp): # contributed by https://github.com/X-T-E-R/GPT-SoVITS-Inference/blob/main/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py @register_method("auto_cut") -def auto_cut(inp): +def auto_cut(inp, max_length=60): # if not re.search(r'[^\w\s]', inp[-1]): # inp += '。' inp = inp.strip("\n") - + erase_punds = r'[“”"‘’\'()()【】[\]{}<>《》〈〉〔〕〖〗〘〙〚〛〛〞〟]' + inp = re.sub(erase_punds, '', inp) split_punds = r'[?!。?!~:]' if inp[-1] not in split_punds: inp+="。" items = re.split(f'({split_punds})', inp) items = ["".join(group) for group in zip(items[::2], items[1::2])] - def process_commas(text): - + def process_commas(text, max_length): + + # Define separators and the regular expression for splitting separators = [',', ',', '、', '——', '…'] - count = 0 - processed_text = "" - for char in text: - processed_text += char - if char in separators: - if count > 12: - processed_text += '\n' - count = 0 + # 使用正则表达式的捕获组来保留分隔符,分隔符两边的括号就是所谓的捕获组 + regex_pattern = '(' + '|'.join(map(re.escape, separators)) + ')' + # 使用re.split函数分割文本,由于使用了捕获组,分隔符也会作为分割结果的一部分返回 + sentences = re.split(regex_pattern, text) + + processed_text = "" + current_line = "" + + final_sentences = [] + + for sentence in sentences: + if len(sentence)>max_length: + + final_sentences+=split_long_sentence(sentence,max_length=max_length).split("\n") else: - count += 1 # 对于非分隔符字符,增加计数 + final_sentences.append(sentence) + + for sentence in final_sentences: + # Add the length of the sentence plus one for the space or newline that will follow + if len(current_line) + len(sentence) <= max_length: + # If adding the next sentence does not exceed max length, add it to the current line + current_line += sentence + else: + # If the current line is too long, start a new line + processed_text += current_line.strip() + '\n' + current_line = sentence + " " # Start the new line with the current sentence + + # Add any remaining text in current_line to processed_text + processed_text += current_line.strip() + return processed_text - final_items=[process_commas(item) for item in items] - final_items = [item for item in final_items if item.strip()] + final_items = [] + for item in items: + final_items+=process_commas(item,max_length=max_length).split("\n") + + final_items = [item for item in final_items if item.strip() and not (len(item.strip()) == 1 and item.strip() in "?!,,。?!~:")] return "\n".join(final_items) - - if __name__ == '__main__': - method = get_method("cut5") - print(method("你好,我是小明。你好,我是小红。你好,我是小刚。你好,我是小张。")) - \ No newline at end of file + method = get_method("cut0") + str1="""一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十 + """ + print("|\n|".join(method(str1).split("\n"))) diff --git a/Inference b/Inference index 6c84aff6..ce9cfd55 160000 --- a/Inference +++ b/Inference @@ -1 +1 @@ -Subproject commit 6c84aff6120967474229706a9d63e1a32e794552 +Subproject commit ce9cfd552de45df058a14c6de4dc3c2cc2b62bd1 diff --git a/requirements.txt b/requirements.txt index 76c873c9..a158f2b5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,4 +30,5 @@ jieba_fast jieba LangSegment>=0.2.5 Faster_Whisper -pyaudio \ No newline at end of file +pyaudio +flask_httpauth \ No newline at end of file