改进了字数统计与切割方法，现在是统计英文词数+其它语言字数

2025-10-07 15:19:59 +08:00 · 2024-03-15 03:33:04 +08:00 · 2024-03-15 03:33:04 +08:00 · 3158199caa
commit 3158199caa
parent 8ec543f00f
5 changed files with 90 additions and 20 deletions
--- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
+++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
@ -70,14 +70,27 @@ class TextPreprocessor:
        return result

    def pre_seg_text(self, text:str, lang:str, text_split_method:str):
+        
+        
        text = text.strip("\n")
        if (text[0] not in splits and len(get_first(text)) < 4): 
            text = "。" + text if lang != "en" else "." + text
        print(i18n("实际输入的目标文本:"))
        print(text)
        
-        seg_method = get_seg_method(text_split_method)
-        text = seg_method(text)
+        if text_split_method.startswith("auto_cut"):
+            try:
+                max_word_count = int(text_split_method.split("_")[-1])
+            except:
+                max_word_count = 20
+            if max_word_count < 5 or max_word_count > 1000:
+                max_word_count = 20
+            text_split_method = "auto_cut"
+            seg_method = get_seg_method(text_split_method)
+            text = seg_method(text, max_word_count)
+        else:
+            seg_method = get_seg_method(text_split_method)
+            text = seg_method(text)
        
        while "\n\n" in text:
            text = text.replace("\n\n", "\n")
--- a/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
+++ b/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
@ -65,6 +65,25 @@ def split(todo_text):
            i_split_head += 1
    return todo_texts

+def cut_sentence_multilang(text, max_length=30):
+    # 初始化计数器
+    word_count = 0
+    in_word = False
+    
+    
+    for index, char in enumerate(text):
+        if char.isspace():  # 如果当前字符是空格
+            in_word = False
+        elif char.isascii() and not in_word:  # 如果是ASCII字符（英文）并且不在单词内
+            word_count += 1  # 新的英文单词
+            in_word = True
+        elif not char.isascii():  # 如果字符非英文
+            word_count += 1  # 每个非英文字符单独计为一个字
+        if word_count > max_length:
+            return text[:index], text[index:]
+    
+    return text, ""
+
 # contributed by XTer
 # 简单的按长度切分，不希望出现超长的句子
 def split_long_sentence(text, max_length=510):
@ -72,12 +91,11 @@ def split_long_sentence(text, max_length=510):
    opts = []
    sentences = text.split('\n')
    for sentence in sentences:
-        while len(sentence) > max_length:
-            part = sentence[:max_length]
-            opts.append(part)
-            sentence = sentence[max_length:]
-        if sentence:
-            opts.append(sentence)
+        prev_text , sentence = cut_sentence_multilang(sentence, max_length)
+        while sentence.strip() != "":
+            opts.append(prev_text)
+            prev_text , sentence = cut_sentence_multilang(sentence, max_length)
+        opts.append(prev_text)
    return "\n".join(opts)

 # 不切
@ -89,7 +107,7 @@ def cut0(inp):
 # 凑四句一切
@register_method("cut1")
 def cut1(inp):
-    inp = split_long_sentence(inp).strip("\n")
+    inp = inp.strip("\n")
    inps = split(inp)
    split_idx = list(range(0, len(inps), 4))
    split_idx[-1] = None
@ -138,7 +156,7 @@ def cut3(inp):
 # 按英文句号.切
@register_method("cut4")
 def cut4(inp):
-    inp = split_long_sentence(inp).strip("\n")
+    inp = inp.strip("\n")
    return "\n".join(["%s" % item for item in inp.strip(".").split(".")])

 # 按标点符号切
@ -147,7 +165,7 @@ def cut4(inp):
 def cut5(inp):
    # if not re.search(r'[^\w\s]', inp[-1]):
    # inp += '。'
-    inp = split_long_sentence(inp).strip("\n")
+    inp = inp.strip("\n")
    punds = r'[,.;?!、，。？！;：…]'
    items = re.split(f'({punds})', inp)
    mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])]
@ -157,12 +175,30 @@ def cut5(inp):
    opt = "\n".join(mergeitems)
    return opt

+def count_words_multilang(text):
+    # 初始化计数器
+    word_count = 0
+    in_word = False
+    
+    for char in text:
+        if char.isspace():  # 如果当前字符是空格
+            in_word = False
+        elif char.isascii() and not in_word:  # 如果是ASCII字符（英文）并且不在单词内
+            word_count += 1  # 新的英文单词
+            in_word = True
+        elif not char.isascii():  # 如果字符非英文
+            word_count += 1  # 每个非英文字符单独计为一个字
+    
+    return word_count
+    
+
 # contributed by https://github.com/X-T-E-R/GPT-SoVITS-Inference/blob/main/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
@register_method("auto_cut")
 def auto_cut(inp, max_length=30):
    # if not re.search(r'[^\w\s]', inp[-1]):
    # inp += '。'
    inp = inp.strip("\n")
+    inp = inp.replace(". ", "。")
    erase_punds = r'[“”"‘’\'（）()【】[\]{}<>《》〈〉〔〕〖〗〘〙〚〛〛〞〟]'
    inp = re.sub(erase_punds, '', inp)
    split_punds = r'[?!。？！~：]'
@ -186,7 +222,7 @@ def auto_cut(inp, max_length=30):
        final_sentences = []
        
        for sentence in sentences:
-            if len(sentence)>max_length:
+            if count_words_multilang(sentence)>max_length:
                
                final_sentences+=split_long_sentence(sentence,max_length=max_length).split("\n")
            else:
@ -194,7 +230,7 @@ def auto_cut(inp, max_length=30):
        
        for sentence in final_sentences:
            # Add the length of the sentence plus one for the space or newline that will follow
-            if len(current_line) + len(sentence) <= max_length:
+            if count_words_multilang(current_line + sentence) <= max_length:
                # If adding the next sentence does not exceed max length, add it to the current line
                current_line += sentence
            else:
@ -217,7 +253,6 @@ def auto_cut(inp, max_length=30):


 if __name__ == '__main__':
-    method = get_method("cut1")
-    str1="""一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十
-    """
-    print("|\n|".join(method(str1).split("\n")))
+    str1 = """我 有i一个j k 1"""
+    print(count_words_multilang(str1))
+    print(cut_sentence_multilang(str1, 20))
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 7f64553a0be374798c827b7b429f8dc935bda2ab
+Subproject commit 7b7d0d8ee066b3664cd8a32fc0f10974058364ff
--- a/tools/i18n/locale_diff.py
+++ b/tools/i18n/locale_diff.py
@ -31,7 +31,7 @@ for lang_file in languages:

    # Add any missing keys to the language file
    for key in diff:
-        lang_data[key] = key
+        lang_data[key] = standard_data[key]

    # Del any extra keys to the language file
    for key in miss:
--- a/tools/i18n/scan_i18n.py
+++ b/tools/i18n/scan_i18n.py
@ -14,6 +14,23 @@ locale_path = "./Inference/i18n/locale"
 scan_list = ["./Inference/"]  # The path to the directory you want to scan, you can change it to your own path
 scan_subfolders = True

+special_words_to_keep = {
+    "auto": "自动判断",
+    "zh": "中文",
+    "en": "英文",
+    "ja": "日文",
+    "all_zh": "只有中文",
+    "all_ja": "只有日文",
+    "auto_cut": "智能切分",
+    "cut0": "仅凭换行切分",
+    "cut1": "凑四句一切",
+    "cut2": "凑50字一切",
+    "cut3": "按中文句号。切",
+    "cut4": "按英文句号.切",
+    "cut5": "按标点符号切",
+    
+}
+

 def extract_i18n_strings(node):
    i18n_strings = []
@ -87,10 +104,15 @@ missing_keys = code_keys - standard_keys
 print("Missing keys:", len(missing_keys))
 for missing_key in missing_keys:
    print("\t", missing_key)
+    
+

 code_keys_dict = OrderedDict()
 for s in strings:
-    code_keys_dict[s] = s
+    if s in special_words_to_keep:
+        code_keys_dict[s] = special_words_to_keep[s]
+    else:    
+        code_keys_dict[s] = s

 # write back
 with open(standard_file, "w", encoding="utf-8") as f: