From 3158199caaab3adeab4597907b14913e91e3f46b Mon Sep 17 00:00:00 2001
From: XTer <xxoy1234@outlook.com>
Date: Fri, 15 Mar 2024 03:33:04 +0800
Subject: [PATCH] =?UTF-8?q?=E6=94=B9=E8=BF=9B=E4=BA=86=E5=AD=97=E6=95=B0?=
 =?UTF-8?q?=E7=BB=9F=E8=AE=A1=E4=B8=8E=E5=88=87=E5=89=B2=E6=96=B9=E6=B3=95?=
 =?UTF-8?q?=EF=BC=8C=E7=8E=B0=E5=9C=A8=E6=98=AF=E7=BB=9F=E8=AE=A1=E8=8B=B1?=
 =?UTF-8?q?=E6=96=87=E8=AF=8D=E6=95=B0+=E5=85=B6=E5=AE=83=E8=AF=AD?=
 =?UTF-8?q?=E8=A8=80=E5=AD=97=E6=95=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py | 17 ++++-
 .../text_segmentation_method.py               | 65 ++++++++++++++-----
 Inference                                     |  2 +-
 tools/i18n/locale_diff.py                     |  2 +-
 tools/i18n/scan_i18n.py                       | 24 ++++++-
 5 files changed, 90 insertions(+), 20 deletions(-)

diff --git a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
index 58b2678c..cebbda09 100644
--- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
+++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
@@ -70,14 +70,27 @@ class TextPreprocessor:
         return result
 
     def pre_seg_text(self, text:str, lang:str, text_split_method:str):
+        
+        
         text = text.strip("\n")
         if (text[0] not in splits and len(get_first(text)) < 4): 
             text = "。" + text if lang != "en" else "." + text
         print(i18n("实际输入的目标文本:"))
         print(text)
         
-        seg_method = get_seg_method(text_split_method)
-        text = seg_method(text)
+        if text_split_method.startswith("auto_cut"):
+            try:
+                max_word_count = int(text_split_method.split("_")[-1])
+            except:
+                max_word_count = 20
+            if max_word_count < 5 or max_word_count > 1000:
+                max_word_count = 20
+            text_split_method = "auto_cut"
+            seg_method = get_seg_method(text_split_method)
+            text = seg_method(text, max_word_count)
+        else:
+            seg_method = get_seg_method(text_split_method)
+            text = seg_method(text)
         
         while "\n\n" in text:
             text = text.replace("\n\n", "\n")
diff --git a/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py b/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
index f8211663..f6b871cb 100644
--- a/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
+++ b/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
@@ -65,6 +65,25 @@ def split(todo_text):
             i_split_head += 1
     return todo_texts
 
+def cut_sentence_multilang(text, max_length=30):
+    # 初始化计数器
+    word_count = 0
+    in_word = False
+    
+    
+    for index, char in enumerate(text):
+        if char.isspace():  # 如果当前字符是空格
+            in_word = False
+        elif char.isascii() and not in_word:  # 如果是ASCII字符（英文）并且不在单词内
+            word_count += 1  # 新的英文单词
+            in_word = True
+        elif not char.isascii():  # 如果字符非英文
+            word_count += 1  # 每个非英文字符单独计为一个字
+        if word_count > max_length:
+            return text[:index], text[index:]
+    
+    return text, ""
+
 # contributed by XTer
 # 简单的按长度切分，不希望出现超长的句子
 def split_long_sentence(text, max_length=510):
@@ -72,12 +91,11 @@ def split_long_sentence(text, max_length=510):
     opts = []
     sentences = text.split('\n')
     for sentence in sentences:
-        while len(sentence) > max_length:
-            part = sentence[:max_length]
-            opts.append(part)
-            sentence = sentence[max_length:]
-        if sentence:
-            opts.append(sentence)
+        prev_text , sentence = cut_sentence_multilang(sentence, max_length)
+        while sentence.strip() != "":
+            opts.append(prev_text)
+            prev_text , sentence = cut_sentence_multilang(sentence, max_length)
+        opts.append(prev_text)
     return "\n".join(opts)
 
 # 不切
@@ -89,7 +107,7 @@ def cut0(inp):
 # 凑四句一切
 @register_method("cut1")
 def cut1(inp):
-    inp = split_long_sentence(inp).strip("\n")
+    inp = inp.strip("\n")
     inps = split(inp)
     split_idx = list(range(0, len(inps), 4))
     split_idx[-1] = None
@@ -138,7 +156,7 @@ def cut3(inp):
 # 按英文句号.切
 @register_method("cut4")
 def cut4(inp):
-    inp = split_long_sentence(inp).strip("\n")
+    inp = inp.strip("\n")
     return "\n".join(["%s" % item for item in inp.strip(".").split(".")])
 
 # 按标点符号切
@@ -147,7 +165,7 @@ def cut4(inp):
 def cut5(inp):
     # if not re.search(r'[^\w\s]', inp[-1]):
     # inp += '。'
-    inp = split_long_sentence(inp).strip("\n")
+    inp = inp.strip("\n")
     punds = r'[,.;?!、，。？！;：…]'
     items = re.split(f'({punds})', inp)
     mergeitems = ["".join(group) for group in zip(items[::2], items[1::2])]
@@ -157,12 +175,30 @@ def cut5(inp):
     opt = "\n".join(mergeitems)
     return opt
 
+def count_words_multilang(text):
+    # 初始化计数器
+    word_count = 0
+    in_word = False
+    
+    for char in text:
+        if char.isspace():  # 如果当前字符是空格
+            in_word = False
+        elif char.isascii() and not in_word:  # 如果是ASCII字符（英文）并且不在单词内
+            word_count += 1  # 新的英文单词
+            in_word = True
+        elif not char.isascii():  # 如果字符非英文
+            word_count += 1  # 每个非英文字符单独计为一个字
+    
+    return word_count
+    
+
 # contributed by https://github.com/X-T-E-R/GPT-SoVITS-Inference/blob/main/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
 @register_method("auto_cut")
 def auto_cut(inp, max_length=30):
     # if not re.search(r'[^\w\s]', inp[-1]):
     # inp += '。'
     inp = inp.strip("\n")
+    inp = inp.replace(". ", "。")
     erase_punds = r'[“”"‘’\'（）()【】[\]{}<>《》〈〉〔〕〖〗〘〙〚〛〛〞〟]'
     inp = re.sub(erase_punds, '', inp)
     split_punds = r'[?!。？！~：]'
@@ -186,7 +222,7 @@ def auto_cut(inp, max_length=30):
         final_sentences = []
         
         for sentence in sentences:
-            if len(sentence)>max_length:
+            if count_words_multilang(sentence)>max_length:
                 
                 final_sentences+=split_long_sentence(sentence,max_length=max_length).split("\n")
             else:
@@ -194,7 +230,7 @@ def auto_cut(inp, max_length=30):
         
         for sentence in final_sentences:
             # Add the length of the sentence plus one for the space or newline that will follow
-            if len(current_line) + len(sentence) <= max_length:
+            if count_words_multilang(current_line + sentence) <= max_length:
                 # If adding the next sentence does not exceed max length, add it to the current line
                 current_line += sentence
             else:
@@ -217,7 +253,6 @@ def auto_cut(inp, max_length=30):
 
 
 if __name__ == '__main__':
-    method = get_method("cut1")
-    str1="""一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十
-    """
-    print("|\n|".join(method(str1).split("\n")))
+    str1 = """我 有i一个j k 1"""
+    print(count_words_multilang(str1))
+    print(cut_sentence_multilang(str1, 20))
diff --git a/Inference b/Inference
index 7f64553a..7b7d0d8e 160000
--- a/Inference
+++ b/Inference
@@ -1 +1 @@
-Subproject commit 7f64553a0be374798c827b7b429f8dc935bda2ab
+Subproject commit 7b7d0d8ee066b3664cd8a32fc0f10974058364ff
diff --git a/tools/i18n/locale_diff.py b/tools/i18n/locale_diff.py
index a9fd073f..5fb505d8 100644
--- a/tools/i18n/locale_diff.py
+++ b/tools/i18n/locale_diff.py
@@ -31,7 +31,7 @@ for lang_file in languages:
 
     # Add any missing keys to the language file
     for key in diff:
-        lang_data[key] = key
+        lang_data[key] = standard_data[key]
 
     # Del any extra keys to the language file
     for key in miss:
diff --git a/tools/i18n/scan_i18n.py b/tools/i18n/scan_i18n.py
index 4f3a396a..e14e8541 100644
--- a/tools/i18n/scan_i18n.py
+++ b/tools/i18n/scan_i18n.py
@@ -14,6 +14,23 @@ locale_path = "./Inference/i18n/locale"
 scan_list = ["./Inference/"]  # The path to the directory you want to scan, you can change it to your own path
 scan_subfolders = True
 
+special_words_to_keep = {
+    "auto": "自动判断",
+    "zh": "中文",
+    "en": "英文",
+    "ja": "日文",
+    "all_zh": "只有中文",
+    "all_ja": "只有日文",
+    "auto_cut": "智能切分",
+    "cut0": "仅凭换行切分",
+    "cut1": "凑四句一切",
+    "cut2": "凑50字一切",
+    "cut3": "按中文句号。切",
+    "cut4": "按英文句号.切",
+    "cut5": "按标点符号切",
+    
+}
+
 
 def extract_i18n_strings(node):
     i18n_strings = []
@@ -87,10 +104,15 @@ missing_keys = code_keys - standard_keys
 print("Missing keys:", len(missing_keys))
 for missing_key in missing_keys:
     print("\t", missing_key)
+    
+
 
 code_keys_dict = OrderedDict()
 for s in strings:
-    code_keys_dict[s] = s
+    if s in special_words_to_keep:
+        code_keys_dict[s] = special_words_to_keep[s]
+    else:    
+        code_keys_dict[s] = s
 
 # write back
 with open(standard_file, "w", encoding="utf-8") as f: