diff --git a/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py b/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py index eb256106..cbc21c23 100644 --- a/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py +++ b/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py @@ -146,9 +146,123 @@ def cut5(inp): opt = "\n".join(mergeitems) return opt +def num_to_chinese(num): + chinese_nums = { + 0: '零', 1: '一', 2: '二', 3: '三', 4: '四', 5: '五', 6: '六', 7: '七', 8: '八', 9: '九', + } + units = ['', '十', '百', '千', '万', '亿'] + num_str = str(num) + num_str_rev = num_str[::-1] + result = '' + for i, digit in enumerate(num_str_rev): + if i == 0 and digit == '0': + continue + if i > 0 and digit == '0' and result[0] != '零': + result = '零' + result + digit_chinese = chinese_nums[int(digit)] + unit = units[i % 4] + if i % 4 == 0: + unit = units[i % 4 + int(i / 4)] + result = digit_chinese + unit + result + return result +# 支持语言混合切,按照约10个字一组,拆分更多的文本支持batch并行推理 +@register_method("mixed_cut") +def mixed_cut(inp): + def re_exp_japenese_char(): + #日文中带有中文字符的情况,依赖short合并把中文合并到上一个日文分组中 + return '[\u3040-\u30FF\uFF66-\uFF9D]' + def re_exp_chinese_char(): + return '[\u4e00-\u9fa5]' + def re_exp_alpha(): + return '[a-zA-Z]' + def re_exp_digit(): + return '[0-9]' + bad_case_ignore = [ "...","~","——","……" ] + for ss in bad_case_ignore: + inp = inp.replace(ss, "。") + result = [] + last_s = "" + last_c_type = "" + #按连续字符进行分组 + for char in inp: + c_type = "unknow" + if char == " ": + last_s += char + continue + elif re.match(re_exp_japenese_char(), char): + c_type="jps" + elif re.match(re_exp_chinese_char(), char): + c_type="hans" + elif re.match(re_exp_alpha(), char): + c_type="alpha" + elif re.match(re_exp_digit(), char): + c_type="digit" + if (c_type != last_c_type and c_type != "unknow" and len(last_c_type) > 0): + result.append(last_s) + last_s = "" + last_s += char + if c_type != "unknow": + last_c_type = c_type + elif len(last_s) > 10: + result.append(last_s) + last_s = "" + result.append(last_s) + + def s_type(s): + if len(s) > 0: + if re.compile(re_exp_japenese_char()).search(s) is not None: + return "jps" + elif re.compile(re_exp_chinese_char()).search(s) is not None: + return "hans" + elif re.compile(re_exp_alpha()).search(s) is not None: + return "alpha" + elif re.compile(re_exp_digit()).search(s) is not None: + return "digit" + return "unknow" + #数组合并至前项,并支持念出中文数字 + new_result = [] + n = 0 + while n < len(result): + this_s = result[n] + this_s_type = s_type(this_s) + before_s = "" + if n > 0: + before_s = result[n-1] + before_s_type = s_type(this_s) + next_s = "" + if n < (len(result)-1): + next_s = result[n+1] + next_s_type = s_type(this_s) + if this_s_type == "digit": + if before_s == "": + new_result.append(before_s) + if before_s_type == "hans" or next_s_type == "hans": + ss = num_to_chinese(this_s) + else: + ss = this_s + if before_s == "" or before_s_type == next_s_type: + ss += next_s + n+=1 + new_result[len(new_result)-1]+=ss + else: + new_result.append(this_s) + n+=1 + opt = "\n".join(new_result) + return opt if __name__ == '__main__': - method = get_method("cut5") - print(method("你好,我是小明。你好,我是小红。你好,我是小刚。你好,我是小张。")) + method = get_method("mixed_cut") + print(method("你好,我是小明。你好,我是小红。你好,我是小刚。你好,我是小张。") + "\n===\n") + print(method("你好,我是小明") + "\n===\n") + print(method("12345") + "\n===\n") + print(method("123,不许动") + "\n===\n") + print(method("你好,我是小明。我今年20岁了") + "\n===\n") + print(method("你好,我是Maxwell, nice to meet you") + "\n===\n") + print(method("你好,我是Maxwell。我今年20岁了") + "\n===\n") + print(method("你好,我是小明。こんにちは、シャオミンです。") + "\n===\n") + print(method("こんにちは、シャオミンです。 今年で20周年") + "\n===\n") + print(method("こんにちは、シャオミンです。 今年で20周年, nice to meet you") + "\n===\n") + print(method("こんにちは、シャオミンです。nice to meet you") + "\n===\n") + print(method("Hello, I am Maxwell. 20 years old,中文名叫小明") + "\n===\n") diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index 199948c6..2baacdc5 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -67,6 +67,7 @@ cut_method = { i18n("按中文句号。切"): "cut3", i18n("按英文句号.切"): "cut4", i18n("按标点符号切"): "cut5", + i18n("语言混合切字"): "mixed_cut", } tts_config = TTS_Config("GPT_SoVITS/configs/tts_infer.yaml")