GPT-SoVITS/tools/i18n/scan_i18n.py

import ast
import json
from collections import OrderedDict
import os

# locale_path = "./i18n/locale" # The path to the i18n locale directory, you can change it to your own path
# scan_list = ["./",
#              "GPT_SoVITS/",
#              "tools/"
#              ]  # The path to the directory you want to scan, you can change it to your own path
# scan_subfolders = False  # Whether to scan subfolders

locale_path = "./Inference/i18n/locale"
scan_list = ["./Inference/"]  # The path to the directory you want to scan, you can change it to your own path
scan_subfolders = True

special_words_to_keep = {
    "auto": "自动判断",
    "zh": "中文",
    "en": "英文",
    "ja": "日文",
    "all_zh": "只有中文",
    "all_ja": "只有日文",
    "auto_cut": "智能切分",
    "cut0": "仅凭换行切分",
    "cut1": "凑四句一切",
    "cut2": "凑50字一切",
    "cut3": "按中文句号。切",
    "cut4": "按英文句号.切",
    "cut5": "按标点符号切",

}


def extract_i18n_strings(node):
    i18n_strings = []

    if (
        isinstance(node, ast.Call)
        and isinstance(node.func, ast.Name)
        and node.func.id == "i18n"
    ):
        for arg in node.args:
            if isinstance(arg, ast.Str):
                i18n_strings.append(arg.s)

    for child_node in ast.iter_child_nodes(node):
        i18n_strings.extend(extract_i18n_strings(child_node))

    return i18n_strings

strings = []

# for each file, parse the code into an AST
# for each AST, extract the i18n strings
def scan_i18n_strings(filename):
    with open(filename, "r", encoding="utf-8") as f:
        code = f.read()
        if "I18nAuto" in code:
            tree = ast.parse(code)
            i18n_strings = extract_i18n_strings(tree)
            print(filename, len(i18n_strings))
            strings.extend(i18n_strings)


# scan the directory for all .py files (recursively)
if scan_subfolders:
    for folder in scan_list:
        for dirpath, dirnames, filenames in os.walk(folder):
            for filename in [f for f in filenames if f.endswith(".py")]:
                scan_i18n_strings(os.path.join(dirpath, filename))
else:
    for folder in scan_list:
        for filename in os.listdir(folder):
            if filename.endswith(".py"):
                scan_i18n_strings(os.path.join(folder, filename))

code_keys = set(strings)
"""
n_i18n.py
gui_v1.py 26
app.py 16
infer-web.py 147
scan_i18n.py 0
i18n.py 0
lib/train/process_ckpt.py 1
"""
print()
print("Total unique:", len(code_keys))


standard_file = os.path.join(locale_path, "zh_CN.json")
with open(standard_file, "r", encoding="utf-8") as f:
    standard_data = json.load(f, object_pairs_hook=OrderedDict)
standard_keys = set(standard_data.keys())

# Define the standard file name
unused_keys = standard_keys - code_keys
print("Unused keys:", len(unused_keys))
for unused_key in unused_keys:
    print("\t", unused_key)

missing_keys = code_keys - standard_keys
print("Missing keys:", len(missing_keys))
for missing_key in missing_keys:
    print("\t", missing_key)


code_keys_dict = OrderedDict()
for s in strings:
    if s in special_words_to_keep:
        code_keys_dict[s] = special_words_to_keep[s]
    else:
        code_keys_dict[s] = s

# write back
with open(standard_file, "w", encoding="utf-8") as f:
    json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True)
    f.write("\n")