GPT-SoVITS/Ref_Audio_Selector/tool/text_check.py

import os
import Ref_Audio_Selector.common.common as common
import Ref_Audio_Selector.tool.audio_check as audio_check
from Ref_Audio_Selector.config_param.log_config import logger


def parse_text_similarity_result_txt(file_path):
    """
    解析指定格式的txt文件，每行格式：f"{item['average_similarity_score']}|{item['count']}|{item['emotion']}"

    :param file_path: txt文件的路径
    :return: 包含解析后数据的字典列表
    """
    data_list = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # 使用'|'作为分隔符分割每行数据
            parts = line.strip().split('|')
            if len(parts) == 3:
                # 将分割后的字符串转换为浮点数、整数和字符串
                try:
                    item = {
                        'average_similarity_score': float(parts[0]),
                        'count': int(parts[1]),
                        'emotion': parts[2]
                    }
                    data_list.append(item)
                except ValueError as e:
                    # 如果转换失败，打印错误信息并跳过该行
                    logger.error(f"Error parsing line: {line.strip()} - {e}")

    return data_list


def remove_low_similarity_files(ref_audio_list, report_list, audio_text_similarity_boundary):
    """
    根据条件删除低相似度音频文件并返回删除数量。

    :param ref_audio_list: 包含音频路径和情感属性的列表
    :param report_list: 包含相似度评分和情感属性的列表
    :param audio_text_similarity_boundary: 相似度阈值
    :return: 删除的文件数量
    """
    deleted_count = 0

    # 筛选出平均相似度低于阈值的报告
    low_similarity_reports = [report for report in report_list if
                              report['average_similarity_score'] < audio_text_similarity_boundary]

    # 遍历低相似度报告，查找并删除对应音频文件
    for report in low_similarity_reports:
        emotion = report['emotion']
        # 查找ref_audio_list中相同情感的音频文件路径
        matching_refs = [ref for ref in ref_audio_list if ref['emotion'] == emotion]
        for match in matching_refs:
            ref_path = match['ref_path']
            # 检查文件是否存在，然后尝试删除
            if os.path.exists(ref_path):
                try:
                    os.remove(ref_path)
                    deleted_count += 1
                    logger.info(f"Deleted file: {ref_path}")
                except Exception as e:
                    logger.error(f"Error deleting file {ref_path}: {e}")
            else:
                logger.error(f"File not found: {ref_path}")

    return deleted_count


def delete_ref_audio_below_boundary(ref_audio_path, text_similarity_result_path, sync_inference_audio_dir,
                                    audio_text_similarity_boundary):
    ref_audio_list = common.RefAudioListManager(ref_audio_path).get_ref_audio_list()
    report_list = parse_text_similarity_result_txt(text_similarity_result_path)
    count = remove_low_similarity_files(ref_audio_list, report_list, audio_text_similarity_boundary)
    audio_check.sync_ref_audio(ref_audio_path, sync_inference_audio_dir)
    return count