diff --git a/Ref_Audio_Selector/config.ini b/Ref_Audio_Selector/config.ini index 0ba2aa2..26b47e9 100644 --- a/Ref_Audio_Selector/config.ini +++ b/Ref_Audio_Selector/config.ini @@ -7,6 +7,8 @@ time_log_print_type = file time_log_print_dir = Ref_Audio_Selector/log/performance # 参考音频目录 reference_audio_dir = refer_audio +# 临时文件目录 +temp_dir = Ref_Audio_Selector/temp [AudioSample] # list转换待选参考音频目录 diff --git a/Ref_Audio_Selector/config_param/config_params.py b/Ref_Audio_Selector/config_param/config_params.py index 050bb1d..9c585d4 100644 --- a/Ref_Audio_Selector/config_param/config_params.py +++ b/Ref_Audio_Selector/config_param/config_params.py @@ -9,6 +9,8 @@ time_log_print_type = config.get_base('time_log_print_type') time_log_print_dir = config.get_base('time_log_print_dir') # 参考音频目录 reference_audio_dir = config.get_base('reference_audio_dir') +# 临时文件目录 +temp_dir = config.get_base('temp_dir') # [AudioSample] # list转换待选参考音频目录 diff --git a/Ref_Audio_Selector/tool/speaker_verification/voice_similarity.py b/Ref_Audio_Selector/tool/speaker_verification/voice_similarity.py index e4a5ed9..f30ace2 100644 --- a/Ref_Audio_Selector/tool/speaker_verification/voice_similarity.py +++ b/Ref_Audio_Selector/tool/speaker_verification/voice_similarity.py @@ -1,5 +1,9 @@ import argparse import os +import soundfile as sf +import torchaudio +import torchaudio.transforms as T +import Ref_Audio_Selector.config_param.config_params as params from Ref_Audio_Selector.common.time_util import timeit_decorator from modelscope.pipelines import pipeline @@ -17,10 +21,13 @@ def compare_audio_and_generate_report(reference_audio_path, comparison_dir_path, comparison_audio_paths = [os.path.join(comparison_dir_path, f) for f in os.listdir(comparison_dir_path) if f.endswith('.wav')] + # 因为这个模型是基于16k音频数据训练的,为了避免后续比较时,每次都对参考音频进行重采样,所以,提前进行了采样 + reference_audio_16k = ensure_16k_wav(reference_audio_path) + # Step 2: 用参考音频依次比较音频目录下的每个音频,获取相似度分数及对应路径 similarity_scores = [] for audio_path in comparison_audio_paths: - score = sv_pipeline([reference_audio_path, audio_path])['score'] + score = sv_pipeline([reference_audio_16k, audio_path])['score'] similarity_scores.append({ 'score': score, 'path': audio_path @@ -42,6 +49,56 @@ def compare_audio_and_generate_report(reference_audio_path, comparison_dir_path, f.write(content) +def ensure_16k_wav(audio_file_path, target_sample_rate=16000): + # 读取音频文件信息 + sample_rate, audio_data = sf.read(audio_file_path) + + # 检查采样率是否为16kHz + if sample_rate == target_sample_rate: + # 是16kHz采样率,直接返回原始文件路径 + return audio_file_path + + # 设置临时文件名 + temp_file_path = os.path.join(params.temp_dir, os.path.basename(audio_file_path)) + + # 重采样至16kHz并保存到临时文件 + sf.write(temp_file_path, audio_data, samplerate=target_sample_rate, format="WAV") + + return temp_file_path + + +def ensure_16k_wav_2(audio_file_path, target_sample_rate=16000): + """ + 输入一个音频文件地址,判断其采样率并决定是否进行重采样,然后将结果保存到指定的输出文件。 + + 参数: + audio_file_path (str): 音频文件路径。 + output_file_path (str): 保存重采样后音频数据的目标文件路径。 + target_sample_rate (int, optional): 目标采样率,默认为16000Hz。 + """ + # 读取音频文件并获取其采样率 + waveform, sample_rate = torchaudio.load(audio_file_path) + + # 判断是否需要重采样 + if sample_rate == target_sample_rate: + return audio_file_path + else: + + # 创建Resample实例 + resampler = T.Resample(orig_freq=sample_rate, new_freq=target_sample_rate) + + # 应用重采样 + resampled_waveform = resampler(waveform) + + # 设置临时文件名 + temp_file_path = os.path.join(params.temp_dir, os.path.basename(audio_file_path)) + + # 保存重采样后的音频到指定文件 + torchaudio.save(temp_file_path, resampled_waveform, target_sample_rate) + + return temp_file_path + + def parse_arguments(): parser = argparse.ArgumentParser(description="Audio processing script arguments")