GPT-SoVITS/GPT_SoVITS/TTS_infer_pack/zero_crossing.py

import numpy as np
import wave
import struct

def read_wav_file(filename):
    """
    Reads a WAV file and returns the sample rate and data as a numpy array.
    """
    with wave.open(filename, 'rb') as wf:
        sample_rate = wf.getframerate()
        n_frames = wf.getnframes()
        sample_width = wf.getsampwidth()
        n_channels = wf.getnchannels()

        audio_data = wf.readframes(n_frames)
        # Determine the format string for struct unpacking
        fmt = "<" + {1:'b', 2:'h', 4:'i'}[sample_width] * n_frames * n_channels
        audio_samples = struct.unpack(fmt, audio_data)
        audio_array = np.array(audio_samples, dtype=int)

        # If stereo, reshape the array
        if n_channels > 1:
            audio_array = audio_array.reshape(-1, n_channels)
        return sample_rate, audio_array, sample_width, n_channels

def write_wav_file(filename, sample_rate, data, sample_width, n_channels):
    """
    Writes numpy array data to a WAV file.
    """
    with wave.open(filename, 'wb') as wf:
        wf.setnchannels(n_channels)
        wf.setsampwidth(sample_width)
        wf.setframerate(sample_rate)
        # Flatten the array if it's multi-dimensional
        if data.ndim > 1:
            data = data.flatten()
        # Pack the data into bytes
        fmt = "<" + {1:'b', 2:'h', 4:'i'}[sample_width] * len(data)
        byte_data = struct.pack(fmt, *data)
        wf.writeframes(byte_data)

def find_zero_zone(chunk, start_index, search_length, num_zeroes=11):
    zone = chunk[start_index:start_index + search_length]
    print(f"Zero-crossing search zone: Start={start_index}, Length={len(zone)}")

    zero_threshold = 1.0e-4
    # Check for y consecutive zeros
    for idx in range(len(zone), -1 + num_zeroes, -1):
        index_to_start = idx-num_zeroes
        abs_zone = np.abs(zone[index_to_start:idx])
        if np.all(abs_zone < zero_threshold):
            index_midpoint = index_to_start + int(num_zeroes // 2)
            return (start_index + index_midpoint), None

    print("Falling back to zero crossing due to no zero zone found.  You may hear more prominent pops and clicks in the audio.  Try increasing search length or cumulative tokens.")
    return find_zero_crossing(chunk, start_index, search_length)

def find_zero_crossing(chunk, start_index, search_length):
    # If the model is falling back on the this function, it might be a bad indicator that the search length is too low

    zone = chunk[start_index:start_index + search_length]
    sign_changes = np.where(np.diff(np.sign(zone)) != 0)[0]

    if len(sign_changes) == 0:
        raise ("No zero-crossings found in this zone. This should not be happening, debugging time.")
    else:
        zc_index = start_index + sign_changes[0] + 1
        print(f"Zero-crossing found at index {zc_index}")
        # Determine the crossing direction in chunk1
        prev_value = chunk[zc_index - 1]
        curr_value = chunk[zc_index]
        crossing_direction = np.sign(curr_value) - np.sign(prev_value)
        print(f"Crossing direction in chunk1: {np.sign(prev_value)} to {np.sign(curr_value)}")
        return zc_index, crossing_direction

def find_matching_index(chunk, center_index, max_offset, crossing_direction):
    """
    Finds a zero-crossing in data that matches the specified crossing direction,
    starting from center_index and searching outward.
    """
    if crossing_direction == None:
        return center_index # if zero zone

    # fall back for zero_crossing
    data_length = len(chunk)
    print(f"Center index in chunk2: {center_index}")
    for offset in range(max_offset + 1):
        # Check index bounds
        idx_forward = center_index + offset
        idx_backward = center_index - offset
        found = False

        # Check forward direction
        if idx_forward < data_length - 1:
            prev_sign = np.sign(chunk[idx_forward])
            curr_sign = np.sign(chunk[idx_forward + 1])
            direction = curr_sign - prev_sign
            if direction == crossing_direction:
                print(f"Matching zero-crossing found at index {idx_forward + 1} (forward)")
                return idx_forward + 1

        # Check backward direction
        if idx_backward > 0:
            prev_sign = np.sign(chunk[idx_backward - 1])
            curr_sign = np.sign(chunk[idx_backward])
            direction = curr_sign - prev_sign
            if direction == crossing_direction:
                print(f"Matching zero-crossing found at index {idx_backward} (backward)")
                return idx_backward

    print("No matching zero-crossings found in this zone.")
    return None

# legacy, just for history.  delete me sometime
def splice_chunks(chunk1, chunk2, search_length, y):
    """
    Splices two audio chunks at zero-crossing points.
    """
    # Define the zone to search in chunk1
    start_index1 = len(chunk1) - search_length
    if start_index1 < 0:
        start_index1 = 0
        search_length = len(chunk1)
    print(f"Searching for zero-crossing in chunk1 from index {start_index1} to {len(chunk1)}")
    # Find zero-crossing in chunk1
    zc_index1, crossing_direction = find_zero_crossing(chunk1, start_index1, search_length, y)
    if zc_index1 is None:
        print("No zero-crossing found in chunk1 within the specified zone.")
        return None

    # Define the zone to search in chunk2 near the same index
    # Since chunk2 overlaps with chunk1, we can assume that index positions correspond
    # Adjusted search in chunk2
    # You can adjust this value if needed
    center_index = zc_index1  # Assuming alignment between chunk1 and chunk2
    max_offset = search_length

    # Ensure center_index is within bounds
    if center_index < 0:
        center_index = 0
    elif center_index >= len(chunk2):
        center_index = len(chunk2) - 1

    print(f"Searching for matching zero-crossing in chunk2 around index {center_index} with max offset {max_offset}")

    zc_index2 = find_matching_zero_crossing(chunk2, center_index, max_offset, crossing_direction)

    if zc_index2 is None:
        print("No matching zero-crossing found in chunk2.")
        return None

    print(f"Zero-crossing in chunk1 at index {zc_index1}, chunk2 at index {zc_index2}")
    # Splice the chunks
    new_chunk = np.concatenate((chunk1[:zc_index1], chunk2[zc_index2:]))
    print(f"Spliced chunk length: {len(new_chunk)}")
    return new_chunk

# legacy, just for history.  delete me sometime
def process_audio_chunks(filenames, sample_rate, x, y, output_filename):
    """
    Processes and splices a list of audio chunks.
    """
    # Read the first chunk
    sr, chunk_data, sample_width, n_channels = read_wav_file(filenames[0])
    if sr != sample_rate:
        print(f"Sample rate mismatch in {filenames[0]}")
        return
    print(f"Processing {filenames[0]}")
    # Initialize the combined audio with the first chunk
    combined_audio = chunk_data
    # Process remaining chunks
    for filename in filenames[1:]:
        sr, next_chunk_data, _, _ = read_wav_file(filename)
        if sr != sample_rate:
            print(f"Sample rate mismatch in {filename}")
            return
        print(f"Processing {filename}")
        # Splice the current combined audio with the next chunk
        new_combined = splice_chunks(combined_audio, next_chunk_data, x, y)
        if new_combined is None:
            print(f"Failed to splice chunks between {filename} and previous chunk.")
            return
        combined_audio = new_combined
    # Write the final combined audio to output file
    write_wav_file(output_filename, sample_rate, combined_audio, sample_width, n_channels)
    print(f"Final audio saved to {output_filename}")

# Main execution
if __name__ == "__main__":
    # User-specified parameters
    sample_rate = 32000  # Sample rate in Hz
    x = 500            # Number of frames to search from the end of the chunk
    y = 10               # Number of consecutive zeros to look for
    output_filename = "combined_output.wav"
    folder_with_chunks = "output_chunks"
    import os
    def absolute_file_paths(directory):
        path = os.path.abspath(directory)
        return [entry.path for entry in os.scandir(path) if entry.is_file()]
    # List of input audio chunk filenames in sequential order
    filenames = absolute_file_paths(folder_with_chunks)
    # Process and splice the audio chunks
    process_audio_chunks(filenames, sample_rate, x, y, output_filename)