GPT-SoVITS/zc_streaming_example.py

import os
import sys
import queue
import threading
import numpy as np
import sounddevice as sd
import wave
import time

now_dir = os.getcwd()
sys.path.append(now_dir)
sys.path.append(os.path.join(now_dir, 'GPT_SoVITS'))
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config

def audio_playback_thread(audio_queue: queue.Queue, sample_rate: int):
    """
    A background thread that plays audio fragments as they become available
    in the queue using a continuous OutputStream for smooth playback.
    """
    sd.default.samplerate = sample_rate
    sd.default.channels = 1
    stream = sd.OutputStream(dtype='float32')
    stream.start()
    try:
        while True:
            audio_fragment = audio_queue.get()
            try:
                if audio_fragment is None:
                    # Sentinel received, end thread
                    break
                # ensure float32 in [-1,1]
                data = audio_fragment.astype(np.float32) / 32768.0
                stream.write(data)
            finally:
                audio_queue.task_done()
    finally:
        stream.stop()
        stream.close()
        print("Playback finished")

def main():
    config_path = 'GPT_SoVITS/configs/tts_infer.yaml' # path to the config file
    t2s_ckpt = 'GPT_SoVITS/pretrained_models/s1v3.ckpt' # path to the t2s checkpoint
    vits_ckpt = 'GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth' # path to the vits checkpoint
    ref_audio = 'local_files/test.wav' # path to the reference audio file
    prompt_text = 'Flesh rots, carrion feeds the scavengers, and the bones remain.  All part of the cycle of life.' # reference_audio transcription
    text = "Today we are going to be testing TTS streaming audio.  Is there anything that I can do to help you out? Or maybe you'd just like a quick snack...  If not, that's okay too.  I'm just here to chat and maybe become friends as I don't meet many people in this world." # text to be converted to audio
    seed = 1 # -1 is random seed

    # Initialize the pipeline
    cfg = TTS_Config(config_path)
    pipeline = TTS(cfg)
    pipeline.init_t2s_weights(t2s_ckpt)
    pipeline.init_vits_weights(vits_ckpt)

    inputs = {
        "text": text,
        "text_lang": "en",
        "ref_audio_path": ref_audio,
        "prompt_text": prompt_text,
        "prompt_lang": "en",
        "top_k": 5,
        "top_p": 1.0,
        "temperature": 1.0,
        "cumulation_amount":10,
        "search_length": 32000*2,
        "num_zeroes": 5,
        "sample_steps": 8,
        "dynamic_cumulatation": True,
        "dynamic_cumulatation_amount": 20,
        "seed" : seed
    }

    while True:
        input("enter to continue")
        fragments = []
        # Initialize generator and fetch first fragment to get sample rate
        gen = pipeline.run_generator(inputs)
        start = time.time()
        try:
            sr, fragment = next(gen)
            fragments.append(fragment)
        except StopIteration:
            print("No audio fragments generated.")
            break

        # Create audio playback queue and start thread with sample rate
        audio_queue = queue.Queue()
        playback_thread = threading.Thread(
            target=audio_playback_thread, args=(audio_queue, sr)
        )
        playback_thread.start()

        for sr, fragment in gen:
            if len(fragments) == 1:
                audio_queue.put(fragments[0])
                end = time.time()
                print(f"Time taken to put first fragment: {end - start}")
            audio_queue.put(fragment)
            fragments.append(fragment)

        # Signal playback thread to finish and wait
        audio_queue.put(None)
        audio_queue.join()
        playback_thread.join()

        print("Audio playback complete")

if __name__ == '__main__':
    main()