GPT-SoVITS/infer_script.py

'''
This is just an example inference script to test batching with llama, mainly for my reference in the future.
'''

import os
import sys
import numpy as np
import soundfile as sf
import threading
import queue
import sounddevice as sd
import time
import speech_recognition as sr

# Ensure that GPT_SoVITS is in the Python path
now_dir = os.getcwd()
sys.path.append(now_dir)
sys.path.append(os.path.join(now_dir, 'GPT_SoVITS'))
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'


from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config

from llama_cpp import Llama
import sys

# Initialize the Llama model
llm = Llama(
    model_path="ggml-model-q8_0.gguf",
    n_gpu_layers=-1, # Uncomment to use GPU acceleration
    seed=1337, # Uncomment to set a specific seed
    n_ctx=2048, # Uncomment to increase the context window
    chat_format="llama-3",
    verbose=False
)

from time import time

def generate_chat_completion_openai_v1_stream(messages):
    start = time()
    stream = llm.create_chat_completion_openai_v1(
        messages=messages,
        temperature=0.8,       # Adjust temperature as needed
        top_p=0.95,            # Adjust top_p as needed
        top_k=40,              # Adjust top_k as needed
        max_tokens=50,        # Adjust the maximum number of tokens as needed
        # stop=["\n"],           # Adjust the stop sequence as needed
        stream=True            # Enable streaming
    )
    end = time()
    total = end - start
    print(total)
    for chunk in stream:
        if chunk.choices[0].delta.content is not None:
            yield chunk.choices[0].delta.content

def audio_playback_thread(audio_queue, sample_rate):
    """
    Audio playback thread that plays audio fragments from the queue.
    """
    sd.default.samplerate = sample_rate
    sd.default.channels = 1
    stream = sd.OutputStream(dtype='float32')
    stream.start()

    try:
        while True:
            # Get the next audio fragment
            audio_fragment = audio_queue.get()
            try:
                if audio_fragment is None:
                    # Sentinel value received, exit the loop
                    break
                # Write the audio fragment to the stream
                stream.write(audio_fragment)
            finally:
                # Mark the item as processed
                audio_queue.task_done()
    finally:
        stream.stop()
        stream.close()

def main():

    config_path = 'configs/tts_infer.yaml'
    # GPT_model_path = 'pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt'
    GPT_model_path = 'custom_trained.ckpt'
    # SoVITS_model_path = 'pretrained_models/gsv-v2final-pretrained/s2G2333k.pth'
    SoVITS_model_path = 'custom_trained.pth'
    ref_audio_path = 'ref_audio.wav'
    ref_text = 'でもなんか対処法ではないよなこれ対処法ではないけどそもそもの話みたいなことを言ってんのか'
    target_text = """hahahaha, well well, let me tell you about that! it was perhaps the most exquisite day of my life! Phew, I've never had one better!  """
    output_path = 'output'
    ref_language = 'ja'
    target_language = 'ja'


    # Ensure output directory exists
    os.makedirs(output_path, exist_ok=True)

    # Initialize TTS configuration and pipeline
    tts_config = TTS_Config(config_path)
    tts_pipeline = TTS(tts_config)

    # Load model weights
    tts_pipeline.init_t2s_weights(GPT_model_path)
    tts_pipeline.init_vits_weights(SoVITS_model_path)

    # Prepare inputs for TTS
    inputs = {
        "text": target_text,
        "text_lang": target_language.lower(),
        "ref_audio_path": ref_audio_path,
        "prompt_text": ref_text,
        "prompt_lang": ref_language.lower(),
        "top_k": 5,
        "top_p": 1.0,
        "temperature": 1.0,
        "text_split_method": "cut0",
        "batch_size": 1,
        "batch_threshold": 0.75,
        "split_bucket": True,
        "speed_factor": 1.0,
        "fragment_interval": 0.3,
        "seed": 2855904637,
        "return_fragment": True,
        "parallel_infer": False,
        "repetition_penalty": 1.35,
    }

    # Run TTS inference

    system_message = '''You are a friendly AI named Vivy.

    HOW YOU SHOULD RESPOND:
    - The responses should include only verbal responses, for example *laughs* should be replaced with haha
    '''

    # Initialize conversation history with system message
    conversation_history = [
        {"role": "system", "content": f"{system_message}"}
    ]

    # Create a queue for audio fragments
    audio_queue = queue.Queue(maxsize=100)  # Adjust maxsize based on your needs

    # Start the audio playback thread
    playback_thread = threading.Thread(
        target=audio_playback_thread,
        args=(audio_queue, tts_pipeline.configs.sampling_rate)
    )
    playback_thread.start()

    # Setup speech recognition
    r = sr.Recognizer()
    mic = sr.Microphone()

    try:
        while True:
            # Prompt for speech input instead of text input
            while True:
                print("\nPlease speak your message (say 'quit' to exit):")
                with mic as source:
                    # Adjust for ambient noise to improve recognition accuracy
                    r.adjust_for_ambient_noise(source, duration=1.0)
                    print("Listening...")
                    audio_data = r.listen(source, timeout=None, phrase_time_limit=60)
                try:
                    # Replace 'recognize_whisper' with your actual recognition method
                    # Ensure that the method is correctly implemented or available
                    user_input = r.recognize_whisper(audio_data=audio_data, model="base")
                    print("You said: " + user_input)

                    # Check if the input is not empty or just whitespace
                    if user_input.strip() == "":
                        print("No speech detected. Please try again.")
                        continue  # Continue listening
                    break  # Valid input received, exit inner loop
                except sr.UnknownValueError:
                    print("Sorry, I could not understand the audio. Please try again.")
                    continue  # Continue listening
                except sr.RequestError as e:
                    print(f"Could not request results from speech recognition service; {e}")
                    continue  # Continue listening

            # Check if the user wants to quit
            if user_input.lower() == "quit":
                print("Exiting the application. Goodbye!")
                sys.exit()

            # Append user message to conversation history
            conversation_history.append({"role": "user", "content": user_input})

            # Initialize variables to track character count and buffering
            buffer = ""
            char_count = 0
            waiting_for_punctuation = False
            assistant_buffer = ""

            # Generate and print the chat completion with streaming
            for token in generate_chat_completion_openai_v1_stream(conversation_history):
                print(token, end="", flush=True)  # Print each character as it's generated
                buffer += token
                assistant_buffer += token
                char_count += len(token)

                if not waiting_for_punctuation:
                    if char_count >= 100:
                        waiting_for_punctuation = True  # Start looking for punctuation
                else:
                    if any(punct in token for punct in ['.', '!', '?']):
                        # Send the buffer to TTS
                        inputs["text"] = buffer
                        synthesis_result = tts_pipeline.run_generator(inputs)
                        # Consume the generator and put audio fragments into the queue
                        for sampling_rate, audio_fragment in synthesis_result:
                            audio_queue.put(audio_fragment)
                        #put sielnce into audio queue after tts sythesis generator has finished
                        silence_duration = 0.5  # in seconds
                        num_samples = int(sampling_rate * silence_duration)
                        silence = np.zeros(num_samples, dtype='float32')
                        audio_queue.put(silence)

                        # Reset counters and buffer
                        char_count = 0
                        buffer = ""
                        waiting_for_punctuation = False

            # Append assistant message to conversation history
            conversation_history.append({"role": "assistant", "content": assistant_buffer})

            # Handle any remaining text after the generator is done
            if buffer.strip():
                inputs["text"] = buffer
                synthesis_result = tts_pipeline.run_generator(inputs)

                # Consume the generator and put audio fragments into the queue
                for sampling_rate, audio_fragment in synthesis_result:
                    audio_queue.put(audio_fragment)
                #put sielnce into audio queue after tts sythesis generator has finished
                silence_duration = 0.5  # in seconds
                num_samples = int(sampling_rate * silence_duration)
                silence = np.zeros(num_samples, dtype='float32')
                audio_queue.put(silence)

                conversation_history.append({"role": "assistant", "content": buffer})
                buffer = ""
                char_count = 0
                waiting_for_punctuation = False
    finally:
        # After all processing is done, send a sentinel to the audio queue and wait for threads to finish
        audio_queue.put(None)
        audio_queue.join()
        playback_thread.join()


        # text = input("GO:")
        # inputs["text"] = text
        # synthesis_result = tts_pipeline.run_generator(inputs)
        # audio_data_list = list(synthesis_result)
        # if audio_data_list:
        #     # Since return_fragment is False, we expect only one tuple in audio_data_list
        #     sampling_rate, audio_data = audio_data_list[0]
        #     output_wav_path = os.path.join(output_path, "output.wav")
        #     # Save the audio data to a WAV file
        #     sf.write(output_wav_path, audio_data, sampling_rate)
        #     print(f"Audio saved to {output_wav_path}")
        # else:
        #     print("No audio data generated.")

if __name__ == '__main__':
    main()