Merge pull request #3 from JarodMica/upstream-sync

update streaming and add reference code
2025-12-17 10:27:08 +08:00 · 2025-05-02 11:35:21 -07:00 · 2025-05-02 11:35:21 -07:00 · df6da098f5
commit df6da098f5
parent 18ae7b1751 b5f89b1ef8
8 changed files with 207 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -21,6 +21,7 @@ ffprobe*
 cfg.json
 speakers.json
 ref_audios
+local_files/
 tools/AP_BWE_main/24kto48k/*
 !tools/AP_BWE_main/24kto48k/readme.txt

--- a/GPT_SoVITS/AR/models/t2s_model.py
+++ b/GPT_SoVITS/AR/models/t2s_model.py
@ -941,6 +941,8 @@ class Text2SemanticDecoder(nn.Module):
        prompts: torch.LongTensor,
        bert_feature: torch.LongTensor,
        cumulation_amount: int,
+        dynamic_cumulatation: bool,
+        dynamic_cumulatation_amount: int,
        top_k: int = -100,
        top_p: int = 100,
        early_stop_num: int = -1,
@ -1038,6 +1040,8 @@ class Text2SemanticDecoder(nn.Module):

            if tokens_since_last_yield >= cumulation_amount:
                generated_tokens = y[:, last_yield_idx:]
+                if dynamic_cumulatation:
+                    cumulation_amount += dynamic_cumulatation_amount
                yield generated_tokens
                last_yield_idx = y.shape[1]
                tokens_since_last_yield = 0
--- a/GPT_SoVITS/TTS_infer_pack/TTS.py
+++ b/GPT_SoVITS/TTS_infer_pack/TTS.py
@ -1604,6 +1604,8 @@ class TTS:
        search_length = inputs.get("search_length", 32000 * 5)
        num_zeroes = inputs.get("num_zeroes", 5)
        cumulation_amount = inputs.get("cumulation_amount", 50)
+        dynamic_cumulatation = inputs.get("dynamic_cumulatation", False)
+        dynamic_cumulatation_amount = inputs.get("dynamic_cumulatation_amount", 10)
        # Prepare reference audio
        if ref_audio_path and ref_audio_path != self.prompt_cache["ref_audio_path"]:
            if not os.path.exists(ref_audio_path):
@ -1672,6 +1674,8 @@ class TTS:
            prompt_sem,
            all_bert.unsqueeze(0).to(self.configs.device),
            cumulation_amount=cumulation_amount,
+            dynamic_cumulatation=dynamic_cumulatation,
+            dynamic_cumulatation_amount=dynamic_cumulatation_amount,
            top_k=top_k,
            top_p=top_p,
            temperature=temperature,
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@ -684,8 +684,12 @@ def get_tts_wav(
    sample_steps=8,
    if_sr=False,
    pause_second=0.3,
+    seed_checkbox=False,
+    seed_text_box=None,
 ):
    global cache
+    if seed_checkbox:
+        set_seed(seed_text_box)
    if ref_wav_path:
        pass
    else:
@ -1204,6 +1208,20 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
                temperature = gr.Slider(
                    minimum=0, maximum=1, step=0.05, label=i18n("temperature"), value=1, interactive=True, scale=1
                )
+                seed_checkbox = gr.Checkbox(
+                    label="Seed",
+                    value=False,
+                    interactive=True,
+                    scale=1,
+                )
+                seed_text_box = gr.Textbox(
+                    label="Seed",
+                    value="-1",
+                    lines=1,
+                    max_lines=1,
+                    scale=1,
+                )
+                
            # with gr.Column():
            #     gr.Markdown(value=i18n("手工调整音素。当音素框不为空时使用手工音素输入推理，无视目标文本框。"))
            #     phoneme=gr.Textbox(label=i18n("音素框"), value="")
@ -1231,6 +1249,8 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
                sample_steps,
                if_sr_Checkbox,
                pause_second_slider,
+                seed_checkbox,
+                seed_text_box,
            ],
            [output],
        )
--- a/GPT_SoVITS/text/ja_userdic/user.dict
+++ b/GPT_SoVITS/text/ja_userdic/user.dict
--- a/GPT_SoVITS/text/ja_userdic/userdict.md5
+++ b/GPT_SoVITS/text/ja_userdic/userdict.md5
@ -0,0 +1 @@
+878b3caf4d1cd7c2927c26e85072a2f5
--- a/call_example.py
+++ b/call_example.py
@ -0,0 +1,65 @@
+'''
+If you want to generate a single audio file with GPT-SoVITS, you can use this script.
+The def run() function in GPT_SoVITS.TTS_infer_pack.TTS.py is used to generate the audio, it's a generator function so it must be called with a loop.
+'''
+
+import os
+import sys
+import queue
+import threading
+import numpy as np
+import sounddevice as sd
+import wave
+import soundfile as sf
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+sys.path.append(os.path.join(now_dir, 'GPT_SoVITS'))
+# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
+
+from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config
+
+def main():
+    # create output directory for inference outputs
+    output_dir = "tts_outputs"
+    os.makedirs(output_dir, exist_ok=True)
+    
+    config_path = 'GPT_SoVITS/configs/tts_infer.yaml' # path to the config file 
+    t2s_ckpt = 'GPT_SoVITS/pretrained_models/s1v3.ckpt' # path to the t2s checkpoint
+    vits_ckpt = 'GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth' # path to the vits checkpoint
+    ref_audio = 'local_files/test.wav' # path to the reference audio file
+    prompt_text = 'Flesh rots, carrion feeds the scavengers, and the bones remain.  All part of the cycle of life.' # prompt text
+    text = "Hey there! This is a test of the TTS streaming.  Is there anything that I can do to help you out? Or maybe you'd just like a quick snack...  If not, that's okay too.  I'm just here to chat and maybe become friends as I don't meet many people in this world." # text to be converted to audio   
+    seed = 1 # -1 is random seed
+    
+    cfg = TTS_Config(config_path)
+    pipeline = TTS(cfg)
+    pipeline.init_t2s_weights(t2s_ckpt)
+    pipeline.init_vits_weights(vits_ckpt)
+
+    inputs = {
+        "text": text,
+        "text_lang": "en",
+        "ref_audio_path": ref_audio,
+        "prompt_text": prompt_text,
+        "prompt_lang": "en",
+        "top_k": 5,
+        "top_p": 1.0,
+        "temperature": 1.0,
+        "sample_steps": 10,
+        "seed" : seed   
+    }
+
+    while True:
+        input("Enter to generate audio")
+        gen = pipeline.run(inputs)
+        idx = 0
+        for sr, fragment in gen:
+            out_path = os.path.join(output_dir, f"inference_{idx}.wav")
+            while os.path.exists(out_path):
+                idx += 1
+                out_path = os.path.join(output_dir, f"inference_{idx}.wav")
+            sf.write(out_path, fragment, sr)
+
+if __name__ == '__main__':
+    main()
--- a/zc_streaming_example.py
+++ b/zc_streaming_example.py
@ -0,0 +1,112 @@
+import os
+import sys
+import queue
+import threading
+import numpy as np
+import sounddevice as sd
+import wave
+import time
+
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+sys.path.append(os.path.join(now_dir, 'GPT_SoVITS'))
+# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
+
+from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config
+
+def audio_playback_thread(audio_queue: queue.Queue, sample_rate: int):
+    """
+    A background thread that plays audio fragments as they become available
+    in the queue using a continuous OutputStream for smooth playback.
+    """
+    sd.default.samplerate = sample_rate
+    sd.default.channels = 1
+    stream = sd.OutputStream(dtype='float32')
+    stream.start()
+    try:
+        while True:
+            audio_fragment = audio_queue.get()
+            try:
+                if audio_fragment is None:
+                    # Sentinel received, end thread
+                    break
+                # ensure float32 in [-1,1]
+                data = audio_fragment.astype(np.float32) / 32768.0
+                stream.write(data)
+            finally:
+                audio_queue.task_done()
+    finally:
+        stream.stop()
+        stream.close()
+        print("Playback finished")
+
+def main():
+    config_path = 'GPT_SoVITS/configs/tts_infer.yaml' # path to the config file
+    t2s_ckpt = 'GPT_SoVITS/pretrained_models/s1v3.ckpt' # path to the t2s checkpoint
+    vits_ckpt = 'GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth' # path to the vits checkpoint
+    ref_audio = 'local_files/test.wav' # path to the reference audio file
+    prompt_text = 'Flesh rots, carrion feeds the scavengers, and the bones remain.  All part of the cycle of life.' # reference_audio transcription
+    text = "Today we are going to be testing TTS streaming audio.  Is there anything that I can do to help you out? Or maybe you'd just like a quick snack...  If not, that's okay too.  I'm just here to chat and maybe become friends as I don't meet many people in this world." # text to be converted to audio
+    seed = 1 # -1 is random seed
+    
+    # Initialize the pipeline
+    cfg = TTS_Config(config_path)
+    pipeline = TTS(cfg)
+    pipeline.init_t2s_weights(t2s_ckpt)
+    pipeline.init_vits_weights(vits_ckpt)
+
+    inputs = {
+        "text": text,
+        "text_lang": "en",
+        "ref_audio_path": ref_audio,
+        "prompt_text": prompt_text,
+        "prompt_lang": "en",
+        "top_k": 5,
+        "top_p": 1.0,
+        "temperature": 1.0,
+        "cumulation_amount":10,
+        "search_length": 32000*2,
+        "num_zeroes": 5,
+        "sample_steps": 8,
+        "dynamic_cumulatation": True,
+        "dynamic_cumulatation_amount": 20,
+        "seed" : seed
+    }
+
+    while True:
+        input("enter to continue")
+        fragments = []
+        # Initialize generator and fetch first fragment to get sample rate
+        gen = pipeline.run_generator(inputs)
+        start = time.time()
+        try:
+            sr, fragment = next(gen)
+            fragments.append(fragment)
+        except StopIteration:
+            print("No audio fragments generated.")
+            break
+
+        # Create audio playback queue and start thread with sample rate
+        audio_queue = queue.Queue()
+        playback_thread = threading.Thread(
+            target=audio_playback_thread, args=(audio_queue, sr)
+        )
+        playback_thread.start()        
+
+        for sr, fragment in gen:
+            if len(fragments) == 1:
+                audio_queue.put(fragments[0])
+                end = time.time()
+                print(f"Time taken to put first fragment: {end - start}")
+            audio_queue.put(fragment)
+            fragments.append(fragment)
+
+        # Signal playback thread to finish and wait
+        audio_queue.put(None)
+        audio_queue.join()
+        playback_thread.join()
+
+        print("Audio playback complete")
+
+if __name__ == '__main__':
+    main()