diff --git a/GPT_SoVITS/TTS_infer_pack/TTS.py b/GPT_SoVITS/TTS_infer_pack/TTS.py
index c7ae465c..bd811d8a 100644
--- a/GPT_SoVITS/TTS_infer_pack/TTS.py
+++ b/GPT_SoVITS/TTS_infer_pack/TTS.py
@@ -323,7 +323,7 @@ class TTS_Config:
         assert isinstance(configs, dict)
         configs_ = deepcopy(self.default_configs)
         configs_.update(configs)
-        self.configs: dict = configs_.get("custom", configs_["v2"])
+        self.configs: dict = configs_.get("custom", configs_["v2ProPlus"])
         self.default_configs = deepcopy(configs_)
 
         self.device = self.configs.get("device", torch.device("cpu"))
@@ -1872,19 +1872,19 @@ class TTS:
             self.init_sr_model()
             if not self.sr_model_not_exist:
                 audio, sr = self.sr_model(audio.unsqueeze(0), sr)
-                max_audio = np.abs(audio).max()
+                if isinstance(audio, torch.Tensor):
+                    max_audio = float(torch.abs(audio).max().item())
+                else:
+                    max_audio = float(np.abs(audio).max())
                 if max_audio > 1:
                     audio /= max_audio
-            audio = (audio * 32768).astype(np.int16)
             t2 = time.perf_counter()
             print(f"超采样用时：{t2 - t1:.3f}s")
+        if isinstance(audio, torch.Tensor):
+            audio = audio.detach().float().cpu().numpy()
         else:
-            # audio = audio.float() * 32768
-            # audio = audio.to(dtype=torch.int16).clamp(-32768, 32767).cpu().numpy()
-
-            audio = audio.cpu().numpy()
-
-        audio = (audio * 32768).astype(np.int16)
+            audio = np.asarray(audio)
+        audio = (audio.reshape(-1) * 32768).astype(np.int16)
 
 
         # try:
@@ -2036,20 +2036,23 @@ class TTS:
         phones: torch.Tensor,
         prompt_semantic: torch.Tensor,
         prompt_phones: torch.Tensor,
-        refer_spec: tuple,
+        refer_spec: tuple | List[tuple],
         raw_audio: torch.Tensor,
         raw_sr: int,
         speed: float = 1.0,
         sample_steps: int = 32,
     ):
-        refer_audio_spec, audio_tensor = refer_spec
+        refer_specs = list(refer_spec) if isinstance(refer_spec, list) else [refer_spec]
+        refer_audio_spec, audio_tensor = refer_specs[0]
         if not self.configs.use_vocoder:
-            refer_audio_spec_list = [refer_audio_spec.to(dtype=self.precision, device=self.configs.device)]
+            refer_audio_spec_list = [item[0].to(dtype=self.precision, device=self.configs.device) for item in refer_specs]
             sv_emb = None
             if self.is_v2pro:
-                if audio_tensor is None:
-                    raise ValueError(i18n("v2Pro request-local synthesis 缺少 16k 参考音频"))
-                sv_emb = self.sv_model.compute_embedding3(audio_tensor).to(self.configs.device)
+                sv_emb = []
+                for _, audio_tensor_item in refer_specs:
+                    if audio_tensor_item is None:
+                        raise ValueError(i18n("v2Pro request-local synthesis 缺少 16k 参考音频"))
+                    sv_emb.append(self.sv_model.compute_embedding3(audio_tensor_item).to(self.configs.device))
             return self.vits_model.decode(
                 semantic_tokens,
                 phones,
@@ -2075,7 +2078,7 @@ class TTS:
         self,
         semantic_tokens_list: List[torch.Tensor],
         phones_list: List[torch.Tensor],
-        refer_specs: List[tuple],
+        refer_specs: List[tuple | List[tuple]],
         speeds: List[float] | None = None,
         sample_steps_list: List[int] | None = None,
     ) -> List[torch.Tensor]:
@@ -2118,7 +2121,11 @@ class TTS:
             semantic_lengths.append(semantic_len)
             phone_lengths.append(phone_len)
 
-            refer_audio_spec, audio_tensor = refer_specs[batch_index]
+            refer_spec_item = refer_specs[batch_index]
+            refer_spec_group = list(refer_spec_item) if isinstance(refer_spec_item, list) else [refer_spec_item]
+            if len(refer_spec_group) != 1:
+                raise ValueError("batched request-local synthesis 暂不支持单请求多参考音频")
+            refer_audio_spec, audio_tensor = refer_spec_group[0]
             refer_audio_specs.append(refer_audio_spec.to(dtype=self.precision, device=device))
             if self.is_v2pro:
                 if audio_tensor is None:
diff --git a/GPT_SoVITS/TTS_infer_pack/prepare_coordinator.py b/GPT_SoVITS/TTS_infer_pack/prepare_coordinator.py
index 306b1b45..06a5e1b8 100644
--- a/GPT_SoVITS/TTS_infer_pack/prepare_coordinator.py
+++ b/GPT_SoVITS/TTS_infer_pack/prepare_coordinator.py
@@ -12,6 +12,7 @@ from GPT_SoVITS.TTS_infer_pack.t2s_scheduler import (
     PreparedTextFeatures,
     SchedulerRequestSpec,
     T2SRequestState,
+    build_empty_text_features,
     build_request_state_from_parts,
     normalize_sentence,
 )
@@ -118,6 +119,21 @@ class PrepareCoordinator:
     def _prepare_text_cpu(self, text: str, language: str):
         return self.tts.prepare_text_segments(text, language)
 
+    @staticmethod
+    def _build_empty_text_features_like(reference: PreparedTextFeatures | None = None) -> PreparedTextFeatures:
+        feature_dim = 1024
+        dtype = None
+        if reference is not None:
+            try:
+                feature_dim = int(reference.bert_features.shape[0])
+                dtype = reference.bert_features.dtype
+            except Exception:
+                pass
+        return build_empty_text_features(
+            feature_dim=int(feature_dim),
+            dtype=(dtype if dtype is not None else None) or __import__("torch").float32,
+        )
+
     def _build_text_features(self, prepared_segments, language: str, cpu_run_ms: float) -> PreparedTextFeatures:
         profile: Dict[str, float] = {"cpu_preprocess_ms": float(cpu_run_ms)}
         branch_start = time.perf_counter()
@@ -139,6 +155,9 @@ class PrepareCoordinator:
         return await loop.run_in_executor(executor, self._run_profiled, fn, float(submit_at), *args)
 
     async def _run_text_cpu_stage(self, text: str, language: str) -> ProfiledResult:
+        if text in [None, ""]:
+            submit_at = time.perf_counter()
+            return ProfiledResult(result=[], submit_at=submit_at, started_at=submit_at, finished_at=submit_at)
         executor = getattr(self.tts, "prepare_text_cpu_executor", None)
         if executor is None:
             submit_at = time.perf_counter()
@@ -164,19 +183,71 @@ class PrepareCoordinator:
         prompt_cpu_run_ms: float,
         target_cpu_run_ms: float,
     ) -> tuple[ProfiledResult, ProfiledResult]:
+        prompt_is_empty = len(prompt_segments or []) == 0
         if self.text_feature_executor is not None:
-            prompt_feature_task = asyncio.create_task(
-                self._run_text_feature_stage(prompt_segments, None, prompt_cpu_run_ms)
+            target_feature_task = asyncio.create_task(self._run_text_feature_stage(target_segments, None, target_cpu_run_ms))
+            if not prompt_is_empty:
+                prompt_feature_task = asyncio.create_task(self._run_text_feature_stage(prompt_segments, None, prompt_cpu_run_ms))
+                return await asyncio.gather(prompt_feature_task, target_feature_task)
+            target_profiled = await target_feature_task
+            submit_at = time.perf_counter()
+            prompt_profiled = ProfiledResult(
+                result=self._build_empty_text_features_like(target_profiled.result),
+                submit_at=float(submit_at),
+                started_at=float(submit_at),
+                finished_at=float(submit_at),
             )
-            target_feature_task = asyncio.create_task(
-                self._run_text_feature_stage(target_segments, None, target_cpu_run_ms)
-            )
-            return await asyncio.gather(prompt_feature_task, target_feature_task)
+            return prompt_profiled, target_profiled
 
-        prompt_profile: Dict[str, float] = {"cpu_preprocess_ms": float(prompt_cpu_run_ms)}
         target_profile: Dict[str, float] = {"cpu_preprocess_ms": float(target_cpu_run_ms)}
         submit_at = time.perf_counter()
         started_at = float(submit_at)
+        if prompt_is_empty:
+            target_result_raw = await self.tts.build_text_features_from_segments_async(
+                target_segments,
+                profile=target_profile,
+            )
+            prompt_result = self._build_empty_text_features_like(
+                PreparedTextFeatures(
+                    phones=target_result_raw[0],
+                    bert_features=target_result_raw[1],
+                    norm_text=target_result_raw[2],
+                    profile=target_profile,
+                    total_ms=float(target_cpu_run_ms + self._estimate_text_feature_run_ms(target_profile)),
+                    cpu_preprocess_ms=float(target_cpu_run_ms),
+                )
+            )
+            finished_at = time.perf_counter()
+            prompt_profiled = ProfiledResult(
+                result=prompt_result,
+                submit_at=float(submit_at),
+                started_at=float(submit_at),
+                finished_at=float(submit_at),
+            )
+            target_result = PreparedTextFeatures(
+                phones=target_result_raw[0],
+                bert_features=target_result_raw[1],
+                norm_text=target_result_raw[2],
+                profile=target_profile,
+                total_ms=float(target_cpu_run_ms + self._estimate_text_feature_run_ms(target_profile)),
+                cpu_preprocess_ms=float(target_cpu_run_ms),
+            )
+            target_profiled = ProfiledResult(
+                result=target_result,
+                submit_at=float(submit_at),
+                started_at=started_at,
+                finished_at=float(submit_at + self._estimate_text_feature_run_ms(target_profile) / 1000.0),
+            )
+            if finished_at > target_profiled.finished_at:
+                target_result.profile["bert_total_ms"] = max(
+                    self._estimate_text_feature_run_ms(target_profile),
+                    (finished_at - submit_at) * 1000.0,
+                )
+            else:
+                target_result.profile["bert_total_ms"] = self._estimate_text_feature_run_ms(target_profile)
+            return prompt_profiled, target_profiled
+
+        prompt_profile: Dict[str, float] = {"cpu_preprocess_ms": float(prompt_cpu_run_ms)}
         prompt_result_raw, target_result_raw = await self.tts.build_text_feature_pair_from_segments_async(
             prompt_segments,
             target_segments,
diff --git a/GPT_SoVITS/TTS_infer_pack/t2s_scheduler.py b/GPT_SoVITS/TTS_infer_pack/t2s_scheduler.py
index 8aabd286..78dbfc36 100644
--- a/GPT_SoVITS/TTS_infer_pack/t2s_scheduler.py
+++ b/GPT_SoVITS/TTS_infer_pack/t2s_scheduler.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
+import os
 from pathlib import Path
 import time
 from typing import Any, Dict, List, Optional, Sequence, Tuple
@@ -35,6 +36,7 @@ class SchedulerRequestSpec:
     temperature: float
     repetition_penalty: float
     early_stop_num: int
+    aux_ref_audio_paths: List[str] = field(default_factory=list)
     ready_step: int = 0
 
 
@@ -54,6 +56,7 @@ class T2SRequestState:
     all_bert_features: torch.Tensor
     prompt_semantic: torch.LongTensor
     refer_spec: Tuple[torch.Tensor, Optional[torch.Tensor]]
+    aux_refer_specs: List[Tuple[torch.Tensor, Optional[torch.Tensor]]]
     raw_audio: torch.Tensor
     raw_sr: int
     top_k: int
@@ -113,6 +116,21 @@ class PreparedTextFeatures:
     cpu_preprocess_ms: float
 
 
+def build_empty_text_features(
+    *,
+    feature_dim: int = 1024,
+    dtype: torch.dtype = torch.float32,
+) -> PreparedTextFeatures:
+    return PreparedTextFeatures(
+        phones=[],
+        bert_features=torch.empty((int(feature_dim), 0), dtype=dtype),
+        norm_text="",
+        profile={"cpu_preprocess_ms": 0.0, "bert_total_ms": 0.0},
+        total_ms=0.0,
+        cpu_preprocess_ms=0.0,
+    )
+
+
 def normalize_sentence(text: str, language: str) -> str:
     text = text.strip("\n").strip()
     if not text:
@@ -171,6 +189,14 @@ def build_request_state_from_parts(
     bundle_profile = ref_audio_bundle.get("profile", {})
     prompt_semantic = ref_audio_bundle["prompt_semantic"].long()
     spec_audio, audio_16k = ref_audio_bundle["refer_spec"]
+    aux_refer_specs: List[Tuple[torch.Tensor, Optional[torch.Tensor]]] = []
+    for aux_ref_audio_path in list(getattr(spec, "aux_ref_audio_paths", []) or []):
+        if aux_ref_audio_path in [None, ""]:
+            continue
+        if not os.path.exists(str(aux_ref_audio_path)):
+            continue
+        aux_spec_audio, aux_audio_16k, _, _ = tts.extract_ref_spec(str(aux_ref_audio_path))
+        aux_refer_specs.append((aux_spec_audio, aux_audio_16k))
     raw_audio = ref_audio_bundle["raw_audio"]
     raw_sr = int(ref_audio_bundle["raw_sr"])
     prompt_semantic_ms = float(bundle_profile.get("prompt_semantic_ms", ref_audio_bundle_ms))
@@ -182,9 +208,9 @@ def build_request_state_from_parts(
     phones_tensor = torch.LongTensor(target_result.phones).to(tts.configs.device)
     prompt_phones_tensor = torch.LongTensor(prompt_result.phones).to(tts.configs.device)
     all_phones = torch.LongTensor(prompt_result.phones + target_result.phones).to(tts.configs.device)
-    all_bert_features = torch.cat([prompt_result.bert_features, target_result.bert_features], dim=1).to(
-        dtype=tts.precision, device=tts.configs.device
-    )
+    prompt_bert_features = prompt_result.bert_features.to(dtype=tts.precision, device=tts.configs.device)
+    target_bert_features = target_result.bert_features.to(dtype=tts.precision, device=tts.configs.device)
+    all_bert_features = torch.cat([prompt_bert_features, target_bert_features], dim=1)
     _sync_device(device)
     tensorize_ms = (time.perf_counter() - tensorize_start) * 1000.0
 
@@ -280,6 +306,7 @@ def build_request_state_from_parts(
         all_bert_features=all_bert_features,
         prompt_semantic=prompt_semantic,
         refer_spec=(spec_audio, audio_16k),
+        aux_refer_specs=aux_refer_specs,
         raw_audio=raw_audio,
         raw_sr=raw_sr,
         top_k=spec.top_k,
@@ -301,10 +328,16 @@ def prepare_request_state(
     prepare_sync_start = time.perf_counter()
     prompt_text = normalize_sentence(spec.prompt_text, spec.prompt_lang)
     text = spec.text.strip("\n")
-    prompt_result = prepare_text_features(tts, prompt_text, spec.prompt_lang)
     target_result = prepare_text_features(tts, text, spec.text_lang)
     if target_result.phones is None:
         raise ValueError(f"{spec.request_id} text preprocessing returned no phones")
+    if prompt_text in [None, ""]:
+        prompt_result = build_empty_text_features(
+            feature_dim=int(target_result.bert_features.shape[0]),
+            dtype=target_result.bert_features.dtype,
+        )
+    else:
+        prompt_result = prepare_text_features(tts, prompt_text, spec.prompt_lang)
     ref_audio_bundle = tts.extract_ref_audio_bundle(str(spec.ref_audio_path))
     return build_request_state_from_parts(
         tts=tts,
diff --git a/GPT_SoVITS/TTS_infer_pack/unified_engine_api.py b/GPT_SoVITS/TTS_infer_pack/unified_engine_api.py
index ca372d5d..0895cef4 100644
--- a/GPT_SoVITS/TTS_infer_pack/unified_engine_api.py
+++ b/GPT_SoVITS/TTS_infer_pack/unified_engine_api.py
@@ -119,6 +119,7 @@ class EngineApiFacade:
         speed_factor: float,
         sample_steps: int,
         media_type: str,
+        super_sampling: bool,
         prepare_wall_ms: float,
         prepare_profile_total_ms: float,
         done_loop: asyncio.AbstractEventLoop | None,
@@ -131,6 +132,7 @@ class EngineApiFacade:
             speed_factor=speed_factor,
             sample_steps=sample_steps,
             media_type=media_type,
+            super_sampling=super_sampling,
             prepare_wall_ms=prepare_wall_ms,
             prepare_profile_total_ms=prepare_profile_total_ms,
             done_loop=done_loop,
diff --git a/GPT_SoVITS/TTS_infer_pack/unified_engine_api_direct.py b/GPT_SoVITS/TTS_infer_pack/unified_engine_api_direct.py
index b2a308df..f55da45d 100644
--- a/GPT_SoVITS/TTS_infer_pack/unified_engine_api_direct.py
+++ b/GPT_SoVITS/TTS_infer_pack/unified_engine_api_direct.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
 import asyncio
+import queue
+import threading
 import time
 import uuid
 from io import BytesIO
@@ -122,6 +124,173 @@ class EngineApiDirectFlow:
         payload["response_streaming"] = False
         return self.api._normalize_engine_request(payload, error_prefix="segment request 参数非法: ")
 
+    async def _execute_single_segment_scheduler_job(
+        self,
+        normalized: NormalizedEngineRequest,
+        *,
+        segment_request: NormalizedEngineRequest,
+    ) -> tuple[SchedulerPendingJob, Dict[str, Any]]:
+        spec = self.api._build_scheduler_submit_spec(segment_request)
+        state, prepare_exec_started_at, prepare_exec_finished_at = await self.api._prepare_state_via_engine_gpu_queue(
+            spec=spec,
+            prepare_submit_at=time.perf_counter(),
+            engine_request_id=None,
+        )
+        prepare_wall_ms = max(0.0, (prepare_exec_finished_at - prepare_exec_started_at) * 1000.0)
+        prepare_profile_total_ms = float(state.prepare_profile.get("wall_total_ms", prepare_wall_ms))
+        loop = asyncio.get_running_loop()
+        done_future = loop.create_future()
+        await self.api._enqueue_prepared_state_for_dispatch(
+            state=state,
+            speed_factor=float(normalized.speed_factor),
+            sample_steps=int(normalized.sample_steps),
+            media_type=normalized.media_type,
+            super_sampling=bool(normalized.super_sampling),
+            prepare_wall_ms=prepare_wall_ms,
+            prepare_profile_total_ms=prepare_profile_total_ms,
+            done_loop=loop,
+            done_future=done_future,
+            engine_request_id=None,
+            timeout_sec=normalized.timeout_sec,
+        )
+        timeout_sec = float(normalized.timeout_sec if normalized.timeout_sec is not None else 30.0)
+        job: SchedulerPendingJob = await asyncio.wait_for(done_future, timeout=timeout_sec)
+        return job, {
+            "request_id": spec.request_id,
+            "prepare_wall_ms": prepare_wall_ms,
+            "prepare_profile_total_ms": prepare_profile_total_ms,
+            "prepare_profile": dict(state.prepare_profile),
+        }
+
+    def _iter_scheduler_direct_tts_bytes(self, normalized: NormalizedEngineRequest) -> Generator[bytes, None, None]:
+        request_start = time.perf_counter()
+        request_id = normalized.request_id
+        media_type = normalized.media_type
+        segment_texts = self._segment_direct_text(normalized)
+        if not segment_texts:
+            raise ValueError("text preprocessing returned no valid segments")
+        chunk_queue: queue.Queue[object] = queue.Queue(maxsize=8)
+        done_marker = object()
+
+        async def _produce_chunks() -> None:
+            self.api._update_request_state(
+                request_id,
+                EngineStatus.CPU_PREPARING,
+                {"backend": "scheduler_v1_direct", "backend_mode": "scheduler_v1_direct", "segment_count": len(segment_texts)},
+            )
+            sample_rate: int | None = None
+            current_media_type = media_type
+            chunk_count = 0
+            stream_total_bytes = 0
+            first_chunk_ms: float | None = None
+            prepare_profiles: List[Dict[str, Any]] = []
+            worker_profiles: List[Dict[str, Any]] = []
+            try:
+                for segment_index, segment_text in enumerate(segment_texts):
+                    segment_request = self._build_segment_request(
+                        normalized,
+                        request_id=f"{request_id}_seg_{segment_index:03d}",
+                        text=segment_text,
+                    )
+                    self.api._update_request_state(
+                        request_id,
+                        EngineStatus.READY_FOR_PREFILL,
+                        {
+                            "backend": "scheduler_v1_direct",
+                            "backend_mode": "scheduler_v1_direct",
+                            "segment_index": segment_index,
+                            "segment_count": len(segment_texts),
+                        },
+                    )
+                    job, prepare_profile = await self._execute_single_segment_scheduler_job(
+                        normalized,
+                        segment_request=segment_request,
+                    )
+                    prepare_profiles.append(prepare_profile)
+                    if job.error is not None:
+                        raise RuntimeError(job.error)
+                    if job.audio_data is None or job.sample_rate is None or job.result is None:
+                        raise RuntimeError(f"{job.request_id} finished without audio result")
+                    worker_profiles.append(dict(job.result))
+                    if sample_rate is None:
+                        sample_rate = int(job.sample_rate)
+                        first_chunk_ms = max(0.0, (time.perf_counter() - request_start) * 1000.0)
+                        self.api._update_request_state(
+                            request_id,
+                            EngineStatus.STREAMING,
+                            {
+                                "backend": "scheduler_v1_direct",
+                                "backend_mode": "scheduler_v1_direct",
+                                "sample_rate": int(sample_rate),
+                            },
+                        )
+                        if media_type == "wav":
+                            header = wave_header_chunk(sample_rate=int(sample_rate))
+                            chunk_count += 1
+                            stream_total_bytes += len(header)
+                            chunk_queue.put(header)
+                            current_media_type = "raw"
+                    packed_chunk = pack_audio(BytesIO(), job.audio_data, int(job.sample_rate), current_media_type).getvalue()
+                    chunk_count += 1
+                    stream_total_bytes += len(packed_chunk)
+                    chunk_queue.put(packed_chunk)
+                    if segment_index + 1 < len(segment_texts):
+                        silence_samples = int(float(normalized.fragment_interval) * float(job.sample_rate))
+                        if silence_samples > 0:
+                            silence_chunk = np.zeros(silence_samples, dtype=np.int16)
+                            packed_silence = pack_audio(
+                                BytesIO(), silence_chunk, int(job.sample_rate), current_media_type
+                            ).getvalue()
+                            chunk_count += 1
+                            stream_total_bytes += len(packed_silence)
+                            chunk_queue.put(packed_silence)
+            except Exception as exc:
+                self.api._fail_request_state(request_id, str(exc))
+                chunk_queue.put(exc)
+            else:
+                self.api._merge_request_state_profile(
+                    request_id,
+                    {
+                        "prepare_aggregate": self.api._aggregate_numeric_dicts(
+                            [item["prepare_profile"] for item in prepare_profiles]
+                        ),
+                        "engine_policy_wait_ms": sum(
+                            float(item.get("engine_policy_wait_ms", 0.0)) for item in worker_profiles
+                        ),
+                        "engine_dispatch_wait_ms": sum(
+                            float(item.get("engine_dispatch_wait_ms", 0.0)) for item in worker_profiles
+                        ),
+                    },
+                )
+                direct_profile = self.api._build_direct_scheduler_profile(
+                    backend="scheduler_v1_direct",
+                    request_start=request_start,
+                    response_ready_at=time.perf_counter(),
+                    audio_bytes=stream_total_bytes,
+                    sample_rate=int(sample_rate or 0),
+                    segment_texts=segment_texts,
+                    prepare_profiles=prepare_profiles,
+                    worker_profiles=worker_profiles,
+                    pack_ms=0.0,
+                    response_overhead_ms=0.0,
+                )
+                self.api._complete_request_state(
+                    request_id,
+                    dict(direct_profile, streaming_completed=True, first_chunk_ms=first_chunk_ms),
+                )
+            finally:
+                chunk_queue.put(done_marker)
+
+        producer_thread = threading.Thread(target=lambda: asyncio.run(_produce_chunks()), daemon=True)
+        producer_thread.start()
+        while True:
+            item = chunk_queue.get()
+            if item is done_marker:
+                break
+            if isinstance(item, Exception):
+                raise item
+            yield item
+
     async def _run_direct_tts_via_scheduler(self, normalized: NormalizedEngineRequest) -> DirectTTSExecution:
         request_start = time.perf_counter()
         request_id = normalized.request_id
@@ -129,63 +298,48 @@ class EngineApiDirectFlow:
         segment_texts = self._segment_direct_text(normalized)
         if not segment_texts:
             raise ValueError("text preprocessing returned no valid segments")
+        if normalized.response_streaming:
+            return DirectTTSExecution(
+                media_type=media_type,
+                streaming=True,
+                audio_generator=self._iter_scheduler_direct_tts_bytes(normalized),
+                request_id=request_id,
+            )
         self.api._update_request_state(
             request_id,
             EngineStatus.CPU_PREPARING,
             {"backend": "scheduler_v1_direct", "backend_mode": "scheduler_v1_direct", "segment_count": len(segment_texts)},
         )
-        segment_specs = []
-        for segment_index, segment_text in enumerate(segment_texts):
-            segment_request = self._build_segment_request(
+        segment_requests = [
+            self._build_segment_request(
                 normalized,
                 request_id=f"{request_id}_seg_{segment_index:03d}",
                 text=segment_text,
             )
-            segment_specs.append(self.api._build_scheduler_submit_spec(segment_request))
-
-        prepared_items = await asyncio.gather(
-            *[
-                self.api._prepare_state_via_engine_gpu_queue(
-                    spec=spec,
-                    prepare_submit_at=time.perf_counter(),
-                    engine_request_id=None,
-                )
-                for spec in segment_specs
-            ]
-        )
+            for segment_index, segment_text in enumerate(segment_texts)
+        ]
         prepare_profiles: List[Dict[str, Any]] = []
         loop = asyncio.get_running_loop()
         done_futures: List[asyncio.Future] = []
         self.api._update_request_state(
             request_id,
             EngineStatus.READY_FOR_PREFILL,
-            {"backend": "scheduler_v1_direct", "backend_mode": "scheduler_v1_direct", "segment_count": len(segment_specs)},
+            {"backend": "scheduler_v1_direct", "backend_mode": "scheduler_v1_direct", "segment_count": len(segment_requests)},
         )
-        for spec, (state, prepare_exec_started_at, prepare_exec_finished_at) in zip(segment_specs, prepared_items):
-            prepare_wall_ms = max(0.0, (prepare_exec_finished_at - prepare_exec_started_at) * 1000.0)
-            prepare_profile_total_ms = float(state.prepare_profile.get("wall_total_ms", prepare_wall_ms))
-            prepare_profiles.append(
-                {
-                    "request_id": spec.request_id,
-                    "prepare_wall_ms": prepare_wall_ms,
-                    "prepare_profile_total_ms": prepare_profile_total_ms,
-                    "prepare_profile": dict(state.prepare_profile),
-                }
-            )
+        prepared_items = await asyncio.gather(
+            *[
+                self._execute_single_segment_scheduler_job(
+                    normalized,
+                    segment_request=segment_request,
+                )
+                for segment_request in segment_requests
+            ]
+        )
+        for job, prepare_profile in prepared_items:
+            prepare_profiles.append(prepare_profile)
             done_future = loop.create_future()
+            done_future.set_result(job)
             done_futures.append(done_future)
-            await self.api._enqueue_prepared_state_for_dispatch(
-                state=state,
-                speed_factor=float(normalized.speed_factor),
-                sample_steps=int(normalized.sample_steps),
-                media_type=media_type,
-                prepare_wall_ms=prepare_wall_ms,
-                prepare_profile_total_ms=prepare_profile_total_ms,
-                done_loop=loop,
-                done_future=done_future,
-                engine_request_id=None,
-                timeout_sec=normalized.timeout_sec,
-            )
         self.api._update_request_state(
             request_id,
             EngineStatus.ACTIVE_DECODE,
diff --git a/GPT_SoVITS/TTS_infer_pack/unified_engine_api_request.py b/GPT_SoVITS/TTS_infer_pack/unified_engine_api_request.py
index 974b9612..14d59f12 100644
--- a/GPT_SoVITS/TTS_infer_pack/unified_engine_api_request.py
+++ b/GPT_SoVITS/TTS_infer_pack/unified_engine_api_request.py
@@ -122,16 +122,6 @@ def is_aux_ref_enabled(aux_ref_audio_paths: List[str] | None) -> bool:
 
 
 def select_direct_backend(normalized: NormalizedEngineRequest) -> Tuple[str, str | None]:
-    if normalized.response_streaming:
-        if normalized.return_fragment or normalized.fixed_length_chunk:
-            return "legacy_direct_fragment", "fragment_streaming_mode"
-        return "legacy_direct_streaming", "streaming_mode"
-    if is_aux_ref_enabled(normalized.aux_ref_audio_paths):
-        return "legacy_direct_aux_ref", "aux_ref_audio_paths"
-    if normalized.super_sampling:
-        return "legacy_direct_super_sampling", "super_sampling"
-    if normalized.prompt_text in [None, ""]:
-        return "legacy_direct_missing_prompt", "missing_prompt_text"
     return "scheduler_v1_direct", None
 
 
diff --git a/GPT_SoVITS/TTS_infer_pack/unified_engine_api_scheduler.py b/GPT_SoVITS/TTS_infer_pack/unified_engine_api_scheduler.py
index 1e934f16..cf6677fb 100644
--- a/GPT_SoVITS/TTS_infer_pack/unified_engine_api_scheduler.py
+++ b/GPT_SoVITS/TTS_infer_pack/unified_engine_api_scheduler.py
@@ -6,7 +6,7 @@ import uuid
 from io import BytesIO
 from typing import Any, Dict, List
 
-from GPT_SoVITS.TTS_infer_pack.t2s_scheduler import SchedulerRequestSpec, T2SFinishedItem, T2SRequestState, run_scheduler_continuous
+from GPT_SoVITS.TTS_infer_pack.t2s_scheduler import SchedulerRequestSpec, T2SFinishedItem, T2SRequestState
 from GPT_SoVITS.TTS_infer_pack.unified_engine_audio import pack_audio, set_scheduler_seed
 from GPT_SoVITS.TTS_infer_pack.unified_engine_components import EngineStatus, NormalizedEngineRequest, SchedulerDebugExecution, SchedulerSubmitExecution
 
@@ -67,39 +67,58 @@ class EngineApiSchedulerFlow:
     async def run_scheduler_debug(self, request_items: List[dict], max_steps: int, seed: int) -> SchedulerDebugExecution:
         request_start = time.perf_counter()
         set_scheduler_seed(seed)
-        specs = self._build_scheduler_request_specs(request_items)
-        request_ids = [spec.request_id for spec in specs]
-        for spec in specs:
+        normalized_requests: List[NormalizedEngineRequest] = []
+        for index, payload in enumerate(request_items):
+            normalized_requests.append(
+                self.api._normalize_engine_request(
+                    payload,
+                    request_id=str(payload.get("request_id") or f"req_{index:03d}"),
+                    error_prefix=f"request[{index}] 参数非法: ",
+                )
+            )
+        specs = [normalized.to_scheduler_spec() for normalized in normalized_requests]
+        request_ids = [normalized.request_id for normalized in normalized_requests]
+        for normalized, spec in zip(normalized_requests, specs):
             self.api._register_request_state(
-                request_id=spec.request_id,
+                request_id=normalized.request_id,
                 api_mode="scheduler_debug",
                 backend="scheduler_debug",
-                media_type="wav",
+                media_type=normalized.media_type,
                 response_streaming=False,
-                meta={
-                    "text_len": len(spec.text),
-                    "prompt_text_len": len(spec.prompt_text),
-                    "text_lang": spec.text_lang,
-                    "prompt_lang": spec.prompt_lang,
-                    "ref_audio_path": str(spec.ref_audio_path),
-                    "ready_step": int(spec.ready_step),
-                },
+                meta=self.api._build_request_meta(normalized.to_payload()),
             )
-            self.api._update_request_state(spec.request_id, EngineStatus.VALIDATED, {"request_source": "scheduler_debug"})
-            self.api._update_request_state(spec.request_id, EngineStatus.CPU_PREPARING, None)
+            self.api._update_request_state(normalized.request_id, EngineStatus.VALIDATED, {"request_source": "scheduler_debug"})
+            self.api._update_request_state(normalized.request_id, EngineStatus.CPU_PREPARING, None)
         prepare_started_at = time.perf_counter()
+        original_worker_max_steps = int(self.api.scheduler_worker.max_steps)
+        original_decode_max_steps = int(self.api.scheduler_worker.decode_executor.max_steps)
         try:
-            states = await self.api.scheduler_worker.prepare_states_batch_async(specs)
+            self.api.scheduler_worker.max_steps = int(max_steps)
+            self.api.scheduler_worker.decode_executor.max_steps = int(max_steps)
+            prepared_payloads = await asyncio.gather(
+                *[
+                    self.api._prepare_state_via_engine_gpu_queue(
+                        spec=spec,
+                        prepare_submit_at=time.perf_counter(),
+                        engine_request_id=normalized.request_id,
+                    )
+                    for normalized, spec in zip(normalized_requests, specs)
+                ]
+            )
         except Exception as exc:
             for request_id in request_ids:
                 self.api._fail_request_state(request_id, str(exc))
             raise
+        finally:
+            self.api.scheduler_worker.max_steps = int(original_worker_max_steps)
+            self.api.scheduler_worker.decode_executor.max_steps = int(original_decode_max_steps)
         prepare_finished_at = time.perf_counter()
         prepare_batch_wall_ms = max(0.0, (prepare_finished_at - prepare_started_at) * 1000.0)
+        states = [payload[0] for payload in prepared_payloads]
         for state in states:
             self.api._update_request_state(
                 state.request_id,
-                EngineStatus.ACTIVE_DECODE,
+                EngineStatus.READY_FOR_PREFILL,
                 {
                     "prepare_profile": dict(state.prepare_profile),
                     "norm_text": state.norm_text,
@@ -108,7 +127,27 @@ class EngineApiSchedulerFlow:
             )
         decode_started_at = time.perf_counter()
         try:
-            finished = run_scheduler_continuous(self.api.tts.t2s_model.model, states, max_steps=int(max_steps))
+            loop = asyncio.get_running_loop()
+            done_futures: List[asyncio.Future] = []
+            for normalized, state in zip(normalized_requests, states):
+                done_future = loop.create_future()
+                done_futures.append(done_future)
+                await self.api._enqueue_prepared_state_for_dispatch(
+                    state=state,
+                    speed_factor=float(normalized.speed_factor),
+                    sample_steps=int(normalized.sample_steps),
+                    media_type=normalized.media_type,
+                    super_sampling=bool(normalized.super_sampling),
+                    prepare_wall_ms=float(state.prepare_profile.get("wall_total_ms", 0.0)),
+                    prepare_profile_total_ms=float(state.prepare_profile.get("wall_total_ms", 0.0)),
+                    done_loop=loop,
+                    done_future=done_future,
+                    engine_request_id=normalized.request_id,
+                    timeout_sec=normalized.timeout_sec,
+                )
+            timeout_candidates = [float(item.timeout_sec) for item in normalized_requests if item.timeout_sec not in [None, ""]]
+            timeout_sec = max(timeout_candidates) if timeout_candidates else 60.0
+            jobs = list(await asyncio.wait_for(asyncio.gather(*done_futures), timeout=float(timeout_sec)))
         except Exception as exc:
             for request_id in request_ids:
                 self.api._fail_request_state(request_id, str(exc))
@@ -116,46 +155,63 @@ class EngineApiSchedulerFlow:
         decode_finished_at = time.perf_counter()
         decode_batch_wall_ms = max(0.0, (decode_finished_at - decode_started_at) * 1000.0)
         request_total_ms = max(0.0, (decode_finished_at - request_start) * 1000.0)
-        finished_map = {item.request_id: item for item in finished}
         request_profiles: List[Dict[str, Any]] = []
-        for state in states:
-            item = finished_map.get(state.request_id)
-            if item is None:
+        finished: List[Dict[str, Any]] = []
+        finish_reason_counts: Dict[str, int] = {}
+        total_semantic_len = 0
+        for state, job in zip(states, jobs):
+            if job.error is not None:
+                self.api._fail_request_state(state.request_id, str(job.error))
+                raise RuntimeError(str(job.error))
+            if job.result is None:
                 self.api._fail_request_state(state.request_id, "scheduler_debug finished without result")
-                continue
-            request_profile = self.api._build_scheduler_debug_request_profile(
-                state=state,
-                item=item,
-                batch_request_count=len(states),
-                prepare_batch_wall_ms=prepare_batch_wall_ms,
-                decode_batch_wall_ms=decode_batch_wall_ms,
-                batch_request_total_ms=request_total_ms,
-            )
-            request_profiles.append(
+                raise RuntimeError(f"{state.request_id} finished without result")
+            job_result = dict(job.result)
+            request_profile = {
+                **job_result,
+                "backend": "scheduler_debug",
+                "backend_mode": "scheduler_debug",
+                "batch_request_count": int(len(states)),
+                "batch_prepare_wall_ms": float(prepare_batch_wall_ms),
+                "batch_decode_wall_ms": float(decode_batch_wall_ms),
+                "batch_request_total_ms": float(request_total_ms),
+                "prepare_ms": float(state.prepare_profile.get("wall_total_ms", 0.0)),
+                "prepare_wall_ms": float(state.prepare_profile.get("wall_total_ms", 0.0)),
+                "prepare_profile_total_ms": float(state.prepare_profile.get("wall_total_ms", 0.0)),
+                "prepare_profile": dict(state.prepare_profile),
+                "norm_text": state.norm_text,
+                "norm_prompt_text": state.norm_prompt_text,
+            }
+            request_profiles.append({"request_id": state.request_id, "profile": dict(request_profile)})
+            self.api._merge_request_state_profile(state.request_id, request_profile)
+            semantic_len = int(job_result.get("semantic_len", 0))
+            finish_reason = str(job_result.get("finish_reason", "unknown"))
+            finished.append(
                 {
                     "request_id": state.request_id,
-                    "profile": dict(request_profile),
+                    "semantic_len": semantic_len,
+                    "finish_idx": int(job_result.get("finish_idx", job_result.get("decode_steps", 0))),
+                    "finish_reason": finish_reason,
                 }
             )
-            self.api._complete_request_state(
-                state.request_id,
-                dict(request_profile),
-            )
+            finish_reason_counts[finish_reason] = finish_reason_counts.get(finish_reason, 0) + 1
+            total_semantic_len += semantic_len
         return SchedulerDebugExecution(
             payload={
                 "message": "success",
                 "request_count": len(states),
                 "max_steps": int(max_steps),
-                "batch_profile": self.api._build_scheduler_debug_batch_profile(
-                    request_count=len(states),
-                    max_steps=int(max_steps),
-                    prepare_batch_wall_ms=prepare_batch_wall_ms,
-                    decode_batch_wall_ms=decode_batch_wall_ms,
-                    request_total_ms=request_total_ms,
-                    finished_items=finished,
-                ),
+                "batch_profile": {
+                    "request_count": int(len(states)),
+                    "max_steps": int(max_steps),
+                    "prepare_batch_wall_ms": float(prepare_batch_wall_ms),
+                    "decode_batch_wall_ms": float(decode_batch_wall_ms),
+                    "request_total_ms": float(request_total_ms),
+                    "total_semantic_len": int(total_semantic_len),
+                    "finish_reason_counts": finish_reason_counts,
+                },
                 "requests": self._summarize_scheduler_states(states),
-                "finished": self._summarize_scheduler_finished(finished),
+                "finished": finished,
                 "request_profiles": request_profiles,
                 "request_traces": self.api._collect_request_summaries(request_ids),
             }
@@ -222,6 +278,7 @@ class EngineApiSchedulerFlow:
             speed_factor=float(normalized.speed_factor),
             sample_steps=int(normalized.sample_steps),
             media_type=normalized.media_type,
+            super_sampling=bool(normalized.super_sampling),
             prepare_wall_ms=prepare_wall_ms,
             prepare_profile_total_ms=prepare_profile_total_ms,
             done_loop=loop,
diff --git a/GPT_SoVITS/TTS_infer_pack/unified_engine_bridge_delegates.py b/GPT_SoVITS/TTS_infer_pack/unified_engine_bridge_delegates.py
index 92714750..e2044ec4 100644
--- a/GPT_SoVITS/TTS_infer_pack/unified_engine_bridge_delegates.py
+++ b/GPT_SoVITS/TTS_infer_pack/unified_engine_bridge_delegates.py
@@ -149,6 +149,7 @@ class EngineBridgeDelegates:
         speed_factor: float,
         sample_steps: int,
         media_type: str,
+        super_sampling: bool,
         prepare_wall_ms: float,
         prepare_profile_total_ms: float,
         done_loop: asyncio.AbstractEventLoop | None,
@@ -161,6 +162,7 @@ class EngineBridgeDelegates:
             speed_factor=speed_factor,
             sample_steps=sample_steps,
             media_type=media_type,
+            super_sampling=super_sampling,
             prepare_wall_ms=prepare_wall_ms,
             prepare_profile_total_ms=prepare_profile_total_ms,
             done_loop=done_loop,
diff --git a/GPT_SoVITS/TTS_infer_pack/unified_engine_bridge_stage.py b/GPT_SoVITS/TTS_infer_pack/unified_engine_bridge_stage.py
index 29b5aaab..2a52e779 100644
--- a/GPT_SoVITS/TTS_infer_pack/unified_engine_bridge_stage.py
+++ b/GPT_SoVITS/TTS_infer_pack/unified_engine_bridge_stage.py
@@ -78,6 +78,7 @@ class EngineStageBridgeFacade:
         speed_factor: float,
         sample_steps: int,
         media_type: str,
+        super_sampling: bool,
         prepare_wall_ms: float,
         prepare_profile_total_ms: float,
         done_loop: asyncio.AbstractEventLoop | None,
@@ -90,6 +91,7 @@ class EngineStageBridgeFacade:
             speed_factor=speed_factor,
             sample_steps=sample_steps,
             media_type=media_type,
+            super_sampling=super_sampling,
             prepare_wall_ms=prepare_wall_ms,
             prepare_profile_total_ms=prepare_profile_total_ms,
             done_loop=done_loop,
diff --git a/GPT_SoVITS/TTS_infer_pack/unified_engine_component_models.py b/GPT_SoVITS/TTS_infer_pack/unified_engine_component_models.py
index 2c0cc9ac..7b5ea5f8 100644
--- a/GPT_SoVITS/TTS_infer_pack/unified_engine_component_models.py
+++ b/GPT_SoVITS/TTS_infer_pack/unified_engine_component_models.py
@@ -104,6 +104,7 @@ class NormalizedEngineRequest:
             temperature=self.temperature,
             repetition_penalty=self.repetition_penalty,
             early_stop_num=self.early_stop_num,
+            aux_ref_audio_paths=list(self.aux_ref_audio_paths or []),
             ready_step=self.ready_step,
         )
 
diff --git a/GPT_SoVITS/TTS_infer_pack/unified_engine_component_registry.py b/GPT_SoVITS/TTS_infer_pack/unified_engine_component_registry.py
index 111ca500..1aaa89c1 100644
--- a/GPT_SoVITS/TTS_infer_pack/unified_engine_component_registry.py
+++ b/GPT_SoVITS/TTS_infer_pack/unified_engine_component_registry.py
@@ -303,6 +303,7 @@ class SchedulerPendingJob:
     speed_factor: float
     sample_steps: int
     media_type: str
+    super_sampling: bool = False
     admission_wait_ms: float = 0.0
     engine_policy_wait_ms: float = 0.0
     engine_dispatch_wait_ms: float = 0.0
diff --git a/GPT_SoVITS/TTS_infer_pack/unified_engine_component_runtime.py b/GPT_SoVITS/TTS_infer_pack/unified_engine_component_runtime.py
index db03a0c3..7f4e485f 100644
--- a/GPT_SoVITS/TTS_infer_pack/unified_engine_component_runtime.py
+++ b/GPT_SoVITS/TTS_infer_pack/unified_engine_component_runtime.py
@@ -291,6 +291,7 @@ class EngineDispatchTask:
     speed_factor: float
     sample_steps: int
     media_type: str
+    super_sampling: bool
     prepare_wall_ms: float
     prepare_profile_total_ms: float
     done_loop: asyncio.AbstractEventLoop | None
diff --git a/GPT_SoVITS/TTS_infer_pack/unified_engine_stage.py b/GPT_SoVITS/TTS_infer_pack/unified_engine_stage.py
index 9aad2fb8..1b872dfa 100644
--- a/GPT_SoVITS/TTS_infer_pack/unified_engine_stage.py
+++ b/GPT_SoVITS/TTS_infer_pack/unified_engine_stage.py
@@ -113,6 +113,7 @@ class EngineStageCoordinator:
         speed_factor: float,
         sample_steps: int,
         media_type: str,
+        super_sampling: bool,
         prepare_wall_ms: float,
         prepare_profile_total_ms: float,
         done_loop: asyncio.AbstractEventLoop | None,
@@ -125,6 +126,7 @@ class EngineStageCoordinator:
             speed_factor=speed_factor,
             sample_steps=sample_steps,
             media_type=media_type,
+            super_sampling=super_sampling,
             prepare_wall_ms=prepare_wall_ms,
             prepare_profile_total_ms=prepare_profile_total_ms,
             done_loop=done_loop,
diff --git a/GPT_SoVITS/TTS_infer_pack/unified_engine_stage_dispatch.py b/GPT_SoVITS/TTS_infer_pack/unified_engine_stage_dispatch.py
index 53ebd793..644c35f6 100644
--- a/GPT_SoVITS/TTS_infer_pack/unified_engine_stage_dispatch.py
+++ b/GPT_SoVITS/TTS_infer_pack/unified_engine_stage_dispatch.py
@@ -16,6 +16,7 @@ class EngineDispatchStageMixin:
         speed_factor: float,
         sample_steps: int,
         media_type: str,
+        super_sampling: bool,
         prepare_wall_ms: float,
         prepare_profile_total_ms: float,
         done_loop: asyncio.AbstractEventLoop | None,
@@ -29,6 +30,7 @@ class EngineDispatchStageMixin:
             speed_factor=float(speed_factor),
             sample_steps=int(sample_steps),
             media_type=media_type,
+            super_sampling=bool(super_sampling),
             prepare_wall_ms=float(prepare_wall_ms),
             prepare_profile_total_ms=float(prepare_profile_total_ms),
             done_loop=done_loop,
@@ -66,6 +68,7 @@ class EngineDispatchStageMixin:
                 speed_factor=dispatch_task.speed_factor,
                 sample_steps=dispatch_task.sample_steps,
                 media_type=dispatch_task.media_type,
+                super_sampling=dispatch_task.super_sampling,
                 prepare_wall_ms=dispatch_task.prepare_wall_ms,
                 prepare_profile_total_ms=dispatch_task.prepare_profile_total_ms,
                 done_loop=dispatch_task.done_loop,
diff --git a/GPT_SoVITS/TTS_infer_pack/unified_engine_worker.py b/GPT_SoVITS/TTS_infer_pack/unified_engine_worker.py
index 934ccf52..ae46536f 100644
--- a/GPT_SoVITS/TTS_infer_pack/unified_engine_worker.py
+++ b/GPT_SoVITS/TTS_infer_pack/unified_engine_worker.py
@@ -46,7 +46,7 @@ class UnifiedSchedulerWorker(
         self.decode_backlog_max = max(0, int(os.environ.get("GPTSOVITS_ENGINE_DECODE_BACKLOG_MAX", "0")))
         self.finalize_pending_max = max(0, int(os.environ.get("GPTSOVITS_ENGINE_FINALIZE_PENDING_MAX", "0")))
         self.engine_decode_control_enabled = (
-            str(os.environ.get("GPTSOVITS_ENGINE_DRIVE_DECODE", "0")).strip().lower() in {"1", "true", "yes", "on"}
+            str(os.environ.get("GPTSOVITS_ENGINE_DRIVE_DECODE", "1")).strip().lower() in {"1", "true", "yes", "on"}
         )
         self.job_registry = SchedulerJobRegistry(self.condition)
         self.worker_thread: threading.Thread | None = None
diff --git a/GPT_SoVITS/TTS_infer_pack/unified_engine_worker_finalize.py b/GPT_SoVITS/TTS_infer_pack/unified_engine_worker_finalize.py
index 4f5833fd..3a675cbe 100644
--- a/GPT_SoVITS/TTS_infer_pack/unified_engine_worker_finalize.py
+++ b/GPT_SoVITS/TTS_infer_pack/unified_engine_worker_finalize.py
@@ -149,16 +149,25 @@ class WorkerFinalizeExecutor:
         except Exception:
             pass
 
+    @staticmethod
+    def _collect_job_refer_specs(job: SchedulerPendingJob) -> List[tuple]:
+        refer_specs = [job.state.refer_spec]
+        refer_specs.extend(list(getattr(job.state, "aux_refer_specs", []) or []))
+        return refer_specs
+
     def _synthesize_finished_audio(self, job: SchedulerPendingJob, item: T2SFinishedItem) -> tuple[int, np.ndarray]:
         audio_fragment = self.tts.synthesize_audio_request_local(
             semantic_tokens=item.semantic_tokens.detach().clone().unsqueeze(0).unsqueeze(0),
             phones=job.state.phones.detach().clone().unsqueeze(0),
             prompt_semantic=job.state.prompt_semantic.detach().clone(),
             prompt_phones=job.state.prompt_phones.detach().clone(),
-            refer_spec=(
-                job.state.refer_spec[0].detach().clone(),
-                None if job.state.refer_spec[1] is None else job.state.refer_spec[1].detach().clone(),
-            ),
+            refer_spec=[
+                (
+                    refer_spec_item[0].detach().clone(),
+                    None if refer_spec_item[1] is None else refer_spec_item[1].detach().clone(),
+                )
+                for refer_spec_item in self._collect_job_refer_specs(job)
+            ],
             raw_audio=job.state.raw_audio.detach().clone(),
             raw_sr=int(job.state.raw_sr),
             speed=float(job.speed_factor),
@@ -172,7 +181,7 @@ class WorkerFinalizeExecutor:
             speed_factor=float(job.speed_factor),
             split_bucket=False,
             fragment_interval=0.0,
-            super_sampling=False,
+            super_sampling=bool(job.super_sampling),
         )
 
     def _synthesize_finished_audio_batch(
@@ -185,11 +194,14 @@ class WorkerFinalizeExecutor:
         speeds = []
         sample_steps_list = []
         for job, _ in jobs_and_items:
+            refer_spec_group = self._collect_job_refer_specs(job)
+            if len(refer_spec_group) != 1:
+                raise ValueError("batched finalize 暂不支持单请求多参考音频")
             refer_specs.append(
-                (
-                    job.state.refer_spec[0].detach().clone(),
-                    None if job.state.refer_spec[1] is None else job.state.refer_spec[1].detach().clone(),
-                )
+                [(
+                    refer_spec_group[0][0].detach().clone(),
+                    None if refer_spec_group[0][1] is None else refer_spec_group[0][1].detach().clone(),
+                )]
             )
             speeds.append(float(job.speed_factor))
             sample_steps_list.append(int(job.sample_steps))
@@ -211,7 +223,7 @@ class WorkerFinalizeExecutor:
                     speed_factor=float(job.speed_factor),
                     split_bucket=False,
                     fragment_interval=0.0,
-                    super_sampling=False,
+                    super_sampling=bool(job.super_sampling),
                 )
             )
         return results
@@ -224,9 +236,12 @@ class WorkerFinalizeExecutor:
             return 0.0, []
         self._sync_device()
         synth_start = time.perf_counter()
-        if len(jobs_and_items) == 1 or self.tts.configs.use_vocoder:
-            job, item = jobs_and_items[0]
-            batch_results = [self._synthesize_finished_audio(job, item)]
+        if (
+            len(jobs_and_items) == 1
+            or self.tts.configs.use_vocoder
+            or any(len(self._collect_job_refer_specs(job)) != 1 for job, _ in jobs_and_items)
+        ):
+            batch_results = [self._synthesize_finished_audio(job, item) for job, item in jobs_and_items]
         else:
             batch_results = self._synthesize_finished_audio_batch(jobs_and_items)
         self._sync_device()
diff --git a/GPT_SoVITS/TTS_infer_pack/unified_engine_worker_submit.py b/GPT_SoVITS/TTS_infer_pack/unified_engine_worker_submit.py
index 1e67f8d3..f1910409 100644
--- a/GPT_SoVITS/TTS_infer_pack/unified_engine_worker_submit.py
+++ b/GPT_SoVITS/TTS_infer_pack/unified_engine_worker_submit.py
@@ -78,6 +78,7 @@ class WorkerSubmitLifecycleMixin:
         speed_factor: float,
         sample_steps: int,
         media_type: str,
+        super_sampling: bool,
         prepare_wall_ms: float,
         prepare_profile_total_ms: float,
         done_loop: asyncio.AbstractEventLoop | None = None,
@@ -97,6 +98,7 @@ class WorkerSubmitLifecycleMixin:
             speed_factor,
             sample_steps,
             media_type,
+            super_sampling,
             prepare_wall_ms,
             prepare_profile_total_ms,
             done_loop,
@@ -172,6 +174,7 @@ class WorkerSubmitLifecycleMixin:
         speed_factor: float,
         sample_steps: int,
         media_type: str,
+        super_sampling: bool,
         prepare_wall_ms: float,
         prepare_profile_total_ms: float,
         done_loop: asyncio.AbstractEventLoop | None = None,
@@ -205,6 +208,7 @@ class WorkerSubmitLifecycleMixin:
             speed_factor=float(speed_factor),
             sample_steps=int(sample_steps),
             media_type=media_type,
+            super_sampling=bool(super_sampling),
             admission_wait_ms=float(admission_wait_ms),
             engine_policy_wait_ms=float(engine_policy_wait_ms),
             engine_dispatch_wait_ms=float(engine_dispatch_wait_ms),
diff --git a/api_v2.py b/api_v2.py
index 35b70c8e..9c29989f 100644
--- a/api_v2.py
+++ b/api_v2.py
@@ -39,8 +39,8 @@ POST:
     "seed": -1,                   # int. random seed for reproducibility.
     "parallel_infer": True,       # bool. whether to use parallel inference.
     "repetition_penalty": 1.35,   # float. repetition penalty for T2S model.
-    "sample_steps": 32,           # int. number of sampling steps for VITS model V3.
-    "super_sampling": False,      # bool. whether to use super-sampling for audio when using VITS model V3.
+    "sample_steps": 32,           # int. 仅 v3/v4 vocoder 路径使用；当前 v2/v2ProPlus 主线可忽略。
+    "super_sampling": False,      # bool. 仅 v3/v4 路径使用；不属于当前 v2/v2ProPlus 正式支持目标。
     "streaming_mode": False,      # bool or int. return audio chunk by chunk.T he available options are: 0,1,2,3 or True/False (0/False: Disabled | 1/True: Best Quality, Slowest response speed (old version streaming_mode) | 2: Medium Quality, Slow response speed | 3: Lower Quality, Faster response speed )
     "overlap_length": 2,          # int. overlap length of semantic tokens for streaming mode.
     "min_chunk_length": 16,       # int. The minimum chunk length of semantic tokens for streaming mode. (affects audio chunk size)
@@ -79,7 +79,7 @@ endpoint: `/set_gpt_weights`
 
 GET:
 ```
-http://127.0.0.1:9880/set_gpt_weights?weights_path=GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
+http://127.0.0.1:9880/set_gpt_weights?weights_path=GPT_SoVITS/pretrained_models/s1v3.ckpt
 ```
 RESP:
 成功: 返回"success", http code 200
@@ -92,7 +92,7 @@ endpoint: `/set_sovits_weights`
 
 GET:
 ```
-http://127.0.0.1:9880/set_sovits_weights?weights_path=GPT_SoVITS/pretrained_models/s2G488k.pth
+http://127.0.0.1:9880/set_sovits_weights?weights_path=GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus.pth
 ```
 
 RESP:
@@ -211,7 +211,7 @@ async def tts_handle(req: dict):
                 "parallel_infer": True,       # bool. whether to use parallel inference.
                 "repetition_penalty": 1.35,   # float. repetition penalty for T2S model.
                 "sample_steps": 32,           # int. number of sampling steps for VITS model V3.
-                "super_sampling": False,      # bool. whether to use super-sampling for audio when using VITS model V3.
+                "super_sampling": False,      # bool. only for v3/v4; not part of current v2/v2ProPlus mainline.
                 "streaming_mode": False,      # bool or int. return audio chunk by chunk.T he available options are: 0,1,2,3 or True/False (0/False: Disabled | 1/True: Best Quality, Slowest response speed (old version streaming_mode) | 2: Medium Quality, Slow response speed | 3: Lower Quality, Faster response speed )
                 "overlap_length": 2,          # int. overlap length of semantic tokens for streaming mode.
                 "min_chunk_length": 16,       # int. The minimum chunk length of semantic tokens for streaming mode. (affects audio chunk size)
diff --git a/api_v3.py b/api_v3.py
index 1a6457ec..35ecf240 100644
--- a/api_v3.py
+++ b/api_v3.py
@@ -39,8 +39,8 @@ POST:
     "seed": -1,                   # int. random seed for reproducibility.
     "parallel_infer": True,       # bool. whether to use parallel inference.
     "repetition_penalty": 1.35,   # float. repetition penalty for T2S model.
-    "sample_steps": 32,           # int. number of sampling steps for VITS model V3.
-    "super_sampling": False,      # bool. whether to use super-sampling for audio when using VITS model V3.
+    "sample_steps": 32,           # int. 仅 v3/v4 vocoder 路径使用；当前 v2/v2ProPlus 主线可忽略。
+    "super_sampling": False,      # bool. 仅 v3/v4 路径使用；不属于当前 v2/v2ProPlus 正式支持目标。
     "streaming_mode": False,      # bool or int. return audio chunk by chunk.T he available options are: 0,1,2,3 or True/False (0/False: Disabled | 1/True: Best Quality, Slowest response speed (old version streaming_mode) | 2: Medium Quality, Slow response speed | 3: Lower Quality, Faster response speed )
     "overlap_length": 2,          # int. overlap length of semantic tokens for streaming mode.
     "min_chunk_length": 16,       # int. The minimum chunk length of semantic tokens for streaming mode. (affects audio chunk size)
@@ -79,7 +79,7 @@ endpoint: `/set_gpt_weights`
 
 GET:
 ```
-http://127.0.0.1:9880/set_gpt_weights?weights_path=GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
+http://127.0.0.1:9880/set_gpt_weights?weights_path=GPT_SoVITS/pretrained_models/s1v3.ckpt
 ```
 RESP:
 成功: 返回"success", http code 200
@@ -92,7 +92,7 @@ endpoint: `/set_sovits_weights`
 
 GET:
 ```
-http://127.0.0.1:9880/set_sovits_weights?weights_path=GPT_SoVITS/pretrained_models/s2G488k.pth
+http://127.0.0.1:9880/set_sovits_weights?weights_path=GPT_SoVITS/pretrained_models/v2Pro/s2Gv2ProPlus.pth
 ```
 
 RESP:
@@ -280,7 +280,7 @@ async def tts_handle(req: dict):
                 "parallel_infer": True,       # bool. whether to use parallel inference.
                 "repetition_penalty": 1.35,   # float. repetition penalty for T2S model.
                 "sample_steps": 32,           # int. number of sampling steps for VITS model V3.
-                "super_sampling": False,      # bool. whether to use super-sampling for audio when using VITS model V3.
+                "super_sampling": False,      # bool. only for v3/v4; not part of current v2/v2ProPlus mainline.
                 "streaming_mode": False,      # bool or int. return audio chunk by chunk.T he available options are: 0,1,2,3 or True/False (0/False: Disabled | 1/True: Best Quality, Slowest response speed (old version streaming_mode) | 2: Medium Quality, Slow response speed | 3: Lower Quality, Faster response speed )
                 "overlap_length": 2,          # int. overlap length of semantic tokens for streaming mode.
                 "min_chunk_length": 16,       # int. The minimum chunk length of semantic tokens for streaming mode. (affects audio chunk size)