3초 미만 제한없음

2025-12-16 17:40:09 +08:00 · 2025-10-15 17:06:21 +09:00 · 2025-10-15 17:06:21 +09:00 · e8616c87c6
commit e8616c87c6
parent 11aa78bd9b
3 changed files with 774 additions and 10 deletions
--- a/GPT_SoVITS/TTS_infer_pack/TTS.py
+++ b/GPT_SoVITS/TTS_infer_pack/TTS.py
@ -799,8 +799,8 @@ class TTS:
        )
        with torch.no_grad():
            wav16k, sr = librosa.load(ref_wav_path, sr=16000)
-            if wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000:
-                raise OSError(i18n("参考音频在3~10秒范围外，请更换！"))
+            # if wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000:
+            #     raise OSError(i18n("参考音频在3~10秒范围外，请更换！"))
            wav16k = torch.from_numpy(wav16k)
            zero_wav_torch = torch.from_numpy(zero_wav)
            wav16k = wav16k.to(self.configs.device)
--- a/GPT_SoVITS/inference_webui.py
+++ b/GPT_SoVITS/inference_webui.py
@ -811,9 +811,6 @@ def get_tts_wav(
    if not ref_free:
        with torch.no_grad():
            wav16k, sr = librosa.load(ref_wav_path, sr=16000)
-            if wav16k.shape[0] > 160000 or wav16k.shape[0] < 48000:
-                gr.Warning(i18n("参考音频在3~10秒范围外，请更换！"))
-                raise OSError(i18n("参考音频在3~10秒范围外，请更换！"))
            wav16k = torch.from_numpy(wav16k)
            if is_half == True:
                wav16k = wav16k.half().to(device)
--- a/api_v2.py
+++ b/api_v2.py
@ -101,7 +101,10 @@ RESP:
 import os
 import sys
 import traceback
-from typing import Generator
+from typing import Generator, Dict, Any, Optional
+import uuid
+import asyncio
+from datetime import datetime

 now_dir = os.getcwd()
 sys.path.append(now_dir)
@ -121,11 +124,47 @@ from tools.i18n.i18n import I18nAuto
 from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config
 from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import get_method_names as get_cut_method_names
 from pydantic import BaseModel
+import json
+import yaml
+
+# Import config variables (avoiding webui to prevent Gradio loading)
+from config import (
+    exp_root,
+    python_exec,
+    is_half,
+    GPU_INDEX,
+    infer_device,
+    SoVITS_weight_version2root,
+    GPT_weight_version2root,
+)

 # print(sys.path)
 i18n = I18nAuto()
 cut_method_names = get_cut_method_names()

+# GPU helper functions (replicated from webui.py to avoid import)
+set_gpu_numbers = GPU_INDEX
+default_gpu_numbers = infer_device.index if hasattr(infer_device, 'index') else 0
+
+def fix_gpu_number(input_val):
+    """Fix GPU number to be within valid range."""
+    try:
+        if int(input_val) not in set_gpu_numbers:
+            return default_gpu_numbers
+    except:
+        return input_val
+    return input_val
+
+def fix_gpu_numbers(inputs):
+    """Fix multiple GPU numbers separated by comma."""
+    output = []
+    try:
+        for input_val in inputs.split(","):
+            output.append(str(fix_gpu_number(input_val)))
+        return ",".join(output)
+    except:
+        return inputs
+
 parser = argparse.ArgumentParser(description="GPT-SoVITS api")
 parser.add_argument("-c", "--tts_config", type=str, default="GPT_SoVITS/configs/tts_infer.yaml", help="tts_infer路径")
 parser.add_argument("-a", "--bind_addr", type=str, default="127.0.0.1", help="default: 127.0.0.1")
@ -172,6 +211,69 @@ class TTS_Request(BaseModel):
    super_sampling: bool = False


+class SpeechSlicingRequest(BaseModel):
+    inp: str
+    opt_root: str
+    threshold: str = "-34"
+    min_length: str = "4000"
+    min_interval: str = "300"
+    hop_size: str = "10"
+    max_sil_kept: str = "500"
+    _max: float = 0.9
+    alpha: float = 0.25
+    n_parts: int = 4
+
+
+class STTRequest(BaseModel):
+    input_folder: str
+    output_folder: str
+    model_path: str = "tools/asr/models/faster-whisper-large-v3"
+    language: str = "auto"
+    precision: str = "float32"
+
+
+class DatasetFormattingRequest(BaseModel):
+    inp_text: str
+    inp_wav_dir: str
+    exp_name: str
+    version: str = "v4"
+    gpu_numbers: str = "0-0"
+    bert_pretrained_dir: str = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
+    ssl_pretrained_dir: str = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
+    pretrained_s2G_path: str = "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth"
+
+
+class FineTuneSoVITSRequest(BaseModel):
+    version: str = "v4"
+    batch_size: int = 2
+    total_epoch: int = 2
+    exp_name: str
+    text_low_lr_rate: float = 0.4
+    if_save_latest: bool = True
+    if_save_every_weights: bool = True
+    save_every_epoch: int = 1
+    gpu_numbers1Ba: str = "0"
+    pretrained_s2G: str = "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Gv4.pth"
+    pretrained_s2D: str = "GPT_SoVITS/pretrained_models/gsv-v4-pretrained/s2Dv4.pth"
+    if_grad_ckpt: bool = False
+    lora_rank: str = "32"
+
+
+class FineTuneGPTRequest(BaseModel):
+    batch_size: int = 8
+    total_epoch: int = 15
+    exp_name: str
+    if_dpo: bool = False
+    if_save_latest: bool = True
+    if_save_every_weights: bool = True
+    save_every_epoch: int = 5
+    gpu_numbers: str = "0"
+    pretrained_s1: str = "GPT_SoVITS/pretrained_models/s1v3.ckpt"
+
+
+jobs: Dict[str, Dict[str, Any]] = {}
+
+
 ### modify from https://github.com/RVC-Boss/GPT-SoVITS/pull/894/files
 def pack_ogg(io_buffer: BytesIO, data: np.ndarray, rate: int):
    with sf.SoundFile(io_buffer, mode="w", samplerate=rate, channels=1, format="ogg") as audio_file:
@ -272,14 +374,14 @@ def check_params(req: dict):
        return JSONResponse(status_code=400, content={"message": "text is required"})
    if text_lang in [None, ""]:
        return JSONResponse(status_code=400, content={"message": "text_lang is required"})
-    elif text_lang.lower() not in tts_config.languages:
+    elif text_lang not in tts_config.languages:
        return JSONResponse(
            status_code=400,
            content={"message": f"text_lang: {text_lang} is not supported in version {tts_config.version}"},
        )
    if prompt_lang in [None, ""]:
        return JSONResponse(status_code=400, content={"message": "prompt_lang is required"})
-    elif prompt_lang.lower() not in tts_config.languages:
+    elif prompt_lang not in tts_config.languages:
        return JSONResponse(
            status_code=400,
            content={"message": f"prompt_lang: {prompt_lang} is not supported in version {tts_config.version}"},
@ -407,11 +509,11 @@ async def tts_get_endpoint(
 ):
    req = {
        "text": text,
-        "text_lang": text_lang.lower(),
+        "text_lang": text_lang,
        "ref_audio_path": ref_audio_path,
        "aux_ref_audio_paths": aux_ref_audio_paths,
        "prompt_text": prompt_text,
-        "prompt_lang": prompt_lang.lower(),
+        "prompt_lang": prompt_lang,
        "top_k": top_k,
        "top_p": top_p,
        "temperature": temperature,
@ -434,6 +536,23 @@ async def tts_get_endpoint(

@APP.post("/tts")
 async def tts_post_endpoint(request: TTS_Request):
+    # DEBUG: Print received payload
+    print(f"\n{'='*80}")
+    print(f"[TTS DEBUG] Received request:")
+    print(f"  text: {request.text[:100] if len(request.text) > 100 else request.text}")  # Truncate long text
+    print(f"  text_lang: {request.text_lang}")
+    print(f"  ref_audio_path: {request.ref_audio_path}")
+    print(f"  prompt_text: {request.prompt_text[:100] if request.prompt_text and len(request.prompt_text) > 100 else request.prompt_text}")
+    print(f"  prompt_lang: {request.prompt_lang}")
+    print(f"  top_k: {request.top_k}")
+    print(f"  top_p: {request.top_p}")
+    print(f"  temperature: {request.temperature}")
+    print(f"  text_split_method: {request.text_split_method}")
+    print(f"  batch_size: {request.batch_size}")
+    print(f"  speed_factor: {request.speed_factor}")
+    print(f"  streaming_mode: {request.streaming_mode}")
+    print(f"{'='*80}\n")
+
    req = request.dict()
    return await tts_handle(req)

@ -489,6 +608,654 @@ async def set_sovits_weights(weights_path: str = None):
    return JSONResponse(status_code=200, content={"message": "success"})


+async def execute_job_async(job_id: str, operation_func, *args, **kwargs):
+    """
+    Execute a job asynchronously in background.
+
+    Args:
+        job_id: Unique job identifier
+        operation_func: Function to execute (from webui.py)
+        args, kwargs: Arguments for the operation function
+    """
+    jobs[job_id]["status"] = "running"
+    jobs[job_id]["started_at"] = datetime.now().isoformat()
+
+    try:
+        result = await asyncio.to_thread(operation_func, *args, **kwargs)
+
+        if hasattr(result, '__iter__') and not isinstance(result, (str, dict)):
+            final_result = None
+            for item in result:
+                final_result = item
+            result = final_result
+
+        jobs[job_id]["status"] = "completed"
+        jobs[job_id]["result"] = result
+        jobs[job_id]["completed_at"] = datetime.now().isoformat()
+    except Exception as e:
+        jobs[job_id]["status"] = "failed"
+        jobs[job_id]["error"] = str(e)
+        jobs[job_id]["traceback"] = traceback.format_exc()
+        jobs[job_id]["failed_at"] = datetime.now().isoformat()
+
+
+@APP.get("/jobs/{job_id}")
+@APP.get("/job-status/{job_id}")  # Alias for compatibility
+async def get_job_status(job_id: str):
+    """
+    Get job status and result.
+
+    Returns:
+        {
+            "job_id": str,
+            "status": "queued" | "running" | "completed" | "failed",
+            "result": Any (if completed),
+            "error": str (if failed),
+            "created_at": str,
+            "started_at": str (if running/completed/failed),
+            "completed_at": str (if completed),
+            "failed_at": str (if failed)
+        }
+    """
+    if job_id not in jobs:
+        return JSONResponse(status_code=404, content={"message": "job not found"})
+
+    job_data = jobs[job_id].copy()
+    job_data["job_id"] = job_id
+    return JSONResponse(status_code=200, content=job_data)
+
+
+async def execute_speech_slicing_direct(job_id: str, request: SpeechSlicingRequest):
+    """
+    Execute speech slicing by directly calling slice_audio.py subprocess.
+    Replaces webui.open_slice() to avoid Gradio dependency.
+    """
+    jobs[job_id]["status"] = "running"
+    jobs[job_id]["started_at"] = datetime.now().isoformat()
+
+    try:
+        # Prepare environment with PYTHONPATH
+        env = os.environ.copy()
+        env["PYTHONPATH"] = os.pathsep.join([now_dir, os.path.join(now_dir, "GPT_SoVITS")])
+
+        # Create processes for parallel slicing (n_parts)
+        processes = []
+        for i_part in range(request.n_parts):
+            cmd = [
+                python_exec,
+                "tools/slice_audio.py",
+                request.inp,
+                request.opt_root,
+                str(request.threshold),
+                str(request.min_length),
+                str(request.min_interval),
+                str(request.hop_size),
+                str(request.max_sil_kept),
+                str(request._max),
+                str(request.alpha),
+                str(i_part),
+                str(request.n_parts),
+            ]
+            print(f"[SPEECH SLICING] Executing: {' '.join(cmd)}")
+            p = subprocess.Popen(cmd, env=env, cwd=now_dir)
+            processes.append(p)
+
+        # Wait for all processes to complete
+        for p in processes:
+            p.wait()
+
+        # Check if any process failed
+        exit_codes = [p.returncode for p in processes]
+        if any(code != 0 for code in exit_codes):
+            raise Exception(f"Speech slicing failed with exit codes: {exit_codes}")
+
+        jobs[job_id]["status"] = "completed"
+        jobs[job_id]["result"] = {
+            "output_dir": request.opt_root,
+            "file_count": request.n_parts
+        }
+        jobs[job_id]["completed_at"] = datetime.now().isoformat()
+
+    except Exception as e:
+        jobs[job_id]["status"] = "failed"
+        jobs[job_id]["error"] = str(e)
+        jobs[job_id]["traceback"] = traceback.format_exc()
+        jobs[job_id]["failed_at"] = datetime.now().isoformat()
+
+
+@APP.post("/preprocessing/speech-slicing")
+async def speech_slicing_endpoint(request: SpeechSlicingRequest):
+    """
+    Start speech slicing job.
+
+    Directly executes tools/slice_audio.py (no webui dependency).
+    """
+    # DEBUG: Print received payload
+    print(f"\n{'='*80}")
+    print(f"[SPEECH SLICING DEBUG] Received request:")
+    print(f"  inp: {request.inp}")
+    print(f"  opt_root: {request.opt_root}")
+    print(f"  threshold: {request.threshold}")
+    print(f"  min_length: {request.min_length}")
+    print(f"  min_interval: {request.min_interval}")
+    print(f"  hop_size: {request.hop_size}")
+    print(f"  max_sil_kept: {request.max_sil_kept}")
+    print(f"  _max: {request._max}")
+    print(f"  alpha: {request.alpha}")
+    print(f"  n_parts: {request.n_parts}")
+    print(f"{'='*80}\n")
+
+    job_id = str(uuid.uuid4())
+    jobs[job_id] = {
+        "status": "queued",
+        "operation": "speech_slicing",
+        "created_at": datetime.now().isoformat()
+    }
+
+    try:
+        asyncio.create_task(execute_speech_slicing_direct(job_id, request))
+        return JSONResponse(status_code=200, content={"job_id": job_id, "status": "queued"})
+    except Exception as e:
+        jobs[job_id]["status"] = "failed"
+        jobs[job_id]["error"] = str(e)
+        return JSONResponse(status_code=500, content={"message": "failed to start job", "error": str(e)})
+
+
+@APP.post("/preprocessing/stt")
+async def stt_endpoint(request: STTRequest):
+    """
+    Start STT (Speech-to-Text) job.
+
+    Wraps tools/asr/fasterwhisper_asr.execute_asr()
+    """
+    # DEBUG: Print received payload
+    print(f"\n{'='*80}")
+    print(f"[STT DEBUG] Received STT request:")
+    print(request)
+    print(f"{'='*80}\n")
+
+    job_id = str(uuid.uuid4())
+    jobs[job_id] = {
+        "status": "queued",
+        "operation": "stt",
+        "created_at": datetime.now().isoformat()
+    }
+
+    try:
+        from tools.asr.fasterwhisper_asr import execute_asr
+
+        asyncio.create_task(execute_job_async(
+            job_id,
+            execute_asr,
+            request.input_folder,
+            request.output_folder,
+            request.model_path,
+            request.language,
+            request.precision
+        ))
+
+        return JSONResponse(status_code=200, content={"job_id": job_id, "status": "queued"})
+    except Exception as e:
+        print(f"[STT ERROR] Failed to start STT job: {str(e)}")
+        jobs[job_id]["status"] = "failed"
+        jobs[job_id]["error"] = str(e)
+        return JSONResponse(status_code=500, content={"message": "failed to start job", "error": str(e)})
+
+
+async def execute_dataset_formatting(job_id: str, request: DatasetFormattingRequest):
+    """
+    Execute dataset formatting sequentially: open1a -> open1b -> open1c
+    Directly executes subprocess (no webui dependency).
+    """
+    jobs[job_id]["status"] = "running"
+    jobs[job_id]["started_at"] = datetime.now().isoformat()
+    jobs[job_id]["current_stage"] = "open1a"
+
+    try:
+        opt_dir = f"{exp_root}/{request.exp_name}"
+        os.makedirs(opt_dir, exist_ok=True)
+
+        # Parse GPU numbers
+        gpu_names = request.gpu_numbers.split("-")
+        all_parts = len(gpu_names)
+
+        # Stage 1a: Get text features
+        print(f"[DATASET FORMATTING] Starting open1a...")
+        for i_part in range(all_parts):
+            env = os.environ.copy()
+            env.update({
+                "PYTHONPATH": os.pathsep.join([now_dir, os.path.join(now_dir, "GPT_SoVITS")]),
+                "inp_text": request.inp_text,
+                "inp_wav_dir": request.inp_wav_dir,
+                "exp_name": request.exp_name,
+                "opt_dir": opt_dir,
+                "bert_pretrained_dir": request.bert_pretrained_dir,
+                "i_part": str(i_part),
+                "all_parts": str(all_parts),
+                "_CUDA_VISIBLE_DEVICES": str(fix_gpu_number(gpu_names[i_part])),
+                "is_half": str(is_half),
+            })
+            cmd = [python_exec, "GPT_SoVITS/prepare_datasets/1-get-text.py"]
+            print(f"[DATASET FORMATTING] Executing 1a part {i_part}: {' '.join(cmd)}")
+            await asyncio.to_thread(subprocess.run, cmd, env=env, cwd=now_dir, check=True)
+
+        # Merge text files from 1a stage
+        opt = []
+        path_text = f"{opt_dir}/2-name2text.txt"
+        for i_part in range(all_parts):
+            text_path = f"{opt_dir}/2-name2text-{i_part}.txt"
+            if os.path.exists(text_path):
+                with open(text_path, "r", encoding="utf8") as f:
+                    opt += f.read().strip("\n").split("\n")
+                os.remove(text_path)
+
+        with open(path_text, "w", encoding="utf8") as f:
+            f.write("\n".join(opt) + "\n")
+
+        # Stage 1b: Get hubert features
+        jobs[job_id]["current_stage"] = "open1b"
+        print(f"[DATASET FORMATTING] Starting open1b...")
+        sv_path = "GPT_SoVITS/pretrained_models/sv/pretrained_eres2netv2w24s4ep4.ckpt"
+
+        for i_part in range(all_parts):
+            env = os.environ.copy()
+            env.update({
+                "PYTHONPATH": os.pathsep.join([now_dir, os.path.join(now_dir, "GPT_SoVITS")]),
+                "inp_text": request.inp_text,
+                "inp_wav_dir": request.inp_wav_dir,
+                "exp_name": request.exp_name,
+                "opt_dir": opt_dir,
+                "cnhubert_base_dir": request.ssl_pretrained_dir,
+                "sv_path": sv_path,
+                "is_half": str(is_half),
+                "i_part": str(i_part),
+                "all_parts": str(all_parts),
+                "_CUDA_VISIBLE_DEVICES": str(fix_gpu_number(gpu_names[i_part])),
+            })
+            cmd = [python_exec, "GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py"]
+            print(f"[DATASET FORMATTING] Executing 1b part {i_part}: {' '.join(cmd)}")
+            await asyncio.to_thread(subprocess.run, cmd, env=env, cwd=now_dir, check=True)
+
+        # For v2Pro version, also run 2-get-sv.py
+        if "Pro" in request.version:
+            for i_part in range(all_parts):
+                env = os.environ.copy()
+                env.update({
+                    "PYTHONPATH": os.pathsep.join([now_dir, os.path.join(now_dir, "GPT_SoVITS")]),
+                    "i_part": str(i_part),
+                    "all_parts": str(all_parts),
+                    "_CUDA_VISIBLE_DEVICES": str(fix_gpu_number(gpu_names[i_part])),
+                    "exp_dir": opt_dir,
+                    "sv_path": sv_path,
+                    "is_half": str(is_half),
+                })
+                cmd = [python_exec, "GPT_SoVITS/prepare_datasets/2-get-sv.py"]
+                print(f"[DATASET FORMATTING] Executing 2-get-sv part {i_part}: {' '.join(cmd)}")
+                await asyncio.to_thread(subprocess.run, cmd, env=env, cwd=now_dir, check=True)
+
+        # Stage 1c: Get semantic features
+        jobs[job_id]["current_stage"] = "open1c"
+        print(f"[DATASET FORMATTING] Starting open1c...")
+        config_file = (
+            "GPT_SoVITS/configs/s2.json"
+            if request.version not in {"v2Pro", "v2ProPlus"}
+            else f"GPT_SoVITS/configs/s2{request.version}.json"
+        )
+
+        for i_part in range(all_parts):
+            env = os.environ.copy()
+            env.update({
+                "PYTHONPATH": os.pathsep.join([now_dir, os.path.join(now_dir, "GPT_SoVITS")]),
+                "inp_text": request.inp_text,
+                "exp_name": request.exp_name,
+                "opt_dir": opt_dir,
+                "pretrained_s2G": request.pretrained_s2G_path,
+                "s2config_path": config_file,
+                "is_half": str(is_half),
+                "i_part": str(i_part),
+                "all_parts": str(all_parts),
+                "_CUDA_VISIBLE_DEVICES": str(fix_gpu_number(gpu_names[i_part])),
+            })
+            cmd = [python_exec, "GPT_SoVITS/prepare_datasets/3-get-semantic.py"]
+            print(f"[DATASET FORMATTING] Executing 1c part {i_part}: {' '.join(cmd)}")
+            await asyncio.to_thread(subprocess.run, cmd, env=env, cwd=now_dir, check=True)
+
+        # Merge semantic files (from open1c logic in webui.py)
+        opt = ["item_name\tsemantic_audio"]
+        path_semantic = f"{opt_dir}/6-name2semantic.tsv"
+        for i_part in range(all_parts):
+            semantic_path = f"{opt_dir}/6-name2semantic-{i_part}.tsv"
+            if os.path.exists(semantic_path):
+                with open(semantic_path, "r", encoding="utf8") as f:
+                    opt += f.read().strip("\n").split("\n")
+                os.remove(semantic_path)
+
+        with open(path_semantic, "w", encoding="utf8") as f:
+            f.write("\n".join(opt))
+
+        jobs[job_id]["status"] = "completed"
+        jobs[job_id]["result"] = {
+            "exp_name": request.exp_name,
+            "stages_completed": ["open1a", "open1b", "open1c"]
+        }
+        jobs[job_id]["completed_at"] = datetime.now().isoformat()
+
+    except Exception as e:
+        jobs[job_id]["status"] = "failed"
+        jobs[job_id]["error"] = str(e)
+        jobs[job_id]["traceback"] = traceback.format_exc()
+        jobs[job_id]["failed_at"] = datetime.now().isoformat()
+
+
+@APP.post("/training/format-dataset")
+async def format_dataset_endpoint(request: DatasetFormattingRequest):
+    """
+    Start dataset formatting job (open1a -> open1b -> open1c).
+
+    Wraps webui.open1a(), open1b(), open1c() sequentially.
+    """
+    # DEBUG: Print received payload
+    print(f"\n{'='*80}")
+    print(f"[DATASET FORMATTING DEBUG] Received request:")
+    print(f"  version: {request.version}")
+    print(f"  inp_text: {request.inp_text}")
+    print(f"  inp_wav_dir: {request.inp_wav_dir}")
+    print(f"  exp_name: {request.exp_name}")
+    print(f"  gpu_numbers1a: {request.gpu_numbers}")
+    print(f"  bert_pretrained_dir: {request.bert_pretrained_dir}")
+    print(f"  ssl_pretrained_dir: {request.ssl_pretrained_dir}")
+    print(f"  pretrained_s2G_path: {request.pretrained_s2G_path}")
+    print(f"{'='*80}\n")
+
+    job_id = str(uuid.uuid4())
+    jobs[job_id] = {
+        "status": "queued",
+        "operation": "format_dataset",
+        "created_at": datetime.now().isoformat()
+    }
+
+    try:
+        asyncio.create_task(execute_dataset_formatting(job_id, request))
+        return JSONResponse(status_code=200, content={"job_id": job_id, "status": "queued"})
+    except Exception as e:
+        jobs[job_id]["status"] = "failed"
+        jobs[job_id]["error"] = str(e)
+        return JSONResponse(status_code=500, content={"message": "failed to start job", "error": str(e)})
+
+
+async def execute_fine_tune_sovits_direct(job_id: str, request: FineTuneSoVITSRequest):
+    """
+    Execute SoVITS fine-tuning by directly calling s2_train.py subprocess.
+    Replaces webui.open1Ba() to avoid Gradio dependency.
+    """
+    jobs[job_id]["status"] = "running"
+    jobs[job_id]["started_at"] = datetime.now().isoformat()
+
+    try:
+        s2_dir = f"{exp_root}/{request.exp_name}"
+        os.makedirs(f"{s2_dir}/logs_s2_{request.version}", exist_ok=True)
+
+        # Load config template
+        config_file = (
+            "GPT_SoVITS/configs/s2.json"
+            if request.version not in {"v2Pro", "v2ProPlus"}
+            else f"GPT_SoVITS/configs/s2{request.version}.json"
+        )
+        with open(config_file) as f:
+            data = json.loads(f.read())
+
+        # Update config with request parameters
+        batch_size = request.batch_size
+        if is_half == False:
+            data["train"]["fp16_run"] = False
+            batch_size = max(1, batch_size // 2)
+
+        data["train"]["batch_size"] = batch_size
+        data["train"]["epochs"] = request.total_epoch
+        data["train"]["text_low_lr_rate"] = request.text_low_lr_rate
+        data["train"]["pretrained_s2G"] = request.pretrained_s2G
+        data["train"]["pretrained_s2D"] = request.pretrained_s2D
+        data["train"]["if_save_latest"] = request.if_save_latest
+        data["train"]["if_save_every_weights"] = request.if_save_every_weights
+        data["train"]["save_every_epoch"] = request.save_every_epoch
+        data["train"]["gpu_numbers"] = request.gpu_numbers1Ba
+        data["train"]["grad_ckpt"] = request.if_grad_ckpt
+        data["train"]["lora_rank"] = request.lora_rank
+        data["model"]["version"] = request.version
+        data["data"]["exp_dir"] = data["s2_ckpt_dir"] = s2_dir
+        data["save_weight_dir"] = SoVITS_weight_version2root[request.version]
+        data["name"] = request.exp_name
+        data["version"] = request.version
+
+        # Write temporary config
+        tmp_config_path = f"{now_dir}/TEMP/tmp_s2.json"
+        os.makedirs(f"{now_dir}/TEMP", exist_ok=True)
+        with open(tmp_config_path, "w") as f:
+            f.write(json.dumps(data))
+
+        # Prepare environment with PYTHONPATH
+        env = os.environ.copy()
+        env["PYTHONPATH"] = os.pathsep.join([now_dir, os.path.join(now_dir, "GPT_SoVITS")])
+
+        # Determine training script based on version
+        if request.version in ["v1", "v2", "v2Pro", "v2ProPlus"]:
+            cmd = [python_exec, "GPT_SoVITS/s2_train.py", "--config", tmp_config_path]
+        else:
+            cmd = [python_exec, "GPT_SoVITS/s2_train_v3_lora.py", "--config", tmp_config_path]
+
+        print(f"[SOVITS FINE-TUNING] Executing: {' '.join(cmd)}")
+        result = await asyncio.to_thread(subprocess.run, cmd, env=env, cwd=now_dir, check=True)
+
+        # Find latest SoVITS checkpoint
+        sovits_weights_dir = data["save_weight_dir"]
+        latest_sovits_checkpoint = None
+
+        if os.path.exists(sovits_weights_dir):
+            import re
+            pattern = re.compile(rf"^{re.escape(request.exp_name)}_e(\d+)_s(\d+)_l(\d+)\.pth$")
+            checkpoints = []
+            for filename in os.listdir(sovits_weights_dir):
+                match = pattern.match(filename)
+                if match:
+                    epoch = int(match.group(1))
+                    step = int(match.group(2))
+                    checkpoints.append((epoch, step, filename))
+
+            if checkpoints:
+                checkpoints.sort(reverse=True)
+                latest_filename = checkpoints[0][2]
+                latest_sovits_checkpoint = os.path.join(sovits_weights_dir, latest_filename)
+                print(f"[SOVITS FINE-TUNING] Latest checkpoint: {latest_sovits_checkpoint}")
+
+        jobs[job_id]["status"] = "completed"
+        jobs[job_id]["result"] = {
+            "exp_name": request.exp_name,
+            "config_path": tmp_config_path,
+            "checkpoint_path": latest_sovits_checkpoint,
+            "sovits_checkpoint_path": latest_sovits_checkpoint
+        }
+        jobs[job_id]["completed_at"] = datetime.now().isoformat()
+
+    except Exception as e:
+        jobs[job_id]["status"] = "failed"
+        jobs[job_id]["error"] = str(e)
+        jobs[job_id]["traceback"] = traceback.format_exc()
+        jobs[job_id]["failed_at"] = datetime.now().isoformat()
+
+
+@APP.post("/training/fine-tune-sovits")
+async def fine_tune_sovits_endpoint(request: FineTuneSoVITSRequest):
+    """
+    Start SoVITS fine-tuning job.
+
+    Directly executes s2_train.py (no webui dependency).
+    """
+    # DEBUG: Print received payload
+    print(f"\n{'='*80}")
+    print(f"[SOVITS FINE-TUNING DEBUG] Received request:")
+    print(f"  version: {request.version}")
+    print(f"  batch_size: {request.batch_size}")
+    print(f"  total_epoch: {request.total_epoch}")
+    print(f"  exp_name: {request.exp_name}")
+    print(f"  text_low_lr_rate: {request.text_low_lr_rate}")
+    print(f"  if_save_latest: {request.if_save_latest}")
+    print(f"  if_save_every_weights: {request.if_save_every_weights}")
+    print(f"  save_every_epoch: {request.save_every_epoch}")
+    print(f"  gpu_numbers1Ba: {request.gpu_numbers1Ba}")
+    print(f"  pretrained_s2G: {request.pretrained_s2G}")
+    print(f"  pretrained_s2D: {request.pretrained_s2D}")
+    print(f"  if_grad_ckpt: {request.if_grad_ckpt}")
+    print(f"  lora_rank: {request.lora_rank}")
+    print(f"{'='*80}\n")
+
+    job_id = str(uuid.uuid4())
+    jobs[job_id] = {
+        "status": "queued",
+        "operation": "fine_tune_sovits",
+        "created_at": datetime.now().isoformat()
+    }
+
+    try:
+        asyncio.create_task(execute_fine_tune_sovits_direct(job_id, request))
+        return JSONResponse(status_code=200, content={"job_id": job_id, "status": "queued"})
+    except Exception as e:
+        jobs[job_id]["status"] = "failed"
+        jobs[job_id]["error"] = str(e)
+        return JSONResponse(status_code=500, content={"message": "failed to start job", "error": str(e)})
+
+
+@APP.post("/training/fine-tune-gpt")
+async def fine_tune_gpt_endpoint(request: FineTuneGPTRequest):
+    """
+    Start GPT fine-tuning job.
+
+    Wraps webui.open1Bb()
+    """
+    # DEBUG: Print received payload
+    print(f"\n{'='*80}")
+    print(f"[GPT FINE-TUNING DEBUG] Received request:")
+    print(f"  batch_size: {request.batch_size}")
+    print(f"  total_epoch: {request.total_epoch}")
+    print(f"  exp_name: {request.exp_name}")
+    print(f"  if_dpo: {request.if_dpo}")
+    print(f"  if_save_latest: {request.if_save_latest}")
+    print(f"  if_save_every_weights: {request.if_save_every_weights}")
+    print(f"  save_every_epoch: {request.save_every_epoch}")
+    print(f"  gpu_numbers: {request.gpu_numbers}")
+    print(f"  pretrained_s1: {request.pretrained_s1}")
+    print(f"{'='*80}\n")
+
+    job_id = str(uuid.uuid4())
+    jobs[job_id] = {
+        "status": "queued",
+        "operation": "fine_tune_gpt",
+        "created_at": datetime.now().isoformat()
+    }
+
+    try:
+        asyncio.create_task(execute_fine_tune_gpt_direct(job_id, request))
+        return JSONResponse(status_code=200, content={"job_id": job_id, "status": "queued"})
+    except Exception as e:
+        jobs[job_id]["status"] = "failed"
+        jobs[job_id]["error"] = str(e)
+        return JSONResponse(status_code=500, content={"message": "failed to start job", "error": str(e)})
+
+
+async def execute_fine_tune_gpt_direct(job_id: str, request: FineTuneGPTRequest):
+    """
+    Execute GPT fine-tuning by directly calling s1_train.py subprocess.
+    Replaces webui.open1Bb() to avoid Gradio dependency.
+    """
+    jobs[job_id]["status"] = "running"
+    jobs[job_id]["started_at"] = datetime.now().isoformat()
+
+    try:
+        s1_dir = f"{exp_root}/{request.exp_name}"
+        os.makedirs(f"{s1_dir}/logs_s1", exist_ok=True)
+
+        # Determine version (from webui.py line 606)
+        version = os.environ.get("version", "v4")
+
+        # Load config template
+        config_path = (
+            "GPT_SoVITS/configs/s1longer.yaml" if version == "v1"
+            else "GPT_SoVITS/configs/s1longer-v2.yaml"
+        )
+        with open(config_path) as f:
+            data = yaml.load(f.read(), Loader=yaml.FullLoader)
+
+        # Update config with request parameters
+        batch_size = request.batch_size
+        if is_half == False:
+            data["train"]["precision"] = "32"
+            batch_size = max(1, batch_size // 2)
+
+        data["train"]["batch_size"] = batch_size
+        data["train"]["epochs"] = request.total_epoch
+        data["pretrained_s1"] = request.pretrained_s1
+        data["train"]["save_every_n_epoch"] = request.save_every_epoch
+        data["train"]["if_save_every_weights"] = request.if_save_every_weights
+        data["train"]["if_save_latest"] = request.if_save_latest
+        data["train"]["if_dpo"] = request.if_dpo
+        data["train"]["half_weights_save_dir"] = GPT_weight_version2root[version]
+        data["train"]["exp_name"] = request.exp_name
+        data["train_semantic_path"] = f"{s1_dir}/6-name2semantic.tsv"
+        data["train_phoneme_path"] = f"{s1_dir}/2-name2text.txt"
+        data["output_dir"] = f"{s1_dir}/logs_s1_{version}"
+
+        # Set environment variables for GPU and PYTHONPATH
+        env = os.environ.copy()
+        env["PYTHONPATH"] = os.pathsep.join([now_dir, os.path.join(now_dir, "GPT_SoVITS")])
+        env["_CUDA_VISIBLE_DEVICES"] = fix_gpu_numbers(request.gpu_numbers.replace("-", ","))
+        env["hz"] = "25hz"
+
+        # Write temporary config
+        tmp_config_path = f"{now_dir}/TEMP/tmp_s1.yaml"
+        os.makedirs(f"{now_dir}/TEMP", exist_ok=True)
+        with open(tmp_config_path, "w") as f:
+            f.write(yaml.dump(data, default_flow_style=False))
+
+        # Execute training
+        cmd = [python_exec, "GPT_SoVITS/s1_train.py", "--config_file", tmp_config_path]
+        print(f"[GPT FINE-TUNING] Executing: {' '.join(cmd)}")
+        result = await asyncio.to_thread(subprocess.run, cmd, env=env, cwd=now_dir, check=True)
+
+        # Find latest GPT checkpoint
+        gpt_weights_dir = data["train"]["half_weights_save_dir"]
+        latest_gpt_checkpoint = None
+
+        if os.path.exists(gpt_weights_dir):
+            import re
+            pattern = re.compile(rf"^{re.escape(request.exp_name)}-e(\d+)\.ckpt$")
+            checkpoints = []
+            for filename in os.listdir(gpt_weights_dir):
+                match = pattern.match(filename)
+                if match:
+                    epoch = int(match.group(1))
+                    checkpoints.append((epoch, filename))
+
+            if checkpoints:
+                checkpoints.sort(reverse=True)
+                latest_filename = checkpoints[0][1]
+                latest_gpt_checkpoint = os.path.join(gpt_weights_dir, latest_filename)
+                print(f"[GPT FINE-TUNING] Latest checkpoint: {latest_gpt_checkpoint}")
+
+        jobs[job_id]["status"] = "completed"
+        jobs[job_id]["result"] = {
+            "exp_name": request.exp_name,
+            "config_path": tmp_config_path,
+            "checkpoint_path": latest_gpt_checkpoint,
+            "gpt_checkpoint_path": latest_gpt_checkpoint
+        }
+        jobs[job_id]["completed_at"] = datetime.now().isoformat()
+
+    except Exception as e:
+        jobs[job_id]["status"] = "failed"
+        jobs[job_id]["error"] = str(e)
+        jobs[job_id]["traceback"] = traceback.format_exc()
+        jobs[job_id]["failed_at"] = datetime.now().isoformat()
+
+
 if __name__ == "__main__":
    try:
        if host == "None":  # 在调用时使用 -a None 参数，可以让api监听双栈