流式API

2026-06-04 05:01:27 +08:00 · 2025-07-15 11:38:15 +08:00 · 2025-07-15 11:38:15 +08:00 · 0d405f2d2a
commit 0d405f2d2a
parent bed73febf3
11 changed files with 55 additions and 87 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,6 +1,3 @@
 GPT_SoVITS/pretrained_models/*
 tools/asr/models/*
 tools/uvr5/uvr5_weights/*
 .git
 .DS_Store
@ -11,10 +8,7 @@ runtime
 .idea
 output
 logs
 SoVITS_weights*/
 GPT_weights*/
 TEMP
 weight.json
 ffmpeg*
 ffprobe*
 cfg.json
--- a/Docker/install_wrapper.sh
+++ b/Docker/install_wrapper.sh
@ -18,7 +18,7 @@ ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretr
 ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel
-bash install.sh --device "CU${CUDA_VERSION//./}" --source HF
+bash install.sh --device "MPS" --source HF
 pip cache purge
--- a/70
+++ b/70
@ -1,62 +1,20 @@
-ARG CUDA_VERSION=12.6
+FROM python:3.10.18-bullseye
 ARG TORCH_BASE=full
-FROM xxxxrt666/torch-base:cu${CUDA_VERSION}-${TORCH_BASE}
+LABEL version="V2pro"
 LABEL maintainer="XXXXRT"
 LABEL version="V4"
 LABEL description="Docker image for GPT-SoVITS"
-ARG CUDA_VERSION=12.6
+WORKDIR /GPT-SoVITS
 COPY requirements.txt /GPT-SoVITS
 RUN pip install -r requirements.txt
-ENV CUDA_VERSION=${CUDA_VERSION}
+COPY GPT_SoVITS /GPT-SoVITS/GPT_SoVITS
 COPY tools /GPT-SoVITS/tools
 COPY api.py /GPT-SoVITS
 COPY api_v2.py /GPT-SoVITS
 COPY config.py /GPT-SoVITS
 COPY webui.py /GPT-SoVITS
 COPY ref_audio /GPT-SoVITS/ref_audio
-SHELL ["/bin/bash", "-c"]
+EXPOSE 9871 9872 9873 9874 9880 8001 8002
-WORKDIR /workspace/GPT-SoVITS
+CMD ["/bin/bash", "-c", "python GPT_SoVITS/inference_webui_api.py"]
 COPY Docker /workspace/GPT-SoVITS/Docker/
 ARG LITE=false
 ENV LITE=${LITE}
 ARG WORKFLOW=false
 ENV WORKFLOW=${WORKFLOW}
 ARG TARGETPLATFORM
 ENV TARGETPLATFORM=${TARGETPLATFORM}
 RUN bash Docker/miniconda_install.sh
 COPY extra-req.txt /workspace/GPT-SoVITS/
 COPY requirements.txt /workspace/GPT-SoVITS/
 COPY install.sh /workspace/GPT-SoVITS/
 RUN bash Docker/install_wrapper.sh
 EXPOSE 9871 9872 9873 9874 9880
 ENV PYTHONPATH="/workspace/GPT-SoVITS"
 RUN conda init bash && echo "conda activate base" >> ~/.bashrc
 WORKDIR /workspace
 RUN rm -rf /workspace/GPT-SoVITS
 WORKDIR /workspace/GPT-SoVITS
 COPY . /workspace/GPT-SoVITS
 CMD ["/bin/bash", "-c", "\
  rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
  rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
  rm -rf /workspace/GPT-SoVITS/tools/asr/models && \
  rm -rf /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \
  ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
  ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
  ln -s /workspace/models/asr_models /workspace/GPT-SoVITS/tools/asr/models && \
  ln -s /workspace/models/uvr5_weights /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \
  exec bash"]
--- a/GPT_SoVITS/configs/tts_infer.yaml
+++ b/GPT_SoVITS/configs/tts_infer.yaml
@ -3,9 +3,9 @@ custom:
  cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
  device: cpu
  is_half: false
-  t2s_weights_path: GPT_weights_v2ProPlus/111-e15.ckpt
+  t2s_weights_path: GPT_SoVITS/pretrained_models/meiv2pp-e15.ckpt
-  version: v2Pro
+  version: v2
-  vits_weights_path: SoVITS_weights_v2ProPlus/111_e8_s136.pth
+  vits_weights_path: GPT_SoVITS/pretrained_models/meiv2pp_e8_s232.pth
 v1:
  bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
  cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
--- a/GPT_SoVITS/inference_webui_api.py
+++ b/GPT_SoVITS/inference_webui_api.py
@ -525,29 +525,30 @@ import tempfile
 import shutil
 import os
 from pydantic import BaseModel
 import soundfile as sf
 app = FastAPI()
 class InferenceRequest(BaseModel):
    text: str
-    text_lang: str = "中文"
+    text_lang: str = i18n("中文")
    ref_audio: str   # 这里是base64编码的音频文件内容
-    prompt_text: str = ""
+    prompt_text: str
-    prompt_lang: str = "中文"
+    prompt_lang: str = i18n("中文")
-    top_k: int = 5
+    top_k: int = 6
-    top_p: float = 1
+    top_p: float = 0.9
-    temperature: float = 1
+    temperature: float = 0.95
-    text_split_method: str = "按标点符号切"
+    text_split_method: str = i18n("按标点符号切")
    batch_size: int = 20
    speed_factor: float = 1.1
-    ref_text_free: bool = True
+    ref_text_free: bool = False
    split_bucket: bool = True
    fragment_interval: float = 0.3
    seed: int = -1
    keep_random: bool = True
    parallel_infer: bool = True
-    repetition_penalty: float = 1.35
+    repetition_penalty: float = 1.45
    sample_steps: int = 32
    super_sampling: bool = False
@ -632,24 +633,21 @@ def wav_chunk_streamer(infer_gen):
            wav_file.writeframes(audio.tobytes())
        return buffer.getvalue()
-    for wav_data, _ in infer_gen:
+    for audio, _ in infer_gen:
-        sr, audio = wav_data
+        audio_data = audio[0] if isinstance(audio[0], np.ndarray) else audio[1]
-        if not isinstance(audio, np.ndarray):
+        yield encode_wav_chunk(32000, audio_data)  # 每段 WAV 数据
-            audio = np.array(audio)
+
        if audio.dtype != np.int16:
            audio = (audio * 32768).astype(np.int16)
        yield encode_wav_chunk(sr, audio)  # 每段 WAV 数据
@app.post("/tts_stream")
 async def api_inference(req: InferenceRequest):
    try:
        infer_gen = inference(
            text=req.text,
-            text_lang=req.text_lang,
+            text_lang=i18n(req.text_lang),
            ref_audio_path=req.ref_audio,
            aux_ref_audio_paths=[],
            prompt_text=req.prompt_text,
-            prompt_lang=req.prompt_lang,
+            prompt_lang=i18n(req.prompt_lang),
            top_k=req.top_k,
            top_p=req.top_p,
            temperature=req.temperature,
@ -683,4 +681,5 @@ async def api_inference(req: InferenceRequest):
 if __name__ == "__main__":
    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8001)
+    port = int(os.environ.get("PORT", 8001))  # 默认端口8001
    uvicorn.run(app, host="0.0.0.0", port=port)
--- a/GPT_SoVITS/inference_webui_fast.py
+++ b/GPT_SoVITS/inference_webui_fast.py
@ -31,6 +31,11 @@ import torch
 import logging
 import time
 import numpy
 # 在文件开头添加输出目录配置
 output_dir = os.environ.get("output_dir", "outputs")
 os.makedirs(output_dir, exist_ok=True)
 now_dir = os.getcwd()
 sys.path.append(now_dir)
@ -206,8 +211,19 @@ def inference(
        start_time = time.time()
-        for item in tts_pipeline.run(inputs):
+        for audio in tts_pipeline.run(inputs):
-            yield item, actual_seed
+            if isinstance(audio, tuple):
                # 保存到本地
                output_filename = f"tts_{int(time.time())}.wav"
                output_path = os.path.join(output_dir, output_filename)
                audio_data = audio[0] if isinstance(audio[0], numpy.ndarray) else audio[1]
                import soundfile as sf
                sf.write(output_path, audio_data, 32000)
                logging.info(f"音频已保存至: {output_path}")
                # 返回原始音频数据给 Gradio
                yield audio, actual_seed
            else:
                yield audio, actual_seed
        logging.info(
            f"TTS请求耗时: {time.time() - start_time:.3f}s | 文本: {text}"
--- a/GPT_SoVITS/text/ja_userdic/user.dict
+++ b/GPT_SoVITS/text/ja_userdic/user.dict
--- a/GPT_SoVITS/text/ja_userdic/userdict.md5
+++ b/GPT_SoVITS/text/ja_userdic/userdict.md5
@ -0,0 +1 @@
 d36bd5ffba62f195d22bf4f1a41cd08f
--- a/ref_audio/1.wav
+++ b/ref_audio/1.wav
--- a/ref_audio/2.wav
+++ b/ref_audio/2.wav
--- a/ref_audio/3.wav
+++ b/ref_audio/3.wav