流式API

This commit is contained in:
XL 2025-07-15 11:38:15 +08:00
parent bed73febf3
commit 0d405f2d2a
11 changed files with 55 additions and 87 deletions

View File

@ -1,6 +1,3 @@
GPT_SoVITS/pretrained_models/*
tools/asr/models/*
tools/uvr5/uvr5_weights/*
.git .git
.DS_Store .DS_Store
@ -11,10 +8,7 @@ runtime
.idea .idea
output output
logs logs
SoVITS_weights*/
GPT_weights*/
TEMP TEMP
weight.json
ffmpeg* ffmpeg*
ffprobe* ffprobe*
cfg.json cfg.json

View File

@ -18,7 +18,7 @@ ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretr
ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel
bash install.sh --device "CU${CUDA_VERSION//./}" --source HF bash install.sh --device "MPS" --source HF
pip cache purge pip cache purge

View File

@ -1,62 +1,20 @@
ARG CUDA_VERSION=12.6 FROM python:3.10.18-bullseye
ARG TORCH_BASE=full
FROM xxxxrt666/torch-base:cu${CUDA_VERSION}-${TORCH_BASE} LABEL version="V2pro"
LABEL maintainer="XXXXRT"
LABEL version="V4"
LABEL description="Docker image for GPT-SoVITS" LABEL description="Docker image for GPT-SoVITS"
ARG CUDA_VERSION=12.6 WORKDIR /GPT-SoVITS
COPY requirements.txt /GPT-SoVITS
RUN pip install -r requirements.txt
ENV CUDA_VERSION=${CUDA_VERSION} COPY GPT_SoVITS /GPT-SoVITS/GPT_SoVITS
COPY tools /GPT-SoVITS/tools
COPY api.py /GPT-SoVITS
COPY api_v2.py /GPT-SoVITS
COPY config.py /GPT-SoVITS
COPY webui.py /GPT-SoVITS
COPY ref_audio /GPT-SoVITS/ref_audio
SHELL ["/bin/bash", "-c"] EXPOSE 9871 9872 9873 9874 9880 8001 8002
WORKDIR /workspace/GPT-SoVITS CMD ["/bin/bash", "-c", "python GPT_SoVITS/inference_webui_api.py"]
COPY Docker /workspace/GPT-SoVITS/Docker/
ARG LITE=false
ENV LITE=${LITE}
ARG WORKFLOW=false
ENV WORKFLOW=${WORKFLOW}
ARG TARGETPLATFORM
ENV TARGETPLATFORM=${TARGETPLATFORM}
RUN bash Docker/miniconda_install.sh
COPY extra-req.txt /workspace/GPT-SoVITS/
COPY requirements.txt /workspace/GPT-SoVITS/
COPY install.sh /workspace/GPT-SoVITS/
RUN bash Docker/install_wrapper.sh
EXPOSE 9871 9872 9873 9874 9880
ENV PYTHONPATH="/workspace/GPT-SoVITS"
RUN conda init bash && echo "conda activate base" >> ~/.bashrc
WORKDIR /workspace
RUN rm -rf /workspace/GPT-SoVITS
WORKDIR /workspace/GPT-SoVITS
COPY . /workspace/GPT-SoVITS
CMD ["/bin/bash", "-c", "\
rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
rm -rf /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
rm -rf /workspace/GPT-SoVITS/tools/asr/models && \
rm -rf /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \
ln -s /workspace/models/pretrained_models /workspace/GPT-SoVITS/GPT_SoVITS/pretrained_models && \
ln -s /workspace/models/G2PWModel /workspace/GPT-SoVITS/GPT_SoVITS/text/G2PWModel && \
ln -s /workspace/models/asr_models /workspace/GPT-SoVITS/tools/asr/models && \
ln -s /workspace/models/uvr5_weights /workspace/GPT-SoVITS/tools/uvr5/uvr5_weights && \
exec bash"]

View File

@ -3,9 +3,9 @@ custom:
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
device: cpu device: cpu
is_half: false is_half: false
t2s_weights_path: GPT_weights_v2ProPlus/111-e15.ckpt t2s_weights_path: GPT_SoVITS/pretrained_models/meiv2pp-e15.ckpt
version: v2Pro version: v2
vits_weights_path: SoVITS_weights_v2ProPlus/111_e8_s136.pth vits_weights_path: GPT_SoVITS/pretrained_models/meiv2pp_e8_s232.pth
v1: v1:
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base

View File

@ -525,29 +525,30 @@ import tempfile
import shutil import shutil
import os import os
from pydantic import BaseModel from pydantic import BaseModel
import soundfile as sf
app = FastAPI() app = FastAPI()
class InferenceRequest(BaseModel): class InferenceRequest(BaseModel):
text: str text: str
text_lang: str = "中文" text_lang: str = i18n("中文")
ref_audio: str # 这里是base64编码的音频文件内容 ref_audio: str # 这里是base64编码的音频文件内容
prompt_text: str = "" prompt_text: str
prompt_lang: str = "中文" prompt_lang: str = i18n("中文")
top_k: int = 5 top_k: int = 6
top_p: float = 1 top_p: float = 0.9
temperature: float = 1 temperature: float = 0.95
text_split_method: str = "按标点符号切" text_split_method: str = i18n("按标点符号切")
batch_size: int = 20 batch_size: int = 20
speed_factor: float = 1.1 speed_factor: float = 1.1
ref_text_free: bool = True ref_text_free: bool = False
split_bucket: bool = True split_bucket: bool = True
fragment_interval: float = 0.3 fragment_interval: float = 0.3
seed: int = -1 seed: int = -1
keep_random: bool = True keep_random: bool = True
parallel_infer: bool = True parallel_infer: bool = True
repetition_penalty: float = 1.35 repetition_penalty: float = 1.45
sample_steps: int = 32 sample_steps: int = 32
super_sampling: bool = False super_sampling: bool = False
@ -632,24 +633,21 @@ def wav_chunk_streamer(infer_gen):
wav_file.writeframes(audio.tobytes()) wav_file.writeframes(audio.tobytes())
return buffer.getvalue() return buffer.getvalue()
for wav_data, _ in infer_gen: for audio, _ in infer_gen:
sr, audio = wav_data audio_data = audio[0] if isinstance(audio[0], np.ndarray) else audio[1]
if not isinstance(audio, np.ndarray): yield encode_wav_chunk(32000, audio_data) # 每段 WAV 数据
audio = np.array(audio)
if audio.dtype != np.int16:
audio = (audio * 32768).astype(np.int16)
yield encode_wav_chunk(sr, audio) # 每段 WAV 数据
@app.post("/tts_stream") @app.post("/tts_stream")
async def api_inference(req: InferenceRequest): async def api_inference(req: InferenceRequest):
try: try:
infer_gen = inference( infer_gen = inference(
text=req.text, text=req.text,
text_lang=req.text_lang, text_lang=i18n(req.text_lang),
ref_audio_path=req.ref_audio, ref_audio_path=req.ref_audio,
aux_ref_audio_paths=[], aux_ref_audio_paths=[],
prompt_text=req.prompt_text, prompt_text=req.prompt_text,
prompt_lang=req.prompt_lang, prompt_lang=i18n(req.prompt_lang),
top_k=req.top_k, top_k=req.top_k,
top_p=req.top_p, top_p=req.top_p,
temperature=req.temperature, temperature=req.temperature,
@ -683,4 +681,5 @@ async def api_inference(req: InferenceRequest):
if __name__ == "__main__": if __name__ == "__main__":
import uvicorn import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8001) port = int(os.environ.get("PORT", 8001)) # 默认端口8001
uvicorn.run(app, host="0.0.0.0", port=port)

View File

@ -31,6 +31,11 @@ import torch
import logging import logging
import time import time
import numpy
# 在文件开头添加输出目录配置
output_dir = os.environ.get("output_dir", "outputs")
os.makedirs(output_dir, exist_ok=True)
now_dir = os.getcwd() now_dir = os.getcwd()
sys.path.append(now_dir) sys.path.append(now_dir)
@ -206,8 +211,19 @@ def inference(
start_time = time.time() start_time = time.time()
for item in tts_pipeline.run(inputs): for audio in tts_pipeline.run(inputs):
yield item, actual_seed if isinstance(audio, tuple):
# 保存到本地
output_filename = f"tts_{int(time.time())}.wav"
output_path = os.path.join(output_dir, output_filename)
audio_data = audio[0] if isinstance(audio[0], numpy.ndarray) else audio[1]
import soundfile as sf
sf.write(output_path, audio_data, 32000)
logging.info(f"音频已保存至: {output_path}")
# 返回原始音频数据给 Gradio
yield audio, actual_seed
else:
yield audio, actual_seed
logging.info( logging.info(
f"TTS请求耗时: {time.time() - start_time:.3f}s | 文本: {text}" f"TTS请求耗时: {time.time() - start_time:.3f}s | 文本: {text}"

Binary file not shown.

View File

@ -0,0 +1 @@
d36bd5ffba62f195d22bf4f1a41cd08f

BIN
ref_audio/1.wav Normal file

Binary file not shown.

BIN
ref_audio/2.wav Normal file

Binary file not shown.

BIN
ref_audio/3.wav Normal file

Binary file not shown.