diff --git a/GPT_SoVITS/inference_webui.py b/GPT_SoVITS/inference_webui.py index bca4a43e..668b36ae 100644 --- a/GPT_SoVITS/inference_webui.py +++ b/GPT_SoVITS/inference_webui.py @@ -17,7 +17,6 @@ logging.getLogger("httpx").setLevel(logging.ERROR) logging.getLogger("asyncio").setLevel(logging.ERROR) logging.getLogger("charset_normalizer").setLevel(logging.ERROR) logging.getLogger("torchaudio._extension").setLevel(logging.ERROR) -import pdb import torch if os.path.exists("./gweight.txt"): @@ -66,7 +65,7 @@ from text import cleaned_text_to_sequence from text.cleaner import clean_text from time import time as ttime from module.mel_processing import spectrogram_torch -from my_utils import load_audio +from pyutils.np_utils import load_audio from tools.i18n.i18n import I18nAuto i18n = I18nAuto() diff --git a/GPT_SoVITS/module/data_utils.py b/GPT_SoVITS/module/data_utils.py index ff4c4f43..ba870b85 100644 --- a/GPT_SoVITS/module/data_utils.py +++ b/GPT_SoVITS/module/data_utils.py @@ -1,23 +1,14 @@ -import time -import logging import os import random import traceback -import numpy as np import torch import torch.utils.data from tqdm import tqdm -from module import commons from module.mel_processing import spectrogram_torch from text import cleaned_text_to_sequence -from utils import load_wav_to_torch, load_filepaths_and_text import torch.nn.functional as F -from functools import lru_cache -import requests -from scipy.io import wavfile -from io import BytesIO -from my_utils import load_audio +from pyutils.np_utils import load_audio # ZeroDivisionError fixed by Tybost (https://github.com/RVC-Boss/GPT-SoVITS/issues/79) class TextAudioSpeakerLoader(torch.utils.data.Dataset): diff --git a/GPT_SoVITS/onnx_export.py b/GPT_SoVITS/onnx_export.py index b82e987f..f4132f9e 100644 --- a/GPT_SoVITS/onnx_export.py +++ b/GPT_SoVITS/onnx_export.py @@ -9,7 +9,6 @@ cnhubert.cnhubert_base_path=cnhubert_base_path ssl_model = cnhubert.get_model() from text import cleaned_text_to_sequence import soundfile -from my_utils import load_audio import os import json diff --git a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py index 9a2f73c0..1c539d27 100644 --- a/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py +++ b/GPT_SoVITS/prepare_datasets/2-get-hubert-wav32k.py @@ -12,12 +12,12 @@ opt_dir= os.environ.get("opt_dir") cnhubert.cnhubert_base_path= os.environ.get("cnhubert_base_dir") is_half=eval(os.environ.get("is_half","True")) -import pdb,traceback,numpy as np,logging +import traceback,numpy as np from scipy.io import wavfile import librosa,torch now_dir = os.getcwd() sys.path.append(now_dir) -from my_utils import load_audio +from pyutils.np_utils import load_audio # from config import cnhubert_base_path # cnhubert.cnhubert_base_path=cnhubert_base_path diff --git a/GPT_SoVITS/my_utils.py b/GPT_SoVITS/pyutils/np_utils.py similarity index 97% rename from GPT_SoVITS/my_utils.py rename to GPT_SoVITS/pyutils/np_utils.py index 776939dd..a5258394 100644 --- a/GPT_SoVITS/my_utils.py +++ b/GPT_SoVITS/pyutils/np_utils.py @@ -1,21 +1,21 @@ -import ffmpeg -import numpy as np - - -def load_audio(file, sr): - try: - # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 - # This launches a subprocess to decode audio while down-mixing and resampling as necessary. - # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. - file = ( - file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - ) # 防止小白拷路径头尾带了空格和"和回车 - out, _ = ( - ffmpeg.input(file, threads=0) - .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) - .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) - ) - except Exception as e: - raise RuntimeError(f"Failed to load audio: {e}") - - return np.frombuffer(out, np.float32).flatten() +import ffmpeg +import numpy as np + + +def load_audio(file, sr): + try: + # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 + # This launches a subprocess to decode audio while down-mixing and resampling as necessary. + # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. + file = ( + file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) # 防止小白拷路径头尾带了空格和"和回车 + out, _ = ( + ffmpeg.input(file, threads=0) + .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) + .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) + ) + except Exception as e: + raise RuntimeError(f"Failed to load audio: {e}") + + return np.frombuffer(out, np.float32).flatten() diff --git a/api.md b/api.md new file mode 100644 index 00000000..1c0e3a87 --- /dev/null +++ b/api.md @@ -0,0 +1,115 @@ +# api + +` python api.py -dr "123.wav" -dt "一二三。" -dl "zh" ` + +## 执行参数: + +`-s` - `SoVITS模型路径, 可在 config.py 中指定` +`-g` - `GPT模型路径, 可在 config.py 中指定` + +调用请求缺少参考音频时使用 +`-dr` - `默认参考音频路径` +`-dt` - `默认参考音频文本` +`-dl` - `默认参考音频语种, "中文","英文","日文","zh","en","ja"` + +`-d` - `推理设备, "cuda","cpu"` +`-a` - `绑定地址, 默认"127.0.0.1"` +`-p` - `绑定端口, 默认9880, 可在 config.py 中指定` +`-fp` - `覆盖 config.py 使用全精度` +`-hp` - `覆盖 config.py 使用半精度` +`-sm` - `流式返回模式, 默认不启用, "close","c", "normal","n", "keepalive","k"` +·-mt` - `返回的音频编码格式, 流式默认ogg, 非流式默认wav, "wav", "ogg", "aac"` +·-cp` - `文本切分符号设定, 默认为空, 以",.,。"字符串的方式传入` + +`-hb` - `cnhubert路径` +`-b` - `bert路径` + +## 调用: + +### 推理 + +endpoint: `/` + +使用执行参数指定的参考音频: +- GET: + `http://127.0.0.1:9880?text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh` + +- POST: +```json +{ + "text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。", + "text_language": "zh" +} +``` + +使用执行参数指定的参考音频并设定分割符号: +- GET: + `http://127.0.0.1:9880?text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh&cut_punc=,。` +- POST: +```json +{ + "text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。", + "text_language": "zh", + "cut_punc": ",。" +} +``` + +手动指定当次推理所使用的参考音频: +- GET: + `http://127.0.0.1:9880?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh&text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh` +- POST: +```json +{ + "refer_wav_path": "123.wav", + "prompt_text": "一二三。", + "prompt_language": "zh", + "text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。", + "text_language": "zh" +} +``` + +RESP: +- 成功: 直接返回 wav 音频流, http code 200 +- 失败: 返回包含错误信息的 json, http code 400 + + +### 更换默认参考音频 + +endpoint: `/change_refer` + +key与推理端一样 + +- GET: + `http://127.0.0.1:9880/change_refer?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh` +- POST: +```json +{ + "refer_wav_path": "123.wav", + "prompt_text": "一二三。", + "prompt_language": "zh" +} +``` + +RESP: +成功: json, http code 200 +失败: json, 400 + + +### 命令控制 + +endpoint: `/control` + +command: +"restart": 重新运行 +"exit": 结束运行 + +- GET: + `http://127.0.0.1:9880/control?command=restart` +- POST: +```json +{ + "command": "restart" +} +``` + +RESP: 无 \ No newline at end of file diff --git a/api.py b/api.py index 041fa349..7b6b07f3 100644 --- a/api.py +++ b/api.py @@ -1,129 +1,10 @@ -""" -# api.py usage - -` python api.py -dr "123.wav" -dt "一二三。" -dl "zh" ` - -## 执行参数: - -`-s` - `SoVITS模型路径, 可在 config.py 中指定` -`-g` - `GPT模型路径, 可在 config.py 中指定` - -调用请求缺少参考音频时使用 -`-dr` - `默认参考音频路径` -`-dt` - `默认参考音频文本` -`-dl` - `默认参考音频语种, "中文","英文","日文","zh","en","ja"` - -`-d` - `推理设备, "cuda","cpu"` -`-a` - `绑定地址, 默认"127.0.0.1"` -`-p` - `绑定端口, 默认9880, 可在 config.py 中指定` -`-fp` - `覆盖 config.py 使用全精度` -`-hp` - `覆盖 config.py 使用半精度` -`-sm` - `流式返回模式, 默认不启用, "close","c", "normal","n", "keepalive","k"` -·-mt` - `返回的音频编码格式, 流式默认ogg, 非流式默认wav, "wav", "ogg", "aac"` -·-cp` - `文本切分符号设定, 默认为空, 以",.,。"字符串的方式传入` - -`-hb` - `cnhubert路径` -`-b` - `bert路径` - -## 调用: - -### 推理 - -endpoint: `/` - -使用执行参数指定的参考音频: -GET: - `http://127.0.0.1:9880?text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh` -POST: -```json -{ - "text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。", - "text_language": "zh" -} -``` - -使用执行参数指定的参考音频并设定分割符号: -GET: - `http://127.0.0.1:9880?text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh&cut_punc=,。` -POST: -```json -{ - "text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。", - "text_language": "zh", - "cut_punc": ",。", -} -``` - -手动指定当次推理所使用的参考音频: -GET: - `http://127.0.0.1:9880?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh&text=先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。&text_language=zh` -POST: -```json -{ - "refer_wav_path": "123.wav", - "prompt_text": "一二三。", - "prompt_language": "zh", - "text": "先帝创业未半而中道崩殂,今天下三分,益州疲弊,此诚危急存亡之秋也。", - "text_language": "zh" -} -``` - -RESP: -成功: 直接返回 wav 音频流, http code 200 -失败: 返回包含错误信息的 json, http code 400 - - -### 更换默认参考音频 - -endpoint: `/change_refer` - -key与推理端一样 - -GET: - `http://127.0.0.1:9880/change_refer?refer_wav_path=123.wav&prompt_text=一二三。&prompt_language=zh` -POST: -```json -{ - "refer_wav_path": "123.wav", - "prompt_text": "一二三。", - "prompt_language": "zh" -} -``` - -RESP: -成功: json, http code 200 -失败: json, 400 - - -### 命令控制 - -endpoint: `/control` - -command: -"restart": 重新运行 -"exit": 结束运行 - -GET: - `http://127.0.0.1:9880/control?command=restart` -POST: -```json -{ - "command": "restart" -} -``` - -RESP: 无 - -""" - - import argparse import os,re import sys -now_dir = os.getcwd() -sys.path.append(now_dir) -sys.path.append("%s/GPT_SoVITS" % (now_dir)) +current_project_dir = os.getcwd() +sys.path.append(current_project_dir) +sys.path.append("%s/GPT_SoVITS" % (current_project_dir)) import signal import LangSegment @@ -131,7 +12,7 @@ from time import time as ttime import torch import librosa import soundfile as sf -from fastapi import FastAPI, Request, HTTPException +from fastapi import FastAPI, Request from fastapi.responses import StreamingResponse, JSONResponse import uvicorn from transformers import AutoModelForMaskedLM, AutoTokenizer @@ -143,7 +24,7 @@ from AR.models.t2s_lightning_module import Text2SemanticLightningModule from text import cleaned_text_to_sequence from text.cleaner import clean_text from module.mel_processing import spectrogram_torch -from my_utils import load_audio +from pyutils.np_utils import load_audio import config as global_config import logging import subprocess @@ -159,7 +40,7 @@ class DefaultRefer: return is_full(self.path, self.text, self.language) -def is_empty(*items): # 任意一项不为空返回False +def is_not_empty(*items): # 任意一项不为空返回False for item in items: if item is not None and item != "": return False @@ -496,7 +377,7 @@ def handle_control(command): def handle_change(path, text, language): - if is_empty(path, text, language): + if is_not_empty(path, text, language): return JSONResponse({"code": 400, "message": '缺少任意一项以下参数: "path", "text", "language"'}, status_code=400) if path != "" or path is not None: