turn it into a package

This commit is contained in:
Jarod Mica 2024-11-16 02:30:20 -08:00
parent a70e1ad30c
commit 71b2fe69ea
84 changed files with 643 additions and 563 deletions

View File

@ -1,60 +1,66 @@
from copy import deepcopy
import math
import os, sys, gc
import random
import traceback
from tqdm import tqdm
now_dir = os.getcwd()
sys.path.append(now_dir)
import ffmpeg
import os
from typing import Generator, List, Tuple, Union
import librosa
import numpy as np
import random
import torch
import torch.nn.functional as F
import traceback
import yaml
from transformers import AutoModelForMaskedLM, AutoTokenizer
from huggingface_hub import snapshot_download, hf_hub_download
from importlib.resources import files
from time import time as ttime
from typing import Generator, List, Tuple, Union
from tqdm import tqdm
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
from feature_extractor.cnhubert import CNHubert
from module.models import SynthesizerTrn
import librosa
from time import time as ttime
from module.mel_processing import spectrogram_torch
from tools.i18n.i18n import I18nAuto, scan_language_list
from tools.my_utils import load_audio
from module.mel_processing import spectrogram_torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
from TTS_infer_pack.text_segmentation_method import splits
from TTS_infer_pack.TextPreprocessor import TextPreprocessor
language=os.environ.get("language","Auto")
language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
i18n = I18nAuto(language=language)
LIBRARY_NAME = "GPT_SoVITS"
# configs/tts_infer.yaml
"""
custom:
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
bert_base_path: pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: pretrained_models/chinese-hubert-base
device: cpu
is_half: false
t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
t2s_weights_path: pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
vits_weights_path: pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
version: v2
default:
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
bert_base_path: pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: pretrained_models/chinese-hubert-base
device: cpu
is_half: false
t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth
t2s_weights_path: pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
vits_weights_path: pretrained_models/s2G488k.pth
version: v1
default_v2:
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
bert_base_path: pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: pretrained_models/chinese-hubert-base
device: cpu
is_half: false
t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
t2s_weights_path: pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
vits_weights_path: pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
version: v2
"""
@ -86,19 +92,19 @@ class TTS_Config:
"device": "cpu",
"is_half": False,
"version": "v1",
"t2s_weights_path": "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
"vits_weights_path": "GPT_SoVITS/pretrained_models/s2G488k.pth",
"cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base",
"bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
"t2s_weights_path": "pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt",
"vits_weights_path": "pretrained_models/s2G488k.pth",
"cnhuhbert_base_path": "pretrained_models/chinese-hubert-base",
"bert_base_path": "pretrained_models/chinese-roberta-wwm-ext-large",
},
"default_v2":{
"device": "cpu",
"is_half": False,
"version": "v2",
"t2s_weights_path": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt",
"vits_weights_path": "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",
"cnhuhbert_base_path": "GPT_SoVITS/pretrained_models/chinese-hubert-base",
"bert_base_path": "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large",
"t2s_weights_path": "pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt",
"vits_weights_path": "pretrained_models/gsv-v2final-pretrained/s2G2333k.pth",
"cnhuhbert_base_path": "pretrained_models/chinese-hubert-base",
"bert_base_path": "pretrained_models/chinese-roberta-wwm-ext-large",
},
}
configs:dict = None
@ -120,7 +126,7 @@ class TTS_Config:
def __init__(self, configs: Union[dict, str]=None):
# 设置默认配置文件路径
configs_base_path:str = "GPT_SoVITS/configs/"
configs_base_path:str = "configs/"
os.makedirs(configs_base_path, exist_ok=True)
self.configs_path:str = os.path.join(configs_base_path, "tts_infer.yaml")
@ -152,23 +158,24 @@ class TTS_Config:
self.bert_base_path = self.configs.get("bert_base_path", None)
self.cnhuhbert_base_path = self.configs.get("cnhuhbert_base_path", None)
self.languages = self.v2_languages if self.version=="v2" else self.v1_languages
if (self.t2s_weights_path in [None, ""]) or (not os.path.exists(self.t2s_weights_path)):
self.t2s_weights_path = self.default_configs[default_config_key]['t2s_weights_path']
self.t2s_weights_path = str(files(LIBRARY_NAME).joinpath(self.default_configs[default_config_key]['t2s_weights_path']))
print(f"fall back to default t2s_weights_path: {self.t2s_weights_path}")
if (self.vits_weights_path in [None, ""]) or (not os.path.exists(self.vits_weights_path)):
self.vits_weights_path = self.default_configs[default_config_key]['vits_weights_path']
self.vits_weights_path = str(files(LIBRARY_NAME).joinpath(self.default_configs[default_config_key]['vits_weights_path']))
print(f"fall back to default vits_weights_path: {self.vits_weights_path}")
if (self.bert_base_path in [None, ""]) or (not os.path.exists(self.bert_base_path)):
self.bert_base_path = self.default_configs[default_config_key]['bert_base_path']
self.bert_base_path = str(files(LIBRARY_NAME).joinpath(self.default_configs[default_config_key]['bert_base_path']))
print(f"fall back to default bert_base_path: {self.bert_base_path}")
if (self.cnhuhbert_base_path in [None, ""]) or (not os.path.exists(self.cnhuhbert_base_path)):
self.cnhuhbert_base_path = self.default_configs[default_config_key]['cnhuhbert_base_path']
self.cnhuhbert_base_path = str(files(LIBRARY_NAME).joinpath(self.default_configs[default_config_key]['cnhuhbert_base_path']))
print(f"fall back to default cnhuhbert_base_path: {self.cnhuhbert_base_path}")
repo_name="lj1995/GPT-SoVITS"
snapshot_download(repo_id=repo_name, local_dir=os.path.dirname(self.bert_base_path))
self.update_configs()
self.max_sec = None
self.hz:int = 50
self.semantic_frame_rate:str = "25hz"

View File

@ -114,6 +114,8 @@ from fastapi import FastAPI, Request, HTTPException, Response
from fastapi.responses import StreamingResponse, JSONResponse
from fastapi import FastAPI, UploadFile, File
import uvicorn
from importlib.resources import files
from io import BytesIO
from tools.i18n.i18n import I18nAuto
from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config
@ -125,7 +127,7 @@ i18n = I18nAuto()
cut_method_names = get_cut_method_names()
parser = argparse.ArgumentParser(description="GPT-SoVITS api")
parser.add_argument("-c", "--tts_config", type=str, default="GPT_SoVITS/configs/tts_infer.yaml", help="tts_infer路径")
parser.add_argument("-c", "--tts_config", type=str, default=None, help="tts_infer路径")
parser.add_argument("-a", "--bind_addr", type=str, default="127.0.0.1", help="default: 127.0.0.1")
parser.add_argument("-p", "--port", type=int, default="9880", help="default: 9880")
args = parser.parse_args()
@ -136,7 +138,7 @@ host = args.bind_addr
argv = sys.argv
if config_path in [None, ""]:
config_path = "GPT-SoVITS/configs/tts_infer.yaml"
config_path = str(files("GPT_SoVITS").joinpath("configs/tts_infer.yaml"))
tts_config = TTS_Config(config_path)
print(tts_config)
@ -394,7 +396,7 @@ async def tts_get_endpoint(
@APP.post("/tts")
async def tts_post_endpoint(request: TTS_Request):
req = request.dict()
req = request.model_dump()
return await tts_handle(req)
@ -449,7 +451,8 @@ async def set_sovits_weights(weights_path: str = None):
if __name__ == "__main__":
def main():
global port, host, argv
try:
if host == 'None': # 在调用时使用 -a None 参数可以让api监听双栈
host = None
@ -458,3 +461,6 @@ if __name__ == "__main__":
traceback.print_exc()
os.kill(os.getpid(), signal.SIGTERM)
exit(0)
if __name__ == "__main__":
main()

View File

@ -1,24 +1,24 @@
custom:
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
bert_base_path: pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: pretrained_models/chinese-hubert-base
device: cuda
is_half: true
t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
t2s_weights_path: pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
version: v2
vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
vits_weights_path: pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
default:
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
bert_base_path: pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: pretrained_models/chinese-hubert-base
device: cpu
is_half: false
t2s_weights_path: GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
t2s_weights_path: pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt
version: v1
vits_weights_path: GPT_SoVITS/pretrained_models/s2G488k.pth
vits_weights_path: pretrained_models/s2G488k.pth
default_v2:
bert_base_path: GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: GPT_SoVITS/pretrained_models/chinese-hubert-base
bert_base_path: pretrained_models/chinese-roberta-wwm-ext-large
cnhuhbert_base_path: pretrained_models/chinese-hubert-base
device: cpu
is_half: false
t2s_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
t2s_weights_path: pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt
version: v2
vits_weights_path: GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth
vits_weights_path: pretrained_models/gsv-v2final-pretrained/s2G2333k.pth

View File

@ -1,33 +1,33 @@
import os,argparse
import traceback
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from tqdm import tqdm
path_denoise = 'tools/denoise-model/speech_frcrn_ans_cirm_16k'
path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k"
ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise)
def execute_denoise(input_folder,output_folder):
os.makedirs(output_folder,exist_ok=True)
# print(input_folder)
# print(list(os.listdir(input_folder).sort()))
for name in tqdm(os.listdir(input_folder)):
try:
ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name))
except:
traceback.print_exc()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input_folder", type=str, required=True,
help="Path to the folder containing WAV files.")
parser.add_argument("-o", "--output_folder", type=str, required=True,
help="Output folder to store transcriptions.")
parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
help="fp16 or fp32")#还没接入
cmd = parser.parse_args()
execute_denoise(
input_folder = cmd.input_folder,
output_folder = cmd.output_folder,
import os,argparse
import traceback
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from tqdm import tqdm
path_denoise = 'tools/denoise-model/speech_frcrn_ans_cirm_16k'
path_denoise = path_denoise if os.path.exists(path_denoise) else "damo/speech_frcrn_ans_cirm_16k"
ans = pipeline(Tasks.acoustic_noise_suppression,model=path_denoise)
def execute_denoise(input_folder,output_folder):
os.makedirs(output_folder,exist_ok=True)
# print(input_folder)
# print(list(os.listdir(input_folder).sort()))
for name in tqdm(os.listdir(input_folder)):
try:
ans("%s/%s"%(input_folder,name),output_path='%s/%s'%(output_folder,name))
except:
traceback.print_exc()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input_folder", type=str, required=True,
help="Path to the folder containing WAV files.")
parser.add_argument("-o", "--output_folder", type=str, required=True,
help="Output folder to store transcriptions.")
parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
help="fp16 or fp32")#还没接入
cmd = parser.parse_args()
execute_denoise(
input_folder = cmd.input_folder,
output_folder = cmd.output_folder,
)

View File

@ -1,115 +1,115 @@
import platform,os,traceback
import ffmpeg
import numpy as np
import gradio as gr
from tools.i18n.i18n import I18nAuto
import pandas as pd
i18n = I18nAuto(language=os.environ.get('language','Auto'))
def load_audio(file, sr):
try:
# https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车
if os.path.exists(file) == False:
raise RuntimeError(
"You input a wrong audio path that does not exists, please fix it!"
)
out, _ = (
ffmpeg.input(file, threads=0)
.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
)
except Exception as e:
traceback.print_exc()
raise RuntimeError(i18n("音频加载失败"))
return np.frombuffer(out, np.float32).flatten()
def clean_path(path_str:str):
if path_str.endswith(('\\','/')):
return clean_path(path_str[0:-1])
path_str = path_str.replace('/', os.sep).replace('\\', os.sep)
return path_str.strip(" ").strip('\'').strip("\n").strip('"').strip(" ").strip("\u202a")
def check_for_existance(file_list:list=None,is_train=False,is_dataset_processing=False):
files_status=[]
if is_train == True and file_list:
file_list.append(os.path.join(file_list[0],'2-name2text.txt'))
file_list.append(os.path.join(file_list[0],'3-bert'))
file_list.append(os.path.join(file_list[0],'4-cnhubert'))
file_list.append(os.path.join(file_list[0],'5-wav32k'))
file_list.append(os.path.join(file_list[0],'6-name2semantic.tsv'))
for file in file_list:
if os.path.exists(file):files_status.append(True)
else:files_status.append(False)
if sum(files_status)!=len(files_status):
if is_train:
for file,status in zip(file_list,files_status):
if status:pass
else:gr.Warning(file)
gr.Warning(i18n('以下文件或文件夹不存在'))
return False
elif is_dataset_processing:
if files_status[0]:
return True
elif not files_status[0]:
gr.Warning(file_list[0])
elif not files_status[1] and file_list[1]:
gr.Warning(file_list[1])
gr.Warning(i18n('以下文件或文件夹不存在'))
return False
else:
if file_list[0]:
gr.Warning(file_list[0])
gr.Warning(i18n('以下文件或文件夹不存在'))
else:
gr.Warning(i18n('路径不能为空'))
return False
return True
def check_details(path_list=None,is_train=False,is_dataset_processing=False):
if is_dataset_processing:
list_path, audio_path = path_list
if (not list_path.endswith('.list')):
gr.Warning(i18n('请填入正确的List路径'))
return
if audio_path:
if not os.path.isdir(audio_path):
gr.Warning(i18n('请填入正确的音频文件夹路径'))
return
with open(list_path,"r",encoding="utf8")as f:
line=f.readline().strip("\n").split("\n")
wav_name, _, __, ___ = line[0].split("|")
wav_name=clean_path(wav_name)
if (audio_path != "" and audio_path != None):
wav_name = os.path.basename(wav_name)
wav_path = "%s/%s"%(audio_path, wav_name)
else:
wav_path=wav_name
if os.path.exists(wav_path):
...
else:
gr.Warning(i18n('路径错误'))
return
if is_train:
path_list.append(os.path.join(path_list[0],'2-name2text.txt'))
path_list.append(os.path.join(path_list[0],'4-cnhubert'))
path_list.append(os.path.join(path_list[0],'5-wav32k'))
path_list.append(os.path.join(path_list[0],'6-name2semantic.tsv'))
phone_path, hubert_path, wav_path, semantic_path = path_list[1:]
with open(phone_path,'r',encoding='utf-8') as f:
if f.read(1):...
else:gr.Warning(i18n('缺少音素数据集'))
if os.listdir(hubert_path):...
else:gr.Warning(i18n('缺少Hubert数据集'))
if os.listdir(wav_path):...
else:gr.Warning(i18n('缺少音频数据集'))
df = pd.read_csv(
semantic_path, delimiter="\t", encoding="utf-8"
)
if len(df) >= 1:...
else:gr.Warning(i18n('缺少语义数据集'))
import platform,os,traceback
import ffmpeg
import numpy as np
import gradio as gr
from tools.i18n.i18n import I18nAuto
import pandas as pd
i18n = I18nAuto(language=os.environ.get('language','Auto'))
def load_audio(file, sr):
try:
# https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车
if os.path.exists(file) == False:
raise RuntimeError(
"You input a wrong audio path that does not exists, please fix it!"
)
out, _ = (
ffmpeg.input(file, threads=0)
.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
)
except Exception as e:
traceback.print_exc()
raise RuntimeError(i18n("音频加载失败"))
return np.frombuffer(out, np.float32).flatten()
def clean_path(path_str:str):
if path_str.endswith(('\\','/')):
return clean_path(path_str[0:-1])
path_str = path_str.replace('/', os.sep).replace('\\', os.sep)
return path_str.strip(" ").strip('\'').strip("\n").strip('"').strip(" ").strip("\u202a")
def check_for_existance(file_list:list=None,is_train=False,is_dataset_processing=False):
files_status=[]
if is_train == True and file_list:
file_list.append(os.path.join(file_list[0],'2-name2text.txt'))
file_list.append(os.path.join(file_list[0],'3-bert'))
file_list.append(os.path.join(file_list[0],'4-cnhubert'))
file_list.append(os.path.join(file_list[0],'5-wav32k'))
file_list.append(os.path.join(file_list[0],'6-name2semantic.tsv'))
for file in file_list:
if os.path.exists(file):files_status.append(True)
else:files_status.append(False)
if sum(files_status)!=len(files_status):
if is_train:
for file,status in zip(file_list,files_status):
if status:pass
else:gr.Warning(file)
gr.Warning(i18n('以下文件或文件夹不存在'))
return False
elif is_dataset_processing:
if files_status[0]:
return True
elif not files_status[0]:
gr.Warning(file_list[0])
elif not files_status[1] and file_list[1]:
gr.Warning(file_list[1])
gr.Warning(i18n('以下文件或文件夹不存在'))
return False
else:
if file_list[0]:
gr.Warning(file_list[0])
gr.Warning(i18n('以下文件或文件夹不存在'))
else:
gr.Warning(i18n('路径不能为空'))
return False
return True
def check_details(path_list=None,is_train=False,is_dataset_processing=False):
if is_dataset_processing:
list_path, audio_path = path_list
if (not list_path.endswith('.list')):
gr.Warning(i18n('请填入正确的List路径'))
return
if audio_path:
if not os.path.isdir(audio_path):
gr.Warning(i18n('请填入正确的音频文件夹路径'))
return
with open(list_path,"r",encoding="utf8")as f:
line=f.readline().strip("\n").split("\n")
wav_name, _, __, ___ = line[0].split("|")
wav_name=clean_path(wav_name)
if (audio_path != "" and audio_path != None):
wav_name = os.path.basename(wav_name)
wav_path = "%s/%s"%(audio_path, wav_name)
else:
wav_path=wav_name
if os.path.exists(wav_path):
...
else:
gr.Warning(i18n('路径错误'))
return
if is_train:
path_list.append(os.path.join(path_list[0],'2-name2text.txt'))
path_list.append(os.path.join(path_list[0],'4-cnhubert'))
path_list.append(os.path.join(path_list[0],'5-wav32k'))
path_list.append(os.path.join(path_list[0],'6-name2semantic.tsv'))
phone_path, hubert_path, wav_path, semantic_path = path_list[1:]
with open(phone_path,'r',encoding='utf-8') as f:
if f.read(1):...
else:gr.Warning(i18n('缺少音素数据集'))
if os.listdir(hubert_path):...
else:gr.Warning(i18n('缺少Hubert数据集'))
if os.listdir(wav_path):...
else:gr.Warning(i18n('缺少音频数据集'))
df = pd.read_csv(
semantic_path, delimiter="\t", encoding="utf-8"
)
if len(df) >= 1:...
else:gr.Warning(i18n('缺少语义数据集'))

View File

@ -1,48 +1,48 @@
import os,sys,numpy as np
import traceback
from scipy.io import wavfile
# parent_directory = os.path.dirname(os.path.abspath(__file__))
# sys.path.append(parent_directory)
from tools.my_utils import load_audio
from slicer2 import Slicer
def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part):
os.makedirs(opt_root,exist_ok=True)
if os.path.isfile(inp):
input=[inp]
elif os.path.isdir(inp):
input=[os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))]
else:
return "输入路径存在但既不是文件也不是文件夹"
slicer = Slicer(
sr=32000, # 长音频采样率
threshold= int(threshold), # 音量小于这个值视作静音的备选切割点
min_length= int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值
min_interval= int(min_interval), # 最短切割间隔
hop_size= int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)
max_sil_kept= int(max_sil_kept), # 切完后静音最多留多长
)
_max=float(_max)
alpha=float(alpha)
for inp_path in input[int(i_part)::int(all_part)]:
# print(inp_path)
try:
name = os.path.basename(inp_path)
audio = load_audio(inp_path, 32000)
# print(audio.shape)
for chunk, start, end in slicer.slice(audio): # start和end是帧数
tmp_max = np.abs(chunk).max()
if(tmp_max>1):chunk/=tmp_max
chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk
wavfile.write(
"%s/%s_%010d_%010d.wav" % (opt_root, name, start, end),
32000,
# chunk.astype(np.float32),
(chunk * 32767).astype(np.int16),
)
except:
print(inp_path,"->fail->",traceback.format_exc())
return "执行完毕,请检查输出文件"
print(slice(*sys.argv[1:]))
import os,sys,numpy as np
import traceback
from scipy.io import wavfile
# parent_directory = os.path.dirname(os.path.abspath(__file__))
# sys.path.append(parent_directory)
from tools.my_utils import load_audio
from slicer2 import Slicer
def slice(inp,opt_root,threshold,min_length,min_interval,hop_size,max_sil_kept,_max,alpha,i_part,all_part):
os.makedirs(opt_root,exist_ok=True)
if os.path.isfile(inp):
input=[inp]
elif os.path.isdir(inp):
input=[os.path.join(inp, name) for name in sorted(list(os.listdir(inp)))]
else:
return "输入路径存在但既不是文件也不是文件夹"
slicer = Slicer(
sr=32000, # 长音频采样率
threshold= int(threshold), # 音量小于这个值视作静音的备选切割点
min_length= int(min_length), # 每段最小多长,如果第一段太短一直和后面段连起来直到超过这个值
min_interval= int(min_interval), # 最短切割间隔
hop_size= int(hop_size), # 怎么算音量曲线,越小精度越大计算量越高(不是精度越大效果越好)
max_sil_kept= int(max_sil_kept), # 切完后静音最多留多长
)
_max=float(_max)
alpha=float(alpha)
for inp_path in input[int(i_part)::int(all_part)]:
# print(inp_path)
try:
name = os.path.basename(inp_path)
audio = load_audio(inp_path, 32000)
# print(audio.shape)
for chunk, start, end in slicer.slice(audio): # start和end是帧数
tmp_max = np.abs(chunk).max()
if(tmp_max>1):chunk/=tmp_max
chunk = (chunk / tmp_max * (_max * alpha)) + (1 - alpha) * chunk
wavfile.write(
"%s/%s_%010d_%010d.wav" % (opt_root, name, start, end),
32000,
# chunk.astype(np.float32),
(chunk * 32767).astype(np.int16),
)
except:
print(inp_path,"->fail->",traceback.format_exc())
return "执行完毕,请检查输出文件"
print(slice(*sys.argv[1:]))

View File

@ -1,261 +1,261 @@
import numpy as np
# This function is obtained from librosa.
def get_rms(
y,
frame_length=2048,
hop_length=512,
pad_mode="constant",
):
padding = (int(frame_length // 2), int(frame_length // 2))
y = np.pad(y, padding, mode=pad_mode)
axis = -1
# put our new within-frame axis at the end for now
out_strides = y.strides + tuple([y.strides[axis]])
# Reduce the shape on the framing axis
x_shape_trimmed = list(y.shape)
x_shape_trimmed[axis] -= frame_length - 1
out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
if axis < 0:
target_axis = axis - 1
else:
target_axis = axis + 1
xw = np.moveaxis(xw, -1, target_axis)
# Downsample along the target axis
slices = [slice(None)] * xw.ndim
slices[axis] = slice(0, None, hop_length)
x = xw[tuple(slices)]
# Calculate power
power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
return np.sqrt(power)
class Slicer:
def __init__(
self,
sr: int,
threshold: float = -40.0,
min_length: int = 5000,
min_interval: int = 300,
hop_size: int = 20,
max_sil_kept: int = 5000,
):
if not min_length >= min_interval >= hop_size:
raise ValueError(
"The following condition must be satisfied: min_length >= min_interval >= hop_size"
)
if not max_sil_kept >= hop_size:
raise ValueError(
"The following condition must be satisfied: max_sil_kept >= hop_size"
)
min_interval = sr * min_interval / 1000
self.threshold = 10 ** (threshold / 20.0)
self.hop_size = round(sr * hop_size / 1000)
self.win_size = min(round(min_interval), 4 * self.hop_size)
self.min_length = round(sr * min_length / 1000 / self.hop_size)
self.min_interval = round(min_interval / self.hop_size)
self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
def _apply_slice(self, waveform, begin, end):
if len(waveform.shape) > 1:
return waveform[
:, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
]
else:
return waveform[
begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
]
# @timeit
def slice(self, waveform):
if len(waveform.shape) > 1:
samples = waveform.mean(axis=0)
else:
samples = waveform
if samples.shape[0] <= self.min_length:
return [waveform]
rms_list = get_rms(
y=samples, frame_length=self.win_size, hop_length=self.hop_size
).squeeze(0)
sil_tags = []
silence_start = None
clip_start = 0
for i, rms in enumerate(rms_list):
# Keep looping while frame is silent.
if rms < self.threshold:
# Record start of silent frames.
if silence_start is None:
silence_start = i
continue
# Keep looping while frame is not silent and silence start has not been recorded.
if silence_start is None:
continue
# Clear recorded silence start if interval is not enough or clip is too short
is_leading_silence = silence_start == 0 and i > self.max_sil_kept
need_slice_middle = (
i - silence_start >= self.min_interval
and i - clip_start >= self.min_length
)
if not is_leading_silence and not need_slice_middle:
silence_start = None
continue
# Need slicing. Record the range of silent frames to be removed.
if i - silence_start <= self.max_sil_kept:
pos = rms_list[silence_start : i + 1].argmin() + silence_start
if silence_start == 0:
sil_tags.append((0, pos))
else:
sil_tags.append((pos, pos))
clip_start = pos
elif i - silence_start <= self.max_sil_kept * 2:
pos = rms_list[
i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
].argmin()
pos += i - self.max_sil_kept
pos_l = (
rms_list[
silence_start : silence_start + self.max_sil_kept + 1
].argmin()
+ silence_start
)
pos_r = (
rms_list[i - self.max_sil_kept : i + 1].argmin()
+ i
- self.max_sil_kept
)
if silence_start == 0:
sil_tags.append((0, pos_r))
clip_start = pos_r
else:
sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
clip_start = max(pos_r, pos)
else:
pos_l = (
rms_list[
silence_start : silence_start + self.max_sil_kept + 1
].argmin()
+ silence_start
)
pos_r = (
rms_list[i - self.max_sil_kept : i + 1].argmin()
+ i
- self.max_sil_kept
)
if silence_start == 0:
sil_tags.append((0, pos_r))
else:
sil_tags.append((pos_l, pos_r))
clip_start = pos_r
silence_start = None
# Deal with trailing silence.
total_frames = rms_list.shape[0]
if (
silence_start is not None
and total_frames - silence_start >= self.min_interval
):
silence_end = min(total_frames, silence_start + self.max_sil_kept)
pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
sil_tags.append((pos, total_frames + 1))
# Apply and return slices.
####音频+起始时间+终止时间
if len(sil_tags) == 0:
return [[waveform,0,int(total_frames*self.hop_size)]]
else:
chunks = []
if sil_tags[0][0] > 0:
chunks.append([self._apply_slice(waveform, 0, sil_tags[0][0]),0,int(sil_tags[0][0]*self.hop_size)])
for i in range(len(sil_tags) - 1):
chunks.append(
[self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]),int(sil_tags[i][1]*self.hop_size),int(sil_tags[i + 1][0]*self.hop_size)]
)
if sil_tags[-1][1] < total_frames:
chunks.append(
[self._apply_slice(waveform, sil_tags[-1][1], total_frames),int(sil_tags[-1][1]*self.hop_size),int(total_frames*self.hop_size)]
)
return chunks
def main():
import os.path
from argparse import ArgumentParser
import librosa
import soundfile
parser = ArgumentParser()
parser.add_argument("audio", type=str, help="The audio to be sliced")
parser.add_argument(
"--out", type=str, help="Output directory of the sliced audio clips"
)
parser.add_argument(
"--db_thresh",
type=float,
required=False,
default=-40,
help="The dB threshold for silence detection",
)
parser.add_argument(
"--min_length",
type=int,
required=False,
default=5000,
help="The minimum milliseconds required for each sliced audio clip",
)
parser.add_argument(
"--min_interval",
type=int,
required=False,
default=300,
help="The minimum milliseconds for a silence part to be sliced",
)
parser.add_argument(
"--hop_size",
type=int,
required=False,
default=10,
help="Frame length in milliseconds",
)
parser.add_argument(
"--max_sil_kept",
type=int,
required=False,
default=500,
help="The maximum silence length kept around the sliced clip, presented in milliseconds",
)
args = parser.parse_args()
out = args.out
if out is None:
out = os.path.dirname(os.path.abspath(args.audio))
audio, sr = librosa.load(args.audio, sr=None, mono=False)
slicer = Slicer(
sr=sr,
threshold=args.db_thresh,
min_length=args.min_length,
min_interval=args.min_interval,
hop_size=args.hop_size,
max_sil_kept=args.max_sil_kept,
)
chunks = slicer.slice(audio)
if not os.path.exists(out):
os.makedirs(out)
for i, chunk in enumerate(chunks):
if len(chunk.shape) > 1:
chunk = chunk.T
soundfile.write(
os.path.join(
out,
f"%s_%d.wav"
% (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
),
chunk,
sr,
)
if __name__ == "__main__":
main()
import numpy as np
# This function is obtained from librosa.
def get_rms(
y,
frame_length=2048,
hop_length=512,
pad_mode="constant",
):
padding = (int(frame_length // 2), int(frame_length // 2))
y = np.pad(y, padding, mode=pad_mode)
axis = -1
# put our new within-frame axis at the end for now
out_strides = y.strides + tuple([y.strides[axis]])
# Reduce the shape on the framing axis
x_shape_trimmed = list(y.shape)
x_shape_trimmed[axis] -= frame_length - 1
out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
if axis < 0:
target_axis = axis - 1
else:
target_axis = axis + 1
xw = np.moveaxis(xw, -1, target_axis)
# Downsample along the target axis
slices = [slice(None)] * xw.ndim
slices[axis] = slice(0, None, hop_length)
x = xw[tuple(slices)]
# Calculate power
power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
return np.sqrt(power)
class Slicer:
def __init__(
self,
sr: int,
threshold: float = -40.0,
min_length: int = 5000,
min_interval: int = 300,
hop_size: int = 20,
max_sil_kept: int = 5000,
):
if not min_length >= min_interval >= hop_size:
raise ValueError(
"The following condition must be satisfied: min_length >= min_interval >= hop_size"
)
if not max_sil_kept >= hop_size:
raise ValueError(
"The following condition must be satisfied: max_sil_kept >= hop_size"
)
min_interval = sr * min_interval / 1000
self.threshold = 10 ** (threshold / 20.0)
self.hop_size = round(sr * hop_size / 1000)
self.win_size = min(round(min_interval), 4 * self.hop_size)
self.min_length = round(sr * min_length / 1000 / self.hop_size)
self.min_interval = round(min_interval / self.hop_size)
self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
def _apply_slice(self, waveform, begin, end):
if len(waveform.shape) > 1:
return waveform[
:, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
]
else:
return waveform[
begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
]
# @timeit
def slice(self, waveform):
if len(waveform.shape) > 1:
samples = waveform.mean(axis=0)
else:
samples = waveform
if samples.shape[0] <= self.min_length:
return [waveform]
rms_list = get_rms(
y=samples, frame_length=self.win_size, hop_length=self.hop_size
).squeeze(0)
sil_tags = []
silence_start = None
clip_start = 0
for i, rms in enumerate(rms_list):
# Keep looping while frame is silent.
if rms < self.threshold:
# Record start of silent frames.
if silence_start is None:
silence_start = i
continue
# Keep looping while frame is not silent and silence start has not been recorded.
if silence_start is None:
continue
# Clear recorded silence start if interval is not enough or clip is too short
is_leading_silence = silence_start == 0 and i > self.max_sil_kept
need_slice_middle = (
i - silence_start >= self.min_interval
and i - clip_start >= self.min_length
)
if not is_leading_silence and not need_slice_middle:
silence_start = None
continue
# Need slicing. Record the range of silent frames to be removed.
if i - silence_start <= self.max_sil_kept:
pos = rms_list[silence_start : i + 1].argmin() + silence_start
if silence_start == 0:
sil_tags.append((0, pos))
else:
sil_tags.append((pos, pos))
clip_start = pos
elif i - silence_start <= self.max_sil_kept * 2:
pos = rms_list[
i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
].argmin()
pos += i - self.max_sil_kept
pos_l = (
rms_list[
silence_start : silence_start + self.max_sil_kept + 1
].argmin()
+ silence_start
)
pos_r = (
rms_list[i - self.max_sil_kept : i + 1].argmin()
+ i
- self.max_sil_kept
)
if silence_start == 0:
sil_tags.append((0, pos_r))
clip_start = pos_r
else:
sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
clip_start = max(pos_r, pos)
else:
pos_l = (
rms_list[
silence_start : silence_start + self.max_sil_kept + 1
].argmin()
+ silence_start
)
pos_r = (
rms_list[i - self.max_sil_kept : i + 1].argmin()
+ i
- self.max_sil_kept
)
if silence_start == 0:
sil_tags.append((0, pos_r))
else:
sil_tags.append((pos_l, pos_r))
clip_start = pos_r
silence_start = None
# Deal with trailing silence.
total_frames = rms_list.shape[0]
if (
silence_start is not None
and total_frames - silence_start >= self.min_interval
):
silence_end = min(total_frames, silence_start + self.max_sil_kept)
pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
sil_tags.append((pos, total_frames + 1))
# Apply and return slices.
####音频+起始时间+终止时间
if len(sil_tags) == 0:
return [[waveform,0,int(total_frames*self.hop_size)]]
else:
chunks = []
if sil_tags[0][0] > 0:
chunks.append([self._apply_slice(waveform, 0, sil_tags[0][0]),0,int(sil_tags[0][0]*self.hop_size)])
for i in range(len(sil_tags) - 1):
chunks.append(
[self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]),int(sil_tags[i][1]*self.hop_size),int(sil_tags[i + 1][0]*self.hop_size)]
)
if sil_tags[-1][1] < total_frames:
chunks.append(
[self._apply_slice(waveform, sil_tags[-1][1], total_frames),int(sil_tags[-1][1]*self.hop_size),int(total_frames*self.hop_size)]
)
return chunks
def main():
import os.path
from argparse import ArgumentParser
import librosa
import soundfile
parser = ArgumentParser()
parser.add_argument("audio", type=str, help="The audio to be sliced")
parser.add_argument(
"--out", type=str, help="Output directory of the sliced audio clips"
)
parser.add_argument(
"--db_thresh",
type=float,
required=False,
default=-40,
help="The dB threshold for silence detection",
)
parser.add_argument(
"--min_length",
type=int,
required=False,
default=5000,
help="The minimum milliseconds required for each sliced audio clip",
)
parser.add_argument(
"--min_interval",
type=int,
required=False,
default=300,
help="The minimum milliseconds for a silence part to be sliced",
)
parser.add_argument(
"--hop_size",
type=int,
required=False,
default=10,
help="Frame length in milliseconds",
)
parser.add_argument(
"--max_sil_kept",
type=int,
required=False,
default=500,
help="The maximum silence length kept around the sliced clip, presented in milliseconds",
)
args = parser.parse_args()
out = args.out
if out is None:
out = os.path.dirname(os.path.abspath(args.audio))
audio, sr = librosa.load(args.audio, sr=None, mono=False)
slicer = Slicer(
sr=sr,
threshold=args.db_thresh,
min_length=args.min_length,
min_interval=args.min_interval,
hop_size=args.hop_size,
max_sil_kept=args.max_sil_kept,
)
chunks = slicer.slice(audio)
if not os.path.exists(out):
os.makedirs(out)
for i, chunk in enumerate(chunks):
if len(chunk.shape) > 1:
chunk = chunk.T
soundfile.write(
os.path.join(
out,
f"%s_%d.wav"
% (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
),
chunk,
sr,
)
if __name__ == "__main__":
main()

View File

@ -1,54 +1,54 @@
{
"bins": 672,
"unstable_bins": 8,
"reduction_bins": 530,
"band": {
"1": {
"sr": 7350,
"hl": 80,
"n_fft": 640,
"crop_start": 0,
"crop_stop": 85,
"lpf_start": 25,
"lpf_stop": 53,
"res_type": "polyphase"
},
"2": {
"sr": 7350,
"hl": 80,
"n_fft": 320,
"crop_start": 4,
"crop_stop": 87,
"hpf_start": 25,
"hpf_stop": 12,
"lpf_start": 31,
"lpf_stop": 62,
"res_type": "polyphase"
},
"3": {
"sr": 14700,
"hl": 160,
"n_fft": 512,
"crop_start": 17,
"crop_stop": 216,
"hpf_start": 48,
"hpf_stop": 24,
"lpf_start": 139,
"lpf_stop": 210,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 480,
"n_fft": 960,
"crop_start": 78,
"crop_stop": 383,
"hpf_start": 130,
"hpf_stop": 86,
"res_type": "kaiser_fast"
}
},
"sr": 44100,
"pre_filter_start": 668,
"pre_filter_stop": 672
{
"bins": 672,
"unstable_bins": 8,
"reduction_bins": 530,
"band": {
"1": {
"sr": 7350,
"hl": 80,
"n_fft": 640,
"crop_start": 0,
"crop_stop": 85,
"lpf_start": 25,
"lpf_stop": 53,
"res_type": "polyphase"
},
"2": {
"sr": 7350,
"hl": 80,
"n_fft": 320,
"crop_start": 4,
"crop_stop": 87,
"hpf_start": 25,
"hpf_stop": 12,
"lpf_start": 31,
"lpf_stop": 62,
"res_type": "polyphase"
},
"3": {
"sr": 14700,
"hl": 160,
"n_fft": 512,
"crop_start": 17,
"crop_stop": 216,
"hpf_start": 48,
"hpf_stop": 24,
"lpf_start": 139,
"lpf_stop": 210,
"res_type": "polyphase"
},
"4": {
"sr": 44100,
"hl": 480,
"n_fft": 960,
"crop_start": 78,
"crop_stop": 383,
"hpf_start": 130,
"hpf_stop": 86,
"res_type": "kaiser_fast"
}
},
"sr": 44100,
"pre_filter_start": 668,
"pre_filter_stop": 672
}

1
MANIFEST.in Normal file
View File

@ -0,0 +1 @@
include GPT_SoVITS/configs/*

View File

@ -1,3 +1,6 @@
# Jarod's NOTE
Working on turning this into a package. Right now, the API *does in fact* work to make requests to and this can be installed. Will get instrucitons out sometime in the future.
<div align="center">

63
pyproject.toml Normal file
View File

@ -0,0 +1,63 @@
[build-system]
requires = ["setuptools >= 61.0", "setuptools-scm>=8.0"]
build-backend = "setuptools.build_meta"
[project]
name = "GPT_SoVITS"
dynamic = ["version"]
description = ""
readme = "README.md"
license = {text = "MIT License"}
classifiers = [
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
]
dependencies = [
"numpy==1.26.4",
"scipy",
"tensorboard",
"librosa==0.9.2",
"numba",
"pytorch-lightning",
"gradio>=4.0,<=4.24.0",
"ffmpeg-python",
"onnxruntime; sys_platform == 'darwin'",
"onnxruntime-gpu; sys_platform != 'darwin'",
"tqdm",
"funasr==1.0.27",
"cn2an",
"pypinyin",
"pyopenjtalk>=0.3.4",
"g2p_en",
"torchaudio",
"modelscope==1.10.0",
"sentencepiece",
"transformers",
"chardet",
"PyYAML",
"psutil",
"jieba_fast",
"jieba",
"LangSegment>=0.2.0",
"Faster_Whisper",
"wordsegment",
"rotary_embedding_torch",
"pyjyutping",
"g2pk2",
"ko_pron",
"opencc; sys_platform != 'linux'",
"opencc==1.1.1; sys_platform == 'linux'",
"python_mecab_ko; sys_platform != 'win32'",
"fastapi<0.112.2",
]
[project.urls]
Homepage = "https://github.com/RVC-Boss/GPT-SoVITS"
[project.scripts]
"gpt_sovits_api" = "GPT_SoVITS.api_v2:main"
[tool.setuptools.packages.find]
include = ["GPT_SoVITS"]

View File

@ -1,8 +1,8 @@
numpy==1.23.4
numpy==1.26.4
scipy
tensorboard
librosa==0.9.2
numba==0.56.4
numba
pytorch-lightning
gradio>=4.0,<=4.24.0
ffmpeg-python