mirror of
https://github.com/RVC-Boss/GPT-SoVITS.git
synced 2025-10-07 23:48:48 +08:00
.
This commit is contained in:
parent
8d0cf0f8a9
commit
18af05b5f4
@ -1,31 +0,0 @@
|
|||||||
import os
|
|
||||||
|
|
||||||
def check_fw_local_models():
|
|
||||||
'''
|
|
||||||
启动时检查本地是否有 Faster Whisper 模型.
|
|
||||||
'''
|
|
||||||
model_size_list = [
|
|
||||||
"tiny", "tiny.en",
|
|
||||||
"base", "base.en",
|
|
||||||
"small", "small.en",
|
|
||||||
"medium", "medium.en",
|
|
||||||
"large", "large-v1",
|
|
||||||
"large-v2", "large-v3"]
|
|
||||||
for i, size in enumerate(model_size_list):
|
|
||||||
if os.path.exists(f'tools/asr/models/faster-whisper-{size}'):
|
|
||||||
model_size_list[i] = size + '-local'
|
|
||||||
return model_size_list
|
|
||||||
|
|
||||||
asr_dict = {
|
|
||||||
"达摩 ASR (中文)": {
|
|
||||||
'lang': ['zh'],
|
|
||||||
'size': ['large'],
|
|
||||||
'path': 'funasr_asr.py',
|
|
||||||
},
|
|
||||||
"Faster Whisper (多语种)": {
|
|
||||||
'lang': ['auto', 'zh', 'en', 'ja'],
|
|
||||||
'size': check_fw_local_models(),
|
|
||||||
'path': 'fasterwhisper_asr.py'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,114 +0,0 @@
|
|||||||
import argparse
|
|
||||||
import os
|
|
||||||
import traceback
|
|
||||||
|
|
||||||
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
|
|
||||||
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from faster_whisper import WhisperModel
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from tools.asr.config import check_fw_local_models
|
|
||||||
|
|
||||||
language_code_list = [
|
|
||||||
"af", "am", "ar", "as", "az",
|
|
||||||
"ba", "be", "bg", "bn", "bo",
|
|
||||||
"br", "bs", "ca", "cs", "cy",
|
|
||||||
"da", "de", "el", "en", "es",
|
|
||||||
"et", "eu", "fa", "fi", "fo",
|
|
||||||
"fr", "gl", "gu", "ha", "haw",
|
|
||||||
"he", "hi", "hr", "ht", "hu",
|
|
||||||
"hy", "id", "is", "it", "ja",
|
|
||||||
"jw", "ka", "kk", "km", "kn",
|
|
||||||
"ko", "la", "lb", "ln", "lo",
|
|
||||||
"lt", "lv", "mg", "mi", "mk",
|
|
||||||
"ml", "mn", "mr", "ms", "mt",
|
|
||||||
"my", "ne", "nl", "nn", "no",
|
|
||||||
"oc", "pa", "pl", "ps", "pt",
|
|
||||||
"ro", "ru", "sa", "sd", "si",
|
|
||||||
"sk", "sl", "sn", "so", "sq",
|
|
||||||
"sr", "su", "sv", "sw", "ta",
|
|
||||||
"te", "tg", "th", "tk", "tl",
|
|
||||||
"tr", "tt", "uk", "ur", "uz",
|
|
||||||
"vi", "yi", "yo", "zh", "yue",
|
|
||||||
"auto"]
|
|
||||||
|
|
||||||
def execute_asr(input_folder, output_folder, model_size, language, precision):
|
|
||||||
if '-local' in model_size:
|
|
||||||
model_size = model_size[:-6]
|
|
||||||
model_path = f'tools/asr/models/faster-whisper-{model_size}'
|
|
||||||
else:
|
|
||||||
model_path = model_size
|
|
||||||
if language == 'auto':
|
|
||||||
language = None #不设置语种由模型自动输出概率最高的语种
|
|
||||||
print("loading faster whisper model:",model_size,model_path)
|
|
||||||
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
||||||
try:
|
|
||||||
model = WhisperModel(model_path, device=device, compute_type=precision)
|
|
||||||
except:
|
|
||||||
return print(traceback.format_exc())
|
|
||||||
|
|
||||||
input_file_names = os.listdir(input_folder)
|
|
||||||
input_file_names.sort()
|
|
||||||
|
|
||||||
output = []
|
|
||||||
output_file_name = os.path.basename(input_folder)
|
|
||||||
|
|
||||||
for file_name in tqdm(input_file_names):
|
|
||||||
try:
|
|
||||||
file_path = os.path.join(input_folder, file_name)
|
|
||||||
segments, info = model.transcribe(
|
|
||||||
audio = file_path,
|
|
||||||
beam_size = 5,
|
|
||||||
vad_filter = True,
|
|
||||||
vad_parameters = dict(min_silence_duration_ms=700),
|
|
||||||
language = language)
|
|
||||||
text = ''
|
|
||||||
|
|
||||||
if info.language == "zh":
|
|
||||||
print("检测为中文文本, 转 FunASR 处理")
|
|
||||||
if("only_asr"not in globals()):
|
|
||||||
from tools.asr.funasr_asr import \
|
|
||||||
only_asr # #如果用英文就不需要导入下载模型
|
|
||||||
text = only_asr(file_path)
|
|
||||||
|
|
||||||
if text == '':
|
|
||||||
for segment in segments:
|
|
||||||
text += segment.text
|
|
||||||
output.append(f"{file_path}|{output_file_name}|{info.language.upper()}|{text}")
|
|
||||||
except:
|
|
||||||
print(traceback.format_exc())
|
|
||||||
|
|
||||||
output_folder = output_folder or "output/asr_opt"
|
|
||||||
os.makedirs(output_folder, exist_ok=True)
|
|
||||||
output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list')
|
|
||||||
|
|
||||||
with open(output_file_path, "w", encoding="utf-8") as f:
|
|
||||||
f.write("\n".join(output))
|
|
||||||
print(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
|
|
||||||
return output_file_path
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("-i", "--input_folder", type=str, required=True,
|
|
||||||
help="Path to the folder containing WAV files.")
|
|
||||||
parser.add_argument("-o", "--output_folder", type=str, required=True,
|
|
||||||
help="Output folder to store transcriptions.")
|
|
||||||
parser.add_argument("-s", "--model_size", type=str, default='large-v3',
|
|
||||||
choices=check_fw_local_models(),
|
|
||||||
help="Model Size of Faster Whisper")
|
|
||||||
parser.add_argument("-l", "--language", type=str, default='ja',
|
|
||||||
choices=language_code_list,
|
|
||||||
help="Language of the audio files.")
|
|
||||||
parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
|
|
||||||
help="fp16 or fp32")
|
|
||||||
|
|
||||||
cmd = parser.parse_args()
|
|
||||||
output_file_path = execute_asr(
|
|
||||||
input_folder = cmd.input_folder,
|
|
||||||
output_folder = cmd.output_folder,
|
|
||||||
model_size = cmd.model_size,
|
|
||||||
language = cmd.language,
|
|
||||||
precision = cmd.precision,
|
|
||||||
)
|
|
@ -1,77 +0,0 @@
|
|||||||
# -*- coding:utf-8 -*-
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import traceback
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from funasr import AutoModel
|
|
||||||
|
|
||||||
path_asr = 'tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
|
|
||||||
path_vad = 'tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch'
|
|
||||||
path_punc = 'tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch'
|
|
||||||
path_asr = path_asr if os.path.exists(path_asr) else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
|
||||||
path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch"
|
|
||||||
path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
|
|
||||||
|
|
||||||
model = AutoModel(
|
|
||||||
model = path_asr,
|
|
||||||
model_revision = "v2.0.4",
|
|
||||||
vad_model = path_vad,
|
|
||||||
vad_model_revision = "v2.0.4",
|
|
||||||
punc_model = path_punc,
|
|
||||||
punc_model_revision = "v2.0.4",
|
|
||||||
)
|
|
||||||
|
|
||||||
def only_asr(input_file):
|
|
||||||
try:
|
|
||||||
text = model.generate(input=input_file)[0]["text"]
|
|
||||||
except:
|
|
||||||
text = ''
|
|
||||||
print(traceback.format_exc())
|
|
||||||
return text
|
|
||||||
|
|
||||||
def execute_asr(input_folder, output_folder, model_size, language):
|
|
||||||
input_file_names = os.listdir(input_folder)
|
|
||||||
input_file_names.sort()
|
|
||||||
|
|
||||||
output = []
|
|
||||||
output_file_name = os.path.basename(input_folder)
|
|
||||||
|
|
||||||
for file_name in tqdm(input_file_names):
|
|
||||||
try:
|
|
||||||
file_path = os.path.join(input_folder, file_name)
|
|
||||||
text = model.generate(input=file_path)[0]["text"]
|
|
||||||
output.append(f"{file_path}|{output_file_name}|{language.upper()}|{text}")
|
|
||||||
except:
|
|
||||||
print(traceback.format_exc())
|
|
||||||
|
|
||||||
output_folder = output_folder or "output/asr_opt"
|
|
||||||
os.makedirs(output_folder, exist_ok=True)
|
|
||||||
output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list')
|
|
||||||
|
|
||||||
with open(output_file_path, "w", encoding="utf-8") as f:
|
|
||||||
f.write("\n".join(output))
|
|
||||||
print(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
|
|
||||||
return output_file_path
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("-i", "--input_folder", type=str, required=True,
|
|
||||||
help="Path to the folder containing WAV files.")
|
|
||||||
parser.add_argument("-o", "--output_folder", type=str, required=True,
|
|
||||||
help="Output folder to store transcriptions.")
|
|
||||||
parser.add_argument("-s", "--model_size", type=str, default='large',
|
|
||||||
help="Model Size of FunASR is Large")
|
|
||||||
parser.add_argument("-l", "--language", type=str, default='zh', choices=['zh'],
|
|
||||||
help="Language of the audio files.")
|
|
||||||
parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16','float32'],
|
|
||||||
help="fp16 or fp32")#还没接入
|
|
||||||
|
|
||||||
cmd = parser.parse_args()
|
|
||||||
execute_asr(
|
|
||||||
input_folder = cmd.input_folder,
|
|
||||||
output_folder = cmd.output_folder,
|
|
||||||
model_size = cmd.model_size,
|
|
||||||
language = cmd.language,
|
|
||||||
)
|
|
71
tools/asr/sensevoice.py
Normal file
71
tools/asr/sensevoice.py
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import traceback
|
||||||
|
import torch
|
||||||
|
from tqdm import tqdm
|
||||||
|
from funasr import AutoModel
|
||||||
|
|
||||||
|
model_dir = "tools/asr/models/SenseVoiceSmall"
|
||||||
|
model_dir = model_dir if os.path.exists(model_dir) else "iic/SenseVoiceSmall"
|
||||||
|
|
||||||
|
def execute_asr(input_folder, output_folder, language, device):
|
||||||
|
try:
|
||||||
|
model = AutoModel(model=model_dir,
|
||||||
|
vad_model="fsmn-vad",
|
||||||
|
vad_kwargs={"max_single_segment_time": 30000},
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
return print(traceback.format_exc())
|
||||||
|
|
||||||
|
input_file_names = os.listdir(input_folder)
|
||||||
|
input_file_names.sort()
|
||||||
|
output = []
|
||||||
|
output_file_name = os.path.basename(input_folder)
|
||||||
|
if device is None:
|
||||||
|
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||||
|
for file_name in tqdm(input_file_names):
|
||||||
|
try:
|
||||||
|
file_path = os.path.join(input_folder, file_name)
|
||||||
|
res = model.generate(
|
||||||
|
input=file_path,
|
||||||
|
cache={},
|
||||||
|
language=language.lower(), # "zn", "en", "yue", "ja", "ko", "nospeech"
|
||||||
|
use_itn=True,
|
||||||
|
batch_size_s=0,
|
||||||
|
device = device
|
||||||
|
)[0]['text']
|
||||||
|
text_language = re.search(r'<([^<>]+)>', res).group(1)[1:-1].upper() if language == 'auto' else language.upper()
|
||||||
|
text = re.sub(r'<[^<>]*>', '', res).replace(' ', '')
|
||||||
|
output.append(f"{file_path}|{output_file_name}|{text_language}|{text}")
|
||||||
|
except:
|
||||||
|
print(traceback.format_exc())
|
||||||
|
|
||||||
|
output_folder = output_folder or "output/asr_opt"
|
||||||
|
os.makedirs(output_folder, exist_ok=True)
|
||||||
|
output_file_path = os.path.abspath(f'{output_folder}/{output_file_name}.list')
|
||||||
|
|
||||||
|
with open(output_file_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write("\n".join(output))
|
||||||
|
print(f"ASR 任务完成->标注文件路径: {output_file_path}\n")
|
||||||
|
return output_file_path
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("-i", "--input_folder", type=str, required=True,
|
||||||
|
help="Path to the folder containing WAV files.")
|
||||||
|
parser.add_argument("-o", "--output_folder", type=str, required=True,
|
||||||
|
help="Output folder to store transcriptions.")
|
||||||
|
parser.add_argument("-l", "--language", type=str, default='auto',
|
||||||
|
choices=['auto','zh','en','ja'],
|
||||||
|
help="Language of the audio files.")
|
||||||
|
parser.add_argument("-d", "--device", type=str, default=None, choices=['cpu','cuda'],
|
||||||
|
help="CPU or CUDA")
|
||||||
|
|
||||||
|
cmd = parser.parse_args()
|
||||||
|
output_file_path = execute_asr(
|
||||||
|
input_folder = cmd.input_folder,
|
||||||
|
output_folder = cmd.output_folder,
|
||||||
|
language = cmd.language,
|
||||||
|
device = cmd.device,
|
||||||
|
)
|
26
webui.py
26
webui.py
@ -194,17 +194,15 @@ def change_tts_inference(if_tts,bert_path,cnhubert_base_path,gpu_number,gpt_path
|
|||||||
p_tts_inference=None
|
p_tts_inference=None
|
||||||
yield i18n("TTS推理进程已关闭")
|
yield i18n("TTS推理进程已关闭")
|
||||||
|
|
||||||
from tools.asr.config import asr_dict
|
|
||||||
def open_asr(asr_inp_dir, asr_opt_dir, asr_model, asr_model_size, asr_lang):
|
def open_asr(asr_inp_dir, asr_opt_dir, asr_model, asr_model_size, asr_lang):
|
||||||
global p_asr
|
global p_asr
|
||||||
if(p_asr==None):
|
if(p_asr==None):
|
||||||
asr_inp_dir=my_utils.clean_path(asr_inp_dir)
|
asr_inp_dir=my_utils.clean_path(asr_inp_dir)
|
||||||
cmd = f'"{python_exec}" tools/asr/{asr_dict[asr_model]["path"]}'
|
cmd = f'"{python_exec}" tools/asr/sensevoice.py'
|
||||||
cmd += f' -i "{asr_inp_dir}"'
|
cmd += f' -i "{asr_inp_dir}"'
|
||||||
cmd += f' -o "{asr_opt_dir}"'
|
cmd += f' -o "{asr_opt_dir}"'
|
||||||
cmd += f' -s {asr_model_size}'
|
|
||||||
cmd += f' -l {asr_lang}'
|
cmd += f' -l {asr_lang}'
|
||||||
cmd += " -p %s"%("float16"if is_half==True else "float32")
|
|
||||||
|
|
||||||
yield "ASR任务开启:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
|
yield "ASR任务开启:%s"%cmd,{"__type__":"update","visible":False},{"__type__":"update","visible":True}
|
||||||
print(cmd)
|
print(cmd)
|
||||||
@ -732,33 +730,25 @@ with gr.Blocks(title="GPT-SoVITS WebUI") as app:
|
|||||||
with gr.Row():
|
with gr.Row():
|
||||||
asr_model = gr.Dropdown(
|
asr_model = gr.Dropdown(
|
||||||
label = i18n("ASR 模型"),
|
label = i18n("ASR 模型"),
|
||||||
choices = list(asr_dict.keys()),
|
choices = ['SenseVoice'],
|
||||||
interactive = True,
|
interactive = True,
|
||||||
value="达摩 ASR (中文)"
|
value="enseVoice"
|
||||||
)
|
)
|
||||||
asr_size = gr.Dropdown(
|
asr_size = gr.Dropdown(
|
||||||
label = i18n("ASR 模型尺寸"),
|
label = i18n("ASR 模型尺寸"),
|
||||||
choices = ["large"],
|
choices = ["small"],
|
||||||
interactive = True,
|
interactive = True,
|
||||||
value="large"
|
value="small"
|
||||||
)
|
)
|
||||||
asr_lang = gr.Dropdown(
|
asr_lang = gr.Dropdown(
|
||||||
label = i18n("ASR 语言设置"),
|
label = i18n("ASR 语言设置"),
|
||||||
choices = ["zh"],
|
choices = ["auto","zh","en","ja"],
|
||||||
interactive = True,
|
interactive = True,
|
||||||
value="zh"
|
value="auto"
|
||||||
)
|
)
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
asr_info = gr.Textbox(label=i18n("ASR进程输出信息"))
|
asr_info = gr.Textbox(label=i18n("ASR进程输出信息"))
|
||||||
|
|
||||||
def change_lang_choices(key): #根据选择的模型修改可选的语言
|
|
||||||
# return gr.Dropdown(choices=asr_dict[key]['lang'])
|
|
||||||
return {"__type__": "update", "choices": asr_dict[key]['lang'],"value":asr_dict[key]['lang'][0]}
|
|
||||||
def change_size_choices(key): # 根据选择的模型修改可选的模型尺寸
|
|
||||||
# return gr.Dropdown(choices=asr_dict[key]['size'])
|
|
||||||
return {"__type__": "update", "choices": asr_dict[key]['size']}
|
|
||||||
asr_model.change(change_lang_choices, [asr_model], [asr_lang])
|
|
||||||
asr_model.change(change_size_choices, [asr_model], [asr_size])
|
|
||||||
|
|
||||||
gr.Markdown(value=i18n("0d-语音文本校对标注工具"))
|
gr.Markdown(value=i18n("0d-语音文本校对标注工具"))
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
|
Loading…
x
Reference in New Issue
Block a user