多进程优化转写效率,提高效率

多进程优化转写效率,提高效率
This commit is contained in:
刘悦 2024-01-26 14:13:22 +08:00 committed by GitHub
parent 813cf96e50
commit 2bbc37e5e8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -2,33 +2,60 @@
from modelscope.pipelines import pipeline from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks from modelscope.utils.constant import Tasks
from modelscope.models import Model
import multiprocessing
import sys,os,traceback import sys,os,traceback
from threading import Lock
lock = Lock()
# 进程数
processes = 2
dir=sys.argv[1] dir=sys.argv[1]
# opt_name=dir.split("\\")[-1].split("/")[-1] # opt_name=dir.split("\\")[-1].split("/")[-1]
opt_name=os.path.basename(dir) opt_name=os.path.basename(dir)
# FunAsr三语转写model
lang2model = {
'zh': 'tools/damo_asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
'ja': "tools/damo_asr/models/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline",
"en": "tools/damo_asr/models/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline",
}
model = Model.from_pretrained(lang2model["zh"])
path_asr='tools/damo_asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
path_vad='tools/damo_asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch'
path_punc='tools/damo_asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch'
path_asr=path_asr if os.path.exists(path_asr)else "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
path_vad=path_vad if os.path.exists(path_vad)else "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
path_punc=path_punc if os.path.exists(path_punc)else "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
inference_pipeline = pipeline( inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition, task=Tasks.auto_speech_recognition,
model=path_asr, model=model,
vad_model=path_vad, vad_model='tools/damo_asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch',
punc_model=path_punc, punc_model='tools/damo_asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
) )
opt=[]
for name in os.listdir(dir): def process_audio_file(dir,filename,name,opt_name):
try: try:
text = inference_pipeline(audio_in="%s/%s"%(dir,name))["text"] text = inference_pipeline(audio_in="%s/%s" % (dir, name))["text"]
opt.append("%s/%s|%s|ZH|%s"%(dir,name,opt_name,text))
with lock:
with open(filename,"a",encoding="utf-8")as f:f.write("%s/%s|%s|ZH|%s\n" % (dir, name, opt_name, text.strip()))
except: except:
print(traceback.format_exc()) print(traceback.format_exc())
opt_dir="output/asr_opt"
os.makedirs(opt_dir,exist_ok=True) def run__process(): # 主进程
with open("%s/%s.list"%(opt_dir,opt_name),"w",encoding="utf-8")as f:f.write("\n".join(opt))
opt_dir="output/asr_opt"
os.makedirs(opt_dir,exist_ok=True)
filename = "%s/%s.list"%(opt_dir,opt_name)
if os.path.exists(filename):
os.remove(filename)
with multiprocessing.Pool(processes=processes) as pool:
pool.starmap(process_audio_file, [(dir,filename,name ,opt_name) for name in os.listdir(dir)])
if __name__ == '__main__':
run__process()