From d70d5a36a2caf4f63870f93495958edb480882bc Mon Sep 17 00:00:00 2001 From: Karl Kroening Date: Sun, 7 Jan 2018 23:25:33 -0800 Subject: [PATCH] Add transcribe_many and make changes to other example scripts --- examples/requirements.txt | 5 ++++- examples/split_silence.py | 34 +++++++++++++++++++++-------- examples/transcribe.py | 29 ++++++++++++++++++------- examples/transcribe_many.py | 43 +++++++++++++++++++++++++++++++++++++ 4 files changed, 93 insertions(+), 18 deletions(-) create mode 100755 examples/transcribe_many.py diff --git a/examples/requirements.txt b/examples/requirements.txt index 20645e6..ca8ae60 100644 --- a/examples/requirements.txt +++ b/examples/requirements.txt @@ -1 +1,4 @@ -ffmpeg +ffmpeg-python +gevent +google-cloud-speech +tqdm diff --git a/examples/split_silence.py b/examples/split_silence.py index 9a8adbd..786e72d 100755 --- a/examples/split_silence.py +++ b/examples/split_silence.py @@ -16,7 +16,7 @@ logger = logging.getLogger(__file__) logger.setLevel(logging.INFO) DEFAULT_DURATION = 0.3 -DEFAULT_THRESHOLD = -60 +DEFAULT_THRESHOLD = -30 parser = argparse.ArgumentParser(description='Split media into separate chunks wherever silence occurs') parser.add_argument('in_filename', help='Input filename (`-` for stdin)') @@ -26,6 +26,7 @@ parser.add_argument('--silence-duration', default=DEFAULT_DURATION, type=float, parser.add_argument('--start-time', type=float, help='Start time (seconds)') parser.add_argument('--end-time', type=float, help='End time (seconds)') parser.add_argument('-v', dest='verbose', action='store_true', help='Verbose mode') +parser.add_argument('--padding', type=float, default=0., help='Output silence padding (seconds)') silence_start_re = re.compile(' silence_start: (?P[0-9]+(\.?[0-9]*))$') silence_end_re = re.compile(' silence_end: (?P[0-9]+(\.?[0-9]*)) ') @@ -110,6 +111,7 @@ def split_audio( silence_duration=DEFAULT_DURATION, start_time=None, end_time=None, + padding=0., verbose=False, ): chunk_times = get_chunk_times(in_filename, silence_threshold, silence_duration, start_time, end_time) @@ -121,16 +123,30 @@ def split_audio( logger.info('{}: start={:.02f}, end={:.02f}, duration={:.02f}'.format(out_filename, start_time, end_time, time)) - _logged_popen( - (ffmpeg - .input(in_filename, ss=start_time, t=time) - .output(out_filename) - .overwrite_output() - .compile() - ), + + input = ffmpeg.input(in_filename, ss=start_time, t=time) + + if padding > 0.: + silence = ffmpeg.input('aevalsrc=0:0::duration={}'.format(padding), format='lavfi') + input = ffmpeg.concat(silence, input, v=0, a=1) + + ffmpeg_cmd = (input + .output(out_filename) + .overwrite_output() + .compile() + ) + print ffmpeg_cmd + + p = _logged_popen( + ffmpeg_cmd, stdout=subprocess.PIPE if not verbose else None, stderr=subprocess.PIPE if not verbose else None, - ).communicate() + ) + out = p.communicate() + if p.returncode != 0: + if not verbose: + sys.stderr.write(out[1]) + sys.exit(1) if __name__ == '__main__': diff --git a/examples/transcribe.py b/examples/transcribe.py index fb484df..227aaff 100755 --- a/examples/transcribe.py +++ b/examples/transcribe.py @@ -1,9 +1,11 @@ #!/usr/bin/env python from __future__ import unicode_literals +import IPython from google.cloud import speech from google.cloud.speech import enums from google.cloud.speech import types +from google.protobuf.json_format import MessageToJson import argparse import ffmpeg import logging @@ -17,7 +19,10 @@ logger.setLevel(logging.INFO) parser = argparse.ArgumentParser(description='Convert speech audio to text using Google Speech API') -parser.add_argument('in_filename', help='Input filename (`-` for stdin)') +parser.add_argument('in_file', help='Input filename (`-` for stdin)') +parser.add_argument('--out-file', type=argparse.FileType('w'), default='-', + help='Output filename (defaults to stdout)') +parser.add_argument('--json', action='store_true', help='Output raw JSON response') def decode_audio(in_filename, **input_kwargs): @@ -38,7 +43,7 @@ def decode_audio(in_filename, **input_kwargs): return out[0] -def get_transcripts(audio_data): +def transcribe_data(audio_data): client = speech.SpeechClient() audio = types.RecognitionAudio(content=audio_data) config = types.RecognitionConfig( @@ -46,17 +51,25 @@ def get_transcripts(audio_data): sample_rate_hertz=16000, language_code='en-US' ) - response = client.recognize(config, audio) - return [result.alternatives[0].transcript for result in response.results] + return client.recognize(config, audio) def transcribe(in_filename): audio_data = decode_audio(in_filename) - transcripts = get_transcripts(audio_data) - for transcript in transcripts: - print(repr(transcript.encode('utf-8'))) + return transcribe_data(audio_data) + + +def transcribe_to_file(in_filename, out_file=sys.stdout, as_json=False): + transcription = transcribe(in_filename) + if as_json: + out_file.write(MessageToJson(transcription).encode('utf-8')) + else: + transcripts = [result.alternatives[0].transcript for result in transcription.results] + for transcript in transcripts: + line = transcript + '\n' + out_file.write(line.encode('utf-8')) if __name__ == '__main__': args = parser.parse_args() - transcribe(args.in_filename) + transcribe_to_file(args.in_file, args.out_file, as_json=args.json) diff --git a/examples/transcribe_many.py b/examples/transcribe_many.py new file mode 100755 index 0000000..4461cbe --- /dev/null +++ b/examples/transcribe_many.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python +from functools import partial +from multiprocessing import Pool +from transcribe import transcribe_to_file +import argparse +import os +import logging + + +logging.basicConfig(level=logging.INFO, format='%(message)s') +logger = logging.getLogger(__file__) + +DEFAULT_WORKER_COUNT = 20 + +parser = argparse.ArgumentParser(description='Transcribe multiple audio files in parallel using Google Speech API') +parser.add_argument('in_filenames', nargs='+', help='Input filename(s)') +parser.add_argument('--keep-suffix', action='store_true', + help='Don\'t strip filename suffix when generating metadata .json output filename') +parser.add_argument('--workers', default=DEFAULT_WORKER_COUNT, + help='Number of workers (default {})'.format(DEFAULT_WORKER_COUNT)) + + +def transcribe_one(in_filename, keep_suffix=False): + if keep_suffix: + base_filename = in_filename + else: + base_filename = os.path.splitext(in_filename)[0] + out_filename = '{}.json'.format(base_filename) + logger.info('Starting: {} -> {}'.format(in_filename, out_filename)) + with open(out_filename, 'w') as out_file: + transcribe_to_file(in_filename, out_file, as_json=True) + logger.info('Finished: {} -> {}'.format(in_filename, out_filename)) + + +def transcribe_many(in_filenames, keep_suffix=False, worker_count=DEFAULT_WORKER_COUNT): + pool = Pool(processes=worker_count) + func = partial(transcribe_one, keep_suffix=keep_suffix) + pool.map_async(func, in_filenames).get(99999999) + + +if __name__ == '__main__': + args = parser.parse_args() + transcribe_many(args.in_filenames, args.keep_suffix)