From f5f7ee20730f2beca5db80933f11d880e1f22a0c Mon Sep 17 00:00:00 2001 From: Karl Kroening Date: Sun, 7 Jan 2018 04:43:05 -0800 Subject: [PATCH] Improve logging in split_silence; add transcribe example --- examples/split_silence.py | 38 ++++++++++++++++-------- examples/transcribe.py | 62 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 12 deletions(-) create mode 100755 examples/transcribe.py diff --git a/examples/split_silence.py b/examples/split_silence.py index 297083a..9a8adbd 100755 --- a/examples/split_silence.py +++ b/examples/split_silence.py @@ -10,6 +10,7 @@ import re import subprocess import sys + logging.basicConfig(level=logging.INFO, format='%(message)s') logger = logging.getLogger(__file__) logger.setLevel(logging.INFO) @@ -24,7 +25,7 @@ parser.add_argument('--silence-threshold', default=DEFAULT_THRESHOLD, type=int, parser.add_argument('--silence-duration', default=DEFAULT_DURATION, type=float, help='Silence duration') parser.add_argument('--start-time', type=float, help='Start time (seconds)') parser.add_argument('--end-time', type=float, help='End time (seconds)') - +parser.add_argument('-v', dest='verbose', action='store_true', help='Verbose mode') silence_start_re = re.compile(' silence_start: (?P[0-9]+(\.?[0-9]*))$') silence_end_re = re.compile(' silence_end: (?P[0-9]+(\.?[0-9]*)) ') @@ -32,6 +33,11 @@ total_duration_re = re.compile( 'size=[^ ]+ time=(?P[0-9]{2}):(?P[0-9]{2}):(?P[0-9\.]{5}) bitrate=') +def _logged_popen(cmd_line, *args, **kwargs): + logger.debug('Running command: {}'.format(subprocess.list2cmdline(cmd_line))) + return subprocess.Popen(cmd_line, *args, **kwargs) + + def get_chunk_times(in_filename, silence_threshold, silence_duration, start_time=None, end_time=None): input_kwargs = {} if start_time is not None: @@ -41,17 +47,20 @@ def get_chunk_times(in_filename, silence_threshold, silence_duration, start_time if end_time is not None: input_kwargs['t'] = end_time - start_time - args = (ffmpeg - .input(in_filename, **input_kwargs) - .filter_('silencedetect', n='{}dB'.format(silence_threshold), d=silence_duration) - .output('-', format='null') - .get_args() + p = _logged_popen( + (ffmpeg + .input(in_filename, **input_kwargs) + .filter_('silencedetect', n='{}dB'.format(silence_threshold), d=silence_duration) + .output('-', format='null') + .compile() + ) + ['-nostats'], # FIXME: use .nostats() once it's implemented in ffmpeg-python. + stderr=subprocess.PIPE ) - p = subprocess.Popen(['ffmpeg'] + args, stderr=subprocess.PIPE) output = p.communicate()[1].decode('utf-8') if p.returncode != 0: sys.stderr.write(output) sys.exit(1) + logger.debug(output) lines = output.splitlines() # Chunks start when silence ends, and chunks end when silence starts. @@ -93,6 +102,7 @@ def _makedirs(path): if exc.errno != errno.EEXIST or not os.path.isdir(path): raise + def split_audio( in_filename, out_pattern, @@ -100,6 +110,7 @@ def split_audio( silence_duration=DEFAULT_DURATION, start_time=None, end_time=None, + verbose=False, ): chunk_times = get_chunk_times(in_filename, silence_threshold, silence_duration, start_time, end_time) @@ -110,18 +121,21 @@ def split_audio( logger.info('{}: start={:.02f}, end={:.02f}, duration={:.02f}'.format(out_filename, start_time, end_time, time)) - subprocess.Popen( + _logged_popen( (ffmpeg .input(in_filename, ss=start_time, t=time) .output(out_filename) .overwrite_output() .compile() ), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + stdout=subprocess.PIPE if not verbose else None, + stderr=subprocess.PIPE if not verbose else None, ).communicate() if __name__ == '__main__': - args = parser.parse_args() - split_audio(**vars(args)) + kwargs = vars(parser.parse_args()) + if kwargs['verbose']: + logging.basicConfig(level=logging.DEBUG, format='%(levels): %(message)s') + logger.setLevel(logging.DEBUG) + split_audio(**kwargs) diff --git a/examples/transcribe.py b/examples/transcribe.py new file mode 100755 index 0000000..fb484df --- /dev/null +++ b/examples/transcribe.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +from __future__ import unicode_literals + +from google.cloud import speech +from google.cloud.speech import enums +from google.cloud.speech import types +import argparse +import ffmpeg +import logging +import subprocess +import sys + + +logging.basicConfig(level=logging.INFO, format='%(message)s') +logger = logging.getLogger(__file__) +logger.setLevel(logging.INFO) + + +parser = argparse.ArgumentParser(description='Convert speech audio to text using Google Speech API') +parser.add_argument('in_filename', help='Input filename (`-` for stdin)') + + +def decode_audio(in_filename, **input_kwargs): + p = subprocess.Popen( + (ffmpeg + .input(in_filename, **input_kwargs) + .output('-', format='s16le', acodec='pcm_s16le', ac=1, ar='16k') + .overwrite_output() + .compile() + ), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) + out = p.communicate() + if p.returncode != 0: + sys.stderr.write(out[1]) + sys.exit(1) + return out[0] + + +def get_transcripts(audio_data): + client = speech.SpeechClient() + audio = types.RecognitionAudio(content=audio_data) + config = types.RecognitionConfig( + encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=16000, + language_code='en-US' + ) + response = client.recognize(config, audio) + return [result.alternatives[0].transcript for result in response.results] + + +def transcribe(in_filename): + audio_data = decode_audio(in_filename) + transcripts = get_transcripts(audio_data) + for transcript in transcripts: + print(repr(transcript.encode('utf-8'))) + + +if __name__ == '__main__': + args = parser.parse_args() + transcribe(args.in_filename)