Add transcribe_many and make changes to other example scripts

This commit is contained in:
Karl Kroening 2018-01-07 23:25:33 -08:00
parent f5f7ee2073
commit d70d5a36a2
4 changed files with 93 additions and 18 deletions

View File

@ -1 +1,4 @@
ffmpeg
ffmpeg-python
gevent
google-cloud-speech
tqdm

View File

@ -16,7 +16,7 @@ logger = logging.getLogger(__file__)
logger.setLevel(logging.INFO)
DEFAULT_DURATION = 0.3
DEFAULT_THRESHOLD = -60
DEFAULT_THRESHOLD = -30
parser = argparse.ArgumentParser(description='Split media into separate chunks wherever silence occurs')
parser.add_argument('in_filename', help='Input filename (`-` for stdin)')
@ -26,6 +26,7 @@ parser.add_argument('--silence-duration', default=DEFAULT_DURATION, type=float,
parser.add_argument('--start-time', type=float, help='Start time (seconds)')
parser.add_argument('--end-time', type=float, help='End time (seconds)')
parser.add_argument('-v', dest='verbose', action='store_true', help='Verbose mode')
parser.add_argument('--padding', type=float, default=0., help='Output silence padding (seconds)')
silence_start_re = re.compile(' silence_start: (?P<start>[0-9]+(\.?[0-9]*))$')
silence_end_re = re.compile(' silence_end: (?P<end>[0-9]+(\.?[0-9]*)) ')
@ -110,6 +111,7 @@ def split_audio(
silence_duration=DEFAULT_DURATION,
start_time=None,
end_time=None,
padding=0.,
verbose=False,
):
chunk_times = get_chunk_times(in_filename, silence_threshold, silence_duration, start_time, end_time)
@ -121,16 +123,30 @@ def split_audio(
logger.info('{}: start={:.02f}, end={:.02f}, duration={:.02f}'.format(out_filename, start_time, end_time,
time))
_logged_popen(
(ffmpeg
.input(in_filename, ss=start_time, t=time)
input = ffmpeg.input(in_filename, ss=start_time, t=time)
if padding > 0.:
silence = ffmpeg.input('aevalsrc=0:0::duration={}'.format(padding), format='lavfi')
input = ffmpeg.concat(silence, input, v=0, a=1)
ffmpeg_cmd = (input
.output(out_filename)
.overwrite_output()
.compile()
),
)
print ffmpeg_cmd
p = _logged_popen(
ffmpeg_cmd,
stdout=subprocess.PIPE if not verbose else None,
stderr=subprocess.PIPE if not verbose else None,
).communicate()
)
out = p.communicate()
if p.returncode != 0:
if not verbose:
sys.stderr.write(out[1])
sys.exit(1)
if __name__ == '__main__':

View File

@ -1,9 +1,11 @@
#!/usr/bin/env python
from __future__ import unicode_literals
import IPython
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
from google.protobuf.json_format import MessageToJson
import argparse
import ffmpeg
import logging
@ -17,7 +19,10 @@ logger.setLevel(logging.INFO)
parser = argparse.ArgumentParser(description='Convert speech audio to text using Google Speech API')
parser.add_argument('in_filename', help='Input filename (`-` for stdin)')
parser.add_argument('in_file', help='Input filename (`-` for stdin)')
parser.add_argument('--out-file', type=argparse.FileType('w'), default='-',
help='Output filename (defaults to stdout)')
parser.add_argument('--json', action='store_true', help='Output raw JSON response')
def decode_audio(in_filename, **input_kwargs):
@ -38,7 +43,7 @@ def decode_audio(in_filename, **input_kwargs):
return out[0]
def get_transcripts(audio_data):
def transcribe_data(audio_data):
client = speech.SpeechClient()
audio = types.RecognitionAudio(content=audio_data)
config = types.RecognitionConfig(
@ -46,17 +51,25 @@ def get_transcripts(audio_data):
sample_rate_hertz=16000,
language_code='en-US'
)
response = client.recognize(config, audio)
return [result.alternatives[0].transcript for result in response.results]
return client.recognize(config, audio)
def transcribe(in_filename):
audio_data = decode_audio(in_filename)
transcripts = get_transcripts(audio_data)
return transcribe_data(audio_data)
def transcribe_to_file(in_filename, out_file=sys.stdout, as_json=False):
transcription = transcribe(in_filename)
if as_json:
out_file.write(MessageToJson(transcription).encode('utf-8'))
else:
transcripts = [result.alternatives[0].transcript for result in transcription.results]
for transcript in transcripts:
print(repr(transcript.encode('utf-8')))
line = transcript + '\n'
out_file.write(line.encode('utf-8'))
if __name__ == '__main__':
args = parser.parse_args()
transcribe(args.in_filename)
transcribe_to_file(args.in_file, args.out_file, as_json=args.json)

43
examples/transcribe_many.py Executable file
View File

@ -0,0 +1,43 @@
#!/usr/bin/env python
from functools import partial
from multiprocessing import Pool
from transcribe import transcribe_to_file
import argparse
import os
import logging
logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger(__file__)
DEFAULT_WORKER_COUNT = 20
parser = argparse.ArgumentParser(description='Transcribe multiple audio files in parallel using Google Speech API')
parser.add_argument('in_filenames', nargs='+', help='Input filename(s)')
parser.add_argument('--keep-suffix', action='store_true',
help='Don\'t strip filename suffix when generating metadata .json output filename')
parser.add_argument('--workers', default=DEFAULT_WORKER_COUNT,
help='Number of workers (default {})'.format(DEFAULT_WORKER_COUNT))
def transcribe_one(in_filename, keep_suffix=False):
if keep_suffix:
base_filename = in_filename
else:
base_filename = os.path.splitext(in_filename)[0]
out_filename = '{}.json'.format(base_filename)
logger.info('Starting: {} -> {}'.format(in_filename, out_filename))
with open(out_filename, 'w') as out_file:
transcribe_to_file(in_filename, out_file, as_json=True)
logger.info('Finished: {} -> {}'.format(in_filename, out_filename))
def transcribe_many(in_filenames, keep_suffix=False, worker_count=DEFAULT_WORKER_COUNT):
pool = Pool(processes=worker_count)
func = partial(transcribe_one, keep_suffix=keep_suffix)
pool.map_async(func, in_filenames).get(99999999)
if __name__ == '__main__':
args = parser.parse_args()
transcribe_many(args.in_filenames, args.keep_suffix)