mirror of
https://github.com/kkroening/ffmpeg-python.git
synced 2025-04-06 04:15:44 +08:00
Add transcribe_many and make changes to other example scripts
This commit is contained in:
parent
f5f7ee2073
commit
d70d5a36a2
@ -1 +1,4 @@
|
||||
ffmpeg
|
||||
ffmpeg-python
|
||||
gevent
|
||||
google-cloud-speech
|
||||
tqdm
|
||||
|
@ -16,7 +16,7 @@ logger = logging.getLogger(__file__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
DEFAULT_DURATION = 0.3
|
||||
DEFAULT_THRESHOLD = -60
|
||||
DEFAULT_THRESHOLD = -30
|
||||
|
||||
parser = argparse.ArgumentParser(description='Split media into separate chunks wherever silence occurs')
|
||||
parser.add_argument('in_filename', help='Input filename (`-` for stdin)')
|
||||
@ -26,6 +26,7 @@ parser.add_argument('--silence-duration', default=DEFAULT_DURATION, type=float,
|
||||
parser.add_argument('--start-time', type=float, help='Start time (seconds)')
|
||||
parser.add_argument('--end-time', type=float, help='End time (seconds)')
|
||||
parser.add_argument('-v', dest='verbose', action='store_true', help='Verbose mode')
|
||||
parser.add_argument('--padding', type=float, default=0., help='Output silence padding (seconds)')
|
||||
|
||||
silence_start_re = re.compile(' silence_start: (?P<start>[0-9]+(\.?[0-9]*))$')
|
||||
silence_end_re = re.compile(' silence_end: (?P<end>[0-9]+(\.?[0-9]*)) ')
|
||||
@ -110,6 +111,7 @@ def split_audio(
|
||||
silence_duration=DEFAULT_DURATION,
|
||||
start_time=None,
|
||||
end_time=None,
|
||||
padding=0.,
|
||||
verbose=False,
|
||||
):
|
||||
chunk_times = get_chunk_times(in_filename, silence_threshold, silence_duration, start_time, end_time)
|
||||
@ -121,16 +123,30 @@ def split_audio(
|
||||
|
||||
logger.info('{}: start={:.02f}, end={:.02f}, duration={:.02f}'.format(out_filename, start_time, end_time,
|
||||
time))
|
||||
_logged_popen(
|
||||
(ffmpeg
|
||||
.input(in_filename, ss=start_time, t=time)
|
||||
|
||||
input = ffmpeg.input(in_filename, ss=start_time, t=time)
|
||||
|
||||
if padding > 0.:
|
||||
silence = ffmpeg.input('aevalsrc=0:0::duration={}'.format(padding), format='lavfi')
|
||||
input = ffmpeg.concat(silence, input, v=0, a=1)
|
||||
|
||||
ffmpeg_cmd = (input
|
||||
.output(out_filename)
|
||||
.overwrite_output()
|
||||
.compile()
|
||||
),
|
||||
)
|
||||
print ffmpeg_cmd
|
||||
|
||||
p = _logged_popen(
|
||||
ffmpeg_cmd,
|
||||
stdout=subprocess.PIPE if not verbose else None,
|
||||
stderr=subprocess.PIPE if not verbose else None,
|
||||
).communicate()
|
||||
)
|
||||
out = p.communicate()
|
||||
if p.returncode != 0:
|
||||
if not verbose:
|
||||
sys.stderr.write(out[1])
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -1,9 +1,11 @@
|
||||
#!/usr/bin/env python
|
||||
from __future__ import unicode_literals
|
||||
import IPython
|
||||
|
||||
from google.cloud import speech
|
||||
from google.cloud.speech import enums
|
||||
from google.cloud.speech import types
|
||||
from google.protobuf.json_format import MessageToJson
|
||||
import argparse
|
||||
import ffmpeg
|
||||
import logging
|
||||
@ -17,7 +19,10 @@ logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser(description='Convert speech audio to text using Google Speech API')
|
||||
parser.add_argument('in_filename', help='Input filename (`-` for stdin)')
|
||||
parser.add_argument('in_file', help='Input filename (`-` for stdin)')
|
||||
parser.add_argument('--out-file', type=argparse.FileType('w'), default='-',
|
||||
help='Output filename (defaults to stdout)')
|
||||
parser.add_argument('--json', action='store_true', help='Output raw JSON response')
|
||||
|
||||
|
||||
def decode_audio(in_filename, **input_kwargs):
|
||||
@ -38,7 +43,7 @@ def decode_audio(in_filename, **input_kwargs):
|
||||
return out[0]
|
||||
|
||||
|
||||
def get_transcripts(audio_data):
|
||||
def transcribe_data(audio_data):
|
||||
client = speech.SpeechClient()
|
||||
audio = types.RecognitionAudio(content=audio_data)
|
||||
config = types.RecognitionConfig(
|
||||
@ -46,17 +51,25 @@ def get_transcripts(audio_data):
|
||||
sample_rate_hertz=16000,
|
||||
language_code='en-US'
|
||||
)
|
||||
response = client.recognize(config, audio)
|
||||
return [result.alternatives[0].transcript for result in response.results]
|
||||
return client.recognize(config, audio)
|
||||
|
||||
|
||||
def transcribe(in_filename):
|
||||
audio_data = decode_audio(in_filename)
|
||||
transcripts = get_transcripts(audio_data)
|
||||
return transcribe_data(audio_data)
|
||||
|
||||
|
||||
def transcribe_to_file(in_filename, out_file=sys.stdout, as_json=False):
|
||||
transcription = transcribe(in_filename)
|
||||
if as_json:
|
||||
out_file.write(MessageToJson(transcription).encode('utf-8'))
|
||||
else:
|
||||
transcripts = [result.alternatives[0].transcript for result in transcription.results]
|
||||
for transcript in transcripts:
|
||||
print(repr(transcript.encode('utf-8')))
|
||||
line = transcript + '\n'
|
||||
out_file.write(line.encode('utf-8'))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parser.parse_args()
|
||||
transcribe(args.in_filename)
|
||||
transcribe_to_file(args.in_file, args.out_file, as_json=args.json)
|
||||
|
43
examples/transcribe_many.py
Executable file
43
examples/transcribe_many.py
Executable file
@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env python
|
||||
from functools import partial
|
||||
from multiprocessing import Pool
|
||||
from transcribe import transcribe_to_file
|
||||
import argparse
|
||||
import os
|
||||
import logging
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(message)s')
|
||||
logger = logging.getLogger(__file__)
|
||||
|
||||
DEFAULT_WORKER_COUNT = 20
|
||||
|
||||
parser = argparse.ArgumentParser(description='Transcribe multiple audio files in parallel using Google Speech API')
|
||||
parser.add_argument('in_filenames', nargs='+', help='Input filename(s)')
|
||||
parser.add_argument('--keep-suffix', action='store_true',
|
||||
help='Don\'t strip filename suffix when generating metadata .json output filename')
|
||||
parser.add_argument('--workers', default=DEFAULT_WORKER_COUNT,
|
||||
help='Number of workers (default {})'.format(DEFAULT_WORKER_COUNT))
|
||||
|
||||
|
||||
def transcribe_one(in_filename, keep_suffix=False):
|
||||
if keep_suffix:
|
||||
base_filename = in_filename
|
||||
else:
|
||||
base_filename = os.path.splitext(in_filename)[0]
|
||||
out_filename = '{}.json'.format(base_filename)
|
||||
logger.info('Starting: {} -> {}'.format(in_filename, out_filename))
|
||||
with open(out_filename, 'w') as out_file:
|
||||
transcribe_to_file(in_filename, out_file, as_json=True)
|
||||
logger.info('Finished: {} -> {}'.format(in_filename, out_filename))
|
||||
|
||||
|
||||
def transcribe_many(in_filenames, keep_suffix=False, worker_count=DEFAULT_WORKER_COUNT):
|
||||
pool = Pool(processes=worker_count)
|
||||
func = partial(transcribe_one, keep_suffix=keep_suffix)
|
||||
pool.map_async(func, in_filenames).get(99999999)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parser.parse_args()
|
||||
transcribe_many(args.in_filenames, args.keep_suffix)
|
Loading…
x
Reference in New Issue
Block a user