Merge remote-tracking branch 'origin/master' into stream_selectors

2025-08-30 02:30:02 +08:00 · 2018-03-10 19:01:57 -08:00 · 2018-03-10 19:01:57 -08:00 · 809ab6cd17
commit 809ab6cd17
parent ef9b102676 a029d7aacc
8 changed files with 249 additions and 3 deletions
--- a/examples/requirements.txt
+++ b/examples/requirements.txt
@ -0,0 +1,2 @@
 ffmpeg-python
 google-cloud-speech
--- a/examples/split_silence.py
+++ b/examples/split_silence.py
@ -0,0 +1,141 @@
 #!/usr/bin/env python
 from __future__ import unicode_literals
 import argparse
 import errno
 import ffmpeg
 import logging
 import os
 import re
 import subprocess
 import sys
 logging.basicConfig(level=logging.INFO, format='%(message)s')
 logger = logging.getLogger(__file__)
 logger.setLevel(logging.INFO)
 DEFAULT_DURATION = 0.3
 DEFAULT_THRESHOLD = -60
 parser = argparse.ArgumentParser(description='Split media into separate chunks wherever silence occurs')
 parser.add_argument('in_filename', help='Input filename (`-` for stdin)')
 parser.add_argument('out_pattern', help='Output filename pattern (e.g. `out/chunk_{:04d}.wav`)')
 parser.add_argument('--silence-threshold', default=DEFAULT_THRESHOLD, type=int, help='Silence threshold (in dB)')
 parser.add_argument('--silence-duration', default=DEFAULT_DURATION, type=float, help='Silence duration')
 parser.add_argument('--start-time', type=float, help='Start time (seconds)')
 parser.add_argument('--end-time', type=float, help='End time (seconds)')
 parser.add_argument('-v', dest='verbose', action='store_true', help='Verbose mode')
 silence_start_re = re.compile(' silence_start: (?P<start>[0-9]+(\.?[0-9]*))$')
 silence_end_re = re.compile(' silence_end: (?P<end>[0-9]+(\.?[0-9]*)) ')
 total_duration_re = re.compile(
    'size=[^ ]+ time=(?P<hours>[0-9]{2}):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9\.]{5}) bitrate=')
 def _logged_popen(cmd_line, *args, **kwargs):
    logger.debug('Running command: {}'.format(subprocess.list2cmdline(cmd_line)))
    return subprocess.Popen(cmd_line, *args, **kwargs)
 def get_chunk_times(in_filename, silence_threshold, silence_duration, start_time=None, end_time=None):
    input_kwargs = {}
    if start_time is not None:
        input_kwargs['ss'] = start_time
    else:
        start_time = 0.
    if end_time is not None:
        input_kwargs['t'] = end_time - start_time
    p = _logged_popen(
        (ffmpeg
            .input(in_filename, **input_kwargs)
            .filter_('silencedetect', n='{}dB'.format(silence_threshold), d=silence_duration)
            .output('-', format='null')
            .compile()
        ) + ['-nostats'],  # FIXME: use .nostats() once it's implemented in ffmpeg-python.
        stderr=subprocess.PIPE
    )
    output = p.communicate()[1].decode('utf-8')
    if p.returncode != 0:
        sys.stderr.write(output)
        sys.exit(1)
    logger.debug(output)
    lines = output.splitlines()
    # Chunks start when silence ends, and chunks end when silence starts.
    chunk_starts = []
    chunk_ends = []
    for line in lines:
        silence_start_match = silence_start_re.search(line)
        silence_end_match = silence_end_re.search(line)
        total_duration_match = total_duration_re.search(line)
        if silence_start_match:
            chunk_ends.append(float(silence_start_match.group('start')))
            if len(chunk_starts) == 0:
                # Started with non-silence.
                chunk_starts.append(start_time or 0.)
        elif silence_end_match:
            chunk_starts.append(float(silence_end_match.group('end')))
        elif total_duration_match:
            hours = int(total_duration_match.group('hours'))
            minutes = int(total_duration_match.group('minutes'))
            seconds = float(total_duration_match.group('seconds'))
            end_time = hours * 3600 + minutes * 60 + seconds
    if len(chunk_starts) == 0:
        # No silence found.
        chunk_starts.append(start_time)
    if len(chunk_starts) > len(chunk_ends):
        # Finished with non-silence.
        chunk_ends.append(end_time or 10000000.)
    return list(zip(chunk_starts, chunk_ends))
 def _makedirs(path):
    """Python2-compatible version of ``os.makedirs(path, exist_ok=True)``."""
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno != errno.EEXIST or not os.path.isdir(path):
            raise
 def split_audio(
    in_filename,
    out_pattern,
    silence_threshold=DEFAULT_THRESHOLD,
    silence_duration=DEFAULT_DURATION,
    start_time=None,
    end_time=None,
    verbose=False,
 ):
    chunk_times = get_chunk_times(in_filename, silence_threshold, silence_duration, start_time, end_time)
    for i, (start_time, end_time) in enumerate(chunk_times):
        time = end_time - start_time
        out_filename = out_pattern.format(i, i=i)
        _makedirs(os.path.dirname(out_filename))
        logger.info('{}: start={:.02f}, end={:.02f}, duration={:.02f}'.format(out_filename, start_time, end_time,
            time))
        _logged_popen(
            (ffmpeg
                .input(in_filename, ss=start_time, t=time)
                .output(out_filename)
                .overwrite_output()
                .compile()
            ),
            stdout=subprocess.PIPE if not verbose else None,
            stderr=subprocess.PIPE if not verbose else None,
        ).communicate()
 if __name__ == '__main__':
    kwargs = vars(parser.parse_args())
    if kwargs['verbose']:
        logging.basicConfig(level=logging.DEBUG, format='%(levels): %(message)s')
        logger.setLevel(logging.DEBUG)
    split_audio(**kwargs)
--- a/examples/transcribe.py
+++ b/examples/transcribe.py
@ -0,0 +1,62 @@
 #!/usr/bin/env python
 from __future__ import unicode_literals
 from google.cloud import speech
 from google.cloud.speech import enums
 from google.cloud.speech import types
 import argparse
 import ffmpeg
 import logging
 import subprocess
 import sys
 logging.basicConfig(level=logging.INFO, format='%(message)s')
 logger = logging.getLogger(__file__)
 logger.setLevel(logging.INFO)
 parser = argparse.ArgumentParser(description='Convert speech audio to text using Google Speech API')
 parser.add_argument('in_filename', help='Input filename (`-` for stdin)')
 def decode_audio(in_filename, **input_kwargs):
    p = subprocess.Popen(
        (ffmpeg
            .input(in_filename, **input_kwargs)
            .output('-', format='s16le', acodec='pcm_s16le', ac=1, ar='16k')
            .overwrite_output()
            .compile()
        ),
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE
    )
    out = p.communicate()
    if p.returncode != 0:
        sys.stderr.write(out[1])
        sys.exit(1)
    return out[0]
 def get_transcripts(audio_data):
    client = speech.SpeechClient()
    audio = types.RecognitionAudio(content=audio_data)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code='en-US'
    )
    response = client.recognize(config, audio)
    return [result.alternatives[0].transcript for result in response.results]
 def transcribe(in_filename):
    audio_data = decode_audio(in_filename)
    transcripts = get_transcripts(audio_data)
    for transcript in transcripts:
        print(repr(transcript.encode('utf-8')))
 if __name__ == '__main__':
    args = parser.parse_args()
    transcribe(args.in_filename)
--- a/ffmpeg/_filters.py
+++ b/ffmpeg/_filters.py
@ -52,6 +52,11 @@ def split(stream):
    return FilterNode(stream, split.__name__)
@filter_operator()
 def asplit(stream):
    return FilterNode(stream, asplit.__name__)
@filter_operator()
 def setpts(stream, expr):
    """Change the PTS (presentation timestamp) of the input frames.
@ -421,6 +426,7 @@ __all__ = [
    'drawbox',
    'drawtext',
    'filter_',
    'filter_multi_output',
    'hflip',
    'hue',
    'overlay',
--- a/ffmpeg/_view.py
+++ b/ffmpeg/_view.py
@ -78,7 +78,7 @@ def view(stream_spec, **kwargs):
            downstream_node_id = str(hash(edge.downstream_node))
            graph.edge(upstream_node_id, downstream_node_id, **kwargs)
-    graph.view(filename)
+    graph.view(filename, cleanup=True)
    return stream_spec
--- a/ffmpeg/nodes.py
+++ b/ffmpeg/nodes.py
@ -280,7 +280,7 @@ class FilterNode(Node):
    def _get_filter(self, outgoing_edges):
        args = self.args
        kwargs = self.kwargs
-        if self.name == 'split':
+        if self.name in ('split', 'asplit'):
            args = [len(outgoing_edges)]
        out_args = [escape_chars(x, '\\\'=:') for x in args]
--- a/ffmpeg/tests/test_ffmpeg.py
+++ b/ffmpeg/tests/test_ffmpeg.py
@ -180,6 +180,41 @@ def test_map_same_effect_as_output():
                                                      TEST_OUTPUT_FILE1]
 def _get_complex_filter_asplit_example():
    split = (ffmpeg
        .input(TEST_INPUT_FILE1)
        .vflip()
        .asplit()
    )
    split0 = split[0]
    split1 = split[1]
    return (ffmpeg
        .concat(
            split0.filter_("atrim", start=10, end=20),
            split1.filter_("atrim", start=30, end=40),
        )
        .output(TEST_OUTPUT_FILE1)
        .overwrite_output()
    )
 def test_filter_asplit():
    out = _get_complex_filter_asplit_example()
    args = out.get_args()
    assert args == [
        '-i',
        TEST_INPUT_FILE1,
        '-filter_complex',
        '[0]vflip[s0];[s0]asplit=2[s1][s2];[s1]atrim=end=20:start=10[s3];[s2]atrim=end=40:start=30[s4];[s3]'
        '[s4]concat=n=2[s5]',
        '-map',
        '[s5]',
        TEST_OUTPUT_FILE1,
        '-y'
    ]
 def test_filter_normal_arg_escape():
    """Test string escaping of normal filter args (e.g. ``font`` param of ``drawtext`` filter)."""
    def _get_drawtext_font_repr(font):
--- a/setup.py
+++ b/setup.py
@ -2,7 +2,7 @@ from setuptools import setup
 from textwrap import dedent
 import subprocess
-version = '0.1.9'
+version = '0.1.10'
 download_url = 'https://github.com/kkroening/ffmpeg-python/archive/v{}.zip'.format(version)
 long_description = dedent("""\