ffmpeg-python/examples/transcribe.py
2023-03-06 17:57:57 +01:00

57 lines
1.7 KiB
Python
Executable File

#!/usr/bin/env python
from __future__ import unicode_literals, print_function
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
import argparse
import ffmpeg
import logging
import sys
logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger(__file__)
logger.setLevel(logging.INFO)
parser = argparse.ArgumentParser(description='Convert speech audio to text using Google Speech API')
parser.add_argument('in_filename', help='Input filename (`-` for stdin)')
def decode_audio(in_filename, **input_kwargs):
try:
out, err = (ffmpeg
.input(in_filename, **input_kwargs)
.output('-', format='s16le', acodec='pcm_s16le', ac=1, ar='16k')
.overwrite_output()
.run(capture_stdout=True, capture_stderr=True)
)
except ffmpeg.Error as e:
print(f"$ {e.cmdline}\n{e.stderr.decode()}", file=sys.stderr)
sys.exit(1)
return out
def get_transcripts(audio_data):
client = speech.SpeechClient()
audio = types.RecognitionAudio(content=audio_data)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code='en-US'
)
response = client.recognize(config, audio)
return [result.alternatives[0].transcript for result in response.results]
def transcribe(in_filename):
audio_data = decode_audio(in_filename)
transcripts = get_transcripts(audio_data)
for transcript in transcripts:
print(repr(transcript.encode('utf-8')))
if __name__ == '__main__':
args = parser.parse_args()
transcribe(args.in_filename)