mirror of
https://github.com/kkroening/ffmpeg-python.git
synced 2025-04-06 04:15:44 +08:00
Add --timing
option to transcribe.py; output json
This commit is contained in:
parent
de1ec94be9
commit
87f85000ac
@ -1,6 +1,7 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from google.protobuf.json_format import MessageToJson
|
||||||
from google.cloud import speech
|
from google.cloud import speech
|
||||||
from google.cloud.speech import enums
|
from google.cloud.speech import enums
|
||||||
from google.cloud.speech import types
|
from google.cloud.speech import types
|
||||||
@ -9,6 +10,7 @@ import ffmpeg
|
|||||||
import logging
|
import logging
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
import IPython
|
||||||
|
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO, format='%(message)s')
|
logging.basicConfig(level=logging.INFO, format='%(message)s')
|
||||||
@ -18,6 +20,7 @@ logger.setLevel(logging.INFO)
|
|||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Convert speech audio to text using Google Speech API')
|
parser = argparse.ArgumentParser(description='Convert speech audio to text using Google Speech API')
|
||||||
parser.add_argument('in_filename', help='Input filename (`-` for stdin)')
|
parser.add_argument('in_filename', help='Input filename (`-` for stdin)')
|
||||||
|
parser.add_argument('--timing', action='store_true', help='Include timing info')
|
||||||
|
|
||||||
|
|
||||||
def decode_audio(in_filename, **input_kwargs):
|
def decode_audio(in_filename, **input_kwargs):
|
||||||
@ -38,25 +41,24 @@ def decode_audio(in_filename, **input_kwargs):
|
|||||||
return out[0]
|
return out[0]
|
||||||
|
|
||||||
|
|
||||||
def get_transcripts(audio_data):
|
def get_transcripts(audio_data, include_timing_info=False):
|
||||||
client = speech.SpeechClient()
|
client = speech.SpeechClient()
|
||||||
audio = types.RecognitionAudio(content=audio_data)
|
audio = types.RecognitionAudio(content=audio_data)
|
||||||
config = types.RecognitionConfig(
|
config = types.RecognitionConfig(
|
||||||
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
|
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
|
||||||
sample_rate_hertz=16000,
|
sample_rate_hertz=16000,
|
||||||
language_code='en-US'
|
language_code='en-US',
|
||||||
|
enable_word_time_offsets=include_timing_info,
|
||||||
)
|
)
|
||||||
response = client.recognize(config, audio)
|
return client.recognize(config, audio)
|
||||||
return [result.alternatives[0].transcript for result in response.results]
|
|
||||||
|
|
||||||
|
|
||||||
def transcribe(in_filename):
|
def transcribe(in_filename, include_timing_info=False):
|
||||||
audio_data = decode_audio(in_filename)
|
audio_data = decode_audio(in_filename)
|
||||||
transcripts = get_transcripts(audio_data)
|
response = get_transcripts(audio_data, include_timing_info)
|
||||||
for transcript in transcripts:
|
print(MessageToJson(response, sort_keys=True))
|
||||||
print(repr(transcript.encode('utf-8')))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
transcribe(args.in_filename)
|
transcribe(args.in_filename, args.timing)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user