|
""" |
|
Script for extracting DeepSpeech features from audio file. |
|
""" |
|
|
|
import os |
|
import argparse |
|
import numpy as np |
|
import pandas as pd |
|
from deepspeech_store import get_deepspeech_model_file |
|
from deepspeech_features import conv_audios_to_deepspeech |
|
|
|
|
|
def parse_args(): |
|
""" |
|
Create python script parameters. |
|
Returns |
|
------- |
|
ArgumentParser |
|
Resulted args. |
|
""" |
|
parser = argparse.ArgumentParser( |
|
description="Extract DeepSpeech features from audio file", |
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter) |
|
parser.add_argument( |
|
"--input", |
|
type=str, |
|
required=True, |
|
help="path to input audio file or directory") |
|
parser.add_argument( |
|
"--output", |
|
type=str, |
|
help="path to output file with DeepSpeech features") |
|
parser.add_argument( |
|
"--deepspeech", |
|
type=str, |
|
help="path to DeepSpeech 0.1.0 frozen model") |
|
parser.add_argument( |
|
"--metainfo", |
|
type=str, |
|
help="path to file with meta-information") |
|
|
|
args = parser.parse_args() |
|
return args |
|
|
|
|
|
def extract_features(in_audios, |
|
out_files, |
|
deepspeech_pb_path, |
|
metainfo_file_path=None): |
|
""" |
|
Real extract audio from video file. |
|
Parameters |
|
---------- |
|
in_audios : list of str |
|
Paths to input audio files. |
|
out_files : list of str |
|
Paths to output files with DeepSpeech features. |
|
deepspeech_pb_path : str |
|
Path to DeepSpeech 0.1.0 frozen model. |
|
metainfo_file_path : str, default None |
|
Path to file with meta-information. |
|
""" |
|
|
|
if metainfo_file_path is None: |
|
num_frames_info = [None] * len(in_audios) |
|
else: |
|
train_df = pd.read_csv( |
|
metainfo_file_path, |
|
sep="\t", |
|
index_col=False, |
|
dtype={"Id": np.int, "File": np.unicode, "Count": np.int}) |
|
num_frames_info = train_df["Count"].values |
|
assert (len(num_frames_info) == len(in_audios)) |
|
|
|
for i, in_audio in enumerate(in_audios): |
|
if not out_files[i]: |
|
file_stem, _ = os.path.splitext(in_audio) |
|
out_files[i] = file_stem + "_ds.npy" |
|
|
|
conv_audios_to_deepspeech( |
|
audios=in_audios, |
|
out_files=out_files, |
|
num_frames_info=num_frames_info, |
|
deepspeech_pb_path=deepspeech_pb_path) |
|
|
|
|
|
def main(): |
|
""" |
|
Main body of script. |
|
""" |
|
args = parse_args() |
|
in_audio = os.path.expanduser(args.input) |
|
if not os.path.exists(in_audio): |
|
raise Exception("Input file/directory doesn't exist: {}".format(in_audio)) |
|
deepspeech_pb_path = args.deepspeech |
|
|
|
deepspeech_pb_path = True |
|
args.deepspeech = '~/.tensorflow/models/deepspeech-0_1_0-b90017e8.pb' |
|
|
|
if deepspeech_pb_path is None: |
|
deepspeech_pb_path = "" |
|
if deepspeech_pb_path: |
|
deepspeech_pb_path = os.path.expanduser(args.deepspeech) |
|
if not os.path.exists(deepspeech_pb_path): |
|
deepspeech_pb_path = get_deepspeech_model_file() |
|
if os.path.isfile(in_audio): |
|
extract_features( |
|
in_audios=[in_audio], |
|
out_files=[args.output], |
|
deepspeech_pb_path=deepspeech_pb_path, |
|
metainfo_file_path=args.metainfo) |
|
else: |
|
audio_file_paths = [] |
|
for file_name in os.listdir(in_audio): |
|
if not os.path.isfile(os.path.join(in_audio, file_name)): |
|
continue |
|
_, file_ext = os.path.splitext(file_name) |
|
if file_ext.lower() == ".wav": |
|
audio_file_path = os.path.join(in_audio, file_name) |
|
audio_file_paths.append(audio_file_path) |
|
audio_file_paths = sorted(audio_file_paths) |
|
out_file_paths = [""] * len(audio_file_paths) |
|
extract_features( |
|
in_audios=audio_file_paths, |
|
out_files=out_file_paths, |
|
deepspeech_pb_path=deepspeech_pb_path, |
|
metainfo_file_path=args.metainfo) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|
|
|