Spaces:
Running
on
Zero
Running
on
Zero
Upload ./RepCodec/examples/dump_feature.py with huggingface_hub
Browse files
RepCodec/examples/dump_feature.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) ByteDance, Inc. and its affiliates.
|
| 2 |
+
# Copyright (c) Chutong Meng
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the MIT license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
# Based on fairseq (https://github.com/facebookresearch/fairseq)
|
| 7 |
+
|
| 8 |
+
import logging
|
| 9 |
+
import os
|
| 10 |
+
import sys
|
| 11 |
+
|
| 12 |
+
from feature_utils import get_path_iterator, dump_feature
|
| 13 |
+
|
| 14 |
+
logging.basicConfig(
|
| 15 |
+
format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
|
| 16 |
+
datefmt="%Y-%m-%d %H:%M:%S",
|
| 17 |
+
level=os.environ.get("LOGLEVEL", "INFO").upper(),
|
| 18 |
+
stream=sys.stdout,
|
| 19 |
+
)
|
| 20 |
+
logger = logging.getLogger("dump_feature")
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def main(
|
| 24 |
+
model_type: str,
|
| 25 |
+
tsv_path: str,
|
| 26 |
+
ckpt_path: str,
|
| 27 |
+
whisper_root: str,
|
| 28 |
+
whisper_name: str,
|
| 29 |
+
layer: int,
|
| 30 |
+
nshard: int,
|
| 31 |
+
rank: int,
|
| 32 |
+
feat_dir: str,
|
| 33 |
+
max_chunk: int,
|
| 34 |
+
use_cpu: bool = False
|
| 35 |
+
):
|
| 36 |
+
device = "cpu" if use_cpu else "cuda"
|
| 37 |
+
|
| 38 |
+
# some checks
|
| 39 |
+
if model_type in ["hubert", "data2vec"]:
|
| 40 |
+
assert ckpt_path and os.path.exists(ckpt_path)
|
| 41 |
+
elif model_type in ["whisper"]:
|
| 42 |
+
assert whisper_name and whisper_root
|
| 43 |
+
else:
|
| 44 |
+
raise ValueError(f"Unsupported model type {model_type}")
|
| 45 |
+
|
| 46 |
+
reader = None
|
| 47 |
+
if model_type == "hubert":
|
| 48 |
+
from hubert_feature_reader import HubertFeatureReader
|
| 49 |
+
reader = HubertFeatureReader(ckpt_path, layer, device=device, max_chunk=max_chunk)
|
| 50 |
+
elif model_type == "data2vec":
|
| 51 |
+
from data2vec_feature_reader import Data2vecFeatureReader
|
| 52 |
+
reader = Data2vecFeatureReader(ckpt_path, layer, device=device, max_chunk=max_chunk)
|
| 53 |
+
elif model_type == "whisper":
|
| 54 |
+
from whisper_feature_reader import WhisperFeatureReader
|
| 55 |
+
reader = WhisperFeatureReader(whisper_root, whisper_name, layer, device=device)
|
| 56 |
+
|
| 57 |
+
assert reader is not None
|
| 58 |
+
|
| 59 |
+
generator, num = get_path_iterator(tsv_path, nshard, rank)
|
| 60 |
+
dump_feature(reader, generator, num, nshard, rank, feat_dir)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
if __name__ == "__main__":
|
| 64 |
+
import argparse
|
| 65 |
+
|
| 66 |
+
parser = argparse.ArgumentParser()
|
| 67 |
+
parser.add_argument(
|
| 68 |
+
"--model_type",
|
| 69 |
+
required=True,
|
| 70 |
+
type=str,
|
| 71 |
+
choices=["data2vec", "hubert", "whisper"],
|
| 72 |
+
help="the type of the speech encoder."
|
| 73 |
+
)
|
| 74 |
+
parser.add_argument(
|
| 75 |
+
"--tsv_path",
|
| 76 |
+
required=True,
|
| 77 |
+
type=str,
|
| 78 |
+
help="the path to the tsv file."
|
| 79 |
+
)
|
| 80 |
+
parser.add_argument(
|
| 81 |
+
"--ckpt_path",
|
| 82 |
+
required=False,
|
| 83 |
+
type=str,
|
| 84 |
+
default=None,
|
| 85 |
+
help="path to the speech model. must provide for HuBERT and data2vec"
|
| 86 |
+
)
|
| 87 |
+
parser.add_argument(
|
| 88 |
+
"--whisper_root",
|
| 89 |
+
required=False,
|
| 90 |
+
type=str,
|
| 91 |
+
default=None,
|
| 92 |
+
help="root dir to download/store whisper model. must provide for whisper model."
|
| 93 |
+
)
|
| 94 |
+
parser.add_argument(
|
| 95 |
+
"--whisper_name",
|
| 96 |
+
required=False,
|
| 97 |
+
type=str,
|
| 98 |
+
default=None,
|
| 99 |
+
help="name of whisper model. e.g., large-v2. must provide for whisper model."
|
| 100 |
+
)
|
| 101 |
+
parser.add_argument(
|
| 102 |
+
"--layer",
|
| 103 |
+
required=True,
|
| 104 |
+
type=int,
|
| 105 |
+
help="which layer of the model. this is 1-based."
|
| 106 |
+
)
|
| 107 |
+
parser.add_argument(
|
| 108 |
+
"--feat_dir",
|
| 109 |
+
required=True,
|
| 110 |
+
type=str,
|
| 111 |
+
help="the output dir to save the representations."
|
| 112 |
+
)
|
| 113 |
+
parser.add_argument(
|
| 114 |
+
"--nshard",
|
| 115 |
+
required=False,
|
| 116 |
+
type=int,
|
| 117 |
+
default=1,
|
| 118 |
+
help="total number of shards."
|
| 119 |
+
)
|
| 120 |
+
parser.add_argument(
|
| 121 |
+
"--rank",
|
| 122 |
+
required=False,
|
| 123 |
+
type=int,
|
| 124 |
+
default=0,
|
| 125 |
+
help="shard id of this process."
|
| 126 |
+
)
|
| 127 |
+
parser.add_argument(
|
| 128 |
+
"--max_chunk",
|
| 129 |
+
type=int,
|
| 130 |
+
default=1600000,
|
| 131 |
+
help="max number of frames of each batch."
|
| 132 |
+
)
|
| 133 |
+
parser.add_argument(
|
| 134 |
+
"--use_cpu",
|
| 135 |
+
default=False,
|
| 136 |
+
action="store_true",
|
| 137 |
+
help="whether use cpu instead of gpu."
|
| 138 |
+
)
|
| 139 |
+
args = parser.parse_args()
|
| 140 |
+
logger.info(args)
|
| 141 |
+
|
| 142 |
+
main(**vars(args))
|