File size: 14,687 Bytes
a638e43 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 |
"""
Load prediction file and GT file to calculate TVR metrics:
- recall at top K (R@K), for a specified IoU, where K in [1, 5, 10, 100], IoU in [0.5, 0.7]
"""
import json
import numpy as np
from tqdm import tqdm
from collections import OrderedDict, defaultdict
def load_json(filename):
with open(filename, "r") as f:
return json.load(f)
def load_jsonl(filename):
with open(filename, "r") as f:
return [json.loads(l.strip("\n")) for l in f.readlines()]
def pad_sequences_1d_np(sequences, dtype=np.float32):
""" Pad a single-nested list or a sequence of n-d array (torch.tensor or np.ndarray)
into a (n+1)-d array, only allow the first dim has variable lengths.
Args:
sequences: list(n-d tensor or list)
dtype: np.dtype or torch.dtype
Returns:
padded_seqs: ((n+1)-d tensor) padded with zeros
mask: (2d tensor) of the same shape as the first two dims of padded_seqs,
1 indicate valid, 0 otherwise
Examples:
>>> test_data_list = [[1,2,3], [1,2], [3,4,7,9]]
>>> pad_sequences_1d(test_data_list, dtype=np.float32)
>>> test_data_3d = [np.random.randn(2,3,4), np.random.randn(4,3,4), np.random.randn(1,3,4)]
>>> pad_sequences_1d(test_data_3d, dtype=np.float32)
"""
if isinstance(sequences[0], list):
sequences = [np.asarray(s, dtype=dtype) for s in sequences]
extra_dims = sequences[0].shape[1:] # the extra dims should be the same for all elements
lengths = [len(seq) for seq in sequences]
assert "numpy" in str(dtype), "dtype and input type does not match"
padded_seqs = np.zeros((len(sequences), max(lengths)) + extra_dims, dtype=dtype)
mask = np.zeros((len(sequences), max(lengths)), dtype=np.float32)
for idx, seq in enumerate(sequences):
end = lengths[idx]
padded_seqs[idx, :end] = seq
mask[idx, :end] = 1
return padded_seqs, mask
def compute_temporal_iou_batch(preds, gt):
""" compute intersection-over-union along temporal axis
This function is significantly faster than `compute_temporal_iou`,
the result should be the same.
Args:
preds: np.ndarray, (N, 2), [st (float), ed (float)] * N
gt: [st (float), ed (float)]
Returns:
iou (float): np.ndarray, (N, )
References:
for np.divide with zeros, see https://stackoverflow.com/a/37977222
"""
intersection = np.maximum(0, np.minimum(preds[:, 1], gt[1]) - np.maximum(preds[:, 0], gt[0]))
union = np.maximum(preds[:, 1], gt[1]) - np.minimum(preds[:, 0], gt[0]) # not the correct union though
return np.divide(intersection, union, out=np.zeros_like(intersection), where=union != 0)
def get_rounded_percentage(float_number, n_floats=2):
return round(float_number * 100, n_floats)
TASK_TYPES = OrderedDict([
("VCMR", "Video Corpus Moment Retrieval"),
("SVMR", "Single Video Moment Retrieval"),
("VR", "regular Video Retrieval")
])
def eval_by_task_type(moment_predictions, video2idx, ground_truth,
iou_thds=(0.5, 0.7), recall_topks=(1, 5, 10, 100),
task_type="SVMR", max_pred_per_query=100, match_number=True, verbose=True, use_desc_type=True):
""" a predicted triplet is positive only if:
1) its vid_name matches the GT vid_name
2) IoU between its timestamp and GT timestamp is higher than the given threshold
moment_predictions w.r.t. different task_type:
For each query, evaluated on top max_pred_per_query [vid_name, st, ed] triplets. (score entry ignored)
VCMR: vid_name might be repeating.
SVMR: vid_name is fixed to be the GT vid_name.
VR: vid_name is not repeating, st and ed will not be used.
Args:
video2idx: {vid_name (str): index (int), ...}
moment_predictions: list(dict), each dict is {
"desc": str,
"query_id": int,
"predictions": [vid_name_idx (int), st (float), ed (float), score (float)] * n_pred,
sorted predictions, n_pred could be different for all dicts. For each prediction,
only the first 3 elements [vid_name (str), st (float), ed (float),] are used,
any other following elements are ignored. We leave score here for record.
}
ground_truth: list(dict), each dict is {
"desc": str,
"query_id": int,
"type": str, one of [v, t, vt]
"vid_name": str
"ts": [st (float), ed (float)], or list([st (float), ed (float)]), len == 4.
...
}
iou_thds: temporal IoU thresholds
recall_topks: recall at different top k
task_type: str, could be: ["VCMR", "SVMR", "VR"], see TASK_TYPES for definition.
max_pred_per_query: int, only top max_pred_per_query predictions for each query are used.
match_number: bool, must set to True if when do evaluation, False is only used for debug.
verbose:
use_desc_type: only TVR has desc type
Returns:
"""
assert task_type in TASK_TYPES, "task_type must be one of {}".format(list(TASK_TYPES.keys()))
if verbose:
print("Running evaluation with task_type {}, n results {}; n gt {}"
.format(task_type, len(moment_predictions), len(ground_truth)))
predictions_by_query_id = {e["query_id"]: e for e in moment_predictions}
gt_by_query_id = {e["query_id"]: e for e in ground_truth}
desc_type2idx = {"v": 0, "t": 1, "vt": 2}
desc_types = [] # n_desc
if match_number:
assert set(gt_by_query_id.keys()) == set(predictions_by_query_id.keys()), \
"query_ids in predictions and ground_truth must match"
# assert len(set([len(e["predictions"]) for e in predictions_by_query_id.values()])) == 1, \
# "all queries must have the same number of predictions"
pred_info_matrix_collection = []
for k, gt_item in tqdm(gt_by_query_id.items(), desc="Loop over moments", leave=False):
if not match_number and k not in predictions_by_query_id:
continue
pred_info_matrix = np.array(
[e[:3] for e in predictions_by_query_id[k]["predictions"]][:max_pred_per_query],
dtype=np.float32) # (n_pred, 3)
if use_desc_type:
desc_types.append(desc_type2idx[gt_item["type"]])
vid_name_matched_pred = pred_info_matrix[:, 0] == video2idx[gt_item["vid_name"]] # bool, (n_pred, )
pred_info_matrix = np.concatenate([pred_info_matrix, vid_name_matched_pred[:, None]], axis=1) # (n_pred, 4)
# add 1 + len(iou_thds) columns, iou_scores, iou_corrects for each iou_thd.
iou_thd_corrects_columns = []
if len(gt_item["ts"]) >= 4: # didemo, fro all 3 splits, at least 4 ts for each, < 0.5% has more than 4.
least_n_overlap = 2 # True if overlapped with at least least_n_overlap GT ts.
iou_corrects_dict = defaultdict(list)
for single_gt_ts in gt_item["ts"]:
single_gt_ts = np.array(single_gt_ts, dtype=np.float32) # (2, )
# iou scores of the predictions that have wrong vid_name are set to 0.
iou_scores = compute_temporal_iou_batch(pred_info_matrix[:, 1:3], single_gt_ts) * vid_name_matched_pred
for iou_thd in iou_thds:
iou_corrects_dict[iou_thd].append(iou_scores >= iou_thd)
for iou_thd in iou_thds:
iou_corrects = sum(iou_corrects_dict[iou_thd]) >= least_n_overlap # bool, (n_pred, )
iou_thd_corrects_columns.append(iou_corrects[:, None])
else: # should be 2, len([st, ed]) == 2
single_gt_ts = np.array(gt_item["ts"], dtype=np.float32) # (2, )
# iou scores of the predictions that have wrong vid_name are set to 0.
iou_scores = compute_temporal_iou_batch(pred_info_matrix[:, 1:3], single_gt_ts) * vid_name_matched_pred
for iou_thd in iou_thds:
iou_corrects = iou_scores >= iou_thd # bool, (n_pred, )
iou_thd_corrects_columns.append(iou_corrects[:, None])
pred_info_matrix = np.concatenate([pred_info_matrix, ] + iou_thd_corrects_columns, axis=1) # (n_pred, 6)
pred_info_matrix_collection.append(pred_info_matrix)
# column header [vid_name_idx (int), st (float), ed (float), is_vid_name_match (bool),
# iou_scores>=iou_thd0 (bool), iou_scores>=iou_thd1 (bool)]
pred_info_matrix_collection = pad_sequences_1d_np(pred_info_matrix_collection)[0] # (n_desc, n_pred, 6)
if use_desc_type:
desc_types = np.array(desc_types) # (n_desc)
# results wrapper
metrics = OrderedDict()
metrics_by_type = OrderedDict()
iou_c_offset = 4 # iou_corrects column index starts here
if task_type == "VCMR":
for iou_idx, iou_thd in enumerate(iou_thds):
iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(bool) # (n_desc, n_pred)
# 1) there might be more than one positive clip, so use `>= 1`
for k in recall_topks:
metrics["{}-r{}".format(iou_thd, k)] = \
get_rounded_percentage(np.mean(np.sum(iou_corrects[:, :k], axis=1) >= 1))
if use_desc_type:
for desc_type in desc_type2idx:
type_corrects = desc_types == desc_type2idx[desc_type] # (n_desc)
n_desc_in_type = np.sum(type_corrects) # (n_desc)
for iou_idx, iou_thd in enumerate(iou_thds):
# (n_desc, n_pred)
iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(bool)
for k in recall_topks:
metrics_by_type["{}-{}-r{}".format(desc_type, iou_thd, k)] = get_rounded_percentage(
1.0 * np.sum(np.logical_and(np.sum(iou_corrects[:, :k], axis=1) >= 1, type_corrects))
/ n_desc_in_type
)
elif task_type == "SVMR":
vid_name_matched = pred_info_matrix_collection[:, :, 3].astype(bool) # (n_desc, n_pred)
n_desc = len(vid_name_matched)
for iou_idx, iou_thd in enumerate(iou_thds):
iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(bool) # (n_desc, n_pred)
# 1) there might be more than one positive clip, so use `>= 1`
for k in recall_topks:
metrics["{}-r{}".format(iou_thd, k)] = get_rounded_percentage(np.mean(
[np.sum(iou_corrects[idx][vid_name_matched[idx]][:k]) >= 1 for idx in range(n_desc)]
))
if use_desc_type:
for desc_type in desc_type2idx:
type_corrects = desc_types == desc_type2idx[desc_type] # (n_desc)
n_desc_in_type = np.sum(type_corrects) # (n_desc)
for iou_idx, iou_thd in enumerate(iou_thds):
# (n_desc, n_pred)
iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(bool)
# 1) there might be more than one positive clip, so use `>= 1`
for k in recall_topks:
metrics_by_type["{}-{}-r{}".format(desc_type, iou_thd, k)] = get_rounded_percentage(
1.0 * np.sum([np.sum(iou_corrects[idx][vid_name_matched[idx]][:k]) >= 1 and type_corrects[idx]
for idx in range(n_desc)])
/ n_desc_in_type)
elif task_type == "VR":
vid_name_matched = pred_info_matrix_collection[:, :, 3].astype(bool) # (n_desc, n_pred)
for k in recall_topks:
metrics["r{}".format(k)] = \
get_rounded_percentage(np.mean(np.sum(vid_name_matched[:, :k], axis=1) >= 1))
if use_desc_type:
for desc_type in desc_type2idx:
type_corrects = desc_types == desc_type2idx[desc_type] # (n_desc)
n_desc_in_type = np.sum(type_corrects) # (n_desc)
for k in recall_topks:
metrics_by_type["{}-r{}".format(desc_type, k)] = get_rounded_percentage(
1.0 * np.sum(np.logical_and(np.sum(vid_name_matched[:, :k], axis=1) >= 1, type_corrects))
/ n_desc_in_type)
else:
raise ValueError("task_type wrong.")
if use_desc_type:
metrics_by_type["desc_type_ratio"] = "v {} t {} vt {}"\
.format(*[get_rounded_percentage(1.0 * np.sum(desc_types == desc_type2idx[k]) / len(desc_types))
for k in ["v", "t", "vt"]])
return metrics, metrics_by_type
def eval_retrieval(submission, ground_truth, iou_thds=(0.5, 0.7), verbose=True, match_number=True, use_desc_type=True):
video2idx = submission["video2idx"]
submitted_task_types = [k for k in TASK_TYPES if k in submission]
if verbose:
print("Evaluating for task {}".format(submitted_task_types))
eval_metrics = OrderedDict()
metrics_raw_dict = {}
for task_type in submitted_task_types:
metrics, metrics_by_type = eval_by_task_type(
submission[task_type], video2idx, ground_truth,
iou_thds=iou_thds, recall_topks=(1, 5, 10, 100),
task_type=task_type, max_pred_per_query=100,
match_number=match_number, verbose=verbose, use_desc_type=use_desc_type)
metrics_raw_dict[task_type] = metrics
metrics_raw_dict[task_type+"_by_type"] = metrics_by_type
for task_type in submitted_task_types:
eval_metrics[task_type] = metrics_raw_dict[task_type]
if use_desc_type:
for task_type in submitted_task_types:
eval_metrics[task_type+"_by_type"] = metrics_raw_dict[task_type+"_by_type"]
return eval_metrics
def eval_main():
import argparse
parser = argparse.ArgumentParser(description="TVR Evaluation Script")
parser.add_argument("--submission_path", type=str, help="path to generated prediction file")
parser.add_argument("--gt_path", type=str, help="path to GT file")
parser.add_argument("--save_path", type=str, help="path to save the results")
parser.add_argument("--not_verbose", action="store_true")
args = parser.parse_args()
verbose = not args.not_verbose
submission = load_json(args.submission_path)
gt = load_jsonl(args.gt_path)
results = eval_retrieval(submission, gt, iou_thds=(0.5, 0.7), verbose=verbose)
if verbose:
print(json.dumps(results, indent=4))
with open(args.save_path, "w") as f:
f.write(json.dumps(results, indent=4))
if __name__ == '__main__':
eval_main()
|