### A simpler rendition of reconstructions. ### Lists only xorbs and a "dedupe factor" for the xorb ### Where dedupe-factor 1 == no dedupe, ### 2 == 1 chunk shared, ### 3 == 2 chunks shared, ### etc. import json import sys import list_reconstructions import list_repos def list_xorbs(repos): # first build up a mapping of {xorb_id: [(start, end), (start, end), ...]} xorbs = {} reconstructions = list_reconstructions.list_reconstructions(repos) for term in reconstructions: if not(term["xorb_id"] in xorbs): xorbs[term["xorb_id"]] = [] path_parts = term["file_path"].split("/") if path_parts[0] != "datasets" and \ path_parts[0] != "spaces": # models omit the "models" part from file path path_parts.insert(0, "models") repo = "/".join(path_parts[:3]) xorbs[term["xorb_id"]].append((term["start"], term["end"], repo)) # then walk the lists and compute dedupe factor output = [] for xorb_id,chunks in xorbs.items(): min_chunk_idx = float("inf") max_chunk_idx = float("-inf") xorb_repos = set() dedupe_factor = 0 for chunk in chunks: min_chunk_idx = min(min_chunk_idx, chunk[0]) max_chunk_idx = max(max_chunk_idx, chunk[1]) xorb_repos.add(chunk[2]) xorb_repos = list(xorb_repos) for i in range(min_chunk_idx, max_chunk_idx): ref_count = 0 for chunk in chunks: if i >= chunk[0] and i < chunk[1]: ref_count += 1 dedupe_factor += ref_count if max_chunk_idx != 0: dedupe_factor /= float(max_chunk_idx) for repo in xorb_repos: output.append({ "xorb_id": xorb_id, "dedupe_factor": dedupe_factor, "repo": repo }) return output if __name__ == "__main__": json.dump(list_xorbs(list_repos.list_repos()), sys.stdout, sort_keys=True, indent=4)