|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json |
|
import sys |
|
|
|
import list_reconstructions |
|
import list_repos |
|
|
|
def list_xorbs(repos): |
|
|
|
xorbs = {} |
|
reconstructions = list_reconstructions.list_reconstructions(repos) |
|
for term in reconstructions: |
|
if not(term["xorb_id"] in xorbs): |
|
xorbs[term["xorb_id"]] = [] |
|
path_parts = term["file_path"].split("/") |
|
if path_parts[0] != "datasets" and \ |
|
path_parts[0] != "spaces": |
|
|
|
path_parts.insert(0, "models") |
|
repo = "/".join(path_parts[:3]) |
|
xorbs[term["xorb_id"]].append((term["start"], term["end"], repo)) |
|
|
|
|
|
output = [] |
|
for xorb_id,chunks in xorbs.items(): |
|
min_chunk_idx = float("inf") |
|
max_chunk_idx = float("-inf") |
|
xorb_repos = set() |
|
dedupe_factor = 0 |
|
for chunk in chunks: |
|
min_chunk_idx = min(min_chunk_idx, chunk[0]) |
|
max_chunk_idx = max(max_chunk_idx, chunk[1]) |
|
xorb_repos.add(chunk[2]) |
|
xorb_repos = list(xorb_repos) |
|
for i in range(min_chunk_idx, max_chunk_idx): |
|
ref_count = 0 |
|
for chunk in chunks: |
|
if i >= chunk[0] and i < chunk[1]: |
|
ref_count += 1 |
|
dedupe_factor += ref_count |
|
if max_chunk_idx != 0: |
|
dedupe_factor /= float(max_chunk_idx) |
|
for repo in xorb_repos: |
|
output.append({ |
|
"xorb_id": xorb_id, |
|
"dedupe_factor": dedupe_factor, |
|
"repo": repo |
|
}) |
|
return output |
|
|
|
|
|
if __name__ == "__main__": |
|
json.dump(list_xorbs(list_repos.list_repos()), sys.stdout, sort_keys=True, indent=4) |