Spaces:

znation
/

xet-repo-data-collection

Sleeping

File size: 2,083 Bytes

f624d68

### A simpler rendition of reconstructions.
### Lists only xorbs and a "dedupe factor" for the xorb
### Where dedupe-factor 1 == no dedupe,
###                     2 == 1 chunk shared,
###                     3 == 2 chunks shared,
###                     etc.

import json
import sys

import list_reconstructions
import list_repos

def list_xorbs(repos):
    # first build up a mapping of {xorb_id: [(start, end), (start, end), ...]}
    xorbs = {}
    reconstructions = list_reconstructions.list_reconstructions(repos)
    for term in reconstructions:
        if not(term["xorb_id"] in xorbs):
            xorbs[term["xorb_id"]] = []
        path_parts = term["file_path"].split("/")
        if path_parts[0] != "datasets" and \
           path_parts[0] != "spaces":
            # models omit the "models" part from file path
            path_parts.insert(0, "models")
        repo = "/".join(path_parts[:3])
        xorbs[term["xorb_id"]].append((term["start"], term["end"], repo))

    # then walk the lists and compute dedupe factor
    output = []
    for xorb_id,chunks in xorbs.items():
        min_chunk_idx = float("inf")
        max_chunk_idx = float("-inf")
        xorb_repos = set()
        dedupe_factor = 0
        for chunk in chunks:
            min_chunk_idx = min(min_chunk_idx, chunk[0])
            max_chunk_idx = max(max_chunk_idx, chunk[1])
            xorb_repos.add(chunk[2])
        xorb_repos = list(xorb_repos)
        for i in range(min_chunk_idx, max_chunk_idx):
            ref_count = 0
            for chunk in chunks:
                if i >= chunk[0] and i < chunk[1]:
                    ref_count += 1
            dedupe_factor += ref_count
        if max_chunk_idx != 0: 
            dedupe_factor /= float(max_chunk_idx)
        for repo in xorb_repos:
            output.append({
                "xorb_id": xorb_id,
                "dedupe_factor": dedupe_factor,
                "repo": repo
            })
    return output


if __name__ == "__main__":
    json.dump(list_xorbs(list_repos.list_repos()), sys.stdout, sort_keys=True, indent=4)