# Merge input files

### ✅ Prerequisites

[Python 3.10](https://www.python.org/downloads/)


> [!CAUTION]
> Please make sure all input files are valid before trying to merge them. You can check validity of each file with a [`validate_input_file`](./validate_input_file.ipynb) notebook.


> [!IMPORTANT]
> Only common `tasks` across all input files and associated documents, models and evaluations are preserved in the resultant file.

### Merge function

In [13]:
from typing import Dict, Set
import json


# =========================================================
# HELPER FUNCTIONS
# =========================================================
def read_json(filename: str, encoding="utf-8"):
 with open(filename, mode="r", encoding=encoding) as fp:
 return json.load(fp)


def write_json(filename: str, content: dict, encoding="utf-8"):
 with open(filename, mode="w", encoding=encoding) as fp:
 return json.dump(content, fp)


# =========================================================
# MAIN FUNCTION
# =========================================================
def merge(inputs: list[dict]) -> dict:
 # Step 1: Return, if single JSON
 if len(inputs) == 1:
 return inputs[0]

 # Step 2: When multiple input JSONs
 # Step 2.a: Initialize necessary variables
 merged_tasks: Dict[str, dict] = {}
 tasks_to_models: Dict[str, Set[str]] = {}
 evaluations: Dict[str, dict] = {}
 all_models = {}
 all_filters = set()

 # Step 2.b: Iterate over each input JSON
 for entry in inputs:
 # Step 2.b.i: Add model to dictionary of all models, if not present already
 for model in entry["models"]:
 if model["model_id"] in all_models:
 if model["name"] != all_models[model["model_id"]]["name"]:
 print(
 f"Mismatched model information for model with id: ${model['model_id']}"
 )
 else:
 all_models[model["model_id"]] = model

 # Step 2.b.ii: Add filters to set of all filter
 if "filters" in entry and entry["filters"]:
 for filter in entry["filters"]:
 all_filters.add(filter)

 # Step 2.b.iii: Iterate over each evaluation
 for evaluation in entry["evaluations"]:
 # Step 2.b.iii.*: Extend map of task IDs to model IDs based on evaluations
 try:
 tasks_to_models[evaluation["task_id"]].add(evaluation["model_id"])
 except KeyError:
 tasks_to_models[evaluation["task_id"]] = set([evaluation["model_id"]])

 # Step 2.b.iii.*: Extend evaluations map, if necessary
 if (
 f"{evaluation['task_id']}<:SEP:>{evaluation['model_id']}"
 not in evaluations
 ):
 evaluations[
 f"{evaluation['task_id']}<:SEP:>{evaluation['model_id']}"
 ] = evaluation

 # Step 2.b.iv: Create merged tasks as follows
 # 1. Merge comments for same task from different input JSONs
 # 2. Merge flagged status for same task from different input JSONs (preserved flagged=True, if any of the input JSONs has it to be 'True')
 for task in entry["tasks"]:
 if task["task_id"] in merged_tasks:
 if "comments" in task and task["comments"]:
 try:
 merged_tasks[task["task_id"]]["comments"].extend(
 task["comments"]
 )
 except KeyError:
 merged_tasks[task["task_id"]]["comments"] = [task["comments"]]

 if "flagged" in task:
 try:
 merged_tasks[task["task_id"]]["flagged"] = (
 merged_tasks[task["task_id"]]["flagged"] or task["flagged"]
 )
 except KeyError:
 merged_tasks[task["task_id"]]["flagged"] = task["flagged"]
 else:
 merged_tasks[task["task_id"]] = task

 # Step 3: Find candidate models
 # Criterion: A group of models which has evaluations for all tasks
 candidate_models = {
 model_id: all_models[model_id]
 for model_id in set.intersection(*list(tasks_to_models.values()))
 }

 # Step 4: Create potential filters
 candidate_filters = all_filters
 for task in merged_tasks.values():
 candidate_filters = candidate_filters.intersection(task.keys())

 # Step 4: Return
 if candidate_models:
 return {
 "name": f"Merged from ${len(inputs)} files",
 "filters": list(candidate_filters),
 "models": list(candidate_models.values()),
 "metrics": inputs[0]["metrics"],
 "documents": inputs[0]["documents"],
 "tasks": inputs[0]["tasks"],
 "evaluations": [
 evaluations[f"{task['task_id']}<:SEP:>{model_id}"]
 for task in inputs[0]["tasks"]
 for model_id in candidate_models
 ],
 }
 else:
 print("Failed to find models with evaluations for all tasks.")
 return None


# =========================================================
# EXECUTE
# =========================================================
# Step 1: Load input files to be merged
inputs = [
 read_json(
 filename=""
 ),
 read_json(
 filename=""
 ),
]

# Step 2: Run merging function
output = merge(inputs=inputs)

# Step 3: Save merged output
if output:
 write_json(
 filename="",
 content=output,
 )