Spaces:
Running
Running
File size: 8,097 Bytes
4594d15 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"collapsed": true,
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"# Merge input files\n",
"\n",
"### ✅ Prerequisites\n",
"\n",
"[Python 3.10](https://www.python.org/downloads/)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> [!CAUTION]\n",
"> Please make sure all input files are valid before trying to merge them. You can check validity of each file with a [`validate_input_file`](./validate_input_file.ipynb) notebook.\n",
"\n",
"\n",
"> [!IMPORTANT]\n",
"> Only common `tasks` across all input files and associated documents, models and evaluations are preserved in the resultant file.\n",
"\n",
"### Merge function"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"from typing import Dict, Set\n",
"import json\n",
"\n",
"\n",
"# =========================================================\n",
"# HELPER FUNCTIONS\n",
"# =========================================================\n",
"def read_json(filename: str, encoding=\"utf-8\"):\n",
" with open(filename, mode=\"r\", encoding=encoding) as fp:\n",
" return json.load(fp)\n",
"\n",
"\n",
"def write_json(filename: str, content: dict, encoding=\"utf-8\"):\n",
" with open(filename, mode=\"w\", encoding=encoding) as fp:\n",
" return json.dump(content, fp)\n",
"\n",
"\n",
"# =========================================================\n",
"# MAIN FUNCTION\n",
"# =========================================================\n",
"def merge(inputs: list[dict]) -> dict:\n",
" # Step 1: Return, if single JSON\n",
" if len(inputs) == 1:\n",
" return inputs[0]\n",
"\n",
" # Step 2: When multiple input JSONs\n",
" # Step 2.a: Initialize necessary variables\n",
" merged_tasks: Dict[str, dict] = {}\n",
" tasks_to_models: Dict[str, Set[str]] = {}\n",
" evaluations: Dict[str, dict] = {}\n",
" all_models = {}\n",
" all_filters = set()\n",
"\n",
" # Step 2.b: Iterate over each input JSON\n",
" for entry in inputs:\n",
" # Step 2.b.i: Add model to dictionary of all models, if not present already\n",
" for model in entry[\"models\"]:\n",
" if model[\"model_id\"] in all_models:\n",
" if model[\"name\"] != all_models[model[\"model_id\"]][\"name\"]:\n",
" print(\n",
" f\"Mismatched model information for model with id: ${model['model_id']}\"\n",
" )\n",
" else:\n",
" all_models[model[\"model_id\"]] = model\n",
"\n",
" # Step 2.b.ii: Add filters to set of all filter\n",
" if \"filters\" in entry and entry[\"filters\"]:\n",
" for filter in entry[\"filters\"]:\n",
" all_filters.add(filter)\n",
"\n",
" # Step 2.b.iii: Iterate over each evaluation\n",
" for evaluation in entry[\"evaluations\"]:\n",
" # Step 2.b.iii.*: Extend map of task IDs to model IDs based on evaluations\n",
" try:\n",
" tasks_to_models[evaluation[\"task_id\"]].add(evaluation[\"model_id\"])\n",
" except KeyError:\n",
" tasks_to_models[evaluation[\"task_id\"]] = set([evaluation[\"model_id\"]])\n",
"\n",
" # Step 2.b.iii.*: Extend evaluations map, if necessary\n",
" if (\n",
" f\"{evaluation['task_id']}<:SEP:>{evaluation['model_id']}\"\n",
" not in evaluations\n",
" ):\n",
" evaluations[\n",
" f\"{evaluation['task_id']}<:SEP:>{evaluation['model_id']}\"\n",
" ] = evaluation\n",
"\n",
" # Step 2.b.iv: Create merged tasks as follows\n",
" # 1. Merge comments for same task from different input JSONs\n",
" # 2. Merge flagged status for same task from different input JSONs (preserved flagged=True, if any of the input JSONs has it to be 'True')\n",
" for task in entry[\"tasks\"]:\n",
" if task[\"task_id\"] in merged_tasks:\n",
" if \"comments\" in task and task[\"comments\"]:\n",
" try:\n",
" merged_tasks[task[\"task_id\"]][\"comments\"].extend(\n",
" task[\"comments\"]\n",
" )\n",
" except KeyError:\n",
" merged_tasks[task[\"task_id\"]][\"comments\"] = [task[\"comments\"]]\n",
"\n",
" if \"flagged\" in task:\n",
" try:\n",
" merged_tasks[task[\"task_id\"]][\"flagged\"] = (\n",
" merged_tasks[task[\"task_id\"]][\"flagged\"] or task[\"flagged\"]\n",
" )\n",
" except KeyError:\n",
" merged_tasks[task[\"task_id\"]][\"flagged\"] = task[\"flagged\"]\n",
" else:\n",
" merged_tasks[task[\"task_id\"]] = task\n",
"\n",
" # Step 3: Find candidate models\n",
" # Criterion: A group of models which has evaluations for all tasks\n",
" candidate_models = {\n",
" model_id: all_models[model_id]\n",
" for model_id in set.intersection(*list(tasks_to_models.values()))\n",
" }\n",
"\n",
" # Step 4: Create potential filters\n",
" candidate_filters = all_filters\n",
" for task in merged_tasks.values():\n",
" candidate_filters = candidate_filters.intersection(task.keys())\n",
"\n",
" # Step 4: Return\n",
" if candidate_models:\n",
" return {\n",
" \"name\": f\"Merged from ${len(inputs)} files\",\n",
" \"filters\": list(candidate_filters),\n",
" \"models\": list(candidate_models.values()),\n",
" \"metrics\": inputs[0][\"metrics\"],\n",
" \"documents\": inputs[0][\"documents\"],\n",
" \"tasks\": inputs[0][\"tasks\"],\n",
" \"evaluations\": [\n",
" evaluations[f\"{task['task_id']}<:SEP:>{model_id}\"]\n",
" for task in inputs[0][\"tasks\"]\n",
" for model_id in candidate_models\n",
" ],\n",
" }\n",
" else:\n",
" print(\"Failed to find models with evaluations for all tasks.\")\n",
" return None\n",
"\n",
"\n",
"# =========================================================\n",
"# EXECUTE\n",
"# =========================================================\n",
"# Step 1: Load input files to be merged\n",
"inputs = [\n",
" read_json(\n",
" filename=\"<PATH TO INPUT JSON 1>\"\n",
" ),\n",
" read_json(\n",
" filename=\"<PATH TO INPUT JSON 2>\"\n",
" ),\n",
"]\n",
"\n",
"# Step 2: Run merging function\n",
"output = merge(inputs=inputs)\n",
"\n",
"# Step 3: Save merged output\n",
"if output:\n",
" write_json(\n",
" filename=\"<PATH TO MERGED FILE>\",\n",
" content=output,\n",
" )"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
|