File size: 8,097 Bytes
4594d15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true,
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "# Merge input files\n",
    "\n",
    "### ✅ Prerequisites\n",
    "\n",
    "[Python 3.10](https://www.python.org/downloads/)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> [!CAUTION]\n",
    "> Please make sure all input files are valid before trying to merge them. You can check validity of each file with a [`validate_input_file`](./validate_input_file.ipynb) notebook.\n",
    "\n",
    "\n",
    "> [!IMPORTANT]\n",
    "> Only common `tasks` across all input files and associated documents, models and evaluations are preserved in the resultant file.\n",
    "\n",
    "### Merge function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import Dict, Set\n",
    "import json\n",
    "\n",
    "\n",
    "# =========================================================\n",
    "#                   HELPER FUNCTIONS\n",
    "# =========================================================\n",
    "def read_json(filename: str, encoding=\"utf-8\"):\n",
    "    with open(filename, mode=\"r\", encoding=encoding) as fp:\n",
    "        return json.load(fp)\n",
    "\n",
    "\n",
    "def write_json(filename: str, content: dict, encoding=\"utf-8\"):\n",
    "    with open(filename, mode=\"w\", encoding=encoding) as fp:\n",
    "        return json.dump(content, fp)\n",
    "\n",
    "\n",
    "# =========================================================\n",
    "#                   MAIN FUNCTION\n",
    "# =========================================================\n",
    "def merge(inputs: list[dict]) -> dict:\n",
    "    # Step 1: Return, if single JSON\n",
    "    if len(inputs) == 1:\n",
    "        return inputs[0]\n",
    "\n",
    "    # Step 2: When multiple input JSONs\n",
    "    # Step 2.a: Initialize necessary variables\n",
    "    merged_tasks: Dict[str, dict] = {}\n",
    "    tasks_to_models: Dict[str, Set[str]] = {}\n",
    "    evaluations: Dict[str, dict] = {}\n",
    "    all_models = {}\n",
    "    all_filters = set()\n",
    "\n",
    "    # Step 2.b: Iterate over each input JSON\n",
    "    for entry in inputs:\n",
    "        # Step 2.b.i: Add model to dictionary of all models, if not present already\n",
    "        for model in entry[\"models\"]:\n",
    "            if model[\"model_id\"] in all_models:\n",
    "                if model[\"name\"] != all_models[model[\"model_id\"]][\"name\"]:\n",
    "                    print(\n",
    "                        f\"Mismatched model information for model with id: ${model['model_id']}\"\n",
    "                    )\n",
    "            else:\n",
    "                all_models[model[\"model_id\"]] = model\n",
    "\n",
    "        # Step 2.b.ii: Add filters to set of all filter\n",
    "        if \"filters\" in entry and entry[\"filters\"]:\n",
    "            for filter in entry[\"filters\"]:\n",
    "                all_filters.add(filter)\n",
    "\n",
    "        # Step 2.b.iii: Iterate over each evaluation\n",
    "        for evaluation in entry[\"evaluations\"]:\n",
    "            # Step 2.b.iii.*: Extend map of task IDs to model IDs based on evaluations\n",
    "            try:\n",
    "                tasks_to_models[evaluation[\"task_id\"]].add(evaluation[\"model_id\"])\n",
    "            except KeyError:\n",
    "                tasks_to_models[evaluation[\"task_id\"]] = set([evaluation[\"model_id\"]])\n",
    "\n",
    "            # Step 2.b.iii.*: Extend evaluations map, if necessary\n",
    "            if (\n",
    "                f\"{evaluation['task_id']}<:SEP:>{evaluation['model_id']}\"\n",
    "                not in evaluations\n",
    "            ):\n",
    "                evaluations[\n",
    "                    f\"{evaluation['task_id']}<:SEP:>{evaluation['model_id']}\"\n",
    "                ] = evaluation\n",
    "\n",
    "        # Step 2.b.iv: Create merged tasks as follows\n",
    "        # 1. Merge comments for same task from different input JSONs\n",
    "        # 2. Merge flagged status for same task from different input JSONs (preserved flagged=True, if any of the input JSONs has it to be 'True')\n",
    "        for task in entry[\"tasks\"]:\n",
    "            if task[\"task_id\"] in merged_tasks:\n",
    "                if \"comments\" in task and task[\"comments\"]:\n",
    "                    try:\n",
    "                        merged_tasks[task[\"task_id\"]][\"comments\"].extend(\n",
    "                            task[\"comments\"]\n",
    "                        )\n",
    "                    except KeyError:\n",
    "                        merged_tasks[task[\"task_id\"]][\"comments\"] = [task[\"comments\"]]\n",
    "\n",
    "                if \"flagged\" in task:\n",
    "                    try:\n",
    "                        merged_tasks[task[\"task_id\"]][\"flagged\"] = (\n",
    "                            merged_tasks[task[\"task_id\"]][\"flagged\"] or task[\"flagged\"]\n",
    "                        )\n",
    "                    except KeyError:\n",
    "                        merged_tasks[task[\"task_id\"]][\"flagged\"] = task[\"flagged\"]\n",
    "            else:\n",
    "                merged_tasks[task[\"task_id\"]] = task\n",
    "\n",
    "    # Step 3: Find candidate models\n",
    "    # Criterion: A group of models which has evaluations for all tasks\n",
    "    candidate_models = {\n",
    "        model_id: all_models[model_id]\n",
    "        for model_id in set.intersection(*list(tasks_to_models.values()))\n",
    "    }\n",
    "\n",
    "    # Step 4: Create potential filters\n",
    "    candidate_filters = all_filters\n",
    "    for task in merged_tasks.values():\n",
    "        candidate_filters  = candidate_filters.intersection(task.keys())\n",
    "\n",
    "    # Step 4: Return\n",
    "    if candidate_models:\n",
    "        return {\n",
    "            \"name\": f\"Merged from ${len(inputs)} files\",\n",
    "            \"filters\": list(candidate_filters),\n",
    "            \"models\": list(candidate_models.values()),\n",
    "            \"metrics\": inputs[0][\"metrics\"],\n",
    "            \"documents\": inputs[0][\"documents\"],\n",
    "            \"tasks\": inputs[0][\"tasks\"],\n",
    "            \"evaluations\": [\n",
    "                evaluations[f\"{task['task_id']}<:SEP:>{model_id}\"]\n",
    "                for task in inputs[0][\"tasks\"]\n",
    "                for model_id in candidate_models\n",
    "            ],\n",
    "        }\n",
    "    else:\n",
    "        print(\"Failed to find models with evaluations for all tasks.\")\n",
    "        return None\n",
    "\n",
    "\n",
    "# =========================================================\n",
    "#                   EXECUTE\n",
    "# =========================================================\n",
    "# Step 1: Load input files to be merged\n",
    "inputs = [\n",
    "    read_json(\n",
    "        filename=\"<PATH TO INPUT JSON 1>\"\n",
    "    ),\n",
    "    read_json(\n",
    "        filename=\"<PATH TO INPUT JSON 2>\"\n",
    "    ),\n",
    "]\n",
    "\n",
    "# Step 2: Run merging function\n",
    "output = merge(inputs=inputs)\n",
    "\n",
    "# Step 3: Save merged output\n",
    "if output:\n",
    "    write_json(\n",
    "        filename=\"<PATH TO MERGED FILE>\",\n",
    "        content=output,\n",
    "    )"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}