Spaces:

kpfadnis
/

InspectorRAGet

Running

File size: 8,097 Bytes

4594d15

{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true,
    "pycharm": {
     "name": "#%% md\n"
    }
   },
   "source": [
    "# Merge input files\n",
    "\n",
    "### ✅ Prerequisites\n",
    "\n",
    "[Python 3.10](https://www.python.org/downloads/)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> [!CAUTION]\n",
    "> Please make sure all input files are valid before trying to merge them. You can check validity of each file with a [`validate_input_file`](./validate_input_file.ipynb) notebook.\n",
    "\n",
    "\n",
    "> [!IMPORTANT]\n",
    "> Only common `tasks` across all input files and associated documents, models and evaluations are preserved in the resultant file.\n",
    "\n",
    "### Merge function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import Dict, Set\n",
    "import json\n",
    "\n",
    "\n",
    "# =========================================================\n",
    "#                   HELPER FUNCTIONS\n",
    "# =========================================================\n",
    "def read_json(filename: str, encoding=\"utf-8\"):\n",
    "    with open(filename, mode=\"r\", encoding=encoding) as fp:\n",
    "        return json.load(fp)\n",
    "\n",
    "\n",
    "def write_json(filename: str, content: dict, encoding=\"utf-8\"):\n",
    "    with open(filename, mode=\"w\", encoding=encoding) as fp:\n",
    "        return json.dump(content, fp)\n",
    "\n",
    "\n",
    "# =========================================================\n",
    "#                   MAIN FUNCTION\n",
    "# =========================================================\n",
    "def merge(inputs: list[dict]) -> dict:\n",
    "    # Step 1: Return, if single JSON\n",
    "    if len(inputs) == 1:\n",
    "        return inputs[0]\n",
    "\n",
    "    # Step 2: When multiple input JSONs\n",
    "    # Step 2.a: Initialize necessary variables\n",
    "    merged_tasks: Dict[str, dict] = {}\n",
    "    tasks_to_models: Dict[str, Set[str]] = {}\n",
    "    evaluations: Dict[str, dict] = {}\n",
    "    all_models = {}\n",
    "    all_filters = set()\n",
    "\n",
    "    # Step 2.b: Iterate over each input JSON\n",
    "    for entry in inputs:\n",
    "        # Step 2.b.i: Add model to dictionary of all models, if not present already\n",
    "        for model in entry[\"models\"]:\n",
    "            if model[\"model_id\"] in all_models:\n",
    "                if model[\"name\"] != all_models[model[\"model_id\"]][\"name\"]:\n",
    "                    print(\n",
    "                        f\"Mismatched model information for model with id: ${model['model_id']}\"\n",
    "                    )\n",
    "            else:\n",
    "                all_models[model[\"model_id\"]] = model\n",
    "\n",
    "        # Step 2.b.ii: Add filters to set of all filter\n",
    "        if \"filters\" in entry and entry[\"filters\"]:\n",
    "            for filter in entry[\"filters\"]:\n",
    "                all_filters.add(filter)\n",
    "\n",
    "        # Step 2.b.iii: Iterate over each evaluation\n",
    "        for evaluation in entry[\"evaluations\"]:\n",
    "            # Step 2.b.iii.*: Extend map of task IDs to model IDs based on evaluations\n",
    "            try:\n",
    "                tasks_to_models[evaluation[\"task_id\"]].add(evaluation[\"model_id\"])\n",
    "            except KeyError:\n",
    "                tasks_to_models[evaluation[\"task_id\"]] = set([evaluation[\"model_id\"]])\n",
    "\n",
    "            # Step 2.b.iii.*: Extend evaluations map, if necessary\n",
    "            if (\n",
    "                f\"{evaluation['task_id']}<:SEP:>{evaluation['model_id']}\"\n",
    "                not in evaluations\n",
    "            ):\n",
    "                evaluations[\n",
    "                    f\"{evaluation['task_id']}<:SEP:>{evaluation['model_id']}\"\n",
    "                ] = evaluation\n",
    "\n",
    "        # Step 2.b.iv: Create merged tasks as follows\n",
    "        # 1. Merge comments for same task from different input JSONs\n",
    "        # 2. Merge flagged status for same task from different input JSONs (preserved flagged=True, if any of the input JSONs has it to be 'True')\n",
    "        for task in entry[\"tasks\"]:\n",
    "            if task[\"task_id\"] in merged_tasks:\n",
    "                if \"comments\" in task and task[\"comments\"]:\n",
    "                    try:\n",
    "                        merged_tasks[task[\"task_id\"]][\"comments\"].extend(\n",
    "                            task[\"comments\"]\n",
    "                        )\n",
    "                    except KeyError:\n",
    "                        merged_tasks[task[\"task_id\"]][\"comments\"] = [task[\"comments\"]]\n",
    "\n",
    "                if \"flagged\" in task:\n",
    "                    try:\n",
    "                        merged_tasks[task[\"task_id\"]][\"flagged\"] = (\n",
    "                            merged_tasks[task[\"task_id\"]][\"flagged\"] or task[\"flagged\"]\n",
    "                        )\n",
    "                    except KeyError:\n",
    "                        merged_tasks[task[\"task_id\"]][\"flagged\"] = task[\"flagged\"]\n",
    "            else:\n",
    "                merged_tasks[task[\"task_id\"]] = task\n",
    "\n",
    "    # Step 3: Find candidate models\n",
    "    # Criterion: A group of models which has evaluations for all tasks\n",
    "    candidate_models = {\n",
    "        model_id: all_models[model_id]\n",
    "        for model_id in set.intersection(*list(tasks_to_models.values()))\n",
    "    }\n",
    "\n",
    "    # Step 4: Create potential filters\n",
    "    candidate_filters = all_filters\n",
    "    for task in merged_tasks.values():\n",
    "        candidate_filters  = candidate_filters.intersection(task.keys())\n",
    "\n",
    "    # Step 4: Return\n",
    "    if candidate_models:\n",
    "        return {\n",
    "            \"name\": f\"Merged from ${len(inputs)} files\",\n",
    "            \"filters\": list(candidate_filters),\n",
    "            \"models\": list(candidate_models.values()),\n",
    "            \"metrics\": inputs[0][\"metrics\"],\n",
    "            \"documents\": inputs[0][\"documents\"],\n",
    "            \"tasks\": inputs[0][\"tasks\"],\n",
    "            \"evaluations\": [\n",
    "                evaluations[f\"{task['task_id']}<:SEP:>{model_id}\"]\n",
    "                for task in inputs[0][\"tasks\"]\n",
    "                for model_id in candidate_models\n",
    "            ],\n",
    "        }\n",
    "    else:\n",
    "        print(\"Failed to find models with evaluations for all tasks.\")\n",
    "        return None\n",
    "\n",
    "\n",
    "# =========================================================\n",
    "#                   EXECUTE\n",
    "# =========================================================\n",
    "# Step 1: Load input files to be merged\n",
    "inputs = [\n",
    "    read_json(\n",
    "        filename=\"<PATH TO INPUT JSON 1>\"\n",
    "    ),\n",
    "    read_json(\n",
    "        filename=\"<PATH TO INPUT JSON 2>\"\n",
    "    ),\n",
    "]\n",
    "\n",
    "# Step 2: Run merging function\n",
    "output = merge(inputs=inputs)\n",
    "\n",
    "# Step 3: Save merged output\n",
    "if output:\n",
    "    write_json(\n",
    "        filename=\"<PATH TO MERGED FILE>\",\n",
    "        content=output,\n",
    "    )"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}