Spaces:

ailab-bio
/

PROTAC-Degradation-Predictor

Sleeping

App Files Files Community

ribesstefano commited on Aug 19, 2024

Commit

2ca0a98

1 Parent(s): 9f58169

Added updated version of PROTAC-DB and starting applying data curation to it

Browse files

Files changed (2) hide show

data/PROTAC-DB-v2.csv +0 -0
notebooks/data_curation_v2.ipynb +1020 -0

data/PROTAC-DB-v2.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

notebooks/data_curation_v2.ipynb ADDED Viewed

	@@ -0,0 +1,1020 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from IPython.display import display_html\n",
+    "\n",
+    "import logging\n",
+    "import warnings\n",
+    "import re\n",
+    "import os\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import pickle\n",
+    "import pickle\n",
+    "import requests\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from rdkit import Chem\n",
+    "from rdkit.Chem import AllChem\n",
+    "from typing import Literal, Union, List, Dict, Any, Callable\n",
+    "from collections import defaultdict\n",
+    "from tqdm.auto import tqdm\n",
+    "from rdkit import RDLogger\n",
+    "\n",
+    "RDLogger.DisableLog('rdApp.*')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def set_global_logging_level(level=logging.ERROR, prefices=[\"\"]):\n",
+    "    \"\"\"\n",
+    "    Override logging levels of different modules based on their name as a prefix.\n",
+    "    It needs to be invoked after the modules have been loaded so that their loggers have been initialized.\n",
+    "\n",
+    "    Args:\n",
+    "        - level: desired level. e.g. logging.INFO. Optional. Default is logging.ERROR\n",
+    "        - prefices: list of one or more str prefices to match (e.g. [\"transformers\", \"torch\"]). Optional.\n",
+    "          Default is `[\"\"]` to match all active loggers.\n",
+    "          The match is a case-sensitive `module_name.startswith(prefix)`\n",
+    "    \"\"\"\n",
+    "    prefix_re = re.compile(fr'^(?:{ \"|\".join(prefices) })')\n",
+    "    for name in logging.root.manager.loggerDict:\n",
+    "        if re.match(prefix_re, name):\n",
+    "            logging.getLogger(name).setLevel(level)\n",
+    "\n",
+    "\n",
+    "# Filter out annoying Pytorch Lightning printouts\n",
+    "warnings.filterwarnings('ignore')\n",
+    "warnings.filterwarnings(\n",
+    "    'ignore', '.*Covariance of the parameters could not be estimated.*')\n",
+    "warnings.filterwarnings(\n",
+    "    'ignore', '.*You seem to be using the pipelines sequentially on GPU.*')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# data_dir = os.path.join(os.getcwd(), '..', 'data')\n",
+    "data_dir = os.path.join(os.getcwd(), 'data')\n",
+    "dirs_to_make = [\n",
+    "    data_dir,\n",
+    "    # os.path.join(data_dir, 'raw'),\n",
+    "    # os.path.join(data_dir, 'processed'),\n",
+    "]\n",
+    "for d in dirs_to_make:\n",
+    "    if not os.path.exists(d):\n",
+    "        os.makedirs(d)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded protac.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "protacdb_file = os.path.join(data_dir, 'PROTAC-DB.csv')\n",
+    "protac_df = pd.read_csv(protacdb_file).reset_index(drop=True)\n",
+    "\n",
+    "protacdb_file = os.path.join(data_dir, 'PROTAC-DB-v2.csv')\n",
+    "protac_v2_df = pd.read_csv(protacdb_file).reset_index(drop=True)\n",
+    "\n",
+    "print(f'Loaded protac.csv')\n",
+    "\n",
+    "old2new = {\n",
+    "    'E3 ligase': 'E3 Ligase',\n",
+    "}\n",
+    "protac_df = protac_df.rename(columns=old2new)\n",
+    "protac_v2_df = protac_v2_df.rename(columns=old2new)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(9380, 5388)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(protac_v2_df), len(protac_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "PROTAC-DB\n",
+      "Number of rows with all 3: 344\n",
+      "Number of rows with Assay: 1008\n",
+      "Number of rows with both DC50 and Dmax: 344\n",
+      "Number of rows with DC50: 905\n",
+      "Number of rows with Dmax: 726\n",
+      "Number of rows with Percent degradation: 362\n",
+      "\n",
+      "PROTAC-DB-v2\n",
+      "Number of rows with all 3: 909\n",
+      "Number of rows with Assay: 1892\n",
+      "Number of rows with both DC50 and Dmax: 909\n",
+      "Number of rows with DC50: 1762\n",
+      "Number of rows with Dmax: 1317\n",
+      "Number of rows with Percent degradation: 1422\n"
+     ]
+    }
+   ],
+   "source": [
+    "def print_dmax_dc_info(df):\n",
+    "    num_all_notna = len(df.dropna(subset=['Assay (DC50/Dmax)', 'DC50 (nM)', 'Dmax (%)']).dropna(how='all').drop_duplicates())\n",
+    "    num_assay_notna = len(df.dropna(subset=['Assay (DC50/Dmax)']).dropna(how='all').drop_duplicates())\n",
+    "    num_both_notna = len(df.dropna(subset=['DC50 (nM)', 'Dmax (%)']).dropna(how='all').drop_duplicates())\n",
+    "    num_dmax_notna = len(df.dropna(subset=['Dmax (%)']).dropna(how='all').drop_duplicates())\n",
+    "    num_dc50_notna = len(df.dropna(subset=['DC50 (nM)']).dropna(how='all').drop_duplicates())\n",
+    "    num_degr_notna = len(df.dropna(subset=['Percent degradation (%)']).dropna(how='all').drop_duplicates())\n",
+    "    print(f'Number of rows with all 3: {num_all_notna}')\n",
+    "    print(f'Number of rows with Assay: {num_assay_notna}')\n",
+    "    print(f'Number of rows with both DC50 and Dmax: {num_both_notna}')\n",
+    "    print(f'Number of rows with DC50: {num_dc50_notna}')\n",
+    "    print(f'Number of rows with Dmax: {num_dmax_notna}')\n",
+    "    print(f'Number of rows with Percent degradation: {num_degr_notna}')\n",
+    "\n",
+    "print('PROTAC-DB')\n",
+    "print_dmax_dc_info(protac_df)\n",
+    "print('')\n",
+    "print('PROTAC-DB-v2')\n",
+    "print_dmax_dc_info(protac_v2_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[-100.0, -5.0, nan, 90.317, 1000.0, nan]\n",
+      "[0.0]\n",
+      "[96.0, 73.0]\n",
+      "[1.0, 3.14]\n"
+     ]
+    }
+   ],
+   "source": [
+    "def clean_string(s: str) -> str:\n",
+    "    \"\"\" Clean a string by removing <, >, =, NaN, and ranges like 100-200.\n",
+    "    Args:\n",
+    "        s(str): string to clean\n",
+    "    Returns:\n",
+    "        str: cleaned string\n",
+    "    \"\"\"\n",
+    "    if pd.isnull(s) or s in {'nan', 'n/a', 'NaN', ''}:\n",
+    "        return np.nan\n",
+    "    if 'N.D.' in s:\n",
+    "        return '0'\n",
+    "    s = s.strip('(WB)').strip()\n",
+    "    # # Combine regex operations for efficiency\n",
+    "    # s = re.sub(r'[<=>]|NaN|[\\d]+[-~]', '', s)  # Remove <, >, =, NaN, and ranges like 100-200\n",
+    "    # Remove <, >, =, NaN\n",
+    "    s = re.sub(r'[<=>]|NaN', '', s)\n",
+    "    # Replace ranges like 100-200 or 1~3 with the left-most value in the range\n",
+    "    s = re.sub(r'\\b(\\d+)[-~]\\d+\\b', r'\\1', s)\n",
+    "    # Replace (n/a) with nan\n",
+    "    s = s.replace('(n/a)', 'nan')\n",
+    "    s = re.sub(r'[~<=>% ]', '', s)  # Remove ~, <, >, =, % and spaces\n",
+    "    return s\n",
+    "\n",
+    "\n",
+    "def split_clean_str(s: str, return_floats: bool = False) -> Union[List[str], List[float]]:\n",
+    "    \"\"\" Split a string by '/' and clean each part.\n",
+    "    Args:\n",
+    "        s(str): string to split\n",
+    "        return_floats(bool): whether to return floats or strings\n",
+    "    Returns:\n",
+    "        list: list of cleaned strings or floats\n",
+    "    \"\"\"\n",
+    "    if pd.isnull(s) or s in {'nan', 'n/a', 'NaN', ''}:\n",
+    "        return np.nan\n",
+    "    cleaned_values = [clean_string(part.strip())\n",
+    "                      for part in s.replace('(n/a)', 'nan').split('/')]\n",
+    "    return [float(value) if return_floats else value for value in cleaned_values]\n",
+    "\n",
+    "\n",
+    "print(split_clean_str('-100-200/-5/(n/a)/<=90.317/>1000/NaN', return_floats=True))\n",
+    "print(split_clean_str('N.D.', return_floats=True))\n",
+    "print(split_clean_str('96/73 (WB)', return_floats=True))\n",
+    "print(split_clean_str('1.0~3/3.14', return_floats=True))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "813\n",
+      "848\n"
+     ]
+    }
+   ],
+   "source": [
+    "def get_assay_texts(df: pd.DataFrame, assay_column: str) -> List[str]:\n",
+    "    tmp = df[assay_column].dropna()\n",
+    "    if tmp.empty:\n",
+    "        return []\n",
+    "    return tmp.unique().tolist()\n",
+    "\n",
+    "\n",
+    "def clean_assay_text(assay):\n",
+    "    tmp = assay.replace('/', ' and ')\n",
+    "    tmp = tmp.replace('BRD4 BD1 and 2', 'BRD4 BD1 and BRD4 BD2')\n",
+    "    tmp = tmp.replace('(Ba and F3 WT)', '(Ba/F3 WT)')\n",
+    "    tmp = tmp.replace('(EGFR L858R and T790M)', '(EGFR L858R/T790M)')\n",
+    "    return tmp\n",
+    "\n",
+    "\n",
+    "assays = {}\n",
+    "for c in protac_df.columns:\n",
+    "    if 'Assay' in c:\n",
+    "        assays[c] = get_assay_texts(protac_df, c)\n",
+    "texts = list(set([x for y in assays.values() for x in y]))\n",
+    "print(len(texts))\n",
+    "print(sum([len(x) for x in assays.values()]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_dc50_info(sentence):\n",
+    "    # Regex patterns for proteins/genes, cell types, and treatment hours\n",
+    "    protein_regex = r\"Degradation of total\\s(.+?)\\s(in|after|using|proteins)\"\n",
+    "    cell_regex = r\"in\\s([A-Za-z0-9-/.;\\(\\)\\s\\+]+)\\scells\"\n",
+    "    treatment_regex = r\"after\\s(\\d+/?\\d*?/?\\d*?\\s?h)\"\n",
+    "\n",
+    "    # Extracting protein information\n",
+    "    if 'total' in sentence.lower():\n",
+    "        protein_match = re.search(protein_regex, sentence)\n",
+    "        proteins = protein_match.group(1).split(' and ') if protein_match else [\n",
+    "            re.search(r\"Degradation of\\s([A-Za-z0-9-]+)\", sentence).group(1)]\n",
+    "    else:\n",
+    "        if ' in ' in sentence.lower():\n",
+    "            proteins = sentence.split(' in ')[0].split('Degradation of ')[-1]\n",
+    "            proteins = proteins.split('/') if '/' in proteins else [proteins]\n",
+    "        else:\n",
+    "            protein_match = re.search(protein_regex, sentence)\n",
+    "            proteins = protein_match.group(1).split(\n",
+    "                '/') if protein_match else [re.search(r\"Degradation of\\s([A-Za-z0-9-\\/]+)\", sentence).group(1)]\n",
+    "    # Handle special cases...\n",
+    "    if 'BRD4 short/long' in sentence:\n",
+    "        proteins = ['BRD4 short', 'BRD4 long']\n",
+    "    if 'BRD4 BD1/2' in sentence:\n",
+    "        proteins = ['BRD4 BD1', 'BRD4 BD2']\n",
+    "    elif 'BRD4 BD1' in sentence:\n",
+    "        proteins = ['BRD4 BD1']\n",
+    "    if 'EGFR L858R/T790M' in sentence:\n",
+    "        proteins = ['EGFR L858R/T790M']\n",
+    "    if 'EGFR del19/T790M/C797S' in sentence:\n",
+    "        proteins = ['EGFR del19/T790M/C797S']\n",
+    "\n",
+    "    # Extracting cell types\n",
+    "    cell_match = re.search(cell_regex, sentence)\n",
+    "    cells = cell_match.group(1).split('/') if cell_match else np.nan\n",
+    "    # Handle special cases...\n",
+    "    if 'Ba/F3' in sentence:\n",
+    "        # Replace any occurences that contain 'Ba' or 'F3' with 'Ba/F3' and remove duplicates while preserving the order in the other cells\n",
+    "        cells = ['Ba/F3' if 'Ba' in c or 'F3' in c else c for c in cells]\n",
+    "        cells.pop(cells.index('Ba/F3'))\n",
+    "    if 'ER-positive breast cancer cell lines' in sentence:\n",
+    "        cells = ['ER-positive breast cancer cell lines']\n",
+    "    if 'LNCaP (AR T878A)' in sentence:\n",
+    "        cells = ['LNCaP']\n",
+    "    if 'in A152T neurons' in sentence:\n",
+    "        cells = ['A152T neurons']\n",
+    "    if 'of Rpn13 in MM.1S after' in sentence:\n",
+    "        cells = ['MM.1S']\n",
+    "    if 'Primary Cardiomyocytes' in sentence:\n",
+    "        cells = ['Primary Cardiomyocytes']\n",
+    "    if ' HDAC6 in MM1S after' in sentence:\n",
+    "        cells = ['MM.1S']\n",
+    "\n",
+    "    # Extracting treatment hours\n",
+    "    treatment_hours_match = re.search(treatment_regex, sentence)\n",
+    "    if treatment_hours_match:\n",
+    "        treatment_hours = treatment_hours_match.group(1).strip('h')\n",
+    "        treatment_hours = split_clean_str(treatment_hours, return_floats=True)\n",
+    "    else:\n",
+    "        treatment_hours = np.nan\n",
+    "\n",
+    "    return {\n",
+    "        'Target (Parsed)': proteins,\n",
+    "        'Cell Type': cells,\n",
+    "        'Treatment Time (h)': treatment_hours,\n",
+    "    }\n",
+    "\n",
+    "\n",
+    "corner_cases = [\n",
+    "    # 'Degradation of BRD4',\n",
+    "    # 'Degradation of BRD4 short/long in HeLa cells after 24 h treatment',\n",
+    "    # 'Degradation of BRD4 BD1 assessed by EGFP/mCherry reporter assay',\n",
+    "    # 'Degradation of BRD4 BD1/2 assessed by EGFP/mCherry reporter assay',\n",
+    "    # 'Degradation of WT/Exon 20 Ins EGFR in OVCAR8/HeLa cells after 24 h treatment',\n",
+    "    # 'Degradation of TPM3-TRKA/TRKA in KM12/HEL cells after 6 h treatment',\n",
+    "    # 'Degradation of Exon 19 del/L858R EGFR in HCC827/H3255 cells after 24 h treatment',\n",
+    "    # 'Degradation of NPM-ALK/EML4-ALK in SU-DHL-1/NCI-H2228 cells after 16 h treatment',\n",
+    "    # 'Degradation of BCR-ABL T315I in Ba/F3 cells after 24 h treatment',\n",
+    "    # 'Degradation of BCR-ABL T315I in MOL/(Ba/F3)/R4;11 cells after 24 h treatment',\n",
+    "    # 'Degradation of ALK in H3122/Karpas 299/Kelly cells 16 h treatment',\n",
+    "    'Degradation of AR in LNCaP/VCaP AR+ cells after 6 h treatment',\n",
+    "    'Degradation of BRD4 BD1/2 assessed by EGFP/mCherry reporter assay',\n",
+    "    'Degradation of BRD4 BD1 assessed by EGFP/mCherry reporter assay',\n",
+    "    'Degradation of PARP1 in Primary Cardiomyocytes after 24 h treatment',\n",
+    "    'Degradation of HDAC6 in MM1S after 6 h treatment by in-cell ELISA analysis',\n",
+    "    'Degradation of total tau/P-tau in A152T neurons after 24 h treatment',\n",
+    "    'Degradation of Rpn13 in MM.1S after 16 h treatment',\n",
+    "    'Degradation of HDAC6 in MM1S after 6 h treatment by in-cell ELISA analysis',\n",
+    "]\n",
+    "\n",
+    "# for assay in assays[\"Assay (DC50/Dmax)\"][-5:] + corner_cases:\n",
+    "#     if len(assay) < 5:\n",
+    "#         continue\n",
+    "#     print(assay)\n",
+    "#     extracted_info = extract_dc50_info(assay)\n",
+    "#     proteins, cells, treatment_hours = extracted_info[\n",
+    "#         'Target (Parsed)'], extracted_info['Cell Type'], extracted_info['Treatment Time (h)']\n",
+    "#     print(proteins, \"|\", cells, \"|\", treatment_hours)\n",
+    "#     print('-' * 80)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_dc50_dmax_df(df):\n",
+    "    param_cols = ['DC50 (nM)', 'Dmax (%)']\n",
+    "    dc50_dmax_df = df.dropna(subset=param_cols + [\"Assay (DC50/Dmax)\"], how='all')\n",
+    "    dc50_dmax_df = dc50_dmax_df[dc50_dmax_df[\"Assay (DC50/Dmax)\"].notnull()]\n",
+    "    return dc50_dmax_df.drop_duplicates()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The 'Dmax (%)' column in PROTAC-DB-v2 has two entries which are _dates_ (you never stop surprising me, PROTAC-DB). Convert them to NaNs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# If any entry in the 'Dmax (%)' column contains the character ':', then it is a\n",
+    "# date and it needs to be set to NaN\n",
+    "def clean_dmax(df):\n",
+    "    df['Dmax (%)'] = df['Dmax (%)'].apply(lambda x: np.nan if ':' in str(x) else x)\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c889fc12d4a040a78fbfdc506696ea9f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting DC50/Dmax info:   0%|          | 0/1008 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Compound ID</th>\n",
+       "      <th>Uniprot</th>\n",
+       "      <th>Target</th>\n",
+       "      <th>E3 Ligase</th>\n",
+       "      <th>PDB</th>\n",
+       "      <th>Name</th>\n",
+       "      <th>Smiles</th>\n",
+       "      <th>DC50 (nM)</th>\n",
+       "      <th>Dmax (%)</th>\n",
+       "      <th>Assay (DC50/Dmax)</th>\n",
+       "      <th>...</th>\n",
+       "      <th>Hydrogen Bond Acceptor Count</th>\n",
+       "      <th>Hydrogen Bond Donor Count</th>\n",
+       "      <th>Rotatable Bond Count</th>\n",
+       "      <th>Topological Polar Surface Area</th>\n",
+       "      <th>Molecular Formula</th>\n",
+       "      <th>InChI</th>\n",
+       "      <th>InChI Key</th>\n",
+       "      <th>Target (Parsed)</th>\n",
+       "      <th>Cell Type</th>\n",
+       "      <th>Treatment Time (h)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>11</td>\n",
+       "      <td>Q9H8M2</td>\n",
+       "      <td>BRD9</td>\n",
+       "      <td>VHL</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...</td>\n",
+       "      <td>560.00</td>\n",
+       "      <td>80.0</td>\n",
+       "      <td>Degradation of BRD9 in HeLa cells after 4 h tr...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>16</td>\n",
+       "      <td>3</td>\n",
+       "      <td>22</td>\n",
+       "      <td>199.15</td>\n",
+       "      <td>C54H69FN8O10S</td>\n",
+       "      <td>InChI=1S/C54H69FN8O10S/c1-34-47(74-33-58-34)35...</td>\n",
+       "      <td>MXAKQOVZPDLCDK-UDVNCTHFSA-N</td>\n",
+       "      <td>BRD9</td>\n",
+       "      <td>HeLa</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>22</td>\n",
+       "      <td>Q9H8M2</td>\n",
+       "      <td>BRD9</td>\n",
+       "      <td>VHL</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>VZ185</td>\n",
+       "      <td>COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...</td>\n",
+       "      <td>1.76</td>\n",
+       "      <td>95.0</td>\n",
+       "      <td>Degradation of BRD9 in RI-1 cells after 8 h tr...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>14</td>\n",
+       "      <td>3</td>\n",
+       "      <td>19</td>\n",
+       "      <td>180.69</td>\n",
+       "      <td>C53H67FN8O8S</td>\n",
+       "      <td>InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...</td>\n",
+       "      <td>ZAGCLFXBHOXXEN-JPTLTNPLSA-N</td>\n",
+       "      <td>BRD9</td>\n",
+       "      <td>RI-1</td>\n",
+       "      <td>8.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>22</td>\n",
+       "      <td>Q9H8M2</td>\n",
+       "      <td>BRD9</td>\n",
+       "      <td>VHL</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>VZ185</td>\n",
+       "      <td>COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...</td>\n",
+       "      <td>4.00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Degradation of HiBiT-BRD9 in HEK293 cells afte...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>14</td>\n",
+       "      <td>3</td>\n",
+       "      <td>19</td>\n",
+       "      <td>180.69</td>\n",
+       "      <td>C53H67FN8O8S</td>\n",
+       "      <td>InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...</td>\n",
+       "      <td>ZAGCLFXBHOXXEN-JPTLTNPLSA-N</td>\n",
+       "      <td>HiBiT-BRD9</td>\n",
+       "      <td>HEK293</td>\n",
+       "      <td>24.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>22</td>\n",
+       "      <td>Q9H8M2</td>\n",
+       "      <td>BRD9</td>\n",
+       "      <td>VHL</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>VZ185</td>\n",
+       "      <td>COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...</td>\n",
+       "      <td>2.00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Degradation of BRD9 in EOL-1/A-204 cells after...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>14</td>\n",
+       "      <td>3</td>\n",
+       "      <td>19</td>\n",
+       "      <td>180.69</td>\n",
+       "      <td>C53H67FN8O8S</td>\n",
+       "      <td>InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...</td>\n",
+       "      <td>ZAGCLFXBHOXXEN-JPTLTNPLSA-N</td>\n",
+       "      <td>BRD9</td>\n",
+       "      <td>EOL-1</td>\n",
+       "      <td>18.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>22</td>\n",
+       "      <td>Q9H8M2</td>\n",
+       "      <td>BRD9</td>\n",
+       "      <td>VHL</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>VZ185</td>\n",
+       "      <td>COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...</td>\n",
+       "      <td>8.00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Degradation of BRD9 in EOL-1/A-204 cells after...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>14</td>\n",
+       "      <td>3</td>\n",
+       "      <td>19</td>\n",
+       "      <td>180.69</td>\n",
+       "      <td>C53H67FN8O8S</td>\n",
+       "      <td>InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...</td>\n",
+       "      <td>ZAGCLFXBHOXXEN-JPTLTNPLSA-N</td>\n",
+       "      <td>BRD9</td>\n",
+       "      <td>A-204</td>\n",
+       "      <td>18.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 92 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Compound ID Uniprot Target E3 Ligase  PDB   Name  \\\n",
+       "0           11  Q9H8M2   BRD9       VHL  NaN    NaN   \n",
+       "1           22  Q9H8M2   BRD9       VHL  NaN  VZ185   \n",
+       "2           22  Q9H8M2   BRD9       VHL  NaN  VZ185   \n",
+       "3           22  Q9H8M2   BRD9       VHL  NaN  VZ185   \n",
+       "4           22  Q9H8M2   BRD9       VHL  NaN  VZ185   \n",
+       "\n",
+       "                                              Smiles  DC50 (nM)  Dmax (%)  \\\n",
+       "0  COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...     560.00      80.0   \n",
+       "1  COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...       1.76      95.0   \n",
+       "2  COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...       4.00       NaN   \n",
+       "3  COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...       2.00       NaN   \n",
+       "4  COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...       8.00       NaN   \n",
+       "\n",
+       "                                   Assay (DC50/Dmax)  ...  \\\n",
+       "0  Degradation of BRD9 in HeLa cells after 4 h tr...  ...   \n",
+       "1  Degradation of BRD9 in RI-1 cells after 8 h tr...  ...   \n",
+       "2  Degradation of HiBiT-BRD9 in HEK293 cells afte...  ...   \n",
+       "3  Degradation of BRD9 in EOL-1/A-204 cells after...  ...   \n",
+       "4  Degradation of BRD9 in EOL-1/A-204 cells after...  ...   \n",
+       "\n",
+       "  Hydrogen Bond Acceptor Count Hydrogen Bond Donor Count Rotatable Bond Count  \\\n",
+       "0                           16                         3                   22   \n",
+       "1                           14                         3                   19   \n",
+       "2                           14                         3                   19   \n",
+       "3                           14                         3                   19   \n",
+       "4                           14                         3                   19   \n",
+       "\n",
+       "  Topological Polar Surface Area Molecular Formula  \\\n",
+       "0                         199.15     C54H69FN8O10S   \n",
+       "1                         180.69      C53H67FN8O8S   \n",
+       "2                         180.69      C53H67FN8O8S   \n",
+       "3                         180.69      C53H67FN8O8S   \n",
+       "4                         180.69      C53H67FN8O8S   \n",
+       "\n",
+       "                                               InChI  \\\n",
+       "0  InChI=1S/C54H69FN8O10S/c1-34-47(74-33-58-34)35...   \n",
+       "1  InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...   \n",
+       "2  InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...   \n",
+       "3  InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...   \n",
+       "4  InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...   \n",
+       "\n",
+       "                     InChI Key Target (Parsed) Cell Type Treatment Time (h)  \n",
+       "0  MXAKQOVZPDLCDK-UDVNCTHFSA-N            BRD9      HeLa                4.0  \n",
+       "1  ZAGCLFXBHOXXEN-JPTLTNPLSA-N            BRD9      RI-1                8.0  \n",
+       "2  ZAGCLFXBHOXXEN-JPTLTNPLSA-N      HiBiT-BRD9    HEK293               24.0  \n",
+       "3  ZAGCLFXBHOXXEN-JPTLTNPLSA-N            BRD9     EOL-1               18.0  \n",
+       "4  ZAGCLFXBHOXXEN-JPTLTNPLSA-N            BRD9     A-204               18.0  \n",
+       "\n",
+       "[5 rows x 92 columns]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Parsed table len: 1205\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a788394f66594587b03025bd8f3d9c51",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting DC50/Dmax info:   0%|          | 0/1892 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Compound ID</th>\n",
+       "      <th>Uniprot</th>\n",
+       "      <th>Target</th>\n",
+       "      <th>E3 Ligase</th>\n",
+       "      <th>PDB</th>\n",
+       "      <th>Name</th>\n",
+       "      <th>Smiles</th>\n",
+       "      <th>DC50 (nM)</th>\n",
+       "      <th>Dmax (%)</th>\n",
+       "      <th>Assay (DC50/Dmax)</th>\n",
+       "      <th>...</th>\n",
+       "      <th>Hydrogen Bond Acceptor Count</th>\n",
+       "      <th>Hydrogen Bond Donor Count</th>\n",
+       "      <th>Rotatable Bond Count</th>\n",
+       "      <th>Topological Polar Surface Area</th>\n",
+       "      <th>Molecular Formula</th>\n",
+       "      <th>InChI</th>\n",
+       "      <th>InChI Key</th>\n",
+       "      <th>Target (Parsed)</th>\n",
+       "      <th>Cell Type</th>\n",
+       "      <th>Treatment Time (h)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>11</td>\n",
+       "      <td>Q9H8M2</td>\n",
+       "      <td>BRD9</td>\n",
+       "      <td>VHL</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...</td>\n",
+       "      <td>560.00</td>\n",
+       "      <td>80.0</td>\n",
+       "      <td>Degradation of BRD9 in HeLa cells after 4 h tr...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>16</td>\n",
+       "      <td>3</td>\n",
+       "      <td>22</td>\n",
+       "      <td>199.15</td>\n",
+       "      <td>C54H69FN8O10S</td>\n",
+       "      <td>InChI=1S/C54H69FN8O10S/c1-34-47(74-33-58-34)35...</td>\n",
+       "      <td>MXAKQOVZPDLCDK-UDVNCTHFSA-N</td>\n",
+       "      <td>BRD9</td>\n",
+       "      <td>HeLa</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>22</td>\n",
+       "      <td>Q9H8M2</td>\n",
+       "      <td>BRD9</td>\n",
+       "      <td>VHL</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>VZ185</td>\n",
+       "      <td>COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...</td>\n",
+       "      <td>1.76</td>\n",
+       "      <td>95.0</td>\n",
+       "      <td>Degradation of BRD9 in RI-1 cells after 8 h tr...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>14</td>\n",
+       "      <td>3</td>\n",
+       "      <td>19</td>\n",
+       "      <td>180.69</td>\n",
+       "      <td>C53H67FN8O8S</td>\n",
+       "      <td>InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...</td>\n",
+       "      <td>ZAGCLFXBHOXXEN-JPTLTNPLSA-N</td>\n",
+       "      <td>BRD9</td>\n",
+       "      <td>RI-1</td>\n",
+       "      <td>8.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>22</td>\n",
+       "      <td>Q9H8M2</td>\n",
+       "      <td>BRD9</td>\n",
+       "      <td>VHL</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>VZ185</td>\n",
+       "      <td>COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...</td>\n",
+       "      <td>4.00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Degradation of HiBiT-BRD9 in HEK293 cells afte...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>14</td>\n",
+       "      <td>3</td>\n",
+       "      <td>19</td>\n",
+       "      <td>180.69</td>\n",
+       "      <td>C53H67FN8O8S</td>\n",
+       "      <td>InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...</td>\n",
+       "      <td>ZAGCLFXBHOXXEN-JPTLTNPLSA-N</td>\n",
+       "      <td>HiBiT-BRD9</td>\n",
+       "      <td>HEK293</td>\n",
+       "      <td>24.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>22</td>\n",
+       "      <td>Q9H8M2</td>\n",
+       "      <td>BRD9</td>\n",
+       "      <td>VHL</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>VZ185</td>\n",
+       "      <td>COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...</td>\n",
+       "      <td>2.00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Degradation of BRD9 in EOL-1/A-204 cells after...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>14</td>\n",
+       "      <td>3</td>\n",
+       "      <td>19</td>\n",
+       "      <td>180.69</td>\n",
+       "      <td>C53H67FN8O8S</td>\n",
+       "      <td>InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...</td>\n",
+       "      <td>ZAGCLFXBHOXXEN-JPTLTNPLSA-N</td>\n",
+       "      <td>BRD9</td>\n",
+       "      <td>EOL-1</td>\n",
+       "      <td>18.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>22</td>\n",
+       "      <td>Q9H8M2</td>\n",
+       "      <td>BRD9</td>\n",
+       "      <td>VHL</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>VZ185</td>\n",
+       "      <td>COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...</td>\n",
+       "      <td>8.00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Degradation of BRD9 in EOL-1/A-204 cells after...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>14</td>\n",
+       "      <td>3</td>\n",
+       "      <td>19</td>\n",
+       "      <td>180.69</td>\n",
+       "      <td>C53H67FN8O8S</td>\n",
+       "      <td>InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...</td>\n",
+       "      <td>ZAGCLFXBHOXXEN-JPTLTNPLSA-N</td>\n",
+       "      <td>BRD9</td>\n",
+       "      <td>A-204</td>\n",
+       "      <td>18.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 92 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Compound ID Uniprot Target E3 Ligase  PDB   Name  \\\n",
+       "0           11  Q9H8M2   BRD9       VHL  NaN    NaN   \n",
+       "1           22  Q9H8M2   BRD9       VHL  NaN  VZ185   \n",
+       "2           22  Q9H8M2   BRD9       VHL  NaN  VZ185   \n",
+       "3           22  Q9H8M2   BRD9       VHL  NaN  VZ185   \n",
+       "4           22  Q9H8M2   BRD9       VHL  NaN  VZ185   \n",
+       "\n",
+       "                                              Smiles  DC50 (nM)  Dmax (%)  \\\n",
+       "0  COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...     560.00      80.0   \n",
+       "1  COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...       1.76      95.0   \n",
+       "2  COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...       4.00       NaN   \n",
+       "3  COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...       2.00       NaN   \n",
+       "4  COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...       8.00       NaN   \n",
+       "\n",
+       "                                   Assay (DC50/Dmax)  ...  \\\n",
+       "0  Degradation of BRD9 in HeLa cells after 4 h tr...  ...   \n",
+       "1  Degradation of BRD9 in RI-1 cells after 8 h tr...  ...   \n",
+       "2  Degradation of HiBiT-BRD9 in HEK293 cells afte...  ...   \n",
+       "3  Degradation of BRD9 in EOL-1/A-204 cells after...  ...   \n",
+       "4  Degradation of BRD9 in EOL-1/A-204 cells after...  ...   \n",
+       "\n",
+       "  Hydrogen Bond Acceptor Count Hydrogen Bond Donor Count Rotatable Bond Count  \\\n",
+       "0                           16                         3                   22   \n",
+       "1                           14                         3                   19   \n",
+       "2                           14                         3                   19   \n",
+       "3                           14                         3                   19   \n",
+       "4                           14                         3                   19   \n",
+       "\n",
+       "  Topological Polar Surface Area Molecular Formula  \\\n",
+       "0                         199.15     C54H69FN8O10S   \n",
+       "1                         180.69      C53H67FN8O8S   \n",
+       "2                         180.69      C53H67FN8O8S   \n",
+       "3                         180.69      C53H67FN8O8S   \n",
+       "4                         180.69      C53H67FN8O8S   \n",
+       "\n",
+       "                                               InChI  \\\n",
+       "0  InChI=1S/C54H69FN8O10S/c1-34-47(74-33-58-34)35...   \n",
+       "1  InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...   \n",
+       "2  InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...   \n",
+       "3  InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...   \n",
+       "4  InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...   \n",
+       "\n",
+       "                     InChI Key Target (Parsed) Cell Type Treatment Time (h)  \n",
+       "0  MXAKQOVZPDLCDK-UDVNCTHFSA-N            BRD9      HeLa                4.0  \n",
+       "1  ZAGCLFXBHOXXEN-JPTLTNPLSA-N            BRD9      RI-1                8.0  \n",
+       "2  ZAGCLFXBHOXXEN-JPTLTNPLSA-N      HiBiT-BRD9    HEK293               24.0  \n",
+       "3  ZAGCLFXBHOXXEN-JPTLTNPLSA-N            BRD9     EOL-1               18.0  \n",
+       "4  ZAGCLFXBHOXXEN-JPTLTNPLSA-N            BRD9     A-204               18.0  \n",
+       "\n",
+       "[5 rows x 92 columns]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Parsed table len: 2264\n"
+     ]
+    }
+   ],
+   "source": [
+    "dfs = {}\n",
+    "\n",
+    "for name, df in [('protac-db', protac_df), ('protac-db-v2', protac_v2_df)]:\n",
+    "    dc50_dmax_df = get_dc50_dmax_df(clean_dmax(df))\n",
+    "\n",
+    "    parsed_table = []\n",
+    "    for i, row in tqdm(dc50_dmax_df.iterrows(), total=len(dc50_dmax_df), desc='Extracting DC50/Dmax info'):\n",
+    "        assay = row[\"Assay (DC50/Dmax)\"]\n",
+    "        if len(assay) < 5:\n",
+    "            continue\n",
+    "        extracted_info = extract_dc50_info(assay)\n",
+    "        extracted_info['DC50 (nM)'] = split_clean_str(\n",
+    "            row['DC50 (nM)'], return_floats=True)\n",
+    "        extracted_info['Dmax (%)'] = split_clean_str(\n",
+    "            row['Dmax (%)'], return_floats=True)\n",
+    "\n",
+    "        # Get the max len of each list in the extracted info\n",
+    "        max_len = max([len(v)\n",
+    "                    for v in extracted_info.values() if isinstance(v, list)])\n",
+    "        for i in range(max_len):\n",
+    "            row_tmp = row.copy().to_dict()\n",
+    "            row_tmp.update({k: v[i % len(v)] if isinstance(v, list)\n",
+    "                        else v for k, v in extracted_info.items()})\n",
+    "            parsed_table.append(row_tmp)\n",
+    "\n",
+    "    parsed_table = pd.DataFrame(parsed_table)\n",
+    "    display(parsed_table.head())\n",
+    "    print(f'Parsed table len: {len(parsed_table)}')\n",
+    "    dfs[name] = parsed_table"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def canonize_smiles(smi):\n",
+    "    return Chem.MolToSmiles(Chem.MolFromSmiles(smi))\n",
+    "\n",
+    "dfs['protac-db']['Smiles'] = dfs['protac-db']['Smiles'].apply(canonize_smiles)\n",
+    "dfs['protac-db-v2']['Smiles'] = dfs['protac-db-v2']['Smiles'].apply(canonize_smiles)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of entries in protac-db: 1205\n",
+      "Number of entries in protac-db-v2: 2264\n",
+      "Number of shared entries: 1249\n",
+      "Number of total entries: 2232\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Get the number of entries in both dfs\n",
+    "print(f'Number of entries in protac-db: {len(dfs[\"protac-db\"])}')\n",
+    "print(f'Number of entries in protac-db-v2: {len(dfs[\"protac-db-v2\"])}')\n",
+    "# Get the number of entries shared between the two dfs\n",
+    "predict_cols = [\"Smiles\", \"DC50 (nM)\", \"Dmax (%)\", \"E3 Ligase\", \"Uniprot\", \"Cell Type\"]\n",
+    "print(f'Number of shared entries: {len(dfs[\"protac-db\"].merge(dfs[\"protac-db-v2\"], on=predict_cols, how=\"inner\"))}')\n",
+    "# Get the number of total entries without duplicates\n",
+    "print(f'Number of total entries: {len(dfs[\"protac-db\"].append(dfs[\"protac-db-v2\"]).drop_duplicates(subset=predict_cols))}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}