{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "from collections import defaultdict\n",
    "import warnings\n",
    "import logging\n",
    "from typing import Literal\n",
    "\n",
    "sys.path.append('~/PROTAC-Degradation-Predictor/protac_degradation_predictor')\n",
    "import protac_degradation_predictor as pdp\n",
    "\n",
    "import pytorch_lightning as pl\n",
    "from rdkit import Chem\n",
    "from rdkit.Chem import AllChem\n",
    "from rdkit import DataStructs\n",
    "from jsonargparse import CLI\n",
    "import pandas as pd\n",
    "# Import tqdm for notebook\n",
    "from tqdm.notebook import tqdm\n",
    "import numpy as np\n",
    "from sklearn.preprocessing import OrdinalEncoder\n",
    "from sklearn.model_selection import (\n",
    "    StratifiedKFold,\n",
    "    StratifiedGroupKFold,\n",
    ")\n",
    "\n",
    "\n",
    "active_col = 'Active (Dmax 0.6, pDC50 6.0)'\n",
    "pDC50_threshold = 6.0\n",
    "Dmax_threshold = 0.6\n",
    "\n",
    "protac_df = pd.read_csv('~/PROTAC-Degradation-Predictor/data/PROTAC-Degradation-DB.csv')\n",
    "protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')\n",
    "protac_df[active_col] = protac_df.apply(\n",
    "    lambda x: pdp.is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "771"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def get_random_split_indices(active_df: pd.DataFrame, test_split: float) -> pd.Index:\n",
    "    \"\"\" Get the indices of the test set using a random split.\n",
    "    \n",
    "    Args:\n",
    "        active_df (pd.DataFrame): The DataFrame containing the active PROTACs.\n",
    "        test_split (float): The percentage of the active PROTACs to use as the test set.\n",
    "    \n",
    "    Returns:\n",
    "        pd.Index: The indices of the test set.\n",
    "    \"\"\"\n",
    "    test_df = active_df.sample(frac=test_split, random_state=42)\n",
    "    return test_df.index\n",
    "\n",
    "active_df = protac_df[protac_df[active_col].notna()].copy()\n",
    "test_split = 0.1\n",
    "test_indices = get_random_split_indices(active_df, test_split)\n",
    "train_val_df = active_df[~active_df.index.isin(test_indices)].copy()\n",
    "len(train_val_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "import optuna\n",
    "\n",
    "def objective(trial: optuna.Trial, verbose: int = 0) -> float:\n",
    "    \n",
    "    radius = trial.suggest_int('radius', 1, 15)\n",
    "    fpsize = trial.suggest_int('fpsize', 128, 2048, step=128)\n",
    "\n",
    "    morgan_fpgen = AllChem.GetMorganGenerator(\n",
    "        radius=radius,\n",
    "        fpSize=fpsize,\n",
    "        includeChirality=True,\n",
    "    )\n",
    "\n",
    "    smiles2fp = {}\n",
    "    for smiles in train_val_df['Smiles'].unique().tolist():\n",
    "        smiles2fp[smiles] = pdp.get_fingerprint(smiles, morgan_fpgen)\n",
    "\n",
    "    # Count the number of unique SMILES and the number of unique Morgan fingerprints\n",
    "    unique_fps = set([tuple(fp) for fp in smiles2fp.values()])\n",
    "    # Get the list of SMILES with overlapping fingerprints\n",
    "    overlapping_smiles = []\n",
    "    unique_fps = set()\n",
    "    for smiles, fp in smiles2fp.items():\n",
    "        if tuple(fp) in unique_fps:\n",
    "            overlapping_smiles.append(smiles)\n",
    "        else:\n",
    "            unique_fps.add(tuple(fp))\n",
    "    num_overlaps = len(train_val_df[train_val_df[\"Smiles\"].isin(overlapping_smiles)])\n",
    "    num_overlaps_tot = len(protac_df[protac_df[\"Smiles\"].isin(overlapping_smiles)])\n",
    "\n",
    "    if verbose:\n",
    "        print(f'Radius: {radius}')\n",
    "        print(f'FP length: {fpsize}')\n",
    "        print(f'Number of unique SMILES: {len(smiles2fp)}')\n",
    "        print(f'Number of unique fingerprints: {len(unique_fps)}')\n",
    "        print(f'Number of SMILES with overlapping fingerprints: {len(overlapping_smiles)}')\n",
    "        print(f'Number of overlapping SMILES in train_val_df: {num_overlaps}')\n",
    "        print(f'Number of overlapping SMILES in protac_df: {num_overlaps_tot}')\n",
    "    return num_overlaps + radius + fpsize / 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[I 2024-04-29 11:28:05,626] A new study created in memory with name: no-name-4db5d822-6220-4ab8-bc3a-c776b0e5cac2\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "678150f59ec548bb89562e2230993989",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/50 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[I 2024-04-29 11:28:07,705] Trial 0 finished with value: 39.480000000000004 and parameters: {'radius': 6, 'fpsize': 2048}. Best is trial 0 with value: 39.480000000000004.\n",
      "[I 2024-04-29 11:28:09,590] Trial 1 finished with value: 23.8 and parameters: {'radius': 11, 'fpsize': 1280}. Best is trial 1 with value: 23.8.\n",
      "[I 2024-04-29 11:28:10,474] Trial 2 finished with value: 131.84 and parameters: {'radius': 3, 'fpsize': 384}. Best is trial 1 with value: 23.8.\n",
      "[I 2024-04-29 11:28:11,978] Trial 3 finished with value: 281.92 and parameters: {'radius': 1, 'fpsize': 1792}. Best is trial 1 with value: 23.8.\n",
      "[I 2024-04-29 11:28:13,994] Trial 4 finished with value: 25.36 and parameters: {'radius': 10, 'fpsize': 1536}. Best is trial 1 with value: 23.8.\n",
      "[I 2024-04-29 11:28:15,642] Trial 5 finished with value: 284.48 and parameters: {'radius': 1, 'fpsize': 2048}. Best is trial 1 with value: 23.8.\n",
      "[I 2024-04-29 11:28:17,154] Trial 6 finished with value: 18.12 and parameters: {'radius': 13, 'fpsize': 512}. Best is trial 6 with value: 18.12.\n",
      "[I 2024-04-29 11:28:18,057] Trial 7 finished with value: 131.84 and parameters: {'radius': 3, 'fpsize': 384}. Best is trial 6 with value: 18.12.\n",
      "[I 2024-04-29 11:28:19,570] Trial 8 finished with value: 41.519999999999996 and parameters: {'radius': 5, 'fpsize': 1152}. Best is trial 6 with value: 18.12.\n",
      "[I 2024-04-29 11:28:20,860] Trial 9 finished with value: 23.4 and parameters: {'radius': 7, 'fpsize': 640}. Best is trial 6 with value: 18.12.\n",
      "[I 2024-04-29 11:28:22,631] Trial 10 finished with value: 22.68 and parameters: {'radius': 15, 'fpsize': 768}. Best is trial 6 with value: 18.12.\n",
      "[I 2024-04-29 11:28:24,427] Trial 11 finished with value: 22.68 and parameters: {'radius': 15, 'fpsize': 768}. Best is trial 6 with value: 18.12.\n",
      "[I 2024-04-29 11:28:25,756] Trial 12 finished with value: 92.28 and parameters: {'radius': 15, 'fpsize': 128}. Best is trial 6 with value: 18.12.\n",
      "[I 2024-04-29 11:28:27,466] Trial 13 finished with value: 20.96 and parameters: {'radius': 12, 'fpsize': 896}. Best is trial 6 with value: 18.12.\n",
      "[I 2024-04-29 11:28:29,156] Trial 14 finished with value: 20.96 and parameters: {'radius': 12, 'fpsize': 896}. Best is trial 6 with value: 18.12.\n",
      "[I 2024-04-29 11:28:30,727] Trial 15 finished with value: 18.12 and parameters: {'radius': 13, 'fpsize': 512}. Best is trial 6 with value: 18.12.\n",
      "[I 2024-04-29 11:28:31,842] Trial 16 finished with value: 22.28 and parameters: {'radius': 9, 'fpsize': 128}. Best is trial 6 with value: 18.12.\n",
      "[I 2024-04-29 11:28:33,365] Trial 17 finished with value: 18.12 and parameters: {'radius': 13, 'fpsize': 512}. Best is trial 6 with value: 18.12.\n",
      "[I 2024-04-29 11:28:34,801] Trial 18 finished with value: 16.84 and parameters: {'radius': 13, 'fpsize': 384}. Best is trial 18 with value: 16.84.\n",
      "[I 2024-04-29 11:28:35,986] Trial 19 finished with value: 13.56 and parameters: {'radius': 9, 'fpsize': 256}. Best is trial 19 with value: 13.56.\n",
      "[I 2024-04-29 11:28:37,122] Trial 20 finished with value: 14.56 and parameters: {'radius': 8, 'fpsize': 256}. Best is trial 19 with value: 13.56.\n",
      "[I 2024-04-29 11:28:38,175] Trial 21 finished with value: 30.28 and parameters: {'radius': 8, 'fpsize': 128}. Best is trial 19 with value: 13.56.\n",
      "[I 2024-04-29 11:28:39,406] Trial 22 finished with value: 13.56 and parameters: {'radius': 9, 'fpsize': 256}. Best is trial 19 with value: 13.56.\n",
      "[I 2024-04-29 11:28:40,649] Trial 23 finished with value: 13.56 and parameters: {'radius': 9, 'fpsize': 256}. Best is trial 19 with value: 13.56.\n",
      "[I 2024-04-29 11:28:41,868] Trial 24 finished with value: 12.56 and parameters: {'radius': 10, 'fpsize': 256}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:28:43,109] Trial 25 finished with value: 12.56 and parameters: {'radius': 10, 'fpsize': 256}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:28:44,587] Trial 26 finished with value: 16.4 and parameters: {'radius': 10, 'fpsize': 640}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:28:46,599] Trial 27 finished with value: 25.08 and parameters: {'radius': 11, 'fpsize': 1408}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:28:48,015] Trial 28 finished with value: 31.96 and parameters: {'radius': 6, 'fpsize': 896}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:28:49,347] Trial 29 finished with value: 23.4 and parameters: {'radius': 7, 'fpsize': 640}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:28:51,503] Trial 30 finished with value: 27.64 and parameters: {'radius': 11, 'fpsize': 1664}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:28:52,657] Trial 31 finished with value: 13.56 and parameters: {'radius': 9, 'fpsize': 256}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:28:53,840] Trial 32 finished with value: 12.56 and parameters: {'radius': 10, 'fpsize': 256}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:28:55,159] Trial 33 finished with value: 13.84 and parameters: {'radius': 10, 'fpsize': 384}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:28:56,140] Trial 34 finished with value: 39.28 and parameters: {'radius': 7, 'fpsize': 128}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:28:57,508] Trial 35 finished with value: 14.84 and parameters: {'radius': 11, 'fpsize': 384}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:28:58,900] Trial 36 finished with value: 15.120000000000001 and parameters: {'radius': 10, 'fpsize': 512}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:29:00,203] Trial 37 finished with value: 14.56 and parameters: {'radius': 12, 'fpsize': 256}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:29:02,225] Trial 38 finished with value: 49.2 and parameters: {'radius': 5, 'fpsize': 1920}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:29:03,942] Trial 39 finished with value: 22.52 and parameters: {'radius': 8, 'fpsize': 1152}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:29:05,240] Trial 40 finished with value: 13.84 and parameters: {'radius': 10, 'fpsize': 384}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:29:06,396] Trial 41 finished with value: 13.56 and parameters: {'radius': 9, 'fpsize': 256}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:29:07,422] Trial 42 finished with value: 30.28 and parameters: {'radius': 8, 'fpsize': 128}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:29:08,590] Trial 43 finished with value: 13.56 and parameters: {'radius': 9, 'fpsize': 256}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:29:09,949] Trial 44 finished with value: 14.84 and parameters: {'radius': 11, 'fpsize': 384}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:29:11,378] Trial 45 finished with value: 15.120000000000001 and parameters: {'radius': 10, 'fpsize': 512}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:29:12,637] Trial 46 finished with value: 26.4 and parameters: {'radius': 6, 'fpsize': 640}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:29:14,232] Trial 47 finished with value: 18.68 and parameters: {'radius': 11, 'fpsize': 768}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:29:14,904] Trial 48 finished with value: 214.28 and parameters: {'radius': 2, 'fpsize': 128}. Best is trial 24 with value: 12.56.\n",
      "[I 2024-04-29 11:29:16,323] Trial 49 finished with value: 16.56 and parameters: {'radius': 14, 'fpsize': 256}. Best is trial 24 with value: 12.56.\n"
     ]
    }
   ],
   "source": [
    "sampler = optuna.samplers.TPESampler(seed=42)\n",
    "study = optuna.create_study(sampler=sampler, direction='minimize')\n",
    "study.optimize(objective, n_trials=50, show_progress_bar=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Radius: 10\n",
      "FP length: 256\n",
      "Number of unique SMILES: 532\n",
      "Number of unique fingerprints: 532\n",
      "Number of SMILES with overlapping fingerprints: 0\n",
      "Number of overlapping SMILES in train_val_df: 0\n",
      "Number of overlapping SMILES in protac_df: 0\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "12.56"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Run objective with best params and verbose\n",
    "objective(study.best_trial, verbose=1)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}