ribesstefano commited on
Commit
2ca0a98
·
1 Parent(s): 9f58169

Added updated version of PROTAC-DB and starting applying data curation to it

Browse files
data/PROTAC-DB-v2.csv ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/data_curation_v2.ipynb ADDED
@@ -0,0 +1,1020 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "# from IPython.display import display_html\n",
10
+ "\n",
11
+ "import logging\n",
12
+ "import warnings\n",
13
+ "import re\n",
14
+ "import os\n",
15
+ "import numpy as np\n",
16
+ "import pandas as pd\n",
17
+ "import pickle\n",
18
+ "import pickle\n",
19
+ "import requests\n",
20
+ "import matplotlib.pyplot as plt\n",
21
+ "import seaborn as sns\n",
22
+ "from rdkit import Chem\n",
23
+ "from rdkit.Chem import AllChem\n",
24
+ "from typing import Literal, Union, List, Dict, Any, Callable\n",
25
+ "from collections import defaultdict\n",
26
+ "from tqdm.auto import tqdm\n",
27
+ "from rdkit import RDLogger\n",
28
+ "\n",
29
+ "RDLogger.DisableLog('rdApp.*')"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 3,
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "def set_global_logging_level(level=logging.ERROR, prefices=[\"\"]):\n",
39
+ " \"\"\"\n",
40
+ " Override logging levels of different modules based on their name as a prefix.\n",
41
+ " It needs to be invoked after the modules have been loaded so that their loggers have been initialized.\n",
42
+ "\n",
43
+ " Args:\n",
44
+ " - level: desired level. e.g. logging.INFO. Optional. Default is logging.ERROR\n",
45
+ " - prefices: list of one or more str prefices to match (e.g. [\"transformers\", \"torch\"]). Optional.\n",
46
+ " Default is `[\"\"]` to match all active loggers.\n",
47
+ " The match is a case-sensitive `module_name.startswith(prefix)`\n",
48
+ " \"\"\"\n",
49
+ " prefix_re = re.compile(fr'^(?:{ \"|\".join(prefices) })')\n",
50
+ " for name in logging.root.manager.loggerDict:\n",
51
+ " if re.match(prefix_re, name):\n",
52
+ " logging.getLogger(name).setLevel(level)\n",
53
+ "\n",
54
+ "\n",
55
+ "# Filter out annoying Pytorch Lightning printouts\n",
56
+ "warnings.filterwarnings('ignore')\n",
57
+ "warnings.filterwarnings(\n",
58
+ " 'ignore', '.*Covariance of the parameters could not be estimated.*')\n",
59
+ "warnings.filterwarnings(\n",
60
+ " 'ignore', '.*You seem to be using the pipelines sequentially on GPU.*')"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": 4,
66
+ "metadata": {},
67
+ "outputs": [],
68
+ "source": [
69
+ "# data_dir = os.path.join(os.getcwd(), '..', 'data')\n",
70
+ "data_dir = os.path.join(os.getcwd(), 'data')\n",
71
+ "dirs_to_make = [\n",
72
+ " data_dir,\n",
73
+ " # os.path.join(data_dir, 'raw'),\n",
74
+ " # os.path.join(data_dir, 'processed'),\n",
75
+ "]\n",
76
+ "for d in dirs_to_make:\n",
77
+ " if not os.path.exists(d):\n",
78
+ " os.makedirs(d)"
79
+ ]
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "execution_count": 5,
84
+ "metadata": {},
85
+ "outputs": [
86
+ {
87
+ "name": "stdout",
88
+ "output_type": "stream",
89
+ "text": [
90
+ "Loaded protac.csv\n"
91
+ ]
92
+ }
93
+ ],
94
+ "source": [
95
+ "protacdb_file = os.path.join(data_dir, 'PROTAC-DB.csv')\n",
96
+ "protac_df = pd.read_csv(protacdb_file).reset_index(drop=True)\n",
97
+ "\n",
98
+ "protacdb_file = os.path.join(data_dir, 'PROTAC-DB-v2.csv')\n",
99
+ "protac_v2_df = pd.read_csv(protacdb_file).reset_index(drop=True)\n",
100
+ "\n",
101
+ "print(f'Loaded protac.csv')\n",
102
+ "\n",
103
+ "old2new = {\n",
104
+ " 'E3 ligase': 'E3 Ligase',\n",
105
+ "}\n",
106
+ "protac_df = protac_df.rename(columns=old2new)\n",
107
+ "protac_v2_df = protac_v2_df.rename(columns=old2new)"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": 6,
113
+ "metadata": {},
114
+ "outputs": [
115
+ {
116
+ "data": {
117
+ "text/plain": [
118
+ "(9380, 5388)"
119
+ ]
120
+ },
121
+ "execution_count": 6,
122
+ "metadata": {},
123
+ "output_type": "execute_result"
124
+ }
125
+ ],
126
+ "source": [
127
+ "len(protac_v2_df), len(protac_df)"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "execution_count": 7,
133
+ "metadata": {},
134
+ "outputs": [
135
+ {
136
+ "name": "stdout",
137
+ "output_type": "stream",
138
+ "text": [
139
+ "PROTAC-DB\n",
140
+ "Number of rows with all 3: 344\n",
141
+ "Number of rows with Assay: 1008\n",
142
+ "Number of rows with both DC50 and Dmax: 344\n",
143
+ "Number of rows with DC50: 905\n",
144
+ "Number of rows with Dmax: 726\n",
145
+ "Number of rows with Percent degradation: 362\n",
146
+ "\n",
147
+ "PROTAC-DB-v2\n",
148
+ "Number of rows with all 3: 909\n",
149
+ "Number of rows with Assay: 1892\n",
150
+ "Number of rows with both DC50 and Dmax: 909\n",
151
+ "Number of rows with DC50: 1762\n",
152
+ "Number of rows with Dmax: 1317\n",
153
+ "Number of rows with Percent degradation: 1422\n"
154
+ ]
155
+ }
156
+ ],
157
+ "source": [
158
+ "def print_dmax_dc_info(df):\n",
159
+ " num_all_notna = len(df.dropna(subset=['Assay (DC50/Dmax)', 'DC50 (nM)', 'Dmax (%)']).dropna(how='all').drop_duplicates())\n",
160
+ " num_assay_notna = len(df.dropna(subset=['Assay (DC50/Dmax)']).dropna(how='all').drop_duplicates())\n",
161
+ " num_both_notna = len(df.dropna(subset=['DC50 (nM)', 'Dmax (%)']).dropna(how='all').drop_duplicates())\n",
162
+ " num_dmax_notna = len(df.dropna(subset=['Dmax (%)']).dropna(how='all').drop_duplicates())\n",
163
+ " num_dc50_notna = len(df.dropna(subset=['DC50 (nM)']).dropna(how='all').drop_duplicates())\n",
164
+ " num_degr_notna = len(df.dropna(subset=['Percent degradation (%)']).dropna(how='all').drop_duplicates())\n",
165
+ " print(f'Number of rows with all 3: {num_all_notna}')\n",
166
+ " print(f'Number of rows with Assay: {num_assay_notna}')\n",
167
+ " print(f'Number of rows with both DC50 and Dmax: {num_both_notna}')\n",
168
+ " print(f'Number of rows with DC50: {num_dc50_notna}')\n",
169
+ " print(f'Number of rows with Dmax: {num_dmax_notna}')\n",
170
+ " print(f'Number of rows with Percent degradation: {num_degr_notna}')\n",
171
+ "\n",
172
+ "print('PROTAC-DB')\n",
173
+ "print_dmax_dc_info(protac_df)\n",
174
+ "print('')\n",
175
+ "print('PROTAC-DB-v2')\n",
176
+ "print_dmax_dc_info(protac_v2_df)"
177
+ ]
178
+ },
179
+ {
180
+ "cell_type": "code",
181
+ "execution_count": 8,
182
+ "metadata": {},
183
+ "outputs": [
184
+ {
185
+ "name": "stdout",
186
+ "output_type": "stream",
187
+ "text": [
188
+ "[-100.0, -5.0, nan, 90.317, 1000.0, nan]\n",
189
+ "[0.0]\n",
190
+ "[96.0, 73.0]\n",
191
+ "[1.0, 3.14]\n"
192
+ ]
193
+ }
194
+ ],
195
+ "source": [
196
+ "def clean_string(s: str) -> str:\n",
197
+ " \"\"\" Clean a string by removing <, >, =, NaN, and ranges like 100-200.\n",
198
+ " Args:\n",
199
+ " s(str): string to clean\n",
200
+ " Returns:\n",
201
+ " str: cleaned string\n",
202
+ " \"\"\"\n",
203
+ " if pd.isnull(s) or s in {'nan', 'n/a', 'NaN', ''}:\n",
204
+ " return np.nan\n",
205
+ " if 'N.D.' in s:\n",
206
+ " return '0'\n",
207
+ " s = s.strip('(WB)').strip()\n",
208
+ " # # Combine regex operations for efficiency\n",
209
+ " # s = re.sub(r'[<=>]|NaN|[\\d]+[-~]', '', s) # Remove <, >, =, NaN, and ranges like 100-200\n",
210
+ " # Remove <, >, =, NaN\n",
211
+ " s = re.sub(r'[<=>]|NaN', '', s)\n",
212
+ " # Replace ranges like 100-200 or 1~3 with the left-most value in the range\n",
213
+ " s = re.sub(r'\\b(\\d+)[-~]\\d+\\b', r'\\1', s)\n",
214
+ " # Replace (n/a) with nan\n",
215
+ " s = s.replace('(n/a)', 'nan')\n",
216
+ " s = re.sub(r'[~<=>% ]', '', s) # Remove ~, <, >, =, % and spaces\n",
217
+ " return s\n",
218
+ "\n",
219
+ "\n",
220
+ "def split_clean_str(s: str, return_floats: bool = False) -> Union[List[str], List[float]]:\n",
221
+ " \"\"\" Split a string by '/' and clean each part.\n",
222
+ " Args:\n",
223
+ " s(str): string to split\n",
224
+ " return_floats(bool): whether to return floats or strings\n",
225
+ " Returns:\n",
226
+ " list: list of cleaned strings or floats\n",
227
+ " \"\"\"\n",
228
+ " if pd.isnull(s) or s in {'nan', 'n/a', 'NaN', ''}:\n",
229
+ " return np.nan\n",
230
+ " cleaned_values = [clean_string(part.strip())\n",
231
+ " for part in s.replace('(n/a)', 'nan').split('/')]\n",
232
+ " return [float(value) if return_floats else value for value in cleaned_values]\n",
233
+ "\n",
234
+ "\n",
235
+ "print(split_clean_str('-100-200/-5/(n/a)/<=90.317/>1000/NaN', return_floats=True))\n",
236
+ "print(split_clean_str('N.D.', return_floats=True))\n",
237
+ "print(split_clean_str('96/73 (WB)', return_floats=True))\n",
238
+ "print(split_clean_str('1.0~3/3.14', return_floats=True))"
239
+ ]
240
+ },
241
+ {
242
+ "cell_type": "code",
243
+ "execution_count": 9,
244
+ "metadata": {},
245
+ "outputs": [
246
+ {
247
+ "name": "stdout",
248
+ "output_type": "stream",
249
+ "text": [
250
+ "813\n",
251
+ "848\n"
252
+ ]
253
+ }
254
+ ],
255
+ "source": [
256
+ "def get_assay_texts(df: pd.DataFrame, assay_column: str) -> List[str]:\n",
257
+ " tmp = df[assay_column].dropna()\n",
258
+ " if tmp.empty:\n",
259
+ " return []\n",
260
+ " return tmp.unique().tolist()\n",
261
+ "\n",
262
+ "\n",
263
+ "def clean_assay_text(assay):\n",
264
+ " tmp = assay.replace('/', ' and ')\n",
265
+ " tmp = tmp.replace('BRD4 BD1 and 2', 'BRD4 BD1 and BRD4 BD2')\n",
266
+ " tmp = tmp.replace('(Ba and F3 WT)', '(Ba/F3 WT)')\n",
267
+ " tmp = tmp.replace('(EGFR L858R and T790M)', '(EGFR L858R/T790M)')\n",
268
+ " return tmp\n",
269
+ "\n",
270
+ "\n",
271
+ "assays = {}\n",
272
+ "for c in protac_df.columns:\n",
273
+ " if 'Assay' in c:\n",
274
+ " assays[c] = get_assay_texts(protac_df, c)\n",
275
+ "texts = list(set([x for y in assays.values() for x in y]))\n",
276
+ "print(len(texts))\n",
277
+ "print(sum([len(x) for x in assays.values()]))"
278
+ ]
279
+ },
280
+ {
281
+ "cell_type": "code",
282
+ "execution_count": 10,
283
+ "metadata": {},
284
+ "outputs": [],
285
+ "source": [
286
+ "def extract_dc50_info(sentence):\n",
287
+ " # Regex patterns for proteins/genes, cell types, and treatment hours\n",
288
+ " protein_regex = r\"Degradation of total\\s(.+?)\\s(in|after|using|proteins)\"\n",
289
+ " cell_regex = r\"in\\s([A-Za-z0-9-/.;\\(\\)\\s\\+]+)\\scells\"\n",
290
+ " treatment_regex = r\"after\\s(\\d+/?\\d*?/?\\d*?\\s?h)\"\n",
291
+ "\n",
292
+ " # Extracting protein information\n",
293
+ " if 'total' in sentence.lower():\n",
294
+ " protein_match = re.search(protein_regex, sentence)\n",
295
+ " proteins = protein_match.group(1).split(' and ') if protein_match else [\n",
296
+ " re.search(r\"Degradation of\\s([A-Za-z0-9-]+)\", sentence).group(1)]\n",
297
+ " else:\n",
298
+ " if ' in ' in sentence.lower():\n",
299
+ " proteins = sentence.split(' in ')[0].split('Degradation of ')[-1]\n",
300
+ " proteins = proteins.split('/') if '/' in proteins else [proteins]\n",
301
+ " else:\n",
302
+ " protein_match = re.search(protein_regex, sentence)\n",
303
+ " proteins = protein_match.group(1).split(\n",
304
+ " '/') if protein_match else [re.search(r\"Degradation of\\s([A-Za-z0-9-\\/]+)\", sentence).group(1)]\n",
305
+ " # Handle special cases...\n",
306
+ " if 'BRD4 short/long' in sentence:\n",
307
+ " proteins = ['BRD4 short', 'BRD4 long']\n",
308
+ " if 'BRD4 BD1/2' in sentence:\n",
309
+ " proteins = ['BRD4 BD1', 'BRD4 BD2']\n",
310
+ " elif 'BRD4 BD1' in sentence:\n",
311
+ " proteins = ['BRD4 BD1']\n",
312
+ " if 'EGFR L858R/T790M' in sentence:\n",
313
+ " proteins = ['EGFR L858R/T790M']\n",
314
+ " if 'EGFR del19/T790M/C797S' in sentence:\n",
315
+ " proteins = ['EGFR del19/T790M/C797S']\n",
316
+ "\n",
317
+ " # Extracting cell types\n",
318
+ " cell_match = re.search(cell_regex, sentence)\n",
319
+ " cells = cell_match.group(1).split('/') if cell_match else np.nan\n",
320
+ " # Handle special cases...\n",
321
+ " if 'Ba/F3' in sentence:\n",
322
+ " # Replace any occurences that contain 'Ba' or 'F3' with 'Ba/F3' and remove duplicates while preserving the order in the other cells\n",
323
+ " cells = ['Ba/F3' if 'Ba' in c or 'F3' in c else c for c in cells]\n",
324
+ " cells.pop(cells.index('Ba/F3'))\n",
325
+ " if 'ER-positive breast cancer cell lines' in sentence:\n",
326
+ " cells = ['ER-positive breast cancer cell lines']\n",
327
+ " if 'LNCaP (AR T878A)' in sentence:\n",
328
+ " cells = ['LNCaP']\n",
329
+ " if 'in A152T neurons' in sentence:\n",
330
+ " cells = ['A152T neurons']\n",
331
+ " if 'of Rpn13 in MM.1S after' in sentence:\n",
332
+ " cells = ['MM.1S']\n",
333
+ " if 'Primary Cardiomyocytes' in sentence:\n",
334
+ " cells = ['Primary Cardiomyocytes']\n",
335
+ " if ' HDAC6 in MM1S after' in sentence:\n",
336
+ " cells = ['MM.1S']\n",
337
+ "\n",
338
+ " # Extracting treatment hours\n",
339
+ " treatment_hours_match = re.search(treatment_regex, sentence)\n",
340
+ " if treatment_hours_match:\n",
341
+ " treatment_hours = treatment_hours_match.group(1).strip('h')\n",
342
+ " treatment_hours = split_clean_str(treatment_hours, return_floats=True)\n",
343
+ " else:\n",
344
+ " treatment_hours = np.nan\n",
345
+ "\n",
346
+ " return {\n",
347
+ " 'Target (Parsed)': proteins,\n",
348
+ " 'Cell Type': cells,\n",
349
+ " 'Treatment Time (h)': treatment_hours,\n",
350
+ " }\n",
351
+ "\n",
352
+ "\n",
353
+ "corner_cases = [\n",
354
+ " # 'Degradation of BRD4',\n",
355
+ " # 'Degradation of BRD4 short/long in HeLa cells after 24 h treatment',\n",
356
+ " # 'Degradation of BRD4 BD1 assessed by EGFP/mCherry reporter assay',\n",
357
+ " # 'Degradation of BRD4 BD1/2 assessed by EGFP/mCherry reporter assay',\n",
358
+ " # 'Degradation of WT/Exon 20 Ins EGFR in OVCAR8/HeLa cells after 24 h treatment',\n",
359
+ " # 'Degradation of TPM3-TRKA/TRKA in KM12/HEL cells after 6 h treatment',\n",
360
+ " # 'Degradation of Exon 19 del/L858R EGFR in HCC827/H3255 cells after 24 h treatment',\n",
361
+ " # 'Degradation of NPM-ALK/EML4-ALK in SU-DHL-1/NCI-H2228 cells after 16 h treatment',\n",
362
+ " # 'Degradation of BCR-ABL T315I in Ba/F3 cells after 24 h treatment',\n",
363
+ " # 'Degradation of BCR-ABL T315I in MOL/(Ba/F3)/R4;11 cells after 24 h treatment',\n",
364
+ " # 'Degradation of ALK in H3122/Karpas 299/Kelly cells 16 h treatment',\n",
365
+ " 'Degradation of AR in LNCaP/VCaP AR+ cells after 6 h treatment',\n",
366
+ " 'Degradation of BRD4 BD1/2 assessed by EGFP/mCherry reporter assay',\n",
367
+ " 'Degradation of BRD4 BD1 assessed by EGFP/mCherry reporter assay',\n",
368
+ " 'Degradation of PARP1 in Primary Cardiomyocytes after 24 h treatment',\n",
369
+ " 'Degradation of HDAC6 in MM1S after 6 h treatment by in-cell ELISA analysis',\n",
370
+ " 'Degradation of total tau/P-tau in A152T neurons after 24 h treatment',\n",
371
+ " 'Degradation of Rpn13 in MM.1S after 16 h treatment',\n",
372
+ " 'Degradation of HDAC6 in MM1S after 6 h treatment by in-cell ELISA analysis',\n",
373
+ "]\n",
374
+ "\n",
375
+ "# for assay in assays[\"Assay (DC50/Dmax)\"][-5:] + corner_cases:\n",
376
+ "# if len(assay) < 5:\n",
377
+ "# continue\n",
378
+ "# print(assay)\n",
379
+ "# extracted_info = extract_dc50_info(assay)\n",
380
+ "# proteins, cells, treatment_hours = extracted_info[\n",
381
+ "# 'Target (Parsed)'], extracted_info['Cell Type'], extracted_info['Treatment Time (h)']\n",
382
+ "# print(proteins, \"|\", cells, \"|\", treatment_hours)\n",
383
+ "# print('-' * 80)"
384
+ ]
385
+ },
386
+ {
387
+ "cell_type": "code",
388
+ "execution_count": 11,
389
+ "metadata": {},
390
+ "outputs": [],
391
+ "source": [
392
+ "def get_dc50_dmax_df(df):\n",
393
+ " param_cols = ['DC50 (nM)', 'Dmax (%)']\n",
394
+ " dc50_dmax_df = df.dropna(subset=param_cols + [\"Assay (DC50/Dmax)\"], how='all')\n",
395
+ " dc50_dmax_df = dc50_dmax_df[dc50_dmax_df[\"Assay (DC50/Dmax)\"].notnull()]\n",
396
+ " return dc50_dmax_df.drop_duplicates()"
397
+ ]
398
+ },
399
+ {
400
+ "cell_type": "markdown",
401
+ "metadata": {},
402
+ "source": [
403
+ "The 'Dmax (%)' column in PROTAC-DB-v2 has two entries which are _dates_ (you never stop surprising me, PROTAC-DB). Convert them to NaNs."
404
+ ]
405
+ },
406
+ {
407
+ "cell_type": "code",
408
+ "execution_count": 12,
409
+ "metadata": {},
410
+ "outputs": [],
411
+ "source": [
412
+ "# If any entry in the 'Dmax (%)' column contains the character ':', then it is a\n",
413
+ "# date and it needs to be set to NaN\n",
414
+ "def clean_dmax(df):\n",
415
+ " df['Dmax (%)'] = df['Dmax (%)'].apply(lambda x: np.nan if ':' in str(x) else x)\n",
416
+ " return df"
417
+ ]
418
+ },
419
+ {
420
+ "cell_type": "code",
421
+ "execution_count": 13,
422
+ "metadata": {},
423
+ "outputs": [
424
+ {
425
+ "data": {
426
+ "application/vnd.jupyter.widget-view+json": {
427
+ "model_id": "c889fc12d4a040a78fbfdc506696ea9f",
428
+ "version_major": 2,
429
+ "version_minor": 0
430
+ },
431
+ "text/plain": [
432
+ "Extracting DC50/Dmax info: 0%| | 0/1008 [00:00<?, ?it/s]"
433
+ ]
434
+ },
435
+ "metadata": {},
436
+ "output_type": "display_data"
437
+ },
438
+ {
439
+ "data": {
440
+ "text/html": [
441
+ "<div>\n",
442
+ "<style scoped>\n",
443
+ " .dataframe tbody tr th:only-of-type {\n",
444
+ " vertical-align: middle;\n",
445
+ " }\n",
446
+ "\n",
447
+ " .dataframe tbody tr th {\n",
448
+ " vertical-align: top;\n",
449
+ " }\n",
450
+ "\n",
451
+ " .dataframe thead th {\n",
452
+ " text-align: right;\n",
453
+ " }\n",
454
+ "</style>\n",
455
+ "<table border=\"1\" class=\"dataframe\">\n",
456
+ " <thead>\n",
457
+ " <tr style=\"text-align: right;\">\n",
458
+ " <th></th>\n",
459
+ " <th>Compound ID</th>\n",
460
+ " <th>Uniprot</th>\n",
461
+ " <th>Target</th>\n",
462
+ " <th>E3 Ligase</th>\n",
463
+ " <th>PDB</th>\n",
464
+ " <th>Name</th>\n",
465
+ " <th>Smiles</th>\n",
466
+ " <th>DC50 (nM)</th>\n",
467
+ " <th>Dmax (%)</th>\n",
468
+ " <th>Assay (DC50/Dmax)</th>\n",
469
+ " <th>...</th>\n",
470
+ " <th>Hydrogen Bond Acceptor Count</th>\n",
471
+ " <th>Hydrogen Bond Donor Count</th>\n",
472
+ " <th>Rotatable Bond Count</th>\n",
473
+ " <th>Topological Polar Surface Area</th>\n",
474
+ " <th>Molecular Formula</th>\n",
475
+ " <th>InChI</th>\n",
476
+ " <th>InChI Key</th>\n",
477
+ " <th>Target (Parsed)</th>\n",
478
+ " <th>Cell Type</th>\n",
479
+ " <th>Treatment Time (h)</th>\n",
480
+ " </tr>\n",
481
+ " </thead>\n",
482
+ " <tbody>\n",
483
+ " <tr>\n",
484
+ " <th>0</th>\n",
485
+ " <td>11</td>\n",
486
+ " <td>Q9H8M2</td>\n",
487
+ " <td>BRD9</td>\n",
488
+ " <td>VHL</td>\n",
489
+ " <td>NaN</td>\n",
490
+ " <td>NaN</td>\n",
491
+ " <td>COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...</td>\n",
492
+ " <td>560.00</td>\n",
493
+ " <td>80.0</td>\n",
494
+ " <td>Degradation of BRD9 in HeLa cells after 4 h tr...</td>\n",
495
+ " <td>...</td>\n",
496
+ " <td>16</td>\n",
497
+ " <td>3</td>\n",
498
+ " <td>22</td>\n",
499
+ " <td>199.15</td>\n",
500
+ " <td>C54H69FN8O10S</td>\n",
501
+ " <td>InChI=1S/C54H69FN8O10S/c1-34-47(74-33-58-34)35...</td>\n",
502
+ " <td>MXAKQOVZPDLCDK-UDVNCTHFSA-N</td>\n",
503
+ " <td>BRD9</td>\n",
504
+ " <td>HeLa</td>\n",
505
+ " <td>4.0</td>\n",
506
+ " </tr>\n",
507
+ " <tr>\n",
508
+ " <th>1</th>\n",
509
+ " <td>22</td>\n",
510
+ " <td>Q9H8M2</td>\n",
511
+ " <td>BRD9</td>\n",
512
+ " <td>VHL</td>\n",
513
+ " <td>NaN</td>\n",
514
+ " <td>VZ185</td>\n",
515
+ " <td>COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...</td>\n",
516
+ " <td>1.76</td>\n",
517
+ " <td>95.0</td>\n",
518
+ " <td>Degradation of BRD9 in RI-1 cells after 8 h tr...</td>\n",
519
+ " <td>...</td>\n",
520
+ " <td>14</td>\n",
521
+ " <td>3</td>\n",
522
+ " <td>19</td>\n",
523
+ " <td>180.69</td>\n",
524
+ " <td>C53H67FN8O8S</td>\n",
525
+ " <td>InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...</td>\n",
526
+ " <td>ZAGCLFXBHOXXEN-JPTLTNPLSA-N</td>\n",
527
+ " <td>BRD9</td>\n",
528
+ " <td>RI-1</td>\n",
529
+ " <td>8.0</td>\n",
530
+ " </tr>\n",
531
+ " <tr>\n",
532
+ " <th>2</th>\n",
533
+ " <td>22</td>\n",
534
+ " <td>Q9H8M2</td>\n",
535
+ " <td>BRD9</td>\n",
536
+ " <td>VHL</td>\n",
537
+ " <td>NaN</td>\n",
538
+ " <td>VZ185</td>\n",
539
+ " <td>COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...</td>\n",
540
+ " <td>4.00</td>\n",
541
+ " <td>NaN</td>\n",
542
+ " <td>Degradation of HiBiT-BRD9 in HEK293 cells afte...</td>\n",
543
+ " <td>...</td>\n",
544
+ " <td>14</td>\n",
545
+ " <td>3</td>\n",
546
+ " <td>19</td>\n",
547
+ " <td>180.69</td>\n",
548
+ " <td>C53H67FN8O8S</td>\n",
549
+ " <td>InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...</td>\n",
550
+ " <td>ZAGCLFXBHOXXEN-JPTLTNPLSA-N</td>\n",
551
+ " <td>HiBiT-BRD9</td>\n",
552
+ " <td>HEK293</td>\n",
553
+ " <td>24.0</td>\n",
554
+ " </tr>\n",
555
+ " <tr>\n",
556
+ " <th>3</th>\n",
557
+ " <td>22</td>\n",
558
+ " <td>Q9H8M2</td>\n",
559
+ " <td>BRD9</td>\n",
560
+ " <td>VHL</td>\n",
561
+ " <td>NaN</td>\n",
562
+ " <td>VZ185</td>\n",
563
+ " <td>COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...</td>\n",
564
+ " <td>2.00</td>\n",
565
+ " <td>NaN</td>\n",
566
+ " <td>Degradation of BRD9 in EOL-1/A-204 cells after...</td>\n",
567
+ " <td>...</td>\n",
568
+ " <td>14</td>\n",
569
+ " <td>3</td>\n",
570
+ " <td>19</td>\n",
571
+ " <td>180.69</td>\n",
572
+ " <td>C53H67FN8O8S</td>\n",
573
+ " <td>InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...</td>\n",
574
+ " <td>ZAGCLFXBHOXXEN-JPTLTNPLSA-N</td>\n",
575
+ " <td>BRD9</td>\n",
576
+ " <td>EOL-1</td>\n",
577
+ " <td>18.0</td>\n",
578
+ " </tr>\n",
579
+ " <tr>\n",
580
+ " <th>4</th>\n",
581
+ " <td>22</td>\n",
582
+ " <td>Q9H8M2</td>\n",
583
+ " <td>BRD9</td>\n",
584
+ " <td>VHL</td>\n",
585
+ " <td>NaN</td>\n",
586
+ " <td>VZ185</td>\n",
587
+ " <td>COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...</td>\n",
588
+ " <td>8.00</td>\n",
589
+ " <td>NaN</td>\n",
590
+ " <td>Degradation of BRD9 in EOL-1/A-204 cells after...</td>\n",
591
+ " <td>...</td>\n",
592
+ " <td>14</td>\n",
593
+ " <td>3</td>\n",
594
+ " <td>19</td>\n",
595
+ " <td>180.69</td>\n",
596
+ " <td>C53H67FN8O8S</td>\n",
597
+ " <td>InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...</td>\n",
598
+ " <td>ZAGCLFXBHOXXEN-JPTLTNPLSA-N</td>\n",
599
+ " <td>BRD9</td>\n",
600
+ " <td>A-204</td>\n",
601
+ " <td>18.0</td>\n",
602
+ " </tr>\n",
603
+ " </tbody>\n",
604
+ "</table>\n",
605
+ "<p>5 rows × 92 columns</p>\n",
606
+ "</div>"
607
+ ],
608
+ "text/plain": [
609
+ " Compound ID Uniprot Target E3 Ligase PDB Name \\\n",
610
+ "0 11 Q9H8M2 BRD9 VHL NaN NaN \n",
611
+ "1 22 Q9H8M2 BRD9 VHL NaN VZ185 \n",
612
+ "2 22 Q9H8M2 BRD9 VHL NaN VZ185 \n",
613
+ "3 22 Q9H8M2 BRD9 VHL NaN VZ185 \n",
614
+ "4 22 Q9H8M2 BRD9 VHL NaN VZ185 \n",
615
+ "\n",
616
+ " Smiles DC50 (nM) Dmax (%) \\\n",
617
+ "0 COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN... 560.00 80.0 \n",
618
+ "1 COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN... 1.76 95.0 \n",
619
+ "2 COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN... 4.00 NaN \n",
620
+ "3 COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN... 2.00 NaN \n",
621
+ "4 COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN... 8.00 NaN \n",
622
+ "\n",
623
+ " Assay (DC50/Dmax) ... \\\n",
624
+ "0 Degradation of BRD9 in HeLa cells after 4 h tr... ... \n",
625
+ "1 Degradation of BRD9 in RI-1 cells after 8 h tr... ... \n",
626
+ "2 Degradation of HiBiT-BRD9 in HEK293 cells afte... ... \n",
627
+ "3 Degradation of BRD9 in EOL-1/A-204 cells after... ... \n",
628
+ "4 Degradation of BRD9 in EOL-1/A-204 cells after... ... \n",
629
+ "\n",
630
+ " Hydrogen Bond Acceptor Count Hydrogen Bond Donor Count Rotatable Bond Count \\\n",
631
+ "0 16 3 22 \n",
632
+ "1 14 3 19 \n",
633
+ "2 14 3 19 \n",
634
+ "3 14 3 19 \n",
635
+ "4 14 3 19 \n",
636
+ "\n",
637
+ " Topological Polar Surface Area Molecular Formula \\\n",
638
+ "0 199.15 C54H69FN8O10S \n",
639
+ "1 180.69 C53H67FN8O8S \n",
640
+ "2 180.69 C53H67FN8O8S \n",
641
+ "3 180.69 C53H67FN8O8S \n",
642
+ "4 180.69 C53H67FN8O8S \n",
643
+ "\n",
644
+ " InChI \\\n",
645
+ "0 InChI=1S/C54H69FN8O10S/c1-34-47(74-33-58-34)35... \n",
646
+ "1 InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-... \n",
647
+ "2 InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-... \n",
648
+ "3 InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-... \n",
649
+ "4 InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-... \n",
650
+ "\n",
651
+ " InChI Key Target (Parsed) Cell Type Treatment Time (h) \n",
652
+ "0 MXAKQOVZPDLCDK-UDVNCTHFSA-N BRD9 HeLa 4.0 \n",
653
+ "1 ZAGCLFXBHOXXEN-JPTLTNPLSA-N BRD9 RI-1 8.0 \n",
654
+ "2 ZAGCLFXBHOXXEN-JPTLTNPLSA-N HiBiT-BRD9 HEK293 24.0 \n",
655
+ "3 ZAGCLFXBHOXXEN-JPTLTNPLSA-N BRD9 EOL-1 18.0 \n",
656
+ "4 ZAGCLFXBHOXXEN-JPTLTNPLSA-N BRD9 A-204 18.0 \n",
657
+ "\n",
658
+ "[5 rows x 92 columns]"
659
+ ]
660
+ },
661
+ "metadata": {},
662
+ "output_type": "display_data"
663
+ },
664
+ {
665
+ "name": "stdout",
666
+ "output_type": "stream",
667
+ "text": [
668
+ "Parsed table len: 1205\n"
669
+ ]
670
+ },
671
+ {
672
+ "data": {
673
+ "application/vnd.jupyter.widget-view+json": {
674
+ "model_id": "a788394f66594587b03025bd8f3d9c51",
675
+ "version_major": 2,
676
+ "version_minor": 0
677
+ },
678
+ "text/plain": [
679
+ "Extracting DC50/Dmax info: 0%| | 0/1892 [00:00<?, ?it/s]"
680
+ ]
681
+ },
682
+ "metadata": {},
683
+ "output_type": "display_data"
684
+ },
685
+ {
686
+ "data": {
687
+ "text/html": [
688
+ "<div>\n",
689
+ "<style scoped>\n",
690
+ " .dataframe tbody tr th:only-of-type {\n",
691
+ " vertical-align: middle;\n",
692
+ " }\n",
693
+ "\n",
694
+ " .dataframe tbody tr th {\n",
695
+ " vertical-align: top;\n",
696
+ " }\n",
697
+ "\n",
698
+ " .dataframe thead th {\n",
699
+ " text-align: right;\n",
700
+ " }\n",
701
+ "</style>\n",
702
+ "<table border=\"1\" class=\"dataframe\">\n",
703
+ " <thead>\n",
704
+ " <tr style=\"text-align: right;\">\n",
705
+ " <th></th>\n",
706
+ " <th>Compound ID</th>\n",
707
+ " <th>Uniprot</th>\n",
708
+ " <th>Target</th>\n",
709
+ " <th>E3 Ligase</th>\n",
710
+ " <th>PDB</th>\n",
711
+ " <th>Name</th>\n",
712
+ " <th>Smiles</th>\n",
713
+ " <th>DC50 (nM)</th>\n",
714
+ " <th>Dmax (%)</th>\n",
715
+ " <th>Assay (DC50/Dmax)</th>\n",
716
+ " <th>...</th>\n",
717
+ " <th>Hydrogen Bond Acceptor Count</th>\n",
718
+ " <th>Hydrogen Bond Donor Count</th>\n",
719
+ " <th>Rotatable Bond Count</th>\n",
720
+ " <th>Topological Polar Surface Area</th>\n",
721
+ " <th>Molecular Formula</th>\n",
722
+ " <th>InChI</th>\n",
723
+ " <th>InChI Key</th>\n",
724
+ " <th>Target (Parsed)</th>\n",
725
+ " <th>Cell Type</th>\n",
726
+ " <th>Treatment Time (h)</th>\n",
727
+ " </tr>\n",
728
+ " </thead>\n",
729
+ " <tbody>\n",
730
+ " <tr>\n",
731
+ " <th>0</th>\n",
732
+ " <td>11</td>\n",
733
+ " <td>Q9H8M2</td>\n",
734
+ " <td>BRD9</td>\n",
735
+ " <td>VHL</td>\n",
736
+ " <td>NaN</td>\n",
737
+ " <td>NaN</td>\n",
738
+ " <td>COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...</td>\n",
739
+ " <td>560.00</td>\n",
740
+ " <td>80.0</td>\n",
741
+ " <td>Degradation of BRD9 in HeLa cells after 4 h tr...</td>\n",
742
+ " <td>...</td>\n",
743
+ " <td>16</td>\n",
744
+ " <td>3</td>\n",
745
+ " <td>22</td>\n",
746
+ " <td>199.15</td>\n",
747
+ " <td>C54H69FN8O10S</td>\n",
748
+ " <td>InChI=1S/C54H69FN8O10S/c1-34-47(74-33-58-34)35...</td>\n",
749
+ " <td>MXAKQOVZPDLCDK-UDVNCTHFSA-N</td>\n",
750
+ " <td>BRD9</td>\n",
751
+ " <td>HeLa</td>\n",
752
+ " <td>4.0</td>\n",
753
+ " </tr>\n",
754
+ " <tr>\n",
755
+ " <th>1</th>\n",
756
+ " <td>22</td>\n",
757
+ " <td>Q9H8M2</td>\n",
758
+ " <td>BRD9</td>\n",
759
+ " <td>VHL</td>\n",
760
+ " <td>NaN</td>\n",
761
+ " <td>VZ185</td>\n",
762
+ " <td>COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...</td>\n",
763
+ " <td>1.76</td>\n",
764
+ " <td>95.0</td>\n",
765
+ " <td>Degradation of BRD9 in RI-1 cells after 8 h tr...</td>\n",
766
+ " <td>...</td>\n",
767
+ " <td>14</td>\n",
768
+ " <td>3</td>\n",
769
+ " <td>19</td>\n",
770
+ " <td>180.69</td>\n",
771
+ " <td>C53H67FN8O8S</td>\n",
772
+ " <td>InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...</td>\n",
773
+ " <td>ZAGCLFXBHOXXEN-JPTLTNPLSA-N</td>\n",
774
+ " <td>BRD9</td>\n",
775
+ " <td>RI-1</td>\n",
776
+ " <td>8.0</td>\n",
777
+ " </tr>\n",
778
+ " <tr>\n",
779
+ " <th>2</th>\n",
780
+ " <td>22</td>\n",
781
+ " <td>Q9H8M2</td>\n",
782
+ " <td>BRD9</td>\n",
783
+ " <td>VHL</td>\n",
784
+ " <td>NaN</td>\n",
785
+ " <td>VZ185</td>\n",
786
+ " <td>COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...</td>\n",
787
+ " <td>4.00</td>\n",
788
+ " <td>NaN</td>\n",
789
+ " <td>Degradation of HiBiT-BRD9 in HEK293 cells afte...</td>\n",
790
+ " <td>...</td>\n",
791
+ " <td>14</td>\n",
792
+ " <td>3</td>\n",
793
+ " <td>19</td>\n",
794
+ " <td>180.69</td>\n",
795
+ " <td>C53H67FN8O8S</td>\n",
796
+ " <td>InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...</td>\n",
797
+ " <td>ZAGCLFXBHOXXEN-JPTLTNPLSA-N</td>\n",
798
+ " <td>HiBiT-BRD9</td>\n",
799
+ " <td>HEK293</td>\n",
800
+ " <td>24.0</td>\n",
801
+ " </tr>\n",
802
+ " <tr>\n",
803
+ " <th>3</th>\n",
804
+ " <td>22</td>\n",
805
+ " <td>Q9H8M2</td>\n",
806
+ " <td>BRD9</td>\n",
807
+ " <td>VHL</td>\n",
808
+ " <td>NaN</td>\n",
809
+ " <td>VZ185</td>\n",
810
+ " <td>COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...</td>\n",
811
+ " <td>2.00</td>\n",
812
+ " <td>NaN</td>\n",
813
+ " <td>Degradation of BRD9 in EOL-1/A-204 cells after...</td>\n",
814
+ " <td>...</td>\n",
815
+ " <td>14</td>\n",
816
+ " <td>3</td>\n",
817
+ " <td>19</td>\n",
818
+ " <td>180.69</td>\n",
819
+ " <td>C53H67FN8O8S</td>\n",
820
+ " <td>InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...</td>\n",
821
+ " <td>ZAGCLFXBHOXXEN-JPTLTNPLSA-N</td>\n",
822
+ " <td>BRD9</td>\n",
823
+ " <td>EOL-1</td>\n",
824
+ " <td>18.0</td>\n",
825
+ " </tr>\n",
826
+ " <tr>\n",
827
+ " <th>4</th>\n",
828
+ " <td>22</td>\n",
829
+ " <td>Q9H8M2</td>\n",
830
+ " <td>BRD9</td>\n",
831
+ " <td>VHL</td>\n",
832
+ " <td>NaN</td>\n",
833
+ " <td>VZ185</td>\n",
834
+ " <td>COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...</td>\n",
835
+ " <td>8.00</td>\n",
836
+ " <td>NaN</td>\n",
837
+ " <td>Degradation of BRD9 in EOL-1/A-204 cells after...</td>\n",
838
+ " <td>...</td>\n",
839
+ " <td>14</td>\n",
840
+ " <td>3</td>\n",
841
+ " <td>19</td>\n",
842
+ " <td>180.69</td>\n",
843
+ " <td>C53H67FN8O8S</td>\n",
844
+ " <td>InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...</td>\n",
845
+ " <td>ZAGCLFXBHOXXEN-JPTLTNPLSA-N</td>\n",
846
+ " <td>BRD9</td>\n",
847
+ " <td>A-204</td>\n",
848
+ " <td>18.0</td>\n",
849
+ " </tr>\n",
850
+ " </tbody>\n",
851
+ "</table>\n",
852
+ "<p>5 rows × 92 columns</p>\n",
853
+ "</div>"
854
+ ],
855
+ "text/plain": [
856
+ " Compound ID Uniprot Target E3 Ligase PDB Name \\\n",
857
+ "0 11 Q9H8M2 BRD9 VHL NaN NaN \n",
858
+ "1 22 Q9H8M2 BRD9 VHL NaN VZ185 \n",
859
+ "2 22 Q9H8M2 BRD9 VHL NaN VZ185 \n",
860
+ "3 22 Q9H8M2 BRD9 VHL NaN VZ185 \n",
861
+ "4 22 Q9H8M2 BRD9 VHL NaN VZ185 \n",
862
+ "\n",
863
+ " Smiles DC50 (nM) Dmax (%) \\\n",
864
+ "0 COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN... 560.00 80.0 \n",
865
+ "1 COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN... 1.76 95.0 \n",
866
+ "2 COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN... 4.00 NaN \n",
867
+ "3 COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN... 2.00 NaN \n",
868
+ "4 COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN... 8.00 NaN \n",
869
+ "\n",
870
+ " Assay (DC50/Dmax) ... \\\n",
871
+ "0 Degradation of BRD9 in HeLa cells after 4 h tr... ... \n",
872
+ "1 Degradation of BRD9 in RI-1 cells after 8 h tr... ... \n",
873
+ "2 Degradation of HiBiT-BRD9 in HEK293 cells afte... ... \n",
874
+ "3 Degradation of BRD9 in EOL-1/A-204 cells after... ... \n",
875
+ "4 Degradation of BRD9 in EOL-1/A-204 cells after... ... \n",
876
+ "\n",
877
+ " Hydrogen Bond Acceptor Count Hydrogen Bond Donor Count Rotatable Bond Count \\\n",
878
+ "0 16 3 22 \n",
879
+ "1 14 3 19 \n",
880
+ "2 14 3 19 \n",
881
+ "3 14 3 19 \n",
882
+ "4 14 3 19 \n",
883
+ "\n",
884
+ " Topological Polar Surface Area Molecular Formula \\\n",
885
+ "0 199.15 C54H69FN8O10S \n",
886
+ "1 180.69 C53H67FN8O8S \n",
887
+ "2 180.69 C53H67FN8O8S \n",
888
+ "3 180.69 C53H67FN8O8S \n",
889
+ "4 180.69 C53H67FN8O8S \n",
890
+ "\n",
891
+ " InChI \\\n",
892
+ "0 InChI=1S/C54H69FN8O10S/c1-34-47(74-33-58-34)35... \n",
893
+ "1 InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-... \n",
894
+ "2 InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-... \n",
895
+ "3 InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-... \n",
896
+ "4 InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-... \n",
897
+ "\n",
898
+ " InChI Key Target (Parsed) Cell Type Treatment Time (h) \n",
899
+ "0 MXAKQOVZPDLCDK-UDVNCTHFSA-N BRD9 HeLa 4.0 \n",
900
+ "1 ZAGCLFXBHOXXEN-JPTLTNPLSA-N BRD9 RI-1 8.0 \n",
901
+ "2 ZAGCLFXBHOXXEN-JPTLTNPLSA-N HiBiT-BRD9 HEK293 24.0 \n",
902
+ "3 ZAGCLFXBHOXXEN-JPTLTNPLSA-N BRD9 EOL-1 18.0 \n",
903
+ "4 ZAGCLFXBHOXXEN-JPTLTNPLSA-N BRD9 A-204 18.0 \n",
904
+ "\n",
905
+ "[5 rows x 92 columns]"
906
+ ]
907
+ },
908
+ "metadata": {},
909
+ "output_type": "display_data"
910
+ },
911
+ {
912
+ "name": "stdout",
913
+ "output_type": "stream",
914
+ "text": [
915
+ "Parsed table len: 2264\n"
916
+ ]
917
+ }
918
+ ],
919
+ "source": [
920
+ "dfs = {}\n",
921
+ "\n",
922
+ "for name, df in [('protac-db', protac_df), ('protac-db-v2', protac_v2_df)]:\n",
923
+ " dc50_dmax_df = get_dc50_dmax_df(clean_dmax(df))\n",
924
+ "\n",
925
+ " parsed_table = []\n",
926
+ " for i, row in tqdm(dc50_dmax_df.iterrows(), total=len(dc50_dmax_df), desc='Extracting DC50/Dmax info'):\n",
927
+ " assay = row[\"Assay (DC50/Dmax)\"]\n",
928
+ " if len(assay) < 5:\n",
929
+ " continue\n",
930
+ " extracted_info = extract_dc50_info(assay)\n",
931
+ " extracted_info['DC50 (nM)'] = split_clean_str(\n",
932
+ " row['DC50 (nM)'], return_floats=True)\n",
933
+ " extracted_info['Dmax (%)'] = split_clean_str(\n",
934
+ " row['Dmax (%)'], return_floats=True)\n",
935
+ "\n",
936
+ " # Get the max len of each list in the extracted info\n",
937
+ " max_len = max([len(v)\n",
938
+ " for v in extracted_info.values() if isinstance(v, list)])\n",
939
+ " for i in range(max_len):\n",
940
+ " row_tmp = row.copy().to_dict()\n",
941
+ " row_tmp.update({k: v[i % len(v)] if isinstance(v, list)\n",
942
+ " else v for k, v in extracted_info.items()})\n",
943
+ " parsed_table.append(row_tmp)\n",
944
+ "\n",
945
+ " parsed_table = pd.DataFrame(parsed_table)\n",
946
+ " display(parsed_table.head())\n",
947
+ " print(f'Parsed table len: {len(parsed_table)}')\n",
948
+ " dfs[name] = parsed_table"
949
+ ]
950
+ },
951
+ {
952
+ "cell_type": "code",
953
+ "execution_count": 14,
954
+ "metadata": {},
955
+ "outputs": [],
956
+ "source": [
957
+ "def canonize_smiles(smi):\n",
958
+ " return Chem.MolToSmiles(Chem.MolFromSmiles(smi))\n",
959
+ "\n",
960
+ "dfs['protac-db']['Smiles'] = dfs['protac-db']['Smiles'].apply(canonize_smiles)\n",
961
+ "dfs['protac-db-v2']['Smiles'] = dfs['protac-db-v2']['Smiles'].apply(canonize_smiles)"
962
+ ]
963
+ },
964
+ {
965
+ "cell_type": "code",
966
+ "execution_count": 27,
967
+ "metadata": {},
968
+ "outputs": [
969
+ {
970
+ "name": "stdout",
971
+ "output_type": "stream",
972
+ "text": [
973
+ "Number of entries in protac-db: 1205\n",
974
+ "Number of entries in protac-db-v2: 2264\n",
975
+ "Number of shared entries: 1249\n",
976
+ "Number of total entries: 2232\n"
977
+ ]
978
+ }
979
+ ],
980
+ "source": [
981
+ "# Get the number of entries in both dfs\n",
982
+ "print(f'Number of entries in protac-db: {len(dfs[\"protac-db\"])}')\n",
983
+ "print(f'Number of entries in protac-db-v2: {len(dfs[\"protac-db-v2\"])}')\n",
984
+ "# Get the number of entries shared between the two dfs\n",
985
+ "predict_cols = [\"Smiles\", \"DC50 (nM)\", \"Dmax (%)\", \"E3 Ligase\", \"Uniprot\", \"Cell Type\"]\n",
986
+ "print(f'Number of shared entries: {len(dfs[\"protac-db\"].merge(dfs[\"protac-db-v2\"], on=predict_cols, how=\"inner\"))}')\n",
987
+ "# Get the number of total entries without duplicates\n",
988
+ "print(f'Number of total entries: {len(dfs[\"protac-db\"].append(dfs[\"protac-db-v2\"]).drop_duplicates(subset=predict_cols))}')"
989
+ ]
990
+ },
991
+ {
992
+ "cell_type": "code",
993
+ "execution_count": null,
994
+ "metadata": {},
995
+ "outputs": [],
996
+ "source": []
997
+ }
998
+ ],
999
+ "metadata": {
1000
+ "kernelspec": {
1001
+ "display_name": "Python 3 (ipykernel)",
1002
+ "language": "python",
1003
+ "name": "python3"
1004
+ },
1005
+ "language_info": {
1006
+ "codemirror_mode": {
1007
+ "name": "ipython",
1008
+ "version": 3
1009
+ },
1010
+ "file_extension": ".py",
1011
+ "mimetype": "text/x-python",
1012
+ "name": "python",
1013
+ "nbconvert_exporter": "python",
1014
+ "pygments_lexer": "ipython3",
1015
+ "version": "3.10.8"
1016
+ }
1017
+ },
1018
+ "nbformat": 4,
1019
+ "nbformat_minor": 2
1020
+ }