Commit
Β·
42d3d55
1
Parent(s):
367cf2c
Added support for listing available dataset data
Browse files
README.md
CHANGED
@@ -24,12 +24,16 @@ The code for data curation can be found in the Jupyter notebook [`data_curation.
|
|
24 |
|
25 |
## π Installation
|
26 |
|
27 |
-
To install the package, open your terminal and run the following
|
28 |
|
29 |
```bash
|
|
|
|
|
30 |
pip install .
|
31 |
```
|
32 |
|
|
|
|
|
33 |
## π― Usage
|
34 |
|
35 |
After installing the package, you can use it as follows:
|
@@ -56,10 +60,30 @@ print(f'The given PROTAC is: {"active" if active_protac else "inactive"}')
|
|
56 |
|
57 |
This example demonstrates how to predict the activity of a PROTAC molecule. The `is_protac_active` function takes the SMILES string of the PROTAC, the E3 ligase, the UniProt ID of the target protein, and the cell line as inputs. It returns whether the PROTAC is active or not.
|
58 |
|
|
|
|
|
|
|
|
|
|
|
59 |
## π Training
|
60 |
|
61 |
The code for training the model can be found in the file [`run_experiments.py`](src/run_experiments.py).
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
## π License
|
64 |
|
65 |
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
|
24 |
|
25 |
## π Installation
|
26 |
|
27 |
+
To install the package, open your terminal and run the following commands:
|
28 |
|
29 |
```bash
|
30 |
+
git clone https://github.com/ribesstefano/PROTAC-Degradation-Predictor.git
|
31 |
+
cd PROTAC-Degradation-Predictor
|
32 |
pip install .
|
33 |
```
|
34 |
|
35 |
+
The package has been developed on a Linux machine with Python 3.10.8. It is recommended to use a virtual environment to avoid conflicts with other packages.
|
36 |
+
|
37 |
## π― Usage
|
38 |
|
39 |
After installing the package, you can use it as follows:
|
|
|
60 |
|
61 |
This example demonstrates how to predict the activity of a PROTAC molecule. The `is_protac_active` function takes the SMILES string of the PROTAC, the E3 ligase, the UniProt ID of the target protein, and the cell line as inputs. It returns whether the PROTAC is active or not.
|
62 |
|
63 |
+
The function supports batch computation by passing lists of SMILES strings, E3 ligases, UniProt IDs, and cell lines. In this case, it returns a list of booleans indicating the activity of each PROTAC.
|
64 |
+
|
65 |
+
|
66 |
+
|
67 |
+
|
68 |
## π Training
|
69 |
|
70 |
The code for training the model can be found in the file [`run_experiments.py`](src/run_experiments.py).
|
71 |
|
72 |
+
## π Citation
|
73 |
+
|
74 |
+
If you use this tool in your research, please cite the following paper:
|
75 |
+
|
76 |
+
```
|
77 |
+
@misc{ribes2024modeling,
|
78 |
+
title={Modeling PROTAC Degradation Activity with Machine Learning},
|
79 |
+
author={Stefano Ribes and Eva Nittinger and Christian Tyrchan and RocΓo Mercado},
|
80 |
+
year={2024},
|
81 |
+
eprint={2406.02637},
|
82 |
+
archivePrefix={arXiv},
|
83 |
+
primaryClass={q-bio.QM}
|
84 |
+
}
|
85 |
+
```
|
86 |
+
|
87 |
## π License
|
88 |
|
89 |
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
protac_degradation_predictor/__init__.py
CHANGED
@@ -3,6 +3,10 @@ from .data_utils import (
|
|
3 |
load_cell2embedding,
|
4 |
get_fingerprint,
|
5 |
is_active,
|
|
|
|
|
|
|
|
|
6 |
)
|
7 |
from .protac_dataset import (
|
8 |
PROTAC_Dataset,
|
|
|
3 |
load_cell2embedding,
|
4 |
get_fingerprint,
|
5 |
is_active,
|
6 |
+
load_curated_dataset,
|
7 |
+
avail_cell_lines,
|
8 |
+
avail_e3_ligases,
|
9 |
+
avail_uniprots,
|
10 |
)
|
11 |
from .protac_dataset import (
|
12 |
PROTAC_Dataset,
|
protac_degradation_predictor/data_utils.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import os
|
2 |
import pkg_resources
|
3 |
import pickle
|
4 |
-
from typing import Dict, Optional
|
5 |
|
6 |
from .config import config
|
7 |
|
@@ -61,6 +61,33 @@ def load_cell2embedding(
|
|
61 |
return cell2embedding
|
62 |
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
def get_fingerprint(smiles: str, morgan_fpgen = None) -> np.ndarray:
|
65 |
""" Get the Morgan fingerprint of a molecule.
|
66 |
|
@@ -113,4 +140,15 @@ def is_active(
|
|
113 |
if pd.notnull(pDC50) and pd.notnull(Dmax):
|
114 |
return True if pDC50 >= pDC50_threshold and Dmax >= Dmax_threshold else False
|
115 |
else:
|
116 |
-
return np.nan
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import pkg_resources
|
3 |
import pickle
|
4 |
+
from typing import Dict, Optional, List
|
5 |
|
6 |
from .config import config
|
7 |
|
|
|
61 |
return cell2embedding
|
62 |
|
63 |
|
64 |
+
def avail_e3_ligases() -> List[str]:
|
65 |
+
""" Get the available E3 ligases.
|
66 |
+
|
67 |
+
Returns:
|
68 |
+
List[str]: The available E3 ligases.
|
69 |
+
"""
|
70 |
+
return list(config.e3_ligase2uniprot.keys())
|
71 |
+
|
72 |
+
|
73 |
+
def avail_cell_lines() -> List[str]:
|
74 |
+
""" Get the available cell lines.
|
75 |
+
|
76 |
+
Returns:
|
77 |
+
List[str]: The available cell lines.
|
78 |
+
"""
|
79 |
+
return list(load_cell2embedding().keys())
|
80 |
+
|
81 |
+
|
82 |
+
def avail_uniprots() -> List[str]:
|
83 |
+
""" Get the available Uniprot IDs.
|
84 |
+
|
85 |
+
Returns:
|
86 |
+
List[str]: The available Uniprot IDs.
|
87 |
+
"""
|
88 |
+
return list(load_protein2embedding().keys())
|
89 |
+
|
90 |
+
|
91 |
def get_fingerprint(smiles: str, morgan_fpgen = None) -> np.ndarray:
|
92 |
""" Get the Morgan fingerprint of a molecule.
|
93 |
|
|
|
140 |
if pd.notnull(pDC50) and pd.notnull(Dmax):
|
141 |
return True if pDC50 >= pDC50_threshold and Dmax >= Dmax_threshold else False
|
142 |
else:
|
143 |
+
return np.nan
|
144 |
+
|
145 |
+
|
146 |
+
def load_curated_dataset() -> pd.DataFrame:
|
147 |
+
""" Load the curated PROTAC dataset as described in the paper: https://arxiv.org/abs/2406.02637
|
148 |
+
|
149 |
+
Returns:
|
150 |
+
pd.DataFrame: The curated PROTAC dataset.
|
151 |
+
"""
|
152 |
+
with pkg_resources.resource_stream(__name__, 'data/PROTAC-Degradation-DB.csv') as f:
|
153 |
+
protac_df = pd.read_csv(f)
|
154 |
+
return protac_df
|
protac_degradation_predictor/protac_degradation_predictor.py
CHANGED
@@ -93,12 +93,18 @@ def get_protac_active_proba(
|
|
93 |
prescaled_embeddings=False, # Normalization performed by the model
|
94 |
)
|
95 |
preds[ckpt_path] = sigmoid(pred).detach().cpu().numpy().flatten()
|
|
|
96 |
# NOTE: The predictions array has shape: (n_models, batch_size)
|
97 |
preds = np.array(list(preds.values()))
|
|
|
|
|
|
|
|
|
|
|
98 |
return {
|
99 |
'preds': preds,
|
100 |
-
'mean':
|
101 |
-
'majority_vote':
|
102 |
}
|
103 |
|
104 |
|
|
|
93 |
prescaled_embeddings=False, # Normalization performed by the model
|
94 |
)
|
95 |
preds[ckpt_path] = sigmoid(pred).detach().cpu().numpy().flatten()
|
96 |
+
|
97 |
# NOTE: The predictions array has shape: (n_models, batch_size)
|
98 |
preds = np.array(list(preds.values()))
|
99 |
+
mean_preds = np.mean(preds, axis=0)
|
100 |
+
# Return a single value if not list as input
|
101 |
+
preds = preds if isinstance(protac_smiles, list) else preds[0]
|
102 |
+
means_preds = mean_preds if isinstance(protac_smiles, list) else mean_preds[0]
|
103 |
+
|
104 |
return {
|
105 |
'preds': preds,
|
106 |
+
'mean': mean_preds,
|
107 |
+
'majority_vote': mean_preds > 0.5,
|
108 |
}
|
109 |
|
110 |
|