ribesstefano commited on
Commit
42d3d55
Β·
1 Parent(s): 367cf2c

Added support for listing available dataset data

Browse files
README.md CHANGED
@@ -24,12 +24,16 @@ The code for data curation can be found in the Jupyter notebook [`data_curation.
24
 
25
  ## πŸš€ Installation
26
 
27
- To install the package, open your terminal and run the following command:
28
 
29
  ```bash
 
 
30
  pip install .
31
  ```
32
 
 
 
33
  ## 🎯 Usage
34
 
35
  After installing the package, you can use it as follows:
@@ -56,10 +60,30 @@ print(f'The given PROTAC is: {"active" if active_protac else "inactive"}')
56
 
57
  This example demonstrates how to predict the activity of a PROTAC molecule. The `is_protac_active` function takes the SMILES string of the PROTAC, the E3 ligase, the UniProt ID of the target protein, and the cell line as inputs. It returns whether the PROTAC is active or not.
58
 
 
 
 
 
 
59
  ## πŸ“ˆ Training
60
 
61
  The code for training the model can be found in the file [`run_experiments.py`](src/run_experiments.py).
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  ## πŸ“œ License
64
 
65
  This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
 
24
 
25
  ## πŸš€ Installation
26
 
27
+ To install the package, open your terminal and run the following commands:
28
 
29
  ```bash
30
+ git clone https://github.com/ribesstefano/PROTAC-Degradation-Predictor.git
31
+ cd PROTAC-Degradation-Predictor
32
  pip install .
33
  ```
34
 
35
+ The package has been developed on a Linux machine with Python 3.10.8. It is recommended to use a virtual environment to avoid conflicts with other packages.
36
+
37
  ## 🎯 Usage
38
 
39
  After installing the package, you can use it as follows:
 
60
 
61
  This example demonstrates how to predict the activity of a PROTAC molecule. The `is_protac_active` function takes the SMILES string of the PROTAC, the E3 ligase, the UniProt ID of the target protein, and the cell line as inputs. It returns whether the PROTAC is active or not.
62
 
63
+ The function supports batch computation by passing lists of SMILES strings, E3 ligases, UniProt IDs, and cell lines. In this case, it returns a list of booleans indicating the activity of each PROTAC.
64
+
65
+
66
+
67
+
68
  ## πŸ“ˆ Training
69
 
70
  The code for training the model can be found in the file [`run_experiments.py`](src/run_experiments.py).
71
 
72
+ ## πŸ“„ Citation
73
+
74
+ If you use this tool in your research, please cite the following paper:
75
+
76
+ ```
77
+ @misc{ribes2024modeling,
78
+ title={Modeling PROTAC Degradation Activity with Machine Learning},
79
+ author={Stefano Ribes and Eva Nittinger and Christian Tyrchan and RocΓ­o Mercado},
80
+ year={2024},
81
+ eprint={2406.02637},
82
+ archivePrefix={arXiv},
83
+ primaryClass={q-bio.QM}
84
+ }
85
+ ```
86
+
87
  ## πŸ“œ License
88
 
89
  This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
protac_degradation_predictor/__init__.py CHANGED
@@ -3,6 +3,10 @@ from .data_utils import (
3
  load_cell2embedding,
4
  get_fingerprint,
5
  is_active,
 
 
 
 
6
  )
7
  from .protac_dataset import (
8
  PROTAC_Dataset,
 
3
  load_cell2embedding,
4
  get_fingerprint,
5
  is_active,
6
+ load_curated_dataset,
7
+ avail_cell_lines,
8
+ avail_e3_ligases,
9
+ avail_uniprots,
10
  )
11
  from .protac_dataset import (
12
  PROTAC_Dataset,
protac_degradation_predictor/data_utils.py CHANGED
@@ -1,7 +1,7 @@
1
  import os
2
  import pkg_resources
3
  import pickle
4
- from typing import Dict, Optional
5
 
6
  from .config import config
7
 
@@ -61,6 +61,33 @@ def load_cell2embedding(
61
  return cell2embedding
62
 
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  def get_fingerprint(smiles: str, morgan_fpgen = None) -> np.ndarray:
65
  """ Get the Morgan fingerprint of a molecule.
66
 
@@ -113,4 +140,15 @@ def is_active(
113
  if pd.notnull(pDC50) and pd.notnull(Dmax):
114
  return True if pDC50 >= pDC50_threshold and Dmax >= Dmax_threshold else False
115
  else:
116
- return np.nan
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import pkg_resources
3
  import pickle
4
+ from typing import Dict, Optional, List
5
 
6
  from .config import config
7
 
 
61
  return cell2embedding
62
 
63
 
64
+ def avail_e3_ligases() -> List[str]:
65
+ """ Get the available E3 ligases.
66
+
67
+ Returns:
68
+ List[str]: The available E3 ligases.
69
+ """
70
+ return list(config.e3_ligase2uniprot.keys())
71
+
72
+
73
+ def avail_cell_lines() -> List[str]:
74
+ """ Get the available cell lines.
75
+
76
+ Returns:
77
+ List[str]: The available cell lines.
78
+ """
79
+ return list(load_cell2embedding().keys())
80
+
81
+
82
+ def avail_uniprots() -> List[str]:
83
+ """ Get the available Uniprot IDs.
84
+
85
+ Returns:
86
+ List[str]: The available Uniprot IDs.
87
+ """
88
+ return list(load_protein2embedding().keys())
89
+
90
+
91
  def get_fingerprint(smiles: str, morgan_fpgen = None) -> np.ndarray:
92
  """ Get the Morgan fingerprint of a molecule.
93
 
 
140
  if pd.notnull(pDC50) and pd.notnull(Dmax):
141
  return True if pDC50 >= pDC50_threshold and Dmax >= Dmax_threshold else False
142
  else:
143
+ return np.nan
144
+
145
+
146
+ def load_curated_dataset() -> pd.DataFrame:
147
+ """ Load the curated PROTAC dataset as described in the paper: https://arxiv.org/abs/2406.02637
148
+
149
+ Returns:
150
+ pd.DataFrame: The curated PROTAC dataset.
151
+ """
152
+ with pkg_resources.resource_stream(__name__, 'data/PROTAC-Degradation-DB.csv') as f:
153
+ protac_df = pd.read_csv(f)
154
+ return protac_df
protac_degradation_predictor/protac_degradation_predictor.py CHANGED
@@ -93,12 +93,18 @@ def get_protac_active_proba(
93
  prescaled_embeddings=False, # Normalization performed by the model
94
  )
95
  preds[ckpt_path] = sigmoid(pred).detach().cpu().numpy().flatten()
 
96
  # NOTE: The predictions array has shape: (n_models, batch_size)
97
  preds = np.array(list(preds.values()))
 
 
 
 
 
98
  return {
99
  'preds': preds,
100
- 'mean': np.mean(preds, axis=0),
101
- 'majority_vote': np.mean(preds, axis=0) > 0.5,
102
  }
103
 
104
 
 
93
  prescaled_embeddings=False, # Normalization performed by the model
94
  )
95
  preds[ckpt_path] = sigmoid(pred).detach().cpu().numpy().flatten()
96
+
97
  # NOTE: The predictions array has shape: (n_models, batch_size)
98
  preds = np.array(list(preds.values()))
99
+ mean_preds = np.mean(preds, axis=0)
100
+ # Return a single value if not list as input
101
+ preds = preds if isinstance(protac_smiles, list) else preds[0]
102
+ means_preds = mean_preds if isinstance(protac_smiles, list) else mean_preds[0]
103
+
104
  return {
105
  'preds': preds,
106
+ 'mean': mean_preds,
107
+ 'majority_vote': mean_preds > 0.5,
108
  }
109
 
110