File size: 4,400 Bytes
5e01175 42d3d55 5e01175 ea572f9 5e01175 c06df22 ea572f9 c06df22 ea572f9 c06df22 ea572f9 c06df22 ea572f9 c06df22 5e01175 ea572f9 c06df22 5e01175 42d3d55 ea572f9 42d3d55 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import os
import pkg_resources
import pickle
from typing import Dict, Optional, List
from .config import config
import h5py
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from joblib import Memory
home_dir = os.path.expanduser('~')
cachedir = os.path.join(home_dir, '.cache', 'protac_degradation_predictor')
memory = Memory(cachedir, verbose=0)
@memory.cache
def load_protein2embedding(
embeddings_path: Optional[str] = None,
) -> Dict[str, np.ndarray]:
""" Load the protein embeddings from a file.
Args:
embeddings_path (str): The path to the embeddings file.
Returns:
Dict[str, np.ndarray]: A dictionary of protein embeddings.
"""
if embeddings_path is None:
embeddings_path = pkg_resources.resource_stream(__name__, 'data/uniprot2embedding.h5')
protein2embedding = {}
with h5py.File(embeddings_path, "r") as file:
for sequence_id in file.keys():
embedding = file[sequence_id][:]
protein2embedding[sequence_id] = np.array(embedding)
return protein2embedding
@memory.cache
def load_cell2embedding(
embeddings_path: Optional[str] = None,
) -> Dict[str, np.ndarray]:
""" Load the cell line embeddings from a file.
Args:
embeddings_path (str): The path to the embeddings file.
Returns:
Dict[str, np.ndarray]: A dictionary of cell line embeddings.
"""
if embeddings_path is None:
with pkg_resources.resource_stream(__name__, 'data/cell2embedding.pkl') as f:
cell2embedding = pickle.load(f)
else:
with open(embeddings_path, 'rb') as f:
cell2embedding = pickle.load(f)
return cell2embedding
def avail_e3_ligases() -> List[str]:
""" Get the available E3 ligases.
Returns:
List[str]: The available E3 ligases.
"""
return list(config.e3_ligase2uniprot.keys())
def avail_cell_lines() -> List[str]:
""" Get the available cell lines.
Returns:
List[str]: The available cell lines.
"""
return list(load_cell2embedding().keys())
def avail_uniprots() -> List[str]:
""" Get the available Uniprot IDs.
Returns:
List[str]: The available Uniprot IDs.
"""
return list(load_protein2embedding().keys())
def get_fingerprint(smiles: str, morgan_fpgen = None) -> np.ndarray:
""" Get the Morgan fingerprint of a molecule.
Args:
smiles (str): The SMILES string of the molecule.
morgan_fpgen: The Morgan fingerprint generator.
Returns:
np.ndarray: The Morgan fingerprint.
"""
if morgan_fpgen is None:
morgan_fpgen = AllChem.GetMorganGenerator(
radius=config.morgan_radius,
fpSize=config.fingerprint_size,
includeChirality=True,
)
return morgan_fpgen.GetFingerprint(Chem.MolFromSmiles(smiles))
def is_active(
DC50: float,
Dmax: float,
pDC50_threshold: float = 7.0,
Dmax_threshold: float = 0.8,
oring: bool = False, # Deprecated
) -> bool:
""" Check if a PROTAC is active based on DC50 and Dmax.
Args:
DC50(float): DC50 in nM
Dmax(float): Dmax in %
Returns:
bool: True if active, False if inactive, np.nan if either DC50 or Dmax is NaN
"""
pDC50 = -np.log10(DC50 * 1e-9) if pd.notnull(DC50) else np.nan
Dmax = Dmax / 100
if pd.notnull(pDC50):
if pDC50 < pDC50_threshold:
return False
if pd.notnull(Dmax):
if Dmax < Dmax_threshold:
return False
if oring:
if pd.notnull(pDC50):
return True if pDC50 >= pDC50_threshold else False
elif pd.notnull(Dmax):
return True if Dmax >= Dmax_threshold else False
else:
return np.nan
else:
if pd.notnull(pDC50) and pd.notnull(Dmax):
return True if pDC50 >= pDC50_threshold and Dmax >= Dmax_threshold else False
else:
return np.nan
def load_curated_dataset() -> pd.DataFrame:
""" Load the curated PROTAC dataset as described in the paper: https://arxiv.org/abs/2406.02637
Returns:
pd.DataFrame: The curated PROTAC dataset.
"""
with pkg_resources.resource_stream(__name__, 'data/PROTAC-Degradation-DB.csv') as f:
protac_df = pd.read_csv(f)
return protac_df |