File size: 4,400 Bytes
5e01175
 
 
42d3d55
5e01175
ea572f9
5e01175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c06df22
 
ea572f9
c06df22
 
ea572f9
 
c06df22
ea572f9
c06df22
ea572f9
 
c06df22
 
 
 
 
 
 
5e01175
 
 
ea572f9
 
 
 
 
 
 
 
 
 
 
 
c06df22
 
 
 
 
5e01175
 
 
42d3d55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea572f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42d3d55
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import os
import pkg_resources
import pickle
from typing import Dict, Optional, List

from .config import config

import h5py
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from joblib import Memory


home_dir = os.path.expanduser('~')
cachedir = os.path.join(home_dir, '.cache', 'protac_degradation_predictor')
memory = Memory(cachedir, verbose=0)


@memory.cache
def load_protein2embedding(
    embeddings_path: Optional[str] = None,
) -> Dict[str, np.ndarray]:
    """ Load the protein embeddings from a file.

    Args:
        embeddings_path (str): The path to the embeddings file.

    Returns:
        Dict[str, np.ndarray]: A dictionary of protein embeddings.
    """
    if embeddings_path is None:
        embeddings_path = pkg_resources.resource_stream(__name__, 'data/uniprot2embedding.h5')
    protein2embedding = {}
    with h5py.File(embeddings_path, "r") as file:
        for sequence_id in file.keys():
            embedding = file[sequence_id][:]
            protein2embedding[sequence_id] = np.array(embedding)
    return protein2embedding


@memory.cache
def load_cell2embedding(
        embeddings_path: Optional[str] = None,
) -> Dict[str, np.ndarray]:
    """ Load the cell line embeddings from a file.
    
    Args:
        embeddings_path (str): The path to the embeddings file.
        
    Returns:
        Dict[str, np.ndarray]: A dictionary of cell line embeddings.
    """
    if embeddings_path is None:
        with pkg_resources.resource_stream(__name__, 'data/cell2embedding.pkl') as f:
            cell2embedding = pickle.load(f)
    else:
        with open(embeddings_path, 'rb') as f:
            cell2embedding = pickle.load(f)
    return cell2embedding


def avail_e3_ligases() -> List[str]:
    """ Get the available E3 ligases.
    
    Returns:
        List[str]: The available E3 ligases.
    """
    return list(config.e3_ligase2uniprot.keys())


def avail_cell_lines() -> List[str]:
    """ Get the available cell lines.
    
    Returns:
        List[str]: The available cell lines.
    """
    return list(load_cell2embedding().keys())


def avail_uniprots() -> List[str]:
    """ Get the available Uniprot IDs.
    
    Returns:
        List[str]: The available Uniprot IDs.
    """
    return list(load_protein2embedding().keys())


def get_fingerprint(smiles: str, morgan_fpgen = None) -> np.ndarray:
    """ Get the Morgan fingerprint of a molecule.
    
    Args:
        smiles (str): The SMILES string of the molecule.
        morgan_fpgen: The Morgan fingerprint generator.

    Returns:
        np.ndarray: The Morgan fingerprint.
    """
    if morgan_fpgen is None:
        morgan_fpgen = AllChem.GetMorganGenerator(
            radius=config.morgan_radius,
            fpSize=config.fingerprint_size,
            includeChirality=True,
        )
    return morgan_fpgen.GetFingerprint(Chem.MolFromSmiles(smiles))


def is_active(
        DC50: float,
        Dmax: float,
        pDC50_threshold: float = 7.0,
        Dmax_threshold: float = 0.8,
        oring: bool = False, # Deprecated
) -> bool:
    """ Check if a PROTAC is active based on DC50 and Dmax.	
    Args:
        DC50(float): DC50 in nM
        Dmax(float): Dmax in %
    Returns:
        bool: True if active, False if inactive, np.nan if either DC50 or Dmax is NaN
    """
    pDC50 = -np.log10(DC50 * 1e-9) if pd.notnull(DC50) else np.nan
    Dmax = Dmax / 100
    if pd.notnull(pDC50):
        if pDC50 < pDC50_threshold:
            return False
    if pd.notnull(Dmax):
        if Dmax < Dmax_threshold:
            return False
    if oring:
        if pd.notnull(pDC50):
            return True if pDC50 >= pDC50_threshold else False
        elif pd.notnull(Dmax):
            return True if Dmax >= Dmax_threshold else False
        else:
            return np.nan
    else:
        if pd.notnull(pDC50) and pd.notnull(Dmax):
            return True if pDC50 >= pDC50_threshold and Dmax >= Dmax_threshold else False
        else:
            return np.nan


def load_curated_dataset() -> pd.DataFrame:
    """ Load the curated PROTAC dataset as described in the paper: https://arxiv.org/abs/2406.02637

    Returns:
        pd.DataFrame: The curated PROTAC dataset.
    """
    with pkg_resources.resource_stream(__name__, 'data/PROTAC-Degradation-DB.csv') as f:
        protac_df = pd.read_csv(f)
    return protac_df