File size: 1,339 Bytes
5e01175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import os
import pkg_resources
import pickle
from typing import Dict

from config import config

import h5py
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from joblib import Memory


home_dir = os.path.expanduser('~')
cachedir = os.path.join(home_dir, '.cache', 'protac_degradation_predictor')
memory = Memory(cachedir, verbose=0)


@memory.cache
def load_protein2embedding() -> Dict[str, np.ndarray]:
    embeddings_path = pkg_resources.resource_stream(__name__, 'data/uniprot2embedding.h5')
    protein2embedding = {}
    with h5py.File(embeddings_path, "r") as file:
        for sequence_id in file.keys():
            embedding = file[sequence_id][:]
            protein2embedding[sequence_id] = np.array(embedding)
    return protein2embedding


@memory.cache
def load_cell2embedding() -> Dict[str, np.ndarray]:
    embeddings_path = pkg_resources.resource_stream(__name__, 'data/cell2embedding.pkl')
    with open(embeddings_path, 'rb') as f:
        cell2embedding = pickle.load(f)
    return cell2embedding


def get_fingerprint(smiles: str) -> np.ndarray:
    morgan_fpgen = AllChem.GetMorganGenerator(
        radius=config.morgan_radius,
        fpSize=config.fingerprint_size,
        includeChirality=True,
    )
    return morgan_fpgen.GetFingerprint(Chem.MolFromSmiles(smiles))