|
import os |
|
import sys |
|
|
|
import numpy as np |
|
import pandas as pd |
|
import streamlit as st |
|
|
|
|
|
from rdkit import Chem |
|
from rdkit.Chem import Draw |
|
|
|
sys.path.insert(0, os.path.abspath("src/")) |
|
|
|
st.set_page_config(layout="wide") |
|
|
|
basepath = os.path.dirname(__file__) |
|
datapath = os.path.join(basepath, "data") |
|
|
|
st.title('HyperDTI: Task-conditioned modeling of drug-target interactions.\n') |
|
st.markdown('') |
|
st.markdown( |
|
""" |
|
🧬 Github: [ml-jku/hyper-dti](https://https://github.com/ml-jku/hyper-dti) 📝 NeurIPS 2022 AI4Science workshop paper: [OpenReview](https://openreview.net/forum?id=dIX34JWnIAL)\n |
|
""" |
|
) |
|
st.error('WARNING! This app is currently under development and should not be used!') |
|
|
|
|
|
def about_page(): |
|
st.markdown( |
|
""" |
|
### About |
|
|
|
HyperNetworks have been established as an effective technique to achieve fast adaptation of parameters for |
|
neural networks. Recently, HyperNetwork predictions conditioned on descriptors of tasks have improved |
|
multi-task generalization in various domains, such as personalized federated learning and neural architecture |
|
search. Especially powerful results were achieved in few- and zero-shot settings, attributed to the increased |
|
information sharing by the HyperNetwork. With the rise of new diseases fast discovery of drugs is needed which |
|
requires models that are able to generalize drug-target interaction predictions in low-data scenarios. |
|
|
|
In this work, we propose the HyperPCM model, a task-conditioned HyperNetwork approach for the problem of |
|
predicting drug-target interactions in drug discovery. Our model learns to generate a QSAR model specialized on |
|
a given protein target. We demonstrate state-of-the-art performance over previous methods on multiple |
|
well-known benchmarks, particularly in zero-shot settings for unseen protein targets. |
|
""" |
|
) |
|
|
|
st.image('figures/hyper-dti.png', caption='Overview of HyperPCM architecture.') |
|
|
|
|
|
''' |
|
def predict_dti(): |
|
st.markdown('## Predict drug-target interaction') |
|
|
|
st.write('In the future this page can be used to predict interactions betweek a query drug compound and a query protein target by the HyperPCM mdoel.') |
|
|
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
st.markdown('### Drug') |
|
|
|
mol_col1, mol_col2 = st.columns(2) |
|
|
|
with mol_col1: |
|
smiles = st.text_input('Enter query SMILES', value='CC(=O)OC1=CC=CC=C1C(=O)O', placeholder='CC(=O)OC1=CC=CC=C1C(=O)O') |
|
if smiles: |
|
mol = Chem.MolFromSmiles(smiles) |
|
mol_img = Chem.Draw.MolToImage(mol) |
|
st.image(mol_img) #, width = 140) |
|
|
|
with mol_col2: |
|
selected_encoder = st.selectbox( |
|
'Select encoder',('None', 'CDDD', 'MolBERT', 'Dummy') |
|
) |
|
if smiles: |
|
if selected_encoder == 'CDDD': |
|
from cddd.inference import InferenceModel |
|
CDDD_MODEL_DIR = 'src/encoders/cddd' |
|
cddd_model = InferenceModel(CDDD_MODEL_DIR) |
|
drug_embedding = cddd_model.seq_to_emb([smiles]) |
|
#from huggingface_hub import hf_hub_download |
|
#precomputed_embs = f'{selected_encoder}_encoding.csv' |
|
#REPO_ID = "emmas96/Lenselink" |
|
#embs_path = hf_hub_download(REPO_ID, precomputed_embs) |
|
#embs = pd.read_csv(embs_path) |
|
#embedding = embs[smiles] |
|
elif selected_encoder == 'MolBERT': |
|
from molbert.utils.featurizer.molbert_featurizer import MolBertFeaturizer |
|
from huggingface_hub import hf_hub_download |
|
CDDD_MODEL_DIR = 'encoders/molbert/last.ckpt' |
|
REPO_ID = "emmas96/hyperpcm" |
|
checkpoint_path = hf_hub_download(REPO_ID, MOLBERT_MODEL_DIR) |
|
molbert_model = MolBertFeaturizer(checkpoint_path, max_seq_len=500, embedding_type='average-1-cat-pooled') |
|
drug_embedding = molbert_model.transform([smiles]) |
|
elif selected_encoder == 'Dummy': |
|
drug_embedding = [0,1,2,3,4,5] |
|
else: |
|
drug_embedding = None |
|
st.image('figures/molecule_encoder.png') |
|
st.warning('Choose encoder above...') |
|
|
|
if drug_embedding is not None: |
|
st.image('figures/molecule_encoder_done.png') |
|
st.success('Encoding complete.') |
|
|
|
with col2: |
|
st.markdown('### Target') |
|
|
|
prot_col1, prot_col2 = st.columns(2) |
|
|
|
with prot_col1: |
|
sequence = st.text_input('Enter query amino-acid sequence', value='HXHVWPVQDAKARFSEFLDACITEGPQIVSRRGAEEAVLVPIGEWRRLQAAA', placeholder='HXHVWPVQDAKARFSEFLDACITEGPQIVSRRGAEEAVLVPIGEWRRLQAAA') |
|
|
|
if sequence == 'HXHVWPVQDAKARFSEFLDACITEGPQIVSRRGAEEAVLVPIGEWRRLQAAA': |
|
st.image('figures/ex_protein.jpeg') |
|
elif sequence: |
|
st.error('Visualization comming soon...') |
|
|
|
with prot_col2: |
|
selected_encoder = st.selectbox( |
|
'Select encoder for protein target',('None', 'SeqVec', 'UniRep', 'ESM-1b', 'ProtT5') |
|
) |
|
|
|
if sequence: |
|
if selected_encoder == 'SeqVec': |
|
with st.spinner('Encoding in progress...'): |
|
from bio_embeddings.embed import SeqVecEmbedder |
|
encoder = SeqVecEmbedder() |
|
embeddings = encoder.embed_batch([sequence]) |
|
for emb in embeddings: |
|
prot_embedding = encoder.reduce_per_protein(emb) |
|
break |
|
elif selected_encoder == 'UniRep': |
|
with st.spinner('Encoding in progress...'): |
|
from jax_unirep.utils import load_params |
|
params = load_params() |
|
from jax_unirep.featurize import get_reps |
|
embedding, h_final, c_final = get_reps([sequence]) |
|
prot_embedding = embedding.mean(axis=0) |
|
elif selected_encoder == 'ESM-1b': |
|
with st.spinner('Encoding in progress...'): |
|
from bio_embeddings.embed import ESM1bEmbedder |
|
encoder = ESM1bEmbedder() |
|
embeddings = encoder.embed_batch([sequence]) |
|
for emb in embeddings: |
|
prot_embedding = encoder.reduce_per_protein(emb) |
|
break |
|
elif selected_encoder == 'ProtT5': |
|
with st.spinner('Encoding in progress...'): |
|
from bio_embeddings.embed import ProtTransT5XLU50Embedder |
|
encoder = ProtTransT5XLU50Embedder() |
|
embeddings = encoder.embed_batch([sequence]) |
|
for emb in embeddings: |
|
prot_embedding = encoder.reduce_per_protein(emb) |
|
break |
|
else: |
|
prot_embedding = None |
|
st.image('figures/protein_encoder.png') |
|
st.warning('Choose encoder above...') |
|
|
|
if prot_embedding is not None: |
|
st.image('figures/protein_encoder_done.png') |
|
st.success('Encoding complete.') |
|
|
|
if drug_embedding is None or prot_embedding is None: |
|
st.warning('Waiting for both drug and target embeddings to be computed...') |
|
else: |
|
st.markdown('### Inference') |
|
|
|
import time |
|
progress_text = "HyperPCM predicts the interaction between the query drug compound toward the query protein target. Please wait." |
|
my_bar = st.progress(0, text=progress_text) |
|
for i in range(100): |
|
time.sleep(0.1) |
|
my_bar.progress(i + 1, text=progress_text) |
|
my_bar.progress(100, text="HyperPCM predicts the interaction between the query drug compound toward the query protein target. Done.") |
|
|
|
st.markdown('### Interaction') |
|
st.write('HyperPCM predicts an activity of xxx pChEMBL.') |
|
''' |
|
|
|
|
|
def retrieval(): |
|
st.markdown('## Retrieve top-k most active drug compounds') |
|
|
|
st.write('In the furute this page will retrieve the top-k drug compounds that are predicted to have the highest activity toward the given protein target from either the Lenselink or Davis datasets.') |
|
|
|
st.markdown('### Target') |
|
|
|
st.write(f'The top-{selected_k} most active drug coupounds from {selected_dataset} predicted by HyperPCM are: ') |
|
dummy_smiles = ['CC(=O)OC1=CC=CC=C1C(=O)O', 'COc1cc(C=O)ccc1O', 'CC(=O)Nc1ccc(O)cc1', 'CC(=O)Nc1ccc(OS(=O)(=O)O)cc1', 'CC(=O)Nc1ccc(O[C@@H]2O[C@H](C(=O)O)[C@@H](O)[C@H](O)[C@H]2O)cc1'] |
|
cols = st.columns(5) |
|
for j, col in enumerate(cols): |
|
with col: |
|
for i in range(int(selected_k/5)): |
|
mol = Chem.MolFromSmiles(dummy_smiles[j]) |
|
mol_img = Chem.Draw.MolToImage(mol) |
|
st.image(mol_img) |
|
|
|
''' |
|
col1, col2, col3, col4 = st.columns(4) |
|
with col2: |
|
sequence = st.text_input('Enter query amino-acid sequence', value='HXHVWPVQDAKARFSEFLDACITEGPQIVSRRGAEEAVLVPIGEWRRLQAAA', placeholder='HXHVWPVQDAKARFSEFLDACITEGPQIVSRRGAEEAVLVPIGEWRRLQAAA') |
|
if sequence == 'HXHVWPVQDAKARFSEFLDACITEGPQIVSRRGAEEAVLVPIGEWRRLQAAA': |
|
st.image('figures/ex_protein.jpeg') |
|
elif sequence: |
|
st.error('Visualization coming soon...') |
|
|
|
with col3: |
|
selected_encoder = st.selectbox( |
|
'Select encoder for protein target',('SeqVec', 'None') |
|
) |
|
if sequence: |
|
if selected_encoder == 'SeqVec': |
|
st.image('figures/protein_encoder_done.png') |
|
with st.spinner('Encoding in progress...'): |
|
from bio_embeddings.embed import SeqVecEmbedder |
|
encoder = SeqVecEmbedder() |
|
embeddings = encoder.embed_batch([sequence]) |
|
for emb in embeddings: |
|
prot_embedding = encoder.reduce_per_protein(emb) |
|
break |
|
st.success('Encoding complete.') |
|
else: |
|
prot_embedding = None |
|
st.image('figures/protein_encoder.png') |
|
st.warning('Choose encoder above...') |
|
|
|
if prot_embedding is not None: |
|
st.markdown('### Inference') |
|
|
|
import time |
|
progress_text = "HyperPCM predicts the QSAR model for the query protein target. Please wait." |
|
my_bar = st.progress(0, text=progress_text) |
|
for i in range(100): |
|
time.sleep(0.1) |
|
my_bar.progress(i + 1, text=progress_text) |
|
my_bar.progress(100, text="HyperPCM predicts the QSAR model for the query protein target. Done.") |
|
|
|
st.markdown('### Retrieval') |
|
|
|
col1, col2 = st.columns(2) |
|
with col1: |
|
selected_dataset = st.selectbox( |
|
'Select dataset from which the drug compounds should be retrieved',('Lenselink', 'Davis') |
|
) |
|
with col2: |
|
selected_k = st.selectbox( |
|
'Select the top-k number of drug compounds to retrieve',(5, 10, 15, 20) |
|
) |
|
|
|
st.write(f'The top-{selected_k} most active drug coupounds from {selected_dataset} predicted by HyperPCM are: ') |
|
dummy_smiles = ['CC(=O)OC1=CC=CC=C1C(=O)O', 'COc1cc(C=O)ccc1O', 'CC(=O)Nc1ccc(O)cc1', 'CC(=O)Nc1ccc(OS(=O)(=O)O)cc1', 'CC(=O)Nc1ccc(O[C@@H]2O[C@H](C(=O)O)[C@@H](O)[C@H](O)[C@H]2O)cc1'] |
|
cols = st.columns(5) |
|
for j, col in enumerate(cols): |
|
with col: |
|
for i in range(int(selected_k/5)): |
|
mol = Chem.MolFromSmiles(dummy_smiles[j]) |
|
mol_img = Chem.Draw.MolToImage(mol) |
|
st.image(mol_img) |
|
''' |
|
|
|
''' |
|
def display_protein(): |
|
st.markdown('## Display protein structure') |
|
st.write('In the future this page will display the ESM predicted sequence of a protein target.') |
|
|
|
st.markdown('### Target') |
|
sequence = st.text_input('Enter the amino-acid sequence of the query protein target', value='HXHVWPVQDAKARFSEFLDACITEGPQIVSRRGAEEAVLVPIGEWRRLQAAA', placeholder='HXHVWPVQDAKARFSEFLDACITEGPQIVSRRGAEEAVLVPIGEWRRLQAAA') |
|
|
|
if sequence: |
|
|
|
st.image('figures/ex_protein.jpeg') |
|
|
|
model = esm.pretrained.esmfold_v1() |
|
model = model.eval().cuda() |
|
|
|
with torch.no_grad(): |
|
output = model.infer_pdb(sequence) |
|
st.write(output) |
|
|
|
with open("result.pdb", "w") as f: |
|
f.write(output) |
|
|
|
|
|
struct = bsio.load_structure("result.pdb", extra_fields=["b_factor"]) |
|
print(struct.b_factor.mean()) |
|
|
|
|
|
""" |
|
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D() |
|
batch_converter = alphabet.get_batch_converter() |
|
batch_labels, batch_strs, batch_tokens = batch_converter([("protein1", sequence),]) |
|
|
|
# Extract per-residue representations (on CPU) |
|
with torch.no_grad(): |
|
results = model(batch_tokens, repr_layers=[12], return_contacts=True) |
|
token_representations = results["representations"][12] |
|
|
|
token_list = token_representations.tolist()[0][0][0] |
|
|
|
client = Client(url=st.secrets["DB_URL"], user=st.secrets["USER"], password=st.secrets["PASSWD"]) |
|
|
|
result = client.fetch("SELECT seq, distance('topK=500')(representations, " + str(token_list) + ')'+ "as dist FROM default.esm_protein_indexer_768") |
|
|
|
result_temp_seq = [] |
|
|
|
for i in result: |
|
# result_temp_coords = i['seq'] |
|
result_temp_seq.append(i['seq']) |
|
|
|
result_temp_seq = list(set(result_temp_seq)) |
|
|
|
if st.button(result_temp_seq[0]): |
|
print(result_temp_seq[0]) |
|
elif st.button(result_temp_seq[1]): |
|
print(result_temp_seq[1]) |
|
elif st.button(result_temp_seq[2]): |
|
print(result_temp_seq[2]) |
|
elif st.button(result_temp_seq[3]): |
|
print(result_temp_seq[3]) |
|
elif st.button(result_temp_seq[4]): |
|
print(result_temp_seq[4]) |
|
|
|
start[2] = st.pyplot(visualize_3D_Coordinates(result_temp_coords).figure) |
|
|
|
headers = { |
|
'Content-Type': 'application/x-www-form-urlencoded', |
|
} |
|
response = requests.post('https://api.esmatlas.com/foldSequence/v1/pdb/', headers=headers, data=sequence) |
|
name = sequence[:3] + sequence[-3:] |
|
pdb_string = response.content.decode('utf-8') |
|
with open('predicted.pdb', 'w') as f: |
|
f.write(pdb_string) |
|
struct = bsio.load_structure('predicted.pdb', extra_fields=["b_factor"]) |
|
b_value = round(struct.b_factor.mean(), 4) |
|
render_mol(pdb_string) |
|
if residues_marker: |
|
start[3] = showmol(render_pdb_resn(viewer = render_pdb(id = id_PDB),resn_lst = [residues_marker])) |
|
else: |
|
start[3] = showmol(render_pdb(id = id_PDB)) |
|
st.session_state['xq'] = st.session_state.model |
|
|
|
# example proteins ["HXHVWPVQDAKARFSEFLDACITEGPQIVSRRGAEEAVLVPIGEWRRLQAAA"], ["AHKLFIGGLPNYLNDDQVKELLTSFGPLKAFNLVKDSATGLSKGYAFCEYVDINVTDQAIAGLNGMQLGDKKLLVQRASVGAKNA"] |
|
""" |
|
|
|
def display_context(): |
|
st.markdown('## Display context') |
|
st.write('In the future this page will visualize the context module for a given protein, i.e., show important features and highly ranked / related proteins from the context.') |
|
''' |
|
|
|
def references(): |
|
st.markdown( |
|
''' |
|
## References |
|
|
|
Schmidhuber, J., “Learning to control fast-weight memories: An alternative to dynamic recurrent networks.” Neural Computation, 1992. |
|
|
|
Davis, M. I., et al. "Comprehensive analysis of kinase inhibitor selectivity." Nature Biotechnology 29.11 (2011): 1046-1051. |
|
|
|
Ha, D., et al. “HyperNetworks”. ICLR, 2017. |
|
|
|
Lenselink, E. B., et al. "Beyond the hype: deep neural networks outperform established methods using a ChEMBL bioactivity benchmark set." Journal of Cheminformatics 9.1 (2017): 1-14. |
|
|
|
Alley, E. C., et al. "Unified rational protein engineering with sequence-based deep representation learning." Nature Methods 16.12 (2019): 1315-1322. |
|
|
|
Chang, O., et al., “Principled weight initialization for hypernetworks.” ICLR, 2019. |
|
|
|
Heinzinger, M., et al. "Modeling aspects of the language of life through transfer-learning protein sequences." BMC Bioinformatics 20.1 (2019): 1-17. |
|
|
|
Winter, R., et al. "Learning continuous and data-driven molecular descriptors by translating equivalent chemical representations." Chemical Science 10.6 (2019): 1692-1701. |
|
|
|
Fabian, B., et al. "Molecular representation learning with language models and domain-relevant auxiliary tasks." Workshop for ML4Molecules (2020). |
|
|
|
Elnaggar, A., et al. "ProtTrans: Toward understanding the language of life through self-supervised learning." IEEE Transactions on Pattern Analysis and Machine Intelligence 44 (2021): 7112–7127. |
|
|
|
Rives, A., et al. "Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences." Proceedings of the National Academy of Sciences 118.15 (2021): e2016239118. |
|
|
|
Kim, P. T., et al. "Unsupervised Representation Learning for Proteochemometric Modeling." International Journal of Molecular Sciences 22.23 (2021): 12882. |
|
|
|
Schimunek, J., et al., “Context-enriched molecule representations improve few-shot drug discovery.” ICLR, 2023. |
|
|
|
''' |
|
) |
|
|
|
page_names_to_func = { |
|
'About': about_page, |
|
|
|
'Retrieve Top-k': retrieval, |
|
|
|
|
|
|
|
} |
|
|
|
selected_page = st.sidebar.selectbox('Choose function', page_names_to_func.keys()) |
|
st.sidebar.markdown('') |
|
page_names_to_func[selected_page]() |
|
|
|
|