update illustrations of respective encoders
Browse files
app.py
CHANGED
|
@@ -56,85 +56,98 @@ def predict_dti():
|
|
| 56 |
|
| 57 |
with col1:
|
| 58 |
st.markdown('### Drug')
|
| 59 |
-
smiles = st.text_input('Enter the SMILES of the query drug compound', value='CC(=O)OC1=CC=CC=C1C(=O)O', placeholder='CC(=O)OC1=CC=CC=C1C(=O)O')
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
st.
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
selected_encoder = st.selectbox(
|
| 67 |
'Select encoder for drug compound',('None', 'CDDD', 'MolBERT')
|
| 68 |
)
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
| 94 |
|
| 95 |
with col2:
|
| 96 |
st.markdown('### Target')
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
|
|
|
| 102 |
selected_encoder = st.selectbox(
|
| 103 |
'Select encoder for protein target',('None', 'SeqVec', 'UniRep', 'ESM-1b', 'ProtT5')
|
| 104 |
)
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
|
|
|
|
|
|
| 138 |
|
| 139 |
st.write('TODO run inference with HyperPCM on the given drug compound and protein target.')
|
| 140 |
|
|
@@ -148,7 +161,7 @@ def retrieval():
|
|
| 148 |
sequence = st.text_input('Enter the amino-acid sequence of the query protein target', value='HXHVWPVQDAKARFSEFLDACITEGPQIVSRRGAEEAVLVPIGEWRRLQAAA', placeholder='HXHVWPVQDAKARFSEFLDACITEGPQIVSRRGAEEAVLVPIGEWRRLQAAA')
|
| 149 |
|
| 150 |
if sequence:
|
| 151 |
-
col1, col2
|
| 152 |
with col1:
|
| 153 |
st.markdown('\n\n\n\n Plot of protein to be added soon. \n\n\n\n')
|
| 154 |
|
|
@@ -162,8 +175,7 @@ def retrieval():
|
|
| 162 |
for emb in embeddings:
|
| 163 |
embedding = encoder.reduce_per_protein(emb)
|
| 164 |
break
|
| 165 |
-
|
| 166 |
-
st.write(f'SeqVec embedding')
|
| 167 |
st.write(embedding)
|
| 168 |
st.write(np.transpose(embedding))
|
| 169 |
|
|
|
|
| 56 |
|
| 57 |
with col1:
|
| 58 |
st.markdown('### Drug')
|
|
|
|
| 59 |
|
| 60 |
+
mol_col1, mol_col2 = st.columns(2)
|
| 61 |
+
|
| 62 |
+
with mol_col1:
|
| 63 |
+
smiles = st.text_input('Enter the SMILES of the query drug compound', value='CC(=O)OC1=CC=CC=C1C(=O)O', placeholder='CC(=O)OC1=CC=CC=C1C(=O)O')
|
| 64 |
+
if smiles:
|
| 65 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 66 |
+
mol_img = Chem.Draw.MolToImage(mol)
|
| 67 |
+
st.image(mol_img) #, width = 140)
|
| 68 |
+
|
| 69 |
+
with mol_col2:
|
| 70 |
selected_encoder = st.selectbox(
|
| 71 |
'Select encoder for drug compound',('None', 'CDDD', 'MolBERT')
|
| 72 |
)
|
| 73 |
+
st.image('molecule_encoder.png')
|
| 74 |
+
if smiles:
|
| 75 |
+
if selected_encoder == 'CDDD':
|
| 76 |
+
from cddd.inference import InferenceModel
|
| 77 |
+
CDDD_MODEL_DIR = 'src/encoders/cddd'
|
| 78 |
+
cddd_model = InferenceModel(CDDD_MODEL_DIR)
|
| 79 |
+
embedding = cddd_model.seq_to_emb([smiles])
|
| 80 |
+
#from huggingface_hub import hf_hub_download
|
| 81 |
+
#precomputed_embs = f'{selected_encoder}_encoding.csv'
|
| 82 |
+
#REPO_ID = "emmas96/Lenselink"
|
| 83 |
+
#embs_path = hf_hub_download(REPO_ID, precomputed_embs)
|
| 84 |
+
#embs = pd.read_csv(embs_path)
|
| 85 |
+
#embedding = embs[smiles]
|
| 86 |
+
elif selected_encoder == 'MolBERT':
|
| 87 |
+
from molbert.utils.featurizer.molbert_featurizer import MolBertFeaturizer
|
| 88 |
+
from huggingface_hub import hf_hub_download
|
| 89 |
+
CDDD_MODEL_DIR = 'encoders/molbert/last.ckpt'
|
| 90 |
+
REPO_ID = "emmas96/hyperpcm"
|
| 91 |
+
checkpoint_path = hf_hub_download(REPO_ID, MOLBERT_MODEL_DIR)
|
| 92 |
+
molbert_model = MolBertFeaturizer(checkpoint_path, max_seq_len=500, embedding_type='average-1-cat-pooled')
|
| 93 |
+
embedding = molbert_model.transform([smiles])
|
| 94 |
+
else:
|
| 95 |
+
st.write('No pre-trained version of HyperPCM is available for the chosen encoder.')
|
| 96 |
+
embedding = None
|
| 97 |
+
if embedding is not None:
|
| 98 |
+
st.write(f'{selected_encoder} embedding')
|
| 99 |
+
st.write(embedding)
|
| 100 |
|
| 101 |
with col2:
|
| 102 |
st.markdown('### Target')
|
| 103 |
+
|
| 104 |
+
prot_col1, prot_col2 = st.columns(2)
|
| 105 |
+
|
| 106 |
+
with prot_col1:
|
| 107 |
+
sequence = st.text_input('Enter the amino-acid sequence of the query protein target', value='HXHVWPVQDAKARFSEFLDACITEGPQIVSRRGAEEAVLVPIGEWRRLQAAA', placeholder='HXHVWPVQDAKARFSEFLDACITEGPQIVSRRGAEEAVLVPIGEWRRLQAAA')
|
| 108 |
|
| 109 |
+
if sequence:
|
| 110 |
+
st.markdown('\n\n\n\n Plot of protein to be added soon. \n\n\n\n')
|
| 111 |
+
|
| 112 |
+
with prot_col2:
|
| 113 |
selected_encoder = st.selectbox(
|
| 114 |
'Select encoder for protein target',('None', 'SeqVec', 'UniRep', 'ESM-1b', 'ProtT5')
|
| 115 |
)
|
| 116 |
+
st.image('protein_encoder.png')
|
| 117 |
+
if sequence:
|
| 118 |
+
if selected_encoder == 'SeqVec':
|
| 119 |
+
from bio_embeddings.embed import SeqVecEmbedder
|
| 120 |
+
encoder = SeqVecEmbedder()
|
| 121 |
+
embeddings = encoder.embed_batch([sequence])
|
| 122 |
+
for emb in embeddings:
|
| 123 |
+
embedding = encoder.reduce_per_protein(emb)
|
| 124 |
+
break
|
| 125 |
+
elif selected_encoder == 'UniRep':
|
| 126 |
+
from jax_unirep.utils import load_params
|
| 127 |
+
params = load_params()
|
| 128 |
+
from jax_unirep.featurize import get_reps
|
| 129 |
+
embedding, h_final, c_final = get_reps([sequence])
|
| 130 |
+
embedding = embedding.mean(axis=0)
|
| 131 |
+
elif selected_encoder == 'ESM-1b':
|
| 132 |
+
from bio_embeddings.embed import ESM1bEmbedder
|
| 133 |
+
encoder = ESM1bEmbedder()
|
| 134 |
+
embeddings = encoder.embed_batch([sequence])
|
| 135 |
+
for emb in embeddings:
|
| 136 |
+
embedding = encoder.reduce_per_protein(emb)
|
| 137 |
+
break
|
| 138 |
+
elif selected_encoder == 'ProtT5':
|
| 139 |
+
from bio_embeddings.embed import ProtTransT5XLU50Embedder
|
| 140 |
+
encoder = ProtTransT5XLU50Embedder()
|
| 141 |
+
embeddings = encoder.embed_batch([sequence])
|
| 142 |
+
for emb in embeddings:
|
| 143 |
+
embedding = encoder.reduce_per_protein(emb)
|
| 144 |
+
break
|
| 145 |
+
else:
|
| 146 |
+
st.write('No pre-trained version of HyperPCM is available for the chosen encoder.')
|
| 147 |
+
embedding = None
|
| 148 |
+
if embedding is not None:
|
| 149 |
+
st.write(f'{selected_encoder} embedding')
|
| 150 |
+
st.write(embedding)
|
| 151 |
|
| 152 |
st.write('TODO run inference with HyperPCM on the given drug compound and protein target.')
|
| 153 |
|
|
|
|
| 161 |
sequence = st.text_input('Enter the amino-acid sequence of the query protein target', value='HXHVWPVQDAKARFSEFLDACITEGPQIVSRRGAEEAVLVPIGEWRRLQAAA', placeholder='HXHVWPVQDAKARFSEFLDACITEGPQIVSRRGAEEAVLVPIGEWRRLQAAA')
|
| 162 |
|
| 163 |
if sequence:
|
| 164 |
+
col1, col2 = st.columns(2)
|
| 165 |
with col1:
|
| 166 |
st.markdown('\n\n\n\n Plot of protein to be added soon. \n\n\n\n')
|
| 167 |
|
|
|
|
| 175 |
for emb in embeddings:
|
| 176 |
embedding = encoder.reduce_per_protein(emb)
|
| 177 |
break
|
| 178 |
+
st.write('SeqVec embedding')
|
|
|
|
| 179 |
st.write(embedding)
|
| 180 |
st.write(np.transpose(embedding))
|
| 181 |
|