# granite.materials.smi-TED - INFERENCE (Classification)

In [None]:
# Install extra packages for notebook
%pip install seaborn xgboost

In [1]:
import sys
sys.path.append('../inference')

In [2]:
# materials.smi-ted
from smi_ted_light.load import load_smi_ted

# Data
import torch
import pandas as pd

# Chemistry
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import Descriptors
PandasTools.RenderImagesInAllDataFrames(True)

In [3]:
# function to canonicalize SMILES
def normalize_smiles(smi, canonical=True, isomeric=False):
    try:
        normalized = Chem.MolToSmiles(
            Chem.MolFromSmiles(smi), canonical=canonical, isomericSmiles=isomeric
        )
    except:
        normalized = None
    return normalized

### Import smi-ted

In [4]:
model_smi_ted = load_smi_ted(
    folder='../inference/smi_ted_light',
    ckpt_filename='smi-ted-Light_40.pt'
)

Random Seed: 12345
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Vocab size: 2393
[INFERENCE MODE - smi-ted-Light]


## BBBP Dataset

### Experiments - Data Load

In [5]:
df_train = pd.read_csv("../finetune/moleculenet/bbbp/train.csv")
df_test = pd.read_csv("../finetune/moleculenet/bbbp/test.csv")

### SMILES canonization

In [6]:
df_train['norm_smiles'] = df_train['smiles'].apply(normalize_smiles)
df_train_normalized = df_train.dropna()
print(df_train_normalized.shape)
df_train_normalized.head()

(1634, 5)


[22:56:14] Explicit valence for atom # 1 N, 4, is greater than permitted
[22:56:14] Explicit valence for atom # 6 N, 4, is greater than permitted
[22:56:14] Explicit valence for atom # 6 N, 4, is greater than permitted
[22:56:14] Explicit valence for atom # 11 N, 4, is greater than permitted
[22:56:14] Explicit valence for atom # 5 N, 4, is greater than permitted


Unnamed: 0,num,name,p_np,smiles,norm_smiles
0,1,Propanolol,1,[Cl].CC(C)NCC(O)COc1cccc2ccccc12,CC(C)NCC(O)COc1cccc2ccccc12.[Cl]
1,2,Terbutylchlorambucil,1,C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl,CC(C)(C)OC(=O)CCCc1ccc(N(CCCl)CCCl)cc1
2,3,40730,1,c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO...,CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23
3,4,24,1,C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C,CC(=O)NCCCOc1cccc(CN2CCCCC2)c1
4,6,cefoperazone,1,CCN1CCN(C(=O)N[C@@H](C(=O)N[C@H]2[C@H]3SCC(=C(...,CCN1CCN(C(=O)NC(C(=O)NC2C(=O)N3C(C(=O)O)=C(CSc...


In [7]:
df_test['norm_smiles'] = df_test['smiles'].apply(normalize_smiles)
df_test_normalized = df_test.dropna()
print(df_test_normalized.shape)
df_test_normalized.head()

(192, 5)


[22:56:17] Explicit valence for atom # 12 N, 4, is greater than permitted
[22:56:17] Explicit valence for atom # 5 N, 4, is greater than permitted


Unnamed: 0,num,name,p_np,smiles,norm_smiles
0,13,18,1,C(Cl)Cl,ClCCl
1,23,SKF-93619,0,c1cc2c(cc(CC3=CNC(=NC3=O)NCCSCc3oc(cc3)CN(C)C)...,CN(C)Cc1ccc(CSCCNc2nc(=O)c(Cc3ccc4ccccc4c3)c[n...
2,36,etomidate,1,CCOC(=O)c1cncn1C(C)c2ccccc2,CCOC(=O)c1cncn1C(C)c1ccccc1
3,37,11a,0,CN(C)c1cc(C2=NC(N)=NN2)ccn1,CN(C)c1cc(-c2nc(N)n[nH]2)ccn1
4,79,compound 45,1,N1(Cc2cc(OCCCNc3oc4ccccc4n3)ccc2)CCCCC1,c1cc(CN2CCCCC2)cc(OCCCNc2nc3ccccc3o2)c1


### Embeddings extraction 

#### smi-ted embeddings extraction

In [8]:
with torch.no_grad():
    df_embeddings_train = model_smi_ted.encode(df_train_normalized['norm_smiles'])
df_embeddings_train.head()

100%|██████████| 16/16 [00:21<00:00,  1.35s/it]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.437218,-0.591727,0.064328,0.374019,0.530676,-0.644067,1.308136,0.089772,0.790524,0.208749,...,-1.325162,-0.083578,0.169544,0.359247,-0.652742,0.720496,-0.674184,0.693,0.586143,-0.159641
1,0.344508,-0.417009,0.095745,0.355959,0.573049,-0.590279,1.069699,0.067724,0.788815,0.159197,...,-1.312421,-0.108732,0.21702,0.303697,-0.598966,0.647903,-0.665967,0.791804,0.620691,-0.107859
2,0.429205,-0.463542,0.056441,0.449925,0.536788,-0.749906,1.193816,0.082596,0.860276,0.162548,...,-1.304979,-0.14862,0.242045,0.34473,-0.704636,0.644773,-0.781017,0.737207,0.58538,-0.101722
3,0.433097,-0.523078,0.089728,0.410127,0.5434,-0.643014,1.203858,0.034177,0.769413,0.202445,...,-1.358915,-0.077463,0.22871,0.317884,-0.68022,0.531601,-0.709799,0.731386,0.567806,-0.087713
4,0.388423,-0.505908,0.072539,0.366502,0.533689,-0.701559,1.035554,0.038419,0.822917,0.163062,...,-1.271012,-0.176412,0.119734,0.294143,-0.677721,0.647655,-0.844419,0.756321,0.570513,-0.240003


In [9]:
with torch.no_grad():
    df_embeddings_test = model_smi_ted.encode(df_test_normalized['norm_smiles'])
df_embeddings_test.head()

100%|██████████| 1/1 [00:04<00:00,  4.23s/it]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.374249,-0.319257,-0.007041,0.444741,0.326734,-0.791476,1.121707,-0.082401,0.611457,0.289225,...,-1.462539,-0.302055,0.295551,-0.058293,-0.830319,0.545099,-0.460271,1.121117,0.685016,-0.452698
1,0.429158,-0.568104,0.112739,0.352429,0.512565,-0.604153,1.181846,0.067963,0.786978,0.128077,...,-1.226941,-0.078927,0.209468,0.266113,-0.762261,0.610685,-0.755705,0.73455,0.592976,-0.148252
2,0.411906,-0.510477,0.073015,0.346871,0.512772,-0.617252,1.191621,0.040103,0.722577,0.188638,...,-1.300554,-0.150735,0.148252,0.282791,-0.694712,0.556029,-0.660645,0.771226,0.558996,-0.00066
3,0.356793,-0.530959,0.05035,0.433593,0.592601,-0.573508,1.221865,0.025491,0.833164,0.214604,...,-1.406141,-0.107165,0.200131,0.289469,-0.770149,0.572746,-0.776739,0.855064,0.662797,-0.194417
4,0.422133,-0.49061,0.044333,0.367861,0.579025,-0.629409,1.139824,0.039823,0.728825,0.145327,...,-1.312777,-0.105049,0.175286,0.336176,-0.738813,0.530226,-0.763357,0.764998,0.583681,-0.109683


### Experiments - BBBP prediction using smi-ted latent spaces

#### XGBoost prediction using the whole Latent Space

In [10]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [11]:
xgb_predict = XGBClassifier(n_estimators=2000, learning_rate=0.04, max_depth=8)
xgb_predict.fit(df_embeddings_train, df_train_normalized['p_np'])

In [13]:
# get XGBoost predictions
y_prob = xgb_predict.predict_proba(df_embeddings_test)[:, 1]

In [14]:
roc_auc = roc_auc_score(df_test_normalized["p_np"], y_prob)
print(f"ROC-AUC Score: {roc_auc:.4f}")

ROC-AUC Score: 0.9194
