# granite.materials.smi-TED - Encoder & Decoder

In [1]:
import sys
sys.path.append('../inference')

In [2]:
# materials.smi-ted (smi-ted)
from smi_ted_light.load import load_smi_ted

# Data
import pandas as pd
import numpy as np
import torch

# Chemistry
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from rdkit.DataStructs import FingerprintSimilarity
from rdkit.DataStructs import TanimotoSimilarity

In [3]:
# function to canonicalize SMILES
def normalize_smiles(smi, canonical=True, isomeric=False):
    try:
        normalized = Chem.MolToSmiles(
            Chem.MolFromSmiles(smi), canonical=canonical, isomericSmiles=isomeric
        )
    except:
        normalized = None
    return normalized

# function to calculate pairwise Tanimoto similarity
def calculate_tanimoto_similarities(fps1, fps2):
    similarities = []
    for i in range(len(fps1)):
            sim = TanimotoSimilarity(fps1[i], fps2[i])
            similarities.append(sim)
    return similarities

### Load smi-ted

In [4]:
model_smi_ted = load_smi_ted(
    folder='../inference/smi_ted_light',
    ckpt_filename='smi-ted-Light_40.pt'
)

Random Seed: 12345
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Vocab size: 2393
[INFERENCE MODE - smi-ted-Light]


## Load Dataset

In [5]:
df_moses = pd.read_csv("./data/moses_test.csv", nrows=1000)

In [6]:
df_moses['SMILES'] = df_moses['SMILES'].apply(normalize_smiles)
df_test_normalized = df_moses.dropna()
print(df_test_normalized.shape)
df_test_normalized.head()

(1000, 1)


Unnamed: 0,SMILES
0,CC1C2CCC(C2)C1CN(CCO)C(=O)c1ccc(Cl)cc1
1,COc1ccc(-c2cc(=O)c3c(O)c(OC)c(OC)cc3o2)cc1O
2,CCOC(=O)c1ncn2c1CN(C)C(=O)c1cc(F)ccc1-2
3,Clc1ccccc1-c1nc(-c2ccncc2)no1
4,CC(C)(Oc1ccc(Cl)cc1)C(=O)OCc1cccc(CO)n1


### Encode SMILES - smi-ted

In [7]:
with torch.no_grad():
    encode_embeddings = model_smi_ted.encode(df_moses['SMILES'], return_torch=True)

100%|██████████| 10/10 [00:07<00:00,  1.42it/s]


### Decode smi-ted embeddings into SMILES

In [8]:
with torch.no_grad():
    decoded_smiles = model_smi_ted.decode(encode_embeddings)

In [9]:
decoded_smiles[0:5]

['CC1C2CCC(C2)C1CN(CCO)C(=O)c1ccc(Cl)cc1',
 'COc1ccc(-c2cc(=O)c3c(O)c(OC)c(OC)cc3o2)cc1O',
 'CCOC(=O)c1ncn2c1CN(C)C(=O)c1cc(F)ccc1-2',
 'Clc1ccccc1-c1nc(-c2ccncc2)no1',
 'CC(C)(Oc1ccc(Cl)cc1)C(=O)OCc1cccc(CO)n1']

### Compare similarities

In [10]:
# Convert SMILES to RDKit molecule objects
mols1 = [Chem.MolFromSmiles(smiles) for smiles in df_moses['SMILES'].to_list()]
mols2 = [Chem.MolFromSmiles(smiles) for smiles in decoded_smiles]

# Compute fingerprints for each molecule
fps1 = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mols1]
fps2 = [AllChem.GetMorganFingerprintAsBitVect(mol, 2) for mol in mols2]

# Calculate Tanimoto similarities
tanimoto_similarities = calculate_tanimoto_similarities(fps1, fps2)

# Calculate the mean similarity
mean_similarity = np.mean(tanimoto_similarities)

# Print the mean similarity
print(f"Mean Tanimoto Similarity: {mean_similarity:.2f}")

Mean Tanimoto Similarity: 1.00
