Spaces:

ibm-research
/

SMI-TED-demo1

Running

File size: 5,355 Bytes

4d799f2
 
e1e6b13
 
 
843425c
073cdd9
843425c
 
 
 
 
 
073cdd9
843425c
64428bf
 
 
 
 
 
214fccd
073cdd9
214fccd
 
073cdd9
6b91e18
cb4cd4f
073cdd9
 
cb4cd4f
 
 
 
 
 
 
 
862c2e6
 
 
 
 
 
 
073cdd9
 
 
68704e5
073cdd9
214fccd
073cdd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214fccd
cb4cd4f
073cdd9
 
862c2e6
073cdd9
 
c9e9b6b
862c2e6
 
073cdd9
 
 
 
214fccd
cb4cd4f
214fccd
5465560
214fccd
073cdd9
64428bf
 
5465560
64428bf
214fccd
 
 
 
 
073cdd9
cbc085f
cb4cd4f
073cdd9
 
4d799f2
 
 
f3e37c7
862c2e6
5465560
 
862c2e6
 
 
 
 
 
4d799f2
 
e1e6b13
4d799f2
5465560
 
214fccd
f3e37c7
ddae879
4d799f2
073cdd9
5465560
4d799f2
5465560
214fccd
 
 
4d799f2
f63af71
64428bf
ddae879

import os
import sys
import json
import pandas as pd
import gradio as gr

# 1) Ajusta o path antes de importar o loader
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
INFERENCE_PATH = os.path.join(BASE_DIR, "smi-ted", "inference")
sys.path.insert(0, INFERENCE_PATH)

from smi_ted_light.load import load_smi_ted

# 2) Carrega o modelo SMI-TED Light
MODEL_DIR = os.path.join(INFERENCE_PATH, "smi_ted_light")
model = load_smi_ted(
    folder=MODEL_DIR,
    ckpt_filename="smi-ted-Light_40.pt",
    vocab_filename="bert_vocab_curated.txt",
)

def process_inputs(smiles: str, file_obj):
    # Modo batch
    if file_obj is not None:
        try:
            # autodetecta delimitador (; ou , etc)
            df_in = pd.read_csv(file_obj.name, sep=None, engine='python')
            
            # procura coluna "smiles" (case‐insensitive)
            smiles_cols = [c for c in df_in.columns if c.lower() == "smiles"]
            if not smiles_cols:
                return (
                    "Error: The CSV must have a column named 'Smiles' with the respective SMILES.",
                    gr.update(visible=False),
                )
            smiles_col = smiles_cols[0]
            smiles_list = df_in[smiles_col].astype(str).tolist()

            # **novo**: limite de 1000 SMILES
            if len(smiles_list) > 1000:
                return (
                    f"Error: Maximum 1000 SMILES allowed per batch (you provided {len(smiles_list)}).",
                    gr.update(visible=False),
                )

            out_records = []
            invalid_smiles = []
            embed_dim = None

            # para cada SMILES, tenta gerar embedding
            for sm in smiles_list:
                try:
                    vec = model.encode(sm, return_torch=True)[0].tolist()
                    if embed_dim is None:
                        embed_dim = len(vec)
                    record = {"smiles": sm}
                    record.update({f"dim_{i}": v for i, v in enumerate(vec)})
                except Exception:
                    invalid_smiles.append(sm)
                    if embed_dim is not None:
                        record = {"smiles": f"SMILES {sm} was invalid"}
                        record.update({f"dim_{i}": None for i in range(embed_dim)})
                    else:
                        record = {"smiles": f"SMILES {sm} was invalid"}
                out_records.append(record)

            out_df = pd.DataFrame(out_records)
            out_df.to_csv("embeddings.csv", index=False)

            total = len(smiles_list)
            valid = total - len(invalid_smiles)
            invalid_count = len(invalid_smiles)
            if invalid_smiles:
                msg = (
                    f"{valid} SMILES processed successfully. "
                    f"{invalid_count} entr{'y' if invalid_count==1 else 'ies'} could not be parsed by RDKit:\n"
                    + "\n".join(f"- {s}" for s in invalid_smiles)
                )
            else:
                msg = f"Processed batch of {valid} SMILES. Download embeddings.csv."

            return msg, gr.update(value="embeddings.csv", visible=True)

        except Exception as e:
            return f"Error processing batch: {e}", gr.update(visible=False)

    # Modo single (sem mudança)
    smiles = smiles.strip()
    if not smiles:
        return "Please enter a SMILES or upload a CSV file.", gr.update(visible=False)
    try:
        vec = model.encode(smiles, return_torch=True)[0].tolist()
        cols = ["smiles"] + [f"dim_{i}" for i in range(len(vec))]
        df_out = pd.DataFrame([[smiles] + vec], columns=cols)
        df_out.to_csv("embeddings.csv", index=False)
        return json.dumps(vec), gr.update(value="embeddings.csv", visible=True)
    except Exception:
        return f"The following input '{smiles}' is not a valid SMILES", gr.update(visible=False)


# 4) Interface Gradio (sem mudanças)
with gr.Blocks() as demo:
    gr.Markdown(
        """
        # SMI-TED-Embeddings-Extraction

        **Single mode:** paste a SMILES string in the left box.  
        **Batch mode:** upload a CSV file where each row has a SMILES in the first column.  
        - **Maximum 1000 SMILES per batch.** Processing time increases with batch size due to Hugging Face environment limits.  
        _This is just a demo environment; for heavy-duty usage, please visit:_  
        https://github.com/IBM/materials/tree/main/models/smi_ted  
        to download the model and run your own experiments.

        - In both cases, an `embeddings.csv` file will be extracted for download, with the first column as SMILES and the embedding values in the following columns.
        """
    )

    with gr.Row():
        smiles_in = gr.Textbox(label="SMILES (single mode)", placeholder="e.g. CCO")
        file_in   = gr.File(label="SMILES CSV (batch mode)", file_types=[".csv"])

    generate_btn = gr.Button("Extract Embeddings")

    with gr.Row():
        output_msg   = gr.Textbox(label="Message / Embedding (JSON)", interactive=False, lines=4)
        download_csv = gr.File(label="Download embeddings.csv", visible=False)

    generate_btn.click(
        fn=process_inputs,
        inputs=[smiles_in, file_in],
        outputs=[output_msg, download_csv]
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0")