import os import sys import json import pandas as pd import gradio as gr # 1) Ajusta o path antes de importar o loader BASE_DIR = os.path.dirname(os.path.abspath(__file__)) INFERENCE_PATH = os.path.join(BASE_DIR, "smi-ted", "inference") sys.path.insert(0, INFERENCE_PATH) from smi_ted_light.load import load_smi_ted # 2) Carrega o modelo SMI-TED Light MODEL_DIR = os.path.join(INFERENCE_PATH, "smi_ted_light") model = load_smi_ted( folder=MODEL_DIR, ckpt_filename="smi-ted-Light_40.pt", vocab_filename="bert_vocab_curated.txt", ) def process_inputs(smiles: str, file_obj): # Modo batch if file_obj is not None: try: # autodetecta delimitador (; ou , etc) df_in = pd.read_csv(file_obj.name, sep=None, engine='python') # procura coluna "smiles" (case‐insensitive) smiles_cols = [c for c in df_in.columns if c.lower() == "smiles"] if not smiles_cols: return ( "Error: The CSV must have a column named 'Smiles' with the respective SMILES.", gr.update(visible=False), ) smiles_col = smiles_cols[0] smiles_list = df_in[smiles_col].astype(str).tolist() # **novo**: limite de 1000 SMILES if len(smiles_list) > 1000: return ( f"Error: Maximum 1000 SMILES allowed per batch (you provided {len(smiles_list)}).", gr.update(visible=False), ) out_records = [] invalid_smiles = [] embed_dim = None # para cada SMILES, tenta gerar embedding for sm in smiles_list: try: vec = model.encode(sm, return_torch=True)[0].tolist() if embed_dim is None: embed_dim = len(vec) record = {"smiles": sm} record.update({f"dim_{i}": v for i, v in enumerate(vec)}) except Exception: invalid_smiles.append(sm) if embed_dim is not None: record = {"smiles": f"SMILES {sm} was invalid"} record.update({f"dim_{i}": None for i in range(embed_dim)}) else: record = {"smiles": f"SMILES {sm} was invalid"} out_records.append(record) out_df = pd.DataFrame(out_records) out_df.to_csv("embeddings.csv", index=False) total = len(smiles_list) valid = total - len(invalid_smiles) invalid_count = len(invalid_smiles) if invalid_smiles: msg = ( f"{valid} SMILES processed successfully. " f"{invalid_count} entr{'y' if invalid_count==1 else 'ies'} could not be parsed by RDKit:\n" + "\n".join(f"- {s}" for s in invalid_smiles) ) else: msg = f"Processed batch of {valid} SMILES. Download embeddings.csv." return msg, gr.update(value="embeddings.csv", visible=True) except Exception as e: return f"Error processing batch: {e}", gr.update(visible=False) # Modo single (sem mudança) smiles = smiles.strip() if not smiles: return "Please enter a SMILES or upload a CSV file.", gr.update(visible=False) try: vec = model.encode(smiles, return_torch=True)[0].tolist() cols = ["smiles"] + [f"dim_{i}" for i in range(len(vec))] df_out = pd.DataFrame([[smiles] + vec], columns=cols) df_out.to_csv("embeddings.csv", index=False) return json.dumps(vec), gr.update(value="embeddings.csv", visible=True) except Exception: return f"The following input '{smiles}' is not a valid SMILES", gr.update(visible=False) # 4) Interface Gradio (sem mudanças) with gr.Blocks() as demo: gr.Markdown( """ # SMI-TED-Embeddings-Extraction **Single mode:** paste a SMILES string in the left box. **Batch mode:** upload a CSV file where each row has a SMILES in the first column. - **Maximum 1000 SMILES per batch.** Processing time increases with batch size due to Hugging Face environment limits. _This is just a demo environment; for heavy-duty usage, please visit:_ https://github.com/IBM/materials/tree/main/models/smi_ted to download the model and run your own experiments. - In both cases, an `embeddings.csv` file will be extracted for download, with the first column as SMILES and the embedding values in the following columns. """ ) with gr.Row(): smiles_in = gr.Textbox(label="SMILES (single mode)", placeholder="e.g. CCO") file_in = gr.File(label="SMILES CSV (batch mode)", file_types=[".csv"]) generate_btn = gr.Button("Extract Embeddings") with gr.Row(): output_msg = gr.Textbox(label="Message / Embedding (JSON)", interactive=False, lines=4) download_csv = gr.File(label="Download embeddings.csv", visible=False) generate_btn.click( fn=process_inputs, inputs=[smiles_in, file_in], outputs=[output_msg, download_csv] ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0")