import os
import sys
import json
import pandas as pd
import gradio as gr

# 1) Ajusta o path antes de importar o loader
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
INFERENCE_PATH = os.path.join(BASE_DIR, "smi-ted", "inference")
sys.path.insert(0, INFERENCE_PATH)

from smi_ted_light.load import load_smi_ted

# 2) Carrega o modelo SMI-TED Light
MODEL_DIR = os.path.join(INFERENCE_PATH, "smi_ted_light")
model = load_smi_ted(
    folder=MODEL_DIR,
    ckpt_filename="smi-ted-Light_40.pt",
    vocab_filename="bert_vocab_curated.txt",
)

def process_inputs(smiles: str, file_obj):
    # Modo batch
    if file_obj is not None:
        try:
            # autodetecta delimitador (; ou , etc)
            df_in = pd.read_csv(file_obj.name, sep=None, engine='python')
            
            # procura coluna "smiles" (case‐insensitive)
            smiles_cols = [c for c in df_in.columns if c.lower() == "smiles"]
            if not smiles_cols:
                return (
                    "Error: The CSV must have a column named 'Smiles' with the respective SMILES.",
                    gr.update(visible=False),
                )
            smiles_col = smiles_cols[0]
            smiles_list = df_in[smiles_col].astype(str).tolist()

            # **novo**: limite de 1000 SMILES
            if len(smiles_list) > 1000:
                return (
                    f"Error: Maximum 1000 SMILES allowed per batch (you provided {len(smiles_list)}).",
                    gr.update(visible=False),
                )

            out_records = []
            invalid_smiles = []
            embed_dim = None

            # para cada SMILES, tenta gerar embedding
            for sm in smiles_list:
                try:
                    vec = model.encode(sm, return_torch=True)[0].tolist()
                    if embed_dim is None:
                        embed_dim = len(vec)
                    record = {"smiles": sm}
                    record.update({f"dim_{i}": v for i, v in enumerate(vec)})
                except Exception:
                    invalid_smiles.append(sm)
                    if embed_dim is not None:
                        record = {"smiles": f"SMILES {sm} was invalid"}
                        record.update({f"dim_{i}": None for i in range(embed_dim)})
                    else:
                        record = {"smiles": f"SMILES {sm} was invalid"}
                out_records.append(record)

            out_df = pd.DataFrame(out_records)
            out_df.to_csv("embeddings.csv", index=False)

            total = len(smiles_list)
            valid = total - len(invalid_smiles)
            invalid_count = len(invalid_smiles)
            if invalid_smiles:
                msg = (
                    f"{valid} SMILES processed successfully. "
                    f"{invalid_count} entr{'y' if invalid_count==1 else 'ies'} could not be parsed by RDKit:\n"
                    + "\n".join(f"- {s}" for s in invalid_smiles)
                )
            else:
                msg = f"Processed batch of {valid} SMILES. Download embeddings.csv."

            return msg, gr.update(value="embeddings.csv", visible=True)

        except Exception as e:
            return f"Error processing batch: {e}", gr.update(visible=False)

    # Modo single (sem mudança)
    smiles = smiles.strip()
    if not smiles:
        return "Please enter a SMILES or upload a CSV file.", gr.update(visible=False)
    try:
        vec = model.encode(smiles, return_torch=True)[0].tolist()
        cols = ["smiles"] + [f"dim_{i}" for i in range(len(vec))]
        df_out = pd.DataFrame([[smiles] + vec], columns=cols)
        df_out.to_csv("embeddings.csv", index=False)
        return json.dumps(vec), gr.update(value="embeddings.csv", visible=True)
    except Exception:
        return f"The following input '{smiles}' is not a valid SMILES", gr.update(visible=False)


# 4) Interface Gradio (sem mudanças)
with gr.Blocks() as demo:
    gr.Markdown(
        """
        # SMI-TED-Embeddings-Extraction

        **Single mode:** paste a SMILES string in the left box.  
        **Batch mode:** upload a CSV file where each row has a SMILES in the first column.  
        - **Maximum 1000 SMILES per batch.** Processing time increases with batch size due to Hugging Face environment limits.  
        _This is just a demo environment; for heavy-duty usage, please visit:_  
        https://github.com/IBM/materials/tree/main/models/smi_ted  
        to download the model and run your own experiments.

        - In both cases, an `embeddings.csv` file will be extracted for download, with the first column as SMILES and the embedding values in the following columns.
        """
    )

    with gr.Row():
        smiles_in = gr.Textbox(label="SMILES (single mode)", placeholder="e.g. CCO")
        file_in   = gr.File(label="SMILES CSV (batch mode)", file_types=[".csv"])

    generate_btn = gr.Button("Extract Embeddings")

    with gr.Row():
        output_msg   = gr.Textbox(label="Message / Embedding (JSON)", interactive=False, lines=4)
        download_csv = gr.File(label="Download embeddings.csv", visible=False)

    generate_btn.click(
        fn=process_inputs,
        inputs=[smiles_in, file_in],
        outputs=[output_msg, download_csv]
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0")