Spaces:

ibm-research
/

SMI-TED-demo1

Running

App Files Files Community

Enzo Reis de Oliveira commited on 15 days ago

Commit

862c2e6

1 Parent(s): c9e9b6b

Adding limit and message

Browse files

Files changed (1) hide show

app.py +17 -12

app.py CHANGED Viewed

@@ -36,6 +36,13 @@ def process_inputs(smiles: str, file_obj):
             smiles_col = smiles_cols[0]
             smiles_list = df_in[smiles_col].astype(str).tolist()
             out_records = []
             invalid_smiles = []
             embed_dim = None
@@ -44,38 +51,30 @@ def process_inputs(smiles: str, file_obj):
             for sm in smiles_list:
                 try:
                     vec = model.encode(sm, return_torch=True)[0].tolist()
-                    # guarda dimensão do vetor na primeira vez
                     if embed_dim is None:
                         embed_dim = len(vec)
-                    # monta registro válido
                     record = {"smiles": sm}
                     record.update({f"dim_{i}": v for i, v in enumerate(vec)})
                 except Exception:
-                    # marca como inválido
                     invalid_smiles.append(sm)
-                    # se já souber quantos dims, preenche com None
                     if embed_dim is not None:
                         record = {"smiles": f"SMILES {sm} was invalid"}
                         record.update({f"dim_{i}": None for i in range(embed_dim)})
                     else:
-                        # ainda não sabemos quantos dims: só guarda smiles
                         record = {"smiles": f"SMILES {sm} was invalid"}
                 out_records.append(record)
-            # converte para DataFrame (vai unificar todas as colunas)
             out_df = pd.DataFrame(out_records)
             out_df.to_csv("embeddings.csv", index=False)
-            # monta mensagem de saída
             total = len(smiles_list)
             valid = total - len(invalid_smiles)
             if invalid_smiles:
-                invalid_count = len(invalid_smiles)
                 msg = (
                     f"{valid} SMILES processed successfully. "
-                    f"{invalid_count} entr{'y' if invalid_count==1 else 'ies'} "
-                    f"could not be parsed by RDKit:\n"
-                    + "\n".join(f"- {sm}" for sm in invalid_smiles)
                 )
             else:
                 msg = f"Processed batch of {valid} SMILES. Download embeddings.csv."
@@ -104,9 +103,15 @@ with gr.Blocks() as demo:
     gr.Markdown(
         """
         # SMI-TED-Embeddings-Extraction
         **Single mode:** paste a SMILES string in the left box.
         **Batch mode:** upload a CSV file where each row has a SMILES in the first column.
-        In both cases, an `embeddings.csv` file will be extracted for download, with the first column as SMILES and the embedding values in the following columns.
         """
     )

             smiles_col = smiles_cols[0]
             smiles_list = df_in[smiles_col].astype(str).tolist()
+            # **novo**: limite de 1000 SMILES
+            if len(smiles_list) > 1000:
+                return (
+                    f"Error: Maximum 1000 SMILES allowed per batch (you provided {len(smiles_list)}).",
+                    gr.update(visible=False),
+                )
             out_records = []
             invalid_smiles = []
             embed_dim = None
             for sm in smiles_list:
                 try:
                     vec = model.encode(sm, return_torch=True)[0].tolist()
                     if embed_dim is None:
                         embed_dim = len(vec)
                     record = {"smiles": sm}
                     record.update({f"dim_{i}": v for i, v in enumerate(vec)})
                 except Exception:
                     invalid_smiles.append(sm)
                     if embed_dim is not None:
                         record = {"smiles": f"SMILES {sm} was invalid"}
                         record.update({f"dim_{i}": None for i in range(embed_dim)})
                     else:
                         record = {"smiles": f"SMILES {sm} was invalid"}
                 out_records.append(record)
             out_df = pd.DataFrame(out_records)
             out_df.to_csv("embeddings.csv", index=False)
             total = len(smiles_list)
             valid = total - len(invalid_smiles)
+            invalid_count = len(invalid_smiles)
             if invalid_smiles:
                 msg = (
                     f"{valid} SMILES processed successfully. "
+                    f"{invalid_count} entr{'y' if invalid_count==1 else 'ies'} could not be parsed by RDKit:\n"
+                    + "\n".join(f"- {s}" for s in invalid_smiles)
                 )
             else:
                 msg = f"Processed batch of {valid} SMILES. Download embeddings.csv."
     gr.Markdown(
         """
         # SMI-TED-Embeddings-Extraction
         **Single mode:** paste a SMILES string in the left box.
         **Batch mode:** upload a CSV file where each row has a SMILES in the first column.
+        - **Maximum 1000 SMILES per batch.** Processing time increases with batch size due to Hugging Face environment limits.
+        _This is just a demo environment; for heavy-duty usage, please visit:_
+        https://github.com/IBM/materials/tree/main/models/smi_ted
+        to download the model and run your own experiments.
+        - In both cases, an `embeddings.csv` file will be extracted for download, with the first column as SMILES and the embedding values in the following columns.
         """
     )