Spaces:
Running
Running
Enzo Reis de Oliveira
commited on
Commit
·
862c2e6
1
Parent(s):
c9e9b6b
Adding limit and message
Browse files
app.py
CHANGED
@@ -36,6 +36,13 @@ def process_inputs(smiles: str, file_obj):
|
|
36 |
smiles_col = smiles_cols[0]
|
37 |
smiles_list = df_in[smiles_col].astype(str).tolist()
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
out_records = []
|
40 |
invalid_smiles = []
|
41 |
embed_dim = None
|
@@ -44,38 +51,30 @@ def process_inputs(smiles: str, file_obj):
|
|
44 |
for sm in smiles_list:
|
45 |
try:
|
46 |
vec = model.encode(sm, return_torch=True)[0].tolist()
|
47 |
-
# guarda dimensão do vetor na primeira vez
|
48 |
if embed_dim is None:
|
49 |
embed_dim = len(vec)
|
50 |
-
# monta registro válido
|
51 |
record = {"smiles": sm}
|
52 |
record.update({f"dim_{i}": v for i, v in enumerate(vec)})
|
53 |
except Exception:
|
54 |
-
# marca como inválido
|
55 |
invalid_smiles.append(sm)
|
56 |
-
# se já souber quantos dims, preenche com None
|
57 |
if embed_dim is not None:
|
58 |
record = {"smiles": f"SMILES {sm} was invalid"}
|
59 |
record.update({f"dim_{i}": None for i in range(embed_dim)})
|
60 |
else:
|
61 |
-
# ainda não sabemos quantos dims: só guarda smiles
|
62 |
record = {"smiles": f"SMILES {sm} was invalid"}
|
63 |
out_records.append(record)
|
64 |
|
65 |
-
# converte para DataFrame (vai unificar todas as colunas)
|
66 |
out_df = pd.DataFrame(out_records)
|
67 |
out_df.to_csv("embeddings.csv", index=False)
|
68 |
|
69 |
-
# monta mensagem de saída
|
70 |
total = len(smiles_list)
|
71 |
valid = total - len(invalid_smiles)
|
|
|
72 |
if invalid_smiles:
|
73 |
-
invalid_count = len(invalid_smiles)
|
74 |
msg = (
|
75 |
f"{valid} SMILES processed successfully. "
|
76 |
-
f"{invalid_count} entr{'y' if invalid_count==1 else 'ies'} "
|
77 |
-
f"
|
78 |
-
+ "\n".join(f"- {sm}" for sm in invalid_smiles)
|
79 |
)
|
80 |
else:
|
81 |
msg = f"Processed batch of {valid} SMILES. Download embeddings.csv."
|
@@ -104,9 +103,15 @@ with gr.Blocks() as demo:
|
|
104 |
gr.Markdown(
|
105 |
"""
|
106 |
# SMI-TED-Embeddings-Extraction
|
|
|
107 |
**Single mode:** paste a SMILES string in the left box.
|
108 |
**Batch mode:** upload a CSV file where each row has a SMILES in the first column.
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
110 |
"""
|
111 |
)
|
112 |
|
|
|
36 |
smiles_col = smiles_cols[0]
|
37 |
smiles_list = df_in[smiles_col].astype(str).tolist()
|
38 |
|
39 |
+
# **novo**: limite de 1000 SMILES
|
40 |
+
if len(smiles_list) > 1000:
|
41 |
+
return (
|
42 |
+
f"Error: Maximum 1000 SMILES allowed per batch (you provided {len(smiles_list)}).",
|
43 |
+
gr.update(visible=False),
|
44 |
+
)
|
45 |
+
|
46 |
out_records = []
|
47 |
invalid_smiles = []
|
48 |
embed_dim = None
|
|
|
51 |
for sm in smiles_list:
|
52 |
try:
|
53 |
vec = model.encode(sm, return_torch=True)[0].tolist()
|
|
|
54 |
if embed_dim is None:
|
55 |
embed_dim = len(vec)
|
|
|
56 |
record = {"smiles": sm}
|
57 |
record.update({f"dim_{i}": v for i, v in enumerate(vec)})
|
58 |
except Exception:
|
|
|
59 |
invalid_smiles.append(sm)
|
|
|
60 |
if embed_dim is not None:
|
61 |
record = {"smiles": f"SMILES {sm} was invalid"}
|
62 |
record.update({f"dim_{i}": None for i in range(embed_dim)})
|
63 |
else:
|
|
|
64 |
record = {"smiles": f"SMILES {sm} was invalid"}
|
65 |
out_records.append(record)
|
66 |
|
|
|
67 |
out_df = pd.DataFrame(out_records)
|
68 |
out_df.to_csv("embeddings.csv", index=False)
|
69 |
|
|
|
70 |
total = len(smiles_list)
|
71 |
valid = total - len(invalid_smiles)
|
72 |
+
invalid_count = len(invalid_smiles)
|
73 |
if invalid_smiles:
|
|
|
74 |
msg = (
|
75 |
f"{valid} SMILES processed successfully. "
|
76 |
+
f"{invalid_count} entr{'y' if invalid_count==1 else 'ies'} could not be parsed by RDKit:\n"
|
77 |
+
+ "\n".join(f"- {s}" for s in invalid_smiles)
|
|
|
78 |
)
|
79 |
else:
|
80 |
msg = f"Processed batch of {valid} SMILES. Download embeddings.csv."
|
|
|
103 |
gr.Markdown(
|
104 |
"""
|
105 |
# SMI-TED-Embeddings-Extraction
|
106 |
+
|
107 |
**Single mode:** paste a SMILES string in the left box.
|
108 |
**Batch mode:** upload a CSV file where each row has a SMILES in the first column.
|
109 |
+
- **Maximum 1000 SMILES per batch.** Processing time increases with batch size due to Hugging Face environment limits.
|
110 |
+
_This is just a demo environment; for heavy-duty usage, please visit:_
|
111 |
+
https://github.com/IBM/materials/tree/main/models/smi_ted
|
112 |
+
to download the model and run your own experiments.
|
113 |
+
|
114 |
+
- In both cases, an `embeddings.csv` file will be extracted for download, with the first column as SMILES and the embedding values in the following columns.
|
115 |
"""
|
116 |
)
|
117 |
|