Enzo Reis de Oliveira commited on
Commit
862c2e6
·
1 Parent(s): c9e9b6b

Adding limit and message

Browse files
Files changed (1) hide show
  1. app.py +17 -12
app.py CHANGED
@@ -36,6 +36,13 @@ def process_inputs(smiles: str, file_obj):
36
  smiles_col = smiles_cols[0]
37
  smiles_list = df_in[smiles_col].astype(str).tolist()
38
 
 
 
 
 
 
 
 
39
  out_records = []
40
  invalid_smiles = []
41
  embed_dim = None
@@ -44,38 +51,30 @@ def process_inputs(smiles: str, file_obj):
44
  for sm in smiles_list:
45
  try:
46
  vec = model.encode(sm, return_torch=True)[0].tolist()
47
- # guarda dimensão do vetor na primeira vez
48
  if embed_dim is None:
49
  embed_dim = len(vec)
50
- # monta registro válido
51
  record = {"smiles": sm}
52
  record.update({f"dim_{i}": v for i, v in enumerate(vec)})
53
  except Exception:
54
- # marca como inválido
55
  invalid_smiles.append(sm)
56
- # se já souber quantos dims, preenche com None
57
  if embed_dim is not None:
58
  record = {"smiles": f"SMILES {sm} was invalid"}
59
  record.update({f"dim_{i}": None for i in range(embed_dim)})
60
  else:
61
- # ainda não sabemos quantos dims: só guarda smiles
62
  record = {"smiles": f"SMILES {sm} was invalid"}
63
  out_records.append(record)
64
 
65
- # converte para DataFrame (vai unificar todas as colunas)
66
  out_df = pd.DataFrame(out_records)
67
  out_df.to_csv("embeddings.csv", index=False)
68
 
69
- # monta mensagem de saída
70
  total = len(smiles_list)
71
  valid = total - len(invalid_smiles)
 
72
  if invalid_smiles:
73
- invalid_count = len(invalid_smiles)
74
  msg = (
75
  f"{valid} SMILES processed successfully. "
76
- f"{invalid_count} entr{'y' if invalid_count==1 else 'ies'} "
77
- f"could not be parsed by RDKit:\n"
78
- + "\n".join(f"- {sm}" for sm in invalid_smiles)
79
  )
80
  else:
81
  msg = f"Processed batch of {valid} SMILES. Download embeddings.csv."
@@ -104,9 +103,15 @@ with gr.Blocks() as demo:
104
  gr.Markdown(
105
  """
106
  # SMI-TED-Embeddings-Extraction
 
107
  **Single mode:** paste a SMILES string in the left box.
108
  **Batch mode:** upload a CSV file where each row has a SMILES in the first column.
109
- In both cases, an `embeddings.csv` file will be extracted for download, with the first column as SMILES and the embedding values in the following columns.
 
 
 
 
 
110
  """
111
  )
112
 
 
36
  smiles_col = smiles_cols[0]
37
  smiles_list = df_in[smiles_col].astype(str).tolist()
38
 
39
+ # **novo**: limite de 1000 SMILES
40
+ if len(smiles_list) > 1000:
41
+ return (
42
+ f"Error: Maximum 1000 SMILES allowed per batch (you provided {len(smiles_list)}).",
43
+ gr.update(visible=False),
44
+ )
45
+
46
  out_records = []
47
  invalid_smiles = []
48
  embed_dim = None
 
51
  for sm in smiles_list:
52
  try:
53
  vec = model.encode(sm, return_torch=True)[0].tolist()
 
54
  if embed_dim is None:
55
  embed_dim = len(vec)
 
56
  record = {"smiles": sm}
57
  record.update({f"dim_{i}": v for i, v in enumerate(vec)})
58
  except Exception:
 
59
  invalid_smiles.append(sm)
 
60
  if embed_dim is not None:
61
  record = {"smiles": f"SMILES {sm} was invalid"}
62
  record.update({f"dim_{i}": None for i in range(embed_dim)})
63
  else:
 
64
  record = {"smiles": f"SMILES {sm} was invalid"}
65
  out_records.append(record)
66
 
 
67
  out_df = pd.DataFrame(out_records)
68
  out_df.to_csv("embeddings.csv", index=False)
69
 
 
70
  total = len(smiles_list)
71
  valid = total - len(invalid_smiles)
72
+ invalid_count = len(invalid_smiles)
73
  if invalid_smiles:
 
74
  msg = (
75
  f"{valid} SMILES processed successfully. "
76
+ f"{invalid_count} entr{'y' if invalid_count==1 else 'ies'} could not be parsed by RDKit:\n"
77
+ + "\n".join(f"- {s}" for s in invalid_smiles)
 
78
  )
79
  else:
80
  msg = f"Processed batch of {valid} SMILES. Download embeddings.csv."
 
103
  gr.Markdown(
104
  """
105
  # SMI-TED-Embeddings-Extraction
106
+
107
  **Single mode:** paste a SMILES string in the left box.
108
  **Batch mode:** upload a CSV file where each row has a SMILES in the first column.
109
+ - **Maximum 1000 SMILES per batch.** Processing time increases with batch size due to Hugging Face environment limits.
110
+ _This is just a demo environment; for heavy-duty usage, please visit:_
111
+ https://github.com/IBM/materials/tree/main/models/smi_ted
112
+ to download the model and run your own experiments.
113
+
114
+ - In both cases, an `embeddings.csv` file will be extracted for download, with the first column as SMILES and the embedding values in the following columns.
115
  """
116
  )
117