Spaces:
Running
Running
Enzo Reis de Oliveira
commited on
Commit
·
5465560
1
Parent(s):
214fccd
Putting everything in english
Browse files
app.py
CHANGED
@@ -4,14 +4,14 @@ import json
|
|
4 |
import pandas as pd
|
5 |
import gradio as gr
|
6 |
|
7 |
-
# 1)
|
8 |
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
9 |
INFERENCE_PATH = os.path.join(BASE_DIR, "smi-ted", "inference")
|
10 |
sys.path.insert(0, INFERENCE_PATH)
|
11 |
|
12 |
from smi_ted_light.load import load_smi_ted
|
13 |
|
14 |
-
# 2)
|
15 |
MODEL_DIR = os.path.join(INFERENCE_PATH, "smi_ted_light")
|
16 |
model = load_smi_ted(
|
17 |
folder=MODEL_DIR,
|
@@ -19,9 +19,9 @@ model = load_smi_ted(
|
|
19 |
vocab_filename="bert_vocab_curated.txt",
|
20 |
)
|
21 |
|
22 |
-
# 3)
|
23 |
def process_inputs(smiles: str, file_obj):
|
24 |
-
#
|
25 |
if file_obj is not None:
|
26 |
try:
|
27 |
df_in = pd.read_csv(file_obj.name)
|
@@ -30,51 +30,51 @@ def process_inputs(smiles: str, file_obj):
|
|
30 |
for sm in smiles_list:
|
31 |
vec = model.encode(sm, return_torch=True)[0].tolist()
|
32 |
embeddings.append(vec)
|
33 |
-
#
|
34 |
out_df = pd.DataFrame(embeddings)
|
35 |
out_df.insert(0, "smiles", smiles_list)
|
36 |
out_df.to_csv("embeddings.csv", index=False)
|
37 |
-
msg = f"
|
38 |
return msg, gr.update(value="embeddings.csv", visible=True)
|
39 |
except Exception as e:
|
40 |
-
return f"
|
41 |
|
42 |
-
#
|
43 |
smiles = smiles.strip()
|
44 |
if not smiles:
|
45 |
-
return "
|
46 |
try:
|
47 |
vec = model.encode(smiles, return_torch=True)[0].tolist()
|
48 |
-
#
|
49 |
cols = ["smiles"] + [f"dim_{i}" for i in range(len(vec))]
|
50 |
df_out = pd.DataFrame([[smiles] + vec], columns=cols)
|
51 |
df_out.to_csv("embeddings.csv", index=False)
|
52 |
return json.dumps(vec), gr.update(value="embeddings.csv", visible=True)
|
53 |
except Exception as e:
|
54 |
-
return f"
|
55 |
|
56 |
-
# 4)
|
57 |
with gr.Blocks() as demo:
|
58 |
gr.Markdown(
|
59 |
"""
|
60 |
# SMI-TED Embedding Generator
|
61 |
-
**
|
62 |
-
**
|
63 |
-
|
64 |
"""
|
65 |
)
|
66 |
|
67 |
with gr.Row():
|
68 |
-
smiles_in = gr.Textbox(label="SMILES (
|
69 |
-
file_in = gr.File(label="CSV
|
70 |
|
71 |
-
|
72 |
|
73 |
with gr.Row():
|
74 |
-
output_msg
|
75 |
-
download_csv = gr.File(label="
|
76 |
|
77 |
-
|
78 |
fn=process_inputs,
|
79 |
inputs=[smiles_in, file_in],
|
80 |
outputs=[output_msg, download_csv]
|
|
|
4 |
import pandas as pd
|
5 |
import gradio as gr
|
6 |
|
7 |
+
# 1) Adjust path before importing the loader
|
8 |
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
9 |
INFERENCE_PATH = os.path.join(BASE_DIR, "smi-ted", "inference")
|
10 |
sys.path.insert(0, INFERENCE_PATH)
|
11 |
|
12 |
from smi_ted_light.load import load_smi_ted
|
13 |
|
14 |
+
# 2) Load the SMI-TED Light model
|
15 |
MODEL_DIR = os.path.join(INFERENCE_PATH, "smi_ted_light")
|
16 |
model = load_smi_ted(
|
17 |
folder=MODEL_DIR,
|
|
|
19 |
vocab_filename="bert_vocab_curated.txt",
|
20 |
)
|
21 |
|
22 |
+
# 3) Single function to process either a single SMILES or a CSV of SMILES
|
23 |
def process_inputs(smiles: str, file_obj):
|
24 |
+
# If a CSV file is provided, process in batch
|
25 |
if file_obj is not None:
|
26 |
try:
|
27 |
df_in = pd.read_csv(file_obj.name)
|
|
|
30 |
for sm in smiles_list:
|
31 |
vec = model.encode(sm, return_torch=True)[0].tolist()
|
32 |
embeddings.append(vec)
|
33 |
+
# Build output DataFrame
|
34 |
out_df = pd.DataFrame(embeddings)
|
35 |
out_df.insert(0, "smiles", smiles_list)
|
36 |
out_df.to_csv("embeddings.csv", index=False)
|
37 |
+
msg = f"Processed batch of {len(smiles_list)} SMILES. Download embeddings.csv."
|
38 |
return msg, gr.update(value="embeddings.csv", visible=True)
|
39 |
except Exception as e:
|
40 |
+
return f"Error processing batch: {e}", gr.update(visible=False)
|
41 |
|
42 |
+
# Otherwise, process a single SMILES
|
43 |
smiles = smiles.strip()
|
44 |
if not smiles:
|
45 |
+
return "Please enter a SMILES or upload a CSV file.", gr.update(visible=False)
|
46 |
try:
|
47 |
vec = model.encode(smiles, return_torch=True)[0].tolist()
|
48 |
+
# Save CSV with header
|
49 |
cols = ["smiles"] + [f"dim_{i}" for i in range(len(vec))]
|
50 |
df_out = pd.DataFrame([[smiles] + vec], columns=cols)
|
51 |
df_out.to_csv("embeddings.csv", index=False)
|
52 |
return json.dumps(vec), gr.update(value="embeddings.csv", visible=True)
|
53 |
except Exception as e:
|
54 |
+
return f"Error generating embedding: {e}", gr.update(visible=False)
|
55 |
|
56 |
+
# 4) Build the Gradio Blocks interface
|
57 |
with gr.Blocks() as demo:
|
58 |
gr.Markdown(
|
59 |
"""
|
60 |
# SMI-TED Embedding Generator
|
61 |
+
**Single mode:** paste a SMILES string in the left box.
|
62 |
+
**Batch mode:** upload a CSV file where each row has a SMILES in the first column.
|
63 |
+
In both cases, an `embeddings.csv` file will be generated for download, with the first column as SMILES and the embedding values in the following columns.
|
64 |
"""
|
65 |
)
|
66 |
|
67 |
with gr.Row():
|
68 |
+
smiles_in = gr.Textbox(label="SMILES (single mode)", placeholder="e.g. CCO")
|
69 |
+
file_in = gr.File(label="SMILES CSV (batch mode)", file_types=[".csv"])
|
70 |
|
71 |
+
generate_btn = gr.Button("Generate Embeddings")
|
72 |
|
73 |
with gr.Row():
|
74 |
+
output_msg = gr.Textbox(label="Message / Embedding (JSON)", interactive=False, lines=2)
|
75 |
+
download_csv = gr.File(label="Download embeddings.csv", visible=False)
|
76 |
|
77 |
+
generate_btn.click(
|
78 |
fn=process_inputs,
|
79 |
inputs=[smiles_in, file_in],
|
80 |
outputs=[output_msg, download_csv]
|