Spaces:

MooseML
/

homo-lumo-gap-predictor

Sleeping

App Files Files Community

MooseML commited on May 6

Commit

a1efe76

1 Parent(s): b12a328

Compact info expander and change permissions in Dockerfile

Browse files

Files changed (2) hide show

Dockerfile +19 -70
app.py +101 -223

Dockerfile CHANGED Viewed

@@ -1,90 +1,39 @@
-# Dockerfile for Hugging Face Space: Streamlit + RDKit + PyG
 FROM python:3.10-slim
-# System libraries (needed by RDKit / Pillow)
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-        build-essential \
-        libxrender1 \
-        libxext6 \
-        libsm6 \
-        libx11-6 \
-        libglib2.0-0 \
-        libfreetype6 \
-        libpng-dev \
-        wget && \
     rm -rf /var/lib/apt/lists/*
-# Create a non-root user to run the application
 RUN useradd -m appuser
-# Python packages
 RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir \
-        streamlit==1.45.0 \
-        rdkit-pypi==2022.9.5 \
-        pandas==2.2.3 \
-        numpy==1.26.4 \
-        torch==2.2.0 \
-        torch-geometric==2.5.2 \
-        ogb==1.3.6 \
-        pillow==10.3.0
-# Set up working directory
-WORKDIR /app
-# Copy application files
 COPY . .
-# Fix permissions for temporary directories
-RUN mkdir -p /tmp/streamlit && \
-    chmod -R 777 /tmp && \
-    chmod -R 777 /tmp/streamlit && \
-    # Also ensure the SQLite database directory is writable
-    mkdir -p /data && \
-    chmod -R 777 /data && \
-    # Make sure the app files are readable
-    chmod -R 755 /app
-# Ensure temp directories exist and are writable
-RUN mkdir -p /tmp/csv_uploads && \
-    chmod -R 777 /tmp/csv_uploads
-# Set environment variables
 ENV DB_DIR=/data \
-    TMPDIR=/tmp \
     STREAMLIT_SERVER_HEADLESS=true \
     STREAMLIT_SERVER_ADDRESS=0.0.0.0 \
     STREAMLIT_SERVER_PORT=7860 \
     STREAMLIT_TELEMETRY_DISABLED=true \
     STREAMLIT_BROWSER_GATHER_USAGE_STATS=false \
-    # Increase file upload size limit to accommodate larger CSVs
-    STREAMLIT_SERVER_MAX_UPLOAD_SIZE=50
-# Expose the port Streamlit will run on
-EXPOSE 7860
-# Set entrypoint script
-COPY <<EOF /app/entrypoint.sh
-#!/bin/bash
-echo "Starting Streamlit app with debug info"
-echo "Current directory: $(pwd)"
-echo "Files in current directory: $(ls -la)"
-echo "Python version: $(python --version)"
-echo "Temp directory: $TMPDIR"
-echo "Temp directory exists: $([ -d $TMPDIR ] && echo 'Yes' || echo 'No')"
-echo "Temp directory permissions: $(ls -ld $TMPDIR)"
-# Run the app
-streamlit run app.py
-EOF
-# Make the entrypoint script executable
-RUN chmod +x /app/entrypoint.sh
-# Switch to the non-root user for better security
 USER appuser
-# Launch using the entrypoint script
-CMD ["/app/entrypoint.sh"]

+# Dockerfile: Streamlit/RDKit/PyG (Hugging Face Spaces)
 FROM python:3.10-slim
+#  OS libs for RDKit drawing
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential libxrender1 libxext6 libsm6 libx11-6 \
+        libglib2.0-0 libfreetype6 libpng-dev wget && \
     rm -rf /var/lib/apt/lists/*
+#  Non‑root user
 RUN useradd -m appuser
+#  Python packages
 RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir \
+        streamlit==1.45.0 rdkit-pypi==2022.9.5 pandas==2.2.3 \
+        numpy==1.26.4 torch==2.2.0 torch-geometric==2.5.2 \
+        ogb==1.3.6 pillow==10.3.0
+# Workdir and code
+WORKDIR /app
 COPY . .
+# Writable dirs with 775 perms
+RUN install -d -m 775 /tmp/streamlit /data
+# Environment
 ENV DB_DIR=/data \
     STREAMLIT_SERVER_HEADLESS=true \
     STREAMLIT_SERVER_ADDRESS=0.0.0.0 \
     STREAMLIT_SERVER_PORT=7860 \
     STREAMLIT_TELEMETRY_DISABLED=true \
     STREAMLIT_BROWSER_GATHER_USAGE_STATS=false \
+    STREAMLIT_SERVER_MAX_UPLOAD_SIZE=50
+EXPOSE 7860
 USER appuser
+CMD ["streamlit", "run", "app.py"]

app.py CHANGED Viewed

@@ -1,87 +1,53 @@
-import streamlit as st
 import pandas as pd
 import torch
-import sqlite3
-from datetime import datetime
 from rdkit import Chem
 from rdkit.Chem import Draw
-import os, pathlib, sys
-import tempfile
-from io import StringIO, BytesIO
 from model import load_model
 from utils import smiles_to_data
-from torch_geometric.loader import DataLoader
-# Config
-DEVICE = "cpu"
-RDKIT_DIM = 6
-MODEL_PATH = "best_hybridgnn.pt"
-MAX_DISPLAY = 10
-# Debug sidebar
-with st.sidebar:
-    st.title("Debug Tools")
-    if st.button("Show Environment Info"):
-        st.write("### System Info")
-        st.write(f"Python version: {sys.version}")
-        st.write(f"Current working directory: {os.getcwd()}")
-        st.write(f"Temp directory: {tempfile.gettempdir()}")
-        st.write(f"Temp dir exists: {os.path.exists(tempfile.gettempdir())}")
-        st.write(f"Temp dir writable: {os.access(tempfile.gettempdir(), os.W_OK)}")
-        st.write(f"Current user: {os.getenv('USER', 'unknown')}")
-        try:
-            st.write("### Directory Contents")
-            st.write(f"Files in current directory: {os.listdir('.')}")
-            st.write(f"Files in /tmp: {os.listdir('/tmp')}")
-        except Exception as e:
-            st.error(f"Error listing directories: {e}")
-        st.write("### Environment Variables")
-        for key, value in os.environ.items():
-            if not key.startswith(('AWS', 'SECRET')):  # Skip sensitive vars
-                st.write(f"{key}: {value}")
-# Load Model
 @st.cache_resource
-def load_cached_model():
-    try:
-        return load_model(rdkit_dim=RDKIT_DIM, path=MODEL_PATH, device=DEVICE)
-    except Exception as e:
-        st.error(f"Error loading model: {e}")
-        return None
-model = load_cached_model()
-# SQLite Setup
-DB_DIR = os.getenv("DB_DIR", "/tmp")
-pathlib.Path(DB_DIR).mkdir(parents=True, exist_ok=True)
 @st.cache_resource
 def init_db():
-    try:
-        db_file = os.path.join(DB_DIR, "predictions.db")
-        conn = sqlite3.connect(db_file, check_same_thread=False)
-        c = conn.cursor()
-        c.execute("""
-            CREATE TABLE IF NOT EXISTS predictions (
-                id INTEGER PRIMARY KEY AUTOINCREMENT,
-                smiles TEXT,
-                prediction REAL,
-                timestamp TEXT
-            )
-        """)
-        conn.commit()
-        return conn
-    except Exception as e:
-        st.error(f"Database initialization error: {e}")
-        return None
-conn = init_db()
-if conn:
-    cursor = conn.cursor()
-# Streamlit UI
 st.title("HOMO-LUMO Gap Predictor")
 st.markdown("""
 This app predicts the HOMO-LUMO energy gap for molecules using a trained Graph Neural Network (GNN).
@@ -94,160 +60,72 @@ This app predicts the HOMO-LUMO energy gap for molecules using a trained Graph N
 - The app will display predictions and molecule images (up to 10 shown at once).
 """)
-# File handling with caching
-@st.cache_data
-def read_csv_file(file_content):
-    """Cache the file reading operation"""
-    try:
-        # Try to read as string first
-        if isinstance(file_content, str):
-            df = pd.read_csv(StringIO(file_content), comment="#")
-        else:
-            # If it's bytes, decode it
-            df = pd.read_csv(StringIO(file_content.decode('utf-8')), comment="#")
-        return df, None
-    except Exception as e:
-        return None, str(e)
-# Debug container for file upload messages
-file_debug = st.container()
-# File uploader outside the form
-uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
-if uploaded_file is not None:
-    with file_debug:
-        st.write(f"File name: {uploaded_file.name}")
-        st.write(f"File type: {uploaded_file.type}")
-        st.write(f"File size: {uploaded_file.size} bytes")
-with st.form("input_form"):
-    smiles_input = st.text_area(
-        "Enter SMILES string(s)",
-        placeholder="C1=CC=CC=C1, CC(=O)Oc1ccccc1C(=O)O",
-        height=120
-        )
-    run_button = st.form_submit_button("Submit")
 smiles_list = []
-# Process after the button press
-if run_button:
-    #  CSV path
-    if uploaded_file is not None:
-        with file_debug:
-            st.write("### Processing CSV file")
-            try:
-                # Save file temporarily for debugging
-                temp_file = os.path.join(tempfile.gettempdir(), uploaded_file.name)
-                with open(temp_file, 'wb') as f:
-                    f.write(uploaded_file.getvalue())
-                st.write(f"Saved temporary file at: {temp_file}")
-                st.write(f"File exists: {os.path.exists(temp_file)}")
-                st.write(f"File size on disk: {os.path.getsize(temp_file)} bytes")
-                # Read file content
-                file_content = uploaded_file.getvalue()
-                st.write(f"Read {len(file_content)} bytes from file")
-                # Try to decode first few bytes
-                preview = file_content[:100] if len(file_content) > 100 else file_content
-                try:
-                    decoded_preview = preview.decode('utf-8')
-                    st.write(f"File preview (decoded): {decoded_preview}")
-                except:
-                    st.write(f"File preview (hex): {preview.hex()}")
-                # Use cached reading function
-                df, error = read_csv_file(file_content)
-                if error:
-                    st.error(f"CSV reading error: {error}")
-                elif df is not None:
-                    st.write(f"CSV loaded with {df.shape[0]} rows and {df.shape[1]} columns")
-                    st.write("CSV columns:", df.columns.tolist())
-                    st.write("First few rows:", df.head())
-                    # choose the SMILES column
-                    if df.shape[1] == 1:
-                        smiles_col = df.iloc[:, 0]
-                        st.write("Using the only column for SMILES")
-                    elif "smiles" in [c.lower() for c in df.columns]:
-                        col_name = [c for c in df.columns if c.lower() == "smiles"][0]
-                        smiles_col = df[col_name]
-                        st.write(f"Using column '{col_name}' for SMILES")
-                    else:
-                        st.error(f"CSV must have a single column or a column named 'SMILES'. Found columns: {', '.join(df.columns)}")
-                        st.write("Using first column as fallback")
-                        smiles_col = df.iloc[:, 0]
-                    smiles_list = smiles_col.dropna().astype(str).tolist()
-                    st.success(f"{len(smiles_list)} SMILES loaded from CSV")
-                    if smiles_list:
-                        st.write("First few SMILES:", smiles_list[:5])
-                else:
-                    st.error("Failed to process CSV: DataFrame is None")
-            except Exception as e:
-                st.error(f"Critical error processing CSV: {str(e)}")
-                st.exception(e)  # This shows the full traceback
-    # Textarea path
-    elif smiles_input.strip():
-        raw_input = smiles_input.replace("\n", ",")
-        smiles_list = [s.strip() for s in raw_input.split(",") if s.strip()]
-        st.success(f"{len(smiles_list)} SMILES parsed from text")
-        if smiles_list:
-            st.write("First few SMILES:", smiles_list[:5])
-# Run Inference
-if smiles_list:
-    with st.spinner("Processing molecules..."):
         try:
-            data_list = smiles_to_data(smiles_list, device=DEVICE)
-            # Filter only valid molecules and keep aligned SMILES
-            valid_pairs = [(smi, data) for smi, data in zip(smiles_list, data_list) if data is not None]
-            if not valid_pairs:
-                st.warning("No valid molecules found")
             else:
-                valid_smiles, valid_data = zip(*valid_pairs)
-                loader = DataLoader(valid_data, batch_size=64)
-                predictions = []
-                for batch in loader:
-                    batch = batch.to(DEVICE)
-                    with torch.no_grad():
-                        pred = model(batch).view(-1).cpu().numpy()
-                        predictions.extend(pred.tolist())
-                # Display Results
-                st.subheader(f"Predictions (showing up to {MAX_DISPLAY} molecules):")
-                for i, (smi, pred) in enumerate(zip(valid_smiles, predictions)):
-                    if i >= MAX_DISPLAY:
-                        st.info(f"...only showing the first {MAX_DISPLAY} molecules")
-                        break
-                    mol = Chem.MolFromSmiles(smi)
-                    if mol:
-                        st.image(Draw.MolToImage(mol, size=(250, 250)))
-                    st.write(f"**SMILES**: `{smi}`")
-                    st.write(f"**Predicted HOMO-LUMO Gap**: `{pred:.4f} eV`")
-                    # Log to SQLite if connection exists
-                    if conn:
-                        cursor.execute("INSERT INTO predictions (smiles, prediction, timestamp) VALUES (?, ?, ?)",
-                                    (smi, pred, str(datetime.now())))
-                        conn.commit()
-                # Download Results
-                result_df = pd.DataFrame({"SMILES": valid_smiles,
-                                        "Predicted HOMO-LUMO Gap (eV)": [round(p, 4) for p in predictions]})
-                st.download_button(label="Download Predictions as CSV",
-                                data=result_df.to_csv(index=False).encode('utf-8'),
-                                file_name="homolumo_predictions.csv",
-                                mime="text/csv")
         except Exception as e:
-            st.error(f"Error during inference: {str(e)}")
-            st.exception(e)  # This shows the full traceback

+import os, pathlib, sqlite3, sys, tempfile
+from datetime import datetime
+from io import StringIO
 import pandas as pd
+import streamlit as st
 import torch
 from rdkit import Chem
 from rdkit.Chem import Draw
+from torch_geometric.loader import DataLoader
 from model import load_model
 from utils import smiles_to_data
+#  Config
+DEVICE, RDKIT_DIM, MODEL_PATH, MAX_DISPLAY = "cpu", 6, "best_hybridgnn.pt", 10
+#  Model & DB (cached)
 @st.cache_resource
+def get_model():
+    return load_model(rdkit_dim=RDKIT_DIM, path=MODEL_PATH, device=DEVICE)
+model = get_model()
+DB_DIR = pathlib.Path(os.getenv("DB_DIR", "/tmp"))
+DB_DIR.mkdir(parents=True, exist_ok=True)
 @st.cache_resource
 def init_db():
+    conn = sqlite3.connect(DB_DIR / "predictions.db", check_same_thread=False)
+    conn.execute(
+        """CREATE TABLE IF NOT EXISTS predictions(
+               id INTEGER PRIMARY KEY AUTOINCREMENT,
+               smiles TEXT, prediction REAL, timestamp TEXT)"""
+    )
+    conn.commit()
+    return conn
+conn   = init_db()
+cursor = conn.cursor()
+#  debug and info panel
+with st.sidebar.expander("Info & Env", expanded=False):
+    st.write(f"Python {sys.version.split()[0]}")
+    st.write(f"Temp dir: `{tempfile.gettempdir()}` "
+             f"({'writable' if os.access(tempfile.gettempdir(), os.W_OK) else 'read-only'})")
+    if "csv_bytes" in st.session_state:
+        st.write(f"Last upload: **{len(st.session_state['csv_bytes'])/1024:.1f} KB**")
+#  Header
 st.title("HOMO-LUMO Gap Predictor")
 st.markdown("""
 This app predicts the HOMO-LUMO energy gap for molecules using a trained Graph Neural Network (GNN).
 - The app will display predictions and molecule images (up to 10 shown at once).
 """)
+#  File uploader (outside form)
+csv_file = st.file_uploader("CSV with SMILES", type=["csv"])
+if csv_file is not None:
+    st.session_state["csv_bytes"] = csv_file.getvalue()
+#  Input form
 smiles_list = []
+with st.form("main_form"):
+    smiles_text = st.text_area("…or paste SMILES (comma/newline separated)",
+                               placeholder="CC(=O)Oc1ccccc1C(=O)O",
+                               height=120)
+    run = st.form_submit_button("Run Prediction")
+#  Parse input
+if run:
+    if "csv_bytes" in st.session_state:
         try:
+            df = pd.read_csv(StringIO(st.session_state["csv_bytes"].decode("utf-8")), comment="#")
+            col = df.columns[0] if df.shape[1] == 1 else next((c for c in df.columns if c.lower() == "smiles"), None)
+            if col is None:
+                st.error("CSV needs one column or a 'SMILES' column")
             else:
+                smiles_list = df[col].dropna().astype(str).tolist()
+                st.success(f"{len(smiles_list)} SMILES loaded from CSV")
         except Exception as e:
+            st.error(f"CSV error: {e}")
+    elif smiles_text.strip():
+        smiles_list = [s.strip() for s in smiles_text.replace("\n", ",").split(",") if s.strip()]
+        st.success(f"{len(smiles_list)} SMILES parsed from textbox")
+    else:
+        st.warning("No input provided")
+#  Inference & display
+if smiles_list:
+    data_list = smiles_to_data(smiles_list, device=DEVICE)
+    valid = [(s, d) for s, d in zip(smiles_list, data_list) if d is not None]
+    if not valid:
+        st.warning("No valid molecules")
+    else:
+        vsmi, vdata = zip(*valid)
+        preds = []
+        for batch in DataLoader(vdata, batch_size=64):
+            with torch.no_grad():
+                preds.extend(get_model()(batch.to(DEVICE)).view(-1).cpu().numpy().tolist())
+        st.subheader(f"Results (first {MAX_DISPLAY})")
+        for i, (smi, pred) in enumerate(zip(vsmi, preds)):
+            if i >= MAX_DISPLAY:
+                st.info("…truncated")
+                break
+            mol = Chem.MolFromSmiles(smi)
+            if mol:
+                st.image(Draw.MolToImage(mol, size=(250, 250)))
+            st.write(f"`{smi}` → **{pred:.4f} eV**")
+            cursor.execute(
+                "INSERT INTO predictions(smiles, prediction, timestamp) VALUES (?,?,?)",
+                (smi, float(pred), datetime.now().isoformat()),
+            )
+        conn.commit()
+        st.download_button("Download CSV",
+                           pd.DataFrame(
+                               {"SMILES": vsmi, "Gap (eV)": [round(p, 4) for p in preds]}
+                               ).to_csv(index=False).encode(),
+                               "homolumo_predictions.csv",
+                               "text/csv")