Spaces:

LeMaterial
/

materials_explorer

Running

App Files Files Community

Ramlaoui commited on Dec 5, 2024

Commit

2dd66b7

1 Parent(s): a91474f

Faster search and Table

Browse files

Files changed (4) hide show

Dockerfile +5 -0
app.py +101 -42
create_index.py +52 -0
requirements.txt +1 -0

Dockerfile CHANGED Viewed

@@ -19,6 +19,9 @@ RUN pip install --no-cache-dir -r requirements.txt
 # Copy the application code
 COPY app.py .
 # Expose the port Dash will run on
 EXPOSE 7860
@@ -29,6 +32,8 @@ RUN --mount=type=secret,id=MATERIALS_PROJECT_API_KEY \
 # Create the cache directory and set permissions
 RUN mkdir -p /app/.cache && chmod -R 777 /app/.cache
 # Set an environment variable for Hugging Face cache
 ENV HF_HOME=/app/.cache

 # Copy the application code
 COPY app.py .
+# Copy the preprocessing script
+COPY create_index.py .
 # Expose the port Dash will run on
 EXPOSE 7860
 # Create the cache directory and set permissions
 RUN mkdir -p /app/.cache && chmod -R 777 /app/.cache
+# Create the index
+RUN python create_index.py
 # Set an environment variable for Hugging Face cache
 ENV HF_HOME=/app/.cache

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from pymatgen.core import Structure
 from pymatgen.ext.matproj import MPRester
 HF_TOKEN = os.environ.get("HF_TOKEN")
 # Load only the train split of the dataset
 dataset = load_dataset(
@@ -40,9 +41,40 @@ dataset = load_dataset(
     ],
 )
-# Convert the train split to a pandas DataFrame
-train_df = dataset.to_pandas()
-del dataset
 # Initialize the Dash app
 app = dash.Dash(__name__, assets_folder=SETTINGS.ASSETS_PATH)
@@ -58,11 +90,11 @@ layout = html.Div(
                     [
                         html.H3("Search for materials by elements (eg. 'Ac,Cd,Ge')"),
                         dmp.MaterialsInput(
-                            allowedInputTypes=["elements"],
                             hidePeriodicTable=False,
                             periodicTableMode="toggle",
                             showSubmitButton=True,
-                            submitButtonText="Submit",
                             type="elements",
                             id="materials-input",
                         ),
@@ -79,10 +111,24 @@ layout = html.Div(
         html.Div(
             [
                 html.Label("Select Material"),
-                dcc.Dropdown(
-                    id="material-dropdown",
-                    options=[],  # Empty options initially
-                    value=None,
                 ),
             ],
             style={"margin-bottom": "20px"},
@@ -118,40 +164,51 @@ layout = html.Div(
 )
-# Function to search for materials
 def search_materials(query):
-    element_list = [el.strip() for el in query.split(",")]
-    isubset = lambda x: set(x).issubset(element_list)
-    isintersection = lambda x: len(set(x).intersection(element_list)) > 0
-    entries_df = train_df[
-        [isintersection(l) and isubset(l) for l in train_df.elements.values.tolist()]
-    ]
-    options = [
-        {
-            "label": f"{res.chemical_formula_reduced} ({res.immutable_id}) Calculated with {res.functional}",
-            "value": n,
-        }
-        for n, res in entries_df.iterrows()
-    ]
-    del entries_df
     return options
-# Callback to update the material dropdown based on search
 @app.callback(
-    Output("material-dropdown", "options"),
-    Output("material-dropdown", "value"),
     Input("materials-input", "submitButtonClicks"),
     Input("materials-input", "value"),
 )
 def on_submit_materials_input(n_clicks, query):
     if n_clicks is None or not query:
-        return [], None
-    options = search_materials(query)
-    if not options:
-        return [], None
-    return options, options[0]["value"]
 # Callback to display the selected material
@@ -161,12 +218,14 @@ def on_submit_materials_input(n_clicks, query):
         Output("properties-container", "children"),
     ],
     Input("display-button", "n_clicks"),
-    State("material-dropdown", "value"),
 )
-def display_material(n_clicks, material_id):
-    if n_clicks is None or not material_id:
         return "", ""
-    row = train_df.iloc[material_id]
     structure = Structure(
         [x for y in row["lattice_vectors"] for x in y],
@@ -180,11 +239,11 @@ def display_material(n_clicks, material_id):
     # Extract key properties
     properties = {
-        "Material ID": row.immutable_id,
-        "Formula": row.chemical_formula_descriptive,
-        "Energy per atom (eV/atom)": row.energy / len(row.species_at_sites),
-        "Band Gap (eV)": row.band_gap_direct or row.band_gap_indirect,
-        "Total Magnetization (μB/f.u.)": row.total_magnetization,
     }
     # Format properties as an HTML table

 from pymatgen.ext.matproj import MPRester
 HF_TOKEN = os.environ.get("HF_TOKEN")
+top_k = 100
 # Load only the train split of the dataset
 dataset = load_dataset(
     ],
 )
+display_columns = [
+    "chemical_formula_descriptive",
+    "functional",
+    "immutable_id",
+    "energy",
+]
+display_names = {
+    "chemical_formula_descriptive": "Formula",
+    "functional": "Functional",
+    "immutable_id": "Material ID",
+    "energy": "Energy (eV)",
+}
+mapping_table_idx_dataset_idx = {}
+import numpy as np
+import periodictable
+map_periodic_table = {v.symbol: k for k, v in enumerate(periodictable.elements)}
+# import re
+#
+# dataset_index = np.zeros((len(dataset), 118))
+# import tqdm
+#
+# for i, row in tqdm.tqdm(enumerate(dataset), total=len(dataset)):
+#     for el in row["chemical_formula_descriptive"].split(" "):
+#         matches = re.findall(r"([a-zA-Z]+)([0-9]*)", el)
+#         el = matches[0][0]
+#         numb = int(matches[0][1]) if matches[0][1] else 1
+#         dataset_index[i][map_periodic_table[el]] = numb
+dataset_index = np.load("dataset_index.npy")
 # Initialize the Dash app
 app = dash.Dash(__name__, assets_folder=SETTINGS.ASSETS_PATH)
                     [
                         html.H3("Search for materials by elements (eg. 'Ac,Cd,Ge')"),
                         dmp.MaterialsInput(
+                            allowedInputTypes=["elements", "formula"],
                             hidePeriodicTable=False,
                             periodicTableMode="toggle",
                             showSubmitButton=True,
+                            submitButtonText="Search",
                             type="elements",
                             id="materials-input",
                         ),
         html.Div(
             [
                 html.Label("Select Material"),
+                # dcc.Dropdown(
+                #     id="material-dropdown",
+                #     options=[],  # Empty options initially
+                #     value=None,
+                # ),
+                dash.dash_table.DataTable(
+                    id="table",
+                    columns=[
+                        {"name": display_names[col], "id": col}
+                        for col in display_columns
+                    ],
+                    data=[{}],
+                    style_table={
+                        "overflowX": "auto",
+                        "height": "400px",
+                        "overflowY": "auto",
+                    },
+                    style_cell={"textAlign": "left"},
                 ),
             ],
             style={"margin-bottom": "20px"},
 )
 def search_materials(query):
+    query_vector = np.zeros(118)
+    if "," in query:
+        element_list = [el.strip() for el in query.split(",")]
+        for el in element_list:
+            query_vector[map_periodic_table[el]] = 1
+    else:
+        # Formula
+        import re
+        matches = re.findall(r"([A-Z][a-z]{0,2})(\d*)", query)
+        for el, numb in matches:
+            numb = int(numb) if numb else 1
+            query_vector[map_periodic_table[el]] = numb
+    similarity = np.dot(dataset_index, query_vector) / (
+        np.linalg.norm(dataset_index) * np.linalg.norm(query_vector)
+    )
+    print(similarity[::-1][:top_k])
+    indices = np.argsort(similarity)[::-1][:top_k]
+    options = [dataset[int(i)] for i in indices]
+    mapping_table_idx_dataset_idx.clear()
+    for i, idx in enumerate(indices):
+        mapping_table_idx_dataset_idx[int(i)] = int(idx)
     return options
+# Callback to update the table based on search
 @app.callback(
+    Output("table", "data"),
     Input("materials-input", "submitButtonClicks"),
     Input("materials-input", "value"),
 )
 def on_submit_materials_input(n_clicks, query):
     if n_clicks is None or not query:
+        return []
+    entries = search_materials(query)
+    print(len(entries))
+    return [{col: entry[col] for col in display_columns} for entry in entries]
 # Callback to display the selected material
         Output("properties-container", "children"),
     ],
     Input("display-button", "n_clicks"),
+    Input("table", "active_cell"),
 )
+def display_material(n_clicks, active_cell):
+    if n_clicks is None or not active_cell:
         return "", ""
+    idx_active = active_cell["row"]
+    row = dataset[mapping_table_idx_dataset_idx[idx_active]]
     structure = Structure(
         [x for y in row["lattice_vectors"] for x in y],
     # Extract key properties
     properties = {
+        "Material ID": row["immutable_id"],
+        "Formula": row["chemical_formula_descriptive"],
+        "Energy per atom (eV/atom)": row["energy"] / len(row["species_at_sites"]),
+        "Band Gap (eV)": row["band_gap_direct"] or row["band_gap_indirect"],
+        "Total Magnetization (μB/f.u.)": row["total_magnetization"],
     }
     # Format properties as an HTML table

create_index.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import os
+import re
+import numpy as np
+import periodictable
+from datasets import load_dataset
+HF_TOKEN = os.environ.get("HF_TOKEN")
+# Load only the train split of the dataset
+dataset = load_dataset(
+    "LeMaterial/leDataset",
+    token=HF_TOKEN,
+    split="train",
+    columns=[
+        "lattice_vectors",
+        "species_at_sites",
+        "cartesian_site_positions",
+        "energy",
+        "energy_corrected",
+        "immutable_id",
+        "elements",
+        "functional",
+        "stress_tensor",
+        "magnetic_moments",
+        "forces",
+        "band_gap_direct",
+        "band_gap_indirect",
+        "dos_ef",
+        "charges",
+        "functional",
+        "chemical_formula_reduced",
+        "chemical_formula_descriptive",
+        "total_magnetization",
+    ],
+)
+map_periodic_table = {v.symbol: k for k, v in enumerate(periodictable.elements)}
+dataset_index = np.zeros((len(dataset), 118))
+import tqdm
+for i, row in tqdm.tqdm(enumerate(dataset), total=len(dataset)):
+    for el in row["chemical_formula_descriptive"].split(" "):
+        matches = re.findall(r"([a-zA-Z]+)([0-9]*)", el)
+        el = matches[0][0]
+        numb = int(matches[0][1]) if matches[0][1] else 1
+        dataset_index[i][map_periodic_table[el]] = numb
+np.save("dataset_index.npy", dataset_index)

requirements.txt CHANGED Viewed

@@ -9,3 +9,4 @@ pandas
 dash-bootstrap-components
 datasets
 dash-mp-components

 dash-bootstrap-components
 datasets
 dash-mp-components
+periodictable