Spaces:

broadfield-dev
/

Embedding-Atlas

Sleeping

App Files Files Community

broadfield-dev commited on 9 days ago

Commit

808b711

verified ·

1 Parent(s): 23f8201

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -35

app.py CHANGED Viewed

@@ -46,6 +46,7 @@ def get_dataset_splits(dataset_id: str):
         return gr.update(choices=[], value=None, interactive=False)
     try:
         splits = get_dataset_split_names(dataset_id)
         return gr.update(choices=splits, value=splits[0] if splits else None, interactive=True)
     except Exception as e:
         gr.Warning(f"Could not fetch splits for dataset '{dataset_id}'. Error: {e}")
@@ -56,9 +57,7 @@ def get_split_columns(dataset_id: str, split: str):
     if not dataset_id or not split:
         return gr.update(choices=[], value=None, interactive=False)
     try:
-        # --- THIS IS THE FIX ---
-        # Instead of iterating, we get the .features property from the dataset info.
-        # This is much faster and more reliable as it only fetches metadata.
         features = load_dataset(dataset_id, split=split, streaming=True).features
         columns = list(features.keys())
@@ -67,9 +66,7 @@ def get_split_columns(dataset_id: str, split: str):
         best_col = next((col for col in preferred_cols if col in columns), columns[0] if columns else None)
         return gr.update(choices=columns, value=best_col, interactive=True)
     except Exception as e:
-        # Adding a print statement here can help debug in the terminal
-        print(f"Error fetching columns for {dataset_id}/{split}: {e}")
-        gr.Warning(f"Could not fetch columns for split '{split}'. Check if the dataset requires special access. Error: {e}")
         return gr.update(choices=[], value=None, interactive=False)
 # --- Main Atlas Generation Logic ---
@@ -103,7 +100,7 @@ def generate_atlas(
     if text_column not in df.columns:
         raise gr.Error(f"Column '{text_column}' not found. Please select a valid column.")
-    progress(0.2, desc="Computing embeddings and UMAP. This may take a while...")
     x_col = find_column_name(df.columns, "projection_x")
     y_col = find_column_name(df.columns, "projection_y")
@@ -115,15 +112,13 @@ def generate_atlas(
             umap_args={"n_neighbors": umap_neighbors, "min_dist": umap_min_dist, "metric": "cosine", "random_state": 42},
         )
     except Exception as e:
-        raise gr.Error(f"Failed to compute embeddings. Check model name or try a smaller sample. Error: {e}")
     progress(0.8, desc="Preparing Atlas data source...")
     id_col = find_column_name(df.columns, "_row_index")
     df[id_col] = range(df.shape[0])
-    metadata = {
-        "columns": {"id": id_col, "text": text_column, "embedding": {"x": x_col, "y": y_col}, "neighbors": neighbors_col},
-    }
     hasher = Hasher()
     hasher.update(f"{dataset_name}-{split}-{text_column}-{sample_size}-{model_name}")
     identifier = hasher.hexdigest()
@@ -143,10 +138,7 @@ def generate_atlas(
 # --- Gradio UI Definition ---
 with gr.Blocks(theme=gr.themes.Soft(), title="Embedding Atlas Explorer") as app:
     gr.Markdown("# Embedding Atlas Explorer")
-    gr.Markdown(
-        "Interactively select and visualize any text-based dataset from the Hugging Face Hub. "
-        "The app computes embeddings and projects them into a 2D map for exploration."
-    )
     with gr.Row():
         with gr.Column(scale=1):
@@ -171,34 +163,23 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Embedding Atlas Explorer") as app:
             output_html = gr.HTML("<div style='display:flex; justify-content:center; align-items:center; height:800px; border: 1px solid #ddd; border-radius: 5px;'><p>Atlas will be displayed here after generation.</p></div>")
     # --- Chained Event Listeners for Dynamic UI ---
-    hf_user_input.submit(
-        fn=get_user_datasets,
-        inputs=[hf_user_input],
-        outputs=[dataset_input]
-    )
-    dataset_input.select(
-        fn=get_dataset_splits,
-        inputs=[dataset_input],
-        outputs=[split_input]
-    )
-    split_input.select(
-        fn=get_split_columns,
-        inputs=[dataset_input, split_input],
-        outputs=[text_column_input]
-    )
     # --- Button Click Event ---
     generate_button.click(
         fn=generate_atlas,
-        inputs=[
-            dataset_input, split_input, text_column_input,
-            sample_size_input, model_input, umap_neighbors_input, umap_min_dist_input
-        ],
         outputs=[output_html],
     )
     # Load initial example data on app load
-    app.load(fn=get_user_datasets, inputs=[hf_user_input], outputs=[dataset_input])
 if __name__ == "__main__":
     app.launch(debug=True)

         return gr.update(choices=[], value=None, interactive=False)
     try:
         splits = get_dataset_split_names(dataset_id)
+        # Set the first split as the default value to trigger the next event
         return gr.update(choices=splits, value=splits[0] if splits else None, interactive=True)
     except Exception as e:
         gr.Warning(f"Could not fetch splits for dataset '{dataset_id}'. Error: {e}")
     if not dataset_id or not split:
         return gr.update(choices=[], value=None, interactive=False)
     try:
+        # Get the .features property from the dataset info.
         features = load_dataset(dataset_id, split=split, streaming=True).features
         columns = list(features.keys())
         best_col = next((col for col in preferred_cols if col in columns), columns[0] if columns else None)
         return gr.update(choices=columns, value=best_col, interactive=True)
     except Exception as e:
+        gr.Warning(f"Could not fetch columns for split '{split}'. Error: {e}")
         return gr.update(choices=[], value=None, interactive=False)
 # --- Main Atlas Generation Logic ---
     if text_column not in df.columns:
         raise gr.Error(f"Column '{text_column}' not found. Please select a valid column.")
+    progress(0.2, desc="Computing embeddings and UMAP...")
     x_col = find_column_name(df.columns, "projection_x")
     y_col = find_column_name(df.columns, "projection_y")
             umap_args={"n_neighbors": umap_neighbors, "min_dist": umap_min_dist, "metric": "cosine", "random_state": 42},
         )
     except Exception as e:
+        raise gr.Error(f"Failed to compute embeddings. Check model name or sample size. Error: {e}")
     progress(0.8, desc="Preparing Atlas data source...")
     id_col = find_column_name(df.columns, "_row_index")
     df[id_col] = range(df.shape[0])
+    metadata = {"columns": {"id": id_col, "text": text_column, "embedding": {"x": x_col, "y": y_col}, "neighbors": neighbors_col}}
     hasher = Hasher()
     hasher.update(f"{dataset_name}-{split}-{text_column}-{sample_size}-{model_name}")
     identifier = hasher.hexdigest()
 # --- Gradio UI Definition ---
 with gr.Blocks(theme=gr.themes.Soft(), title="Embedding Atlas Explorer") as app:
     gr.Markdown("# Embedding Atlas Explorer")
+    gr.Markdown("Interactively select and visualize any text-based dataset from the Hugging Face Hub.")
     with gr.Row():
         with gr.Column(scale=1):
             output_html = gr.HTML("<div style='display:flex; justify-content:center; align-items:center; height:800px; border: 1px solid #ddd; border-radius: 5px;'><p>Atlas will be displayed here after generation.</p></div>")
     # --- Chained Event Listeners for Dynamic UI ---
+    # When the user submits a name, get their datasets
+    hf_user_input.submit(fn=get_user_datasets, inputs=hf_user_input, outputs=dataset_input)
+    # --- THIS IS THE FIX ---
+    # Use .change() so that when a dataset is selected (by user OR another function), it triggers the next step.
+    dataset_input.change(fn=get_dataset_splits, inputs=dataset_input, outputs=split_input)
+    split_input.change(fn=get_split_columns, inputs=[dataset_input, split_input], outputs=text_column_input)
     # --- Button Click Event ---
     generate_button.click(
         fn=generate_atlas,
+        inputs=[dataset_input, split_input, text_column_input, sample_size_input, model_input, umap_neighbors_input, umap_min_dist_input],
         outputs=[output_html],
     )
     # Load initial example data on app load
+    app.load(fn=get_user_datasets, inputs=hf_user_input, outputs=dataset_input)
 if __name__ == "__main__":
     app.launch(debug=True)