Spaces:

broadfield-dev
/

Embedding-Atlas

Sleeping

App Files Files Community

broadfield-dev commited on 13 days ago

Commit

c84ca1f

verified ·

1 Parent(s): cd69d2a

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -18

app.py CHANGED Viewed

@@ -1,10 +1,14 @@
 import gradio as gr
 import pandas as pd
-from datasets import load_dataset, get_dataset_split_names, get_dataset_config_info
 from huggingface_hub import HfApi
 import os
 import pathlib
 import uuid
 # --- Embedding Atlas Imports ---
 from embedding_atlas.data_source import DataSource
@@ -30,45 +34,54 @@ hf_api = HfApi()
 def get_user_datasets(username: str):
     """Fetches all public datasets for a given username or organization."""
     if not username:
         return gr.update(choices=[], value=None, interactive=False)
     try:
         datasets = hf_api.list_datasets(author=username, full=True)
         dataset_ids = [d.id for d in datasets if not d.private]
         return gr.update(choices=sorted(dataset_ids), value=None, interactive=True)
     except Exception as e:
-        gr.Warning(f"Could not fetch datasets for user '{username}'. Error: {e}")
         return gr.update(choices=[], value=None, interactive=False)
 def get_dataset_splits(dataset_id: str):
     """Gets all available splits for a selected dataset."""
     if not dataset_id:
         return gr.update(choices=[], value=None, interactive=False)
     try:
-        # --- FIX: Removed trust_remote_code=True ---
         splits = get_dataset_split_names(dataset_id)
         return gr.update(choices=splits, value=splits[0] if splits else None, interactive=True)
     except Exception as e:
-        gr.Warning(f"Could not fetch splits for dataset '{dataset_id}'. Error: {e}")
         return gr.update(choices=[], value=None, interactive=False)
-def get_split_columns(dataset_id: str):
-    """Gets all columns for a selected dataset by loading its metadata info."""
-    if not dataset_id:
         return gr.update(choices=[], value=None, interactive=False)
     try:
-        # --- FIX: Removed trust_remote_code=True ---
-        info = get_dataset_config_info(dataset_id)
-        features = info.features
-        columns = list(features.keys())
         preferred_cols = ['text', 'content', 'instruction', 'question', 'document', 'prompt']
         best_col = next((col for col in preferred_cols if col in columns), columns[0] if columns else None)
         return gr.update(choices=columns, value=best_col, interactive=True)
     except Exception as e:
-        gr.Warning(f"Could not get columns for '{dataset_id}'. It might be a gated dataset or have an unusual structure. Error: {e}")
         return gr.update(choices=[], value=None, interactive=False)
 # --- Main Atlas Generation Logic ---
@@ -90,9 +103,7 @@ def generate_atlas(
     progress(0, desc=f"Loading dataset '{dataset_name}' [{split}]...")
     try:
-        # Here, trust_remote_code can be useful if the dataset actually needs it.
-        # It's less likely to crash here than in the metadata functions.
-        dataset = load_dataset(dataset_name, split=split, trust_remote_code=True)
         df = dataset.to_pandas()
     except Exception as e:
         raise gr.Error(f"Failed to load data. Error: {e}")
@@ -166,11 +177,13 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Embedding Atlas Explorer") as app:
             gr.Markdown("### 3. Explore Atlas")
             output_html = gr.HTML("<div style='display:flex; justify-content:center; align-items:center; height:800px; border: 1px solid #ddd; border-radius: 5px;'><p>Atlas will be displayed here after generation.</p></div>")
-    # --- Chained Event Listeners for Dynamic UI ---
     hf_user_input.submit(fn=get_user_datasets, inputs=hf_user_input, outputs=dataset_input)
     dataset_input.change(fn=get_dataset_splits, inputs=dataset_input, outputs=split_input)
-    dataset_input.change(fn=get_split_columns, inputs=dataset_input, outputs=text_column_input)
     # --- Button Click Event ---
     generate_button.click(

 import gradio as gr
 import pandas as pd
+from datasets import load_dataset, get_dataset_split_names
 from huggingface_hub import HfApi
 import os
 import pathlib
 import uuid
+import logging
+# --- Setup Logging ---
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 # --- Embedding Atlas Imports ---
 from embedding_atlas.data_source import DataSource
 def get_user_datasets(username: str):
     """Fetches all public datasets for a given username or organization."""
+    logging.info(f"Fetching datasets for user: {username}")
     if not username:
         return gr.update(choices=[], value=None, interactive=False)
     try:
         datasets = hf_api.list_datasets(author=username, full=True)
         dataset_ids = [d.id for d in datasets if not d.private]
+        logging.info(f"Found {len(dataset_ids)} datasets for {username}.")
         return gr.update(choices=sorted(dataset_ids), value=None, interactive=True)
     except Exception as e:
+        logging.error(f"Failed to fetch datasets for {username}: {e}")
+        gr.Warning(f"Could not fetch datasets for user '{username}'.")
         return gr.update(choices=[], value=None, interactive=False)
 def get_dataset_splits(dataset_id: str):
     """Gets all available splits for a selected dataset."""
+    logging.info(f"Fetching splits for dataset: {dataset_id}")
     if not dataset_id:
         return gr.update(choices=[], value=None, interactive=False)
     try:
         splits = get_dataset_split_names(dataset_id)
+        logging.info(f"Found splits for {dataset_id}: {splits}")
         return gr.update(choices=splits, value=splits[0] if splits else None, interactive=True)
     except Exception as e:
+        logging.error(f"Failed to fetch splits for {dataset_id}: {e}")
+        gr.Warning(f"Could not fetch splits for dataset '{dataset_id}'.")
         return gr.update(choices=[], value=None, interactive=False)
+def get_split_columns(dataset_id: str, split: str):
+    """Gets all columns for a selected split by loading one row."""
+    logging.info(f"Fetching columns for: {dataset_id}, split: {split}")
+    if not dataset_id or not split:
         return gr.update(choices=[], value=None, interactive=False)
     try:
+        # This is the most robust method: stream one row and get its keys.
+        dataset_sample = load_dataset(dataset_id, split=split, streaming=True)
+        first_row = next(iter(dataset_sample))
+        columns = list(first_row.keys())
+        logging.info(f"Found columns: {columns}")
+        # Heuristically find the best text column
         preferred_cols = ['text', 'content', 'instruction', 'question', 'document', 'prompt']
         best_col = next((col for col in preferred_cols if col in columns), columns[0] if columns else None)
+        logging.info(f"Best default column chosen: {best_col}")
         return gr.update(choices=columns, value=best_col, interactive=True)
     except Exception as e:
+        logging.error(f"Failed to get columns for {dataset_id}/{split}: {e}", exc_info=True)
+        gr.Warning(f"Could not fetch columns for split '{split}'. Error: {e}")
         return gr.update(choices=[], value=None, interactive=False)
 # --- Main Atlas Generation Logic ---
     progress(0, desc=f"Loading dataset '{dataset_name}' [{split}]...")
     try:
+        dataset = load_dataset(dataset_name, split=split)
         df = dataset.to_pandas()
     except Exception as e:
         raise gr.Error(f"Failed to load data. Error: {e}")
             gr.Markdown("### 3. Explore Atlas")
             output_html = gr.HTML("<div style='display:flex; justify-content:center; align-items:center; height:800px; border: 1px solid #ddd; border-radius: 5px;'><p>Atlas will be displayed here after generation.</p></div>")
+    # --- Chained Event Listeners for Dynamic UI (CORRECTED LOGIC) ---
     hf_user_input.submit(fn=get_user_datasets, inputs=hf_user_input, outputs=dataset_input)
     dataset_input.change(fn=get_dataset_splits, inputs=dataset_input, outputs=split_input)
+    # This is the critical fix: The columns are populated only AFTER a split is chosen.
+    split_input.change(fn=get_split_columns, inputs=[dataset_input, split_input], outputs=text_column_input)
     # --- Button Click Event ---
     generate_button.click(