Spaces:

broadfield-dev
/

Embedding-Atlas

Sleeping

App Files Files Community

broadfield-dev commited on 9 days ago

Commit

23f8201

verified ·

1 Parent(s): e195296

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -9

app.py CHANGED Viewed

@@ -33,10 +33,7 @@ def get_user_datasets(username: str):
     if not username:
         return gr.update(choices=[], value=None, interactive=False)
     try:
-        # --- THIS IS THE FIX ---
-        # Replace deprecated 'cardData=True' with 'full=True'
         datasets = hf_api.list_datasets(author=username, full=True)
         dataset_ids = [d.id for d in datasets if not d.private]
         return gr.update(choices=sorted(dataset_ids), value=None, interactive=True)
     except Exception as e:
@@ -55,20 +52,24 @@ def get_dataset_splits(dataset_id: str):
         return gr.update(choices=[], value=None, interactive=False)
 def get_split_columns(dataset_id: str, split: str):
-    """Gets all columns for a selected split by loading one row."""
     if not dataset_id or not split:
         return gr.update(choices=[], value=None, interactive=False)
     try:
-        # Stream one row to get column names without downloading the whole dataset
-        dataset_sample = load_dataset(dataset_id, split=split, streaming=True)
-        first_row = next(iter(dataset_sample))
-        columns = list(first_row.keys())
         # Heuristically find the best text column
         preferred_cols = ['text', 'content', 'instruction', 'question', 'document', 'prompt']
         best_col = next((col for col in preferred_cols if col in columns), columns[0] if columns else None)
         return gr.update(choices=columns, value=best_col, interactive=True)
     except Exception as e:
-        gr.Warning(f"Could not fetch columns for split '{split}'. Error: {e}")
         return gr.update(choices=[], value=None, interactive=False)
 # --- Main Atlas Generation Logic ---

     if not username:
         return gr.update(choices=[], value=None, interactive=False)
     try:
         datasets = hf_api.list_datasets(author=username, full=True)
         dataset_ids = [d.id for d in datasets if not d.private]
         return gr.update(choices=sorted(dataset_ids), value=None, interactive=True)
     except Exception as e:
         return gr.update(choices=[], value=None, interactive=False)
 def get_split_columns(dataset_id: str, split: str):
+    """Gets all columns for a selected split by loading its metadata."""
     if not dataset_id or not split:
         return gr.update(choices=[], value=None, interactive=False)
     try:
+        # --- THIS IS THE FIX ---
+        # Instead of iterating, we get the .features property from the dataset info.
+        # This is much faster and more reliable as it only fetches metadata.
+        features = load_dataset(dataset_id, split=split, streaming=True).features
+        columns = list(features.keys())
         # Heuristically find the best text column
         preferred_cols = ['text', 'content', 'instruction', 'question', 'document', 'prompt']
         best_col = next((col for col in preferred_cols if col in columns), columns[0] if columns else None)
         return gr.update(choices=columns, value=best_col, interactive=True)
     except Exception as e:
+        # Adding a print statement here can help debug in the terminal
+        print(f"Error fetching columns for {dataset_id}/{split}: {e}")
+        gr.Warning(f"Could not fetch columns for split '{split}'. Check if the dataset requires special access. Error: {e}")
         return gr.update(choices=[], value=None, interactive=False)
 # --- Main Atlas Generation Logic ---