broadfield-dev commited on
Commit
23f8201
·
verified ·
1 Parent(s): e195296

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -9
app.py CHANGED
@@ -33,10 +33,7 @@ def get_user_datasets(username: str):
33
  if not username:
34
  return gr.update(choices=[], value=None, interactive=False)
35
  try:
36
- # --- THIS IS THE FIX ---
37
- # Replace deprecated 'cardData=True' with 'full=True'
38
  datasets = hf_api.list_datasets(author=username, full=True)
39
-
40
  dataset_ids = [d.id for d in datasets if not d.private]
41
  return gr.update(choices=sorted(dataset_ids), value=None, interactive=True)
42
  except Exception as e:
@@ -55,20 +52,24 @@ def get_dataset_splits(dataset_id: str):
55
  return gr.update(choices=[], value=None, interactive=False)
56
 
57
  def get_split_columns(dataset_id: str, split: str):
58
- """Gets all columns for a selected split by loading one row."""
59
  if not dataset_id or not split:
60
  return gr.update(choices=[], value=None, interactive=False)
61
  try:
62
- # Stream one row to get column names without downloading the whole dataset
63
- dataset_sample = load_dataset(dataset_id, split=split, streaming=True)
64
- first_row = next(iter(dataset_sample))
65
- columns = list(first_row.keys())
 
 
66
  # Heuristically find the best text column
67
  preferred_cols = ['text', 'content', 'instruction', 'question', 'document', 'prompt']
68
  best_col = next((col for col in preferred_cols if col in columns), columns[0] if columns else None)
69
  return gr.update(choices=columns, value=best_col, interactive=True)
70
  except Exception as e:
71
- gr.Warning(f"Could not fetch columns for split '{split}'. Error: {e}")
 
 
72
  return gr.update(choices=[], value=None, interactive=False)
73
 
74
  # --- Main Atlas Generation Logic ---
 
33
  if not username:
34
  return gr.update(choices=[], value=None, interactive=False)
35
  try:
 
 
36
  datasets = hf_api.list_datasets(author=username, full=True)
 
37
  dataset_ids = [d.id for d in datasets if not d.private]
38
  return gr.update(choices=sorted(dataset_ids), value=None, interactive=True)
39
  except Exception as e:
 
52
  return gr.update(choices=[], value=None, interactive=False)
53
 
54
  def get_split_columns(dataset_id: str, split: str):
55
+ """Gets all columns for a selected split by loading its metadata."""
56
  if not dataset_id or not split:
57
  return gr.update(choices=[], value=None, interactive=False)
58
  try:
59
+ # --- THIS IS THE FIX ---
60
+ # Instead of iterating, we get the .features property from the dataset info.
61
+ # This is much faster and more reliable as it only fetches metadata.
62
+ features = load_dataset(dataset_id, split=split, streaming=True).features
63
+ columns = list(features.keys())
64
+
65
  # Heuristically find the best text column
66
  preferred_cols = ['text', 'content', 'instruction', 'question', 'document', 'prompt']
67
  best_col = next((col for col in preferred_cols if col in columns), columns[0] if columns else None)
68
  return gr.update(choices=columns, value=best_col, interactive=True)
69
  except Exception as e:
70
+ # Adding a print statement here can help debug in the terminal
71
+ print(f"Error fetching columns for {dataset_id}/{split}: {e}")
72
+ gr.Warning(f"Could not fetch columns for split '{split}'. Check if the dataset requires special access. Error: {e}")
73
  return gr.update(choices=[], value=None, interactive=False)
74
 
75
  # --- Main Atlas Generation Logic ---