Spaces:

broadfield-dev
/

Embedding-Atlas

Sleeping

broadfield-dev commited on 9 days ago

Commit

f8c307f

verified ·

1 Parent(s): 2fa8d09

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -4,13 +4,29 @@ from datasets import load_dataset
 import os
 import pathlib
 import uuid
 # --- Embedding Atlas Imports ---
 # We will import the necessary components directly from the library
 from embedding_atlas.data_source import DataSource
 from embedding_atlas.server import make_server
 from embedding_atlas.projection import compute_text_projection
-from embedding_atlas.utils import find_column_name, Hasher
 # --- Global State ---
 # We need to keep track of the mounted app to avoid errors on re-runs.
@@ -49,7 +65,7 @@ def generate_atlas(
     # --- 2. Sample Data (if requested) ---
     if sample_size > 0 and sample_size < len(df):
         progress(0.1, desc=f"Sampling {sample_size} rows...")
-        df = df.sample(n=sample_size, random_state=42)
     # Check if the text column exists
     if text_column not in df.columns:

 import os
 import pathlib
 import uuid
+import hashlib
 # --- Embedding Atlas Imports ---
 # We will import the necessary components directly from the library
 from embedding_atlas.data_source import DataSource
 from embedding_atlas.server import make_server
 from embedding_atlas.projection import compute_text_projection
+# Hasher is correctly located in the utils module
+from embedding_atlas.utils import Hasher
+# --- Helper function from embedding_atlas/cli.py ---
+# To make the script self-contained, we copy this small helper function here.
+def find_column_name(existing_names, candidate):
+    """Finds a unique column name, appending '_1', '_2', etc. if the candidate name already exists."""
+    if candidate not in existing_names:
+        return candidate
+    else:
+        index = 1
+        while True:
+            s = f"{candidate}_{index}"
+            if s not in existing_names:
+                return s
+            index += 1
 # --- Global State ---
 # We need to keep track of the mounted app to avoid errors on re-runs.
     # --- 2. Sample Data (if requested) ---
     if sample_size > 0 and sample_size < len(df):
         progress(0.1, desc=f"Sampling {sample_size} rows...")
+        df = df.sample(n=sample_size, random_state=42).reset_index(drop=True)
     # Check if the text column exists
     if text_column not in df.columns: