broadfield-dev commited on
Commit
f8c307f
·
verified ·
1 Parent(s): 2fa8d09

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -2
app.py CHANGED
@@ -4,13 +4,29 @@ from datasets import load_dataset
4
  import os
5
  import pathlib
6
  import uuid
 
7
 
8
  # --- Embedding Atlas Imports ---
9
  # We will import the necessary components directly from the library
10
  from embedding_atlas.data_source import DataSource
11
  from embedding_atlas.server import make_server
12
  from embedding_atlas.projection import compute_text_projection
13
- from embedding_atlas.utils import find_column_name, Hasher
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  # --- Global State ---
16
  # We need to keep track of the mounted app to avoid errors on re-runs.
@@ -49,7 +65,7 @@ def generate_atlas(
49
  # --- 2. Sample Data (if requested) ---
50
  if sample_size > 0 and sample_size < len(df):
51
  progress(0.1, desc=f"Sampling {sample_size} rows...")
52
- df = df.sample(n=sample_size, random_state=42)
53
 
54
  # Check if the text column exists
55
  if text_column not in df.columns:
 
4
  import os
5
  import pathlib
6
  import uuid
7
+ import hashlib
8
 
9
  # --- Embedding Atlas Imports ---
10
  # We will import the necessary components directly from the library
11
  from embedding_atlas.data_source import DataSource
12
  from embedding_atlas.server import make_server
13
  from embedding_atlas.projection import compute_text_projection
14
+ # Hasher is correctly located in the utils module
15
+ from embedding_atlas.utils import Hasher
16
+
17
+ # --- Helper function from embedding_atlas/cli.py ---
18
+ # To make the script self-contained, we copy this small helper function here.
19
+ def find_column_name(existing_names, candidate):
20
+ """Finds a unique column name, appending '_1', '_2', etc. if the candidate name already exists."""
21
+ if candidate not in existing_names:
22
+ return candidate
23
+ else:
24
+ index = 1
25
+ while True:
26
+ s = f"{candidate}_{index}"
27
+ if s not in existing_names:
28
+ return s
29
+ index += 1
30
 
31
  # --- Global State ---
32
  # We need to keep track of the mounted app to avoid errors on re-runs.
 
65
  # --- 2. Sample Data (if requested) ---
66
  if sample_size > 0 and sample_size < len(df):
67
  progress(0.1, desc=f"Sampling {sample_size} rows...")
68
+ df = df.sample(n=sample_size, random_state=42).reset_index(drop=True)
69
 
70
  # Check if the text column exists
71
  if text_column not in df.columns: