Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,13 +4,29 @@ from datasets import load_dataset
|
|
4 |
import os
|
5 |
import pathlib
|
6 |
import uuid
|
|
|
7 |
|
8 |
# --- Embedding Atlas Imports ---
|
9 |
# We will import the necessary components directly from the library
|
10 |
from embedding_atlas.data_source import DataSource
|
11 |
from embedding_atlas.server import make_server
|
12 |
from embedding_atlas.projection import compute_text_projection
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
# --- Global State ---
|
16 |
# We need to keep track of the mounted app to avoid errors on re-runs.
|
@@ -49,7 +65,7 @@ def generate_atlas(
|
|
49 |
# --- 2. Sample Data (if requested) ---
|
50 |
if sample_size > 0 and sample_size < len(df):
|
51 |
progress(0.1, desc=f"Sampling {sample_size} rows...")
|
52 |
-
df = df.sample(n=sample_size, random_state=42)
|
53 |
|
54 |
# Check if the text column exists
|
55 |
if text_column not in df.columns:
|
|
|
4 |
import os
|
5 |
import pathlib
|
6 |
import uuid
|
7 |
+
import hashlib
|
8 |
|
9 |
# --- Embedding Atlas Imports ---
|
10 |
# We will import the necessary components directly from the library
|
11 |
from embedding_atlas.data_source import DataSource
|
12 |
from embedding_atlas.server import make_server
|
13 |
from embedding_atlas.projection import compute_text_projection
|
14 |
+
# Hasher is correctly located in the utils module
|
15 |
+
from embedding_atlas.utils import Hasher
|
16 |
+
|
17 |
+
# --- Helper function from embedding_atlas/cli.py ---
|
18 |
+
# To make the script self-contained, we copy this small helper function here.
|
19 |
+
def find_column_name(existing_names, candidate):
|
20 |
+
"""Finds a unique column name, appending '_1', '_2', etc. if the candidate name already exists."""
|
21 |
+
if candidate not in existing_names:
|
22 |
+
return candidate
|
23 |
+
else:
|
24 |
+
index = 1
|
25 |
+
while True:
|
26 |
+
s = f"{candidate}_{index}"
|
27 |
+
if s not in existing_names:
|
28 |
+
return s
|
29 |
+
index += 1
|
30 |
|
31 |
# --- Global State ---
|
32 |
# We need to keep track of the mounted app to avoid errors on re-runs.
|
|
|
65 |
# --- 2. Sample Data (if requested) ---
|
66 |
if sample_size > 0 and sample_size < len(df):
|
67 |
progress(0.1, desc=f"Sampling {sample_size} rows...")
|
68 |
+
df = df.sample(n=sample_size, random_state=42).reset_index(drop=True)
|
69 |
|
70 |
# Check if the text column exists
|
71 |
if text_column not in df.columns:
|