broadfield-dev commited on
Commit
47eeac1
·
verified ·
1 Parent(s): fd3f0f2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -179
app.py CHANGED
@@ -176,184 +176,5 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Embedding Atlas Explorer") as app:
176
  )
177
  app.load(fn=get_user_datasets, inputs=hf_user_input, outputs=dataset_input)
178
 
179
- if __name__ == "__main__":
180
- app.launch(debug=True)import gradio as gr
181
- import pandas as pd
182
- from datasets import load_dataset, get_dataset_split_names
183
- from huggingface_hub import HfApi
184
- import os
185
- import pathlib
186
- import uuid
187
- import logging
188
- import threading
189
- import time
190
- import socket
191
- import uvicorn
192
-
193
- # --- Setup Logging ---
194
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
195
-
196
- # --- Embedding Atlas Imports ---
197
- from embedding_atlas.data_source import DataSource
198
- from embedding_atlas.server import make_server
199
- from embedding_atlas.projection import compute_text_projection
200
- from embedding_atlas.utils import Hasher
201
-
202
- # --- Helper functions ---
203
- def find_column_name(existing_names, candidate):
204
- if candidate not in existing_names:
205
- return candidate
206
- index = 1
207
- while True:
208
- s = f"{candidate}_{index}"
209
- if s not in existing_names:
210
- return s
211
- index += 1
212
-
213
- def find_available_port(start_port: int, max_attempts: int = 100):
214
- """Finds an available TCP port on the host."""
215
- for port in range(start_port, start_port + max_attempts):
216
- with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
217
- if s.connect_ex(('127.0.0.1', port)) != 0:
218
- logging.info(f"Found available port: {port}")
219
- return port
220
- raise RuntimeError("Could not find an available port.")
221
-
222
- def run_atlas_server(app, port):
223
- """Target function for the background thread to run the Uvicorn server."""
224
- logging.info(f"Starting Atlas server on http://127.0.0.1:{port}")
225
- uvicorn.run(app, host="127.0.0.1", port=port, log_level="warning")
226
-
227
- # --- Hugging Face API Helpers ---
228
- hf_api = HfApi()
229
-
230
- def get_user_datasets(username: str):
231
- logging.info(f"Fetching datasets for user: {username}")
232
- if not username: return gr.update(choices=[], value=None, interactive=False)
233
- try:
234
- datasets = hf_api.list_datasets(author=username, full=True)
235
- dataset_ids = [d.id for d in datasets if not d.private]
236
- logging.info(f"Found {len(dataset_ids)} datasets.")
237
- return gr.update(choices=sorted(dataset_ids), value=None, interactive=True)
238
- except Exception as e:
239
- logging.error(f"Failed to fetch datasets: {e}")
240
- return gr.update(choices=[], value=None, interactive=False)
241
-
242
- def get_dataset_splits(dataset_id: str):
243
- logging.info(f"Fetching splits for: {dataset_id}")
244
- if not dataset_id: return gr.update(choices=[], value=None, interactive=False)
245
- try:
246
- splits = get_dataset_split_names(dataset_id)
247
- logging.info(f"Found splits: {splits}")
248
- return gr.update(choices=splits, value=splits[0] if splits else None, interactive=True)
249
- except Exception as e:
250
- logging.error(f"Failed to fetch splits: {e}")
251
- return gr.update(choices=[], value=None, interactive=False)
252
-
253
- def get_split_columns(dataset_id: str, split: str):
254
- logging.info(f"Fetching columns for: {dataset_id}/{split}")
255
- if not dataset_id or not split: return gr.update(choices=[], value=None, interactive=False)
256
- try:
257
- dataset_sample = load_dataset(dataset_id, split=split, streaming=True)
258
- first_row = next(iter(dataset_sample))
259
- columns = list(first_row.keys())
260
- logging.info(f"Found columns: {columns}")
261
- preferred_cols = ['text', 'content', 'instruction', 'question', 'document', 'prompt']
262
- best_col = next((col for col in preferred_cols if col in columns), columns[0] if columns else None)
263
- return gr.update(choices=columns, value=best_col, interactive=True)
264
- except Exception as e:
265
- logging.error(f"Failed to get columns: {e}", exc_info=True)
266
- return gr.update(choices=[], value=None, interactive=False)
267
-
268
- # --- Main Atlas Generation Logic ---
269
- def generate_atlas(
270
- dataset_name: str,
271
- split: str,
272
- text_column: str,
273
- sample_size: int,
274
- model_name: str,
275
- umap_neighbors: int,
276
- umap_min_dist: float,
277
- progress=gr.Progress(track_tqdm=True)
278
- ):
279
- if not all([dataset_name, split, text_column]):
280
- raise gr.Error("Please ensure a Dataset, Split, and Text Column are selected.")
281
-
282
- progress(0, desc="Loading dataset...")
283
- df = load_dataset(dataset_name, split=split).to_pandas()
284
- if sample_size > 0 and sample_size < len(df):
285
- df = df.sample(n=sample_size, random_state=42).reset_index(drop=True)
286
-
287
- progress(0.2, desc="Computing embeddings and UMAP...")
288
- x_col = find_column_name(df.columns, "projection_x")
289
- y_col = find_column_name(df.columns, "projection_y")
290
- neighbors_col = find_column_name(df.columns, "__neighbors")
291
- compute_text_projection(
292
- df, text_column, x=x_col, y=y_col, neighbors=neighbors_col, model=model_name,
293
- umap_args={"n_neighbors": umap_neighbors, "min_dist": umap_min_dist, "metric": "cosine", "random_state": 42},
294
- )
295
-
296
- progress(0.8, desc="Preparing Atlas data source...")
297
- id_col = find_column_name(df.columns, "_row_index")
298
- df[id_col] = range(df.shape[0])
299
- metadata = {"columns": {"id": id_col, "text": text_column, "embedding": {"x": x_col, "y": y_col}, "neighbors": neighbors_col}}
300
- hasher = Hasher()
301
- hasher.update(f"{dataset_name}-{split}-{text_column}-{sample_size}-{model_name}-{uuid.uuid4()}")
302
- identifier = hasher.hexdigest()
303
- atlas_dataset = DataSource(identifier, df, metadata)
304
-
305
- progress(0.9, desc="Starting Atlas server...")
306
- static_path = str((pathlib.Path(__import__('embedding_atlas').__file__).parent / "static").resolve())
307
- atlas_app = make_server(atlas_dataset, static_path=static_path, duckdb_uri="wasm")
308
-
309
- # Find an open port and run the server in a background thread
310
- port = find_available_port(start_port=8001)
311
- thread = threading.Thread(target=run_atlas_server, args=(atlas_app, port), daemon=True)
312
- thread.start()
313
-
314
- # Give the server a moment to start up
315
- time.sleep(2)
316
-
317
- iframe_html = f"<iframe src='http://127.0.0.1:{port}' width='100%' height='800px' frameborder='0'></iframe>"
318
- return gr.HTML(iframe_html)
319
-
320
- # --- Gradio UI Definition ---
321
- with gr.Blocks(theme=gr.themes.Soft(), title="Embedding Atlas Explorer") as app:
322
- # UI elements...
323
- gr.Markdown("# Embedding Atlas Explorer")
324
- # ... (rest of the UI is the same as before) ...
325
- with gr.Row():
326
- with gr.Column(scale=1):
327
- gr.Markdown("### 1. Select Data")
328
- hf_user_input = gr.Textbox(label="Hugging Face User or Org Name", value="Trendyol", placeholder="e.g., 'gradio' or 'google'")
329
- dataset_input = gr.Dropdown(label="Select a Dataset", interactive=False)
330
- split_input = gr.Dropdown(label="Select a Split", interactive=False)
331
- text_column_input = gr.Dropdown(label="Select a Text Column", interactive=False)
332
-
333
- gr.Markdown("### 2. Configure Visualization")
334
- sample_size_input = gr.Slider(label="Number of Samples", minimum=0, maximum=10000, value=2000, step=100)
335
-
336
- with gr.Accordion("Advanced Settings", open=False):
337
- model_input = gr.Dropdown(label="Embedding Model", choices=["all-MiniLM-L6-v2", "all-mpnet-base-v2", "multi-qa-MiniLM-L6-cos-v1"], value="all-MiniLM-L6-v2")
338
- umap_neighbors_input = gr.Slider(label="UMAP Neighbors", minimum=2, maximum=100, value=15, step=1, info="Controls local vs. global structure.")
339
- umap_min_dist_input = gr.Slider(label="UMAP Min Distance", minimum=0.0, maximum=0.99, value=0.1, step=0.01, info="Controls how tightly points are packed.")
340
-
341
- generate_button = gr.Button("Generate Atlas", variant="primary")
342
-
343
- with gr.Column(scale=3):
344
- gr.Markdown("### 3. Explore Atlas")
345
- output_html = gr.HTML("<div style='display:flex; justify-content:center; align-items:center; height:800px; border: 1px solid #ddd; border-radius: 5px;'><p>Atlas will be displayed here after generation.</p></div>")
346
-
347
- # --- Event Listeners ---
348
- hf_user_input.submit(fn=get_user_datasets, inputs=hf_user_input, outputs=dataset_input)
349
- dataset_input.change(fn=get_dataset_splits, inputs=dataset_input, outputs=split_input)
350
- split_input.change(fn=get_split_columns, inputs=[dataset_input, split_input], outputs=text_column_input)
351
- generate_button.click(
352
- fn=generate_atlas,
353
- inputs=[dataset_input, split_input, text_column_input, sample_size_input, model_input, umap_neighbors_input, umap_min_dist_input],
354
- outputs=[output_html],
355
- )
356
- app.load(fn=get_user_datasets, inputs=hf_user_input, outputs=dataset_input)
357
-
358
  if __name__ == "__main__":
359
  app.launch(debug=True)
 
176
  )
177
  app.load(fn=get_user_datasets, inputs=hf_user_input, outputs=dataset_input)
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  if __name__ == "__main__":
180
  app.launch(debug=True)