Spaces:

broadfield-dev
/

Embedding-Atlas

Sleeping

App Files Files Community

broadfield-dev commited on 9 days ago

Commit

701c41c

verified ·

1 Parent(s): f8c307f

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -121

app.py CHANGED Viewed

@@ -1,21 +1,18 @@
 import gradio as gr
 import pandas as pd
-from datasets import load_dataset
 import os
 import pathlib
 import uuid
-import hashlib
 # --- Embedding Atlas Imports ---
-# We will import the necessary components directly from the library
 from embedding_atlas.data_source import DataSource
 from embedding_atlas.server import make_server
 from embedding_atlas.projection import compute_text_projection
-# Hasher is correctly located in the utils module
 from embedding_atlas.utils import Hasher
 # --- Helper function from embedding_atlas/cli.py ---
-# To make the script self-contained, we copy this small helper function here.
 def find_column_name(existing_names, candidate):
     """Finds a unique column name, appending '_1', '_2', etc. if the candidate name already exists."""
     if candidate not in existing_names:
@@ -28,20 +25,54 @@ def find_column_name(existing_names, candidate):
                 return s
             index += 1
-# --- Global State ---
-# We need to keep track of the mounted app to avoid errors on re-runs.
-# A dictionary to store unique app instances for each run.
-mounted_apps = {}
-def get_atlas_static_path():
-    """Finds the path to the static files for the embedding-atlas frontend."""
-    import embedding_atlas
-    return str((pathlib.Path(embedding_atlas.__file__).parent / "static").resolve())
 def generate_atlas(
     dataset_name: str,
-    text_column: str,
     split: str,
     sample_size: int,
     model_name: str,
     umap_neighbors: int,
@@ -51,169 +82,123 @@ def generate_atlas(
     """
     Loads data, computes embeddings, and serves the Embedding Atlas UI.
     """
-    global mounted_apps
-    # --- 1. Load Data ---
-    progress(0, desc=f"Loading dataset '{dataset_name}'...")
     try:
-        # Load the dataset from Hugging Face
         dataset = load_dataset(dataset_name, split=split)
         df = dataset.to_pandas()
     except Exception as e:
-        raise gr.Error(f"Failed to load dataset. Please check the name and split. Error: {e}")
-    # --- 2. Sample Data (if requested) ---
     if sample_size > 0 and sample_size < len(df):
         progress(0.1, desc=f"Sampling {sample_size} rows...")
         df = df.sample(n=sample_size, random_state=42).reset_index(drop=True)
-    # Check if the text column exists
     if text_column not in df.columns:
-        raise gr.Error(f"Column '{text_column}' not found in the dataset. Available columns: {', '.join(df.columns)}")
-    # --- 3. Compute Embeddings & UMAP Projection ---
-    progress(0.2, desc="Computing embeddings and UMAP projection. This may take a while...")
-    x_column = find_column_name(df.columns, "projection_x")
-    y_column = find_column_name(df.columns, "projection_y")
-    neighbors_column = find_column_name(df.columns, "__neighbors")
     try:
         compute_text_projection(
-            df,
-            text_column,
-            x=x_column,
-            y=y_column,
-            neighbors=neighbors_column,
-            model=model_name,
-            umap_args={
-                "n_neighbors": umap_neighbors,
-                "min_dist": umap_min_dist,
-                "metric": "cosine",
-                "random_state": 42,
-            },
         )
     except Exception as e:
         raise gr.Error(f"Failed to compute embeddings. Check model name or try a smaller sample. Error: {e}")
-    # --- 4. Prepare Atlas DataSource ---
     progress(0.8, desc="Preparing Atlas data source...")
-    id_column = find_column_name(df.columns, "_row_index")
-    df[id_column] = range(df.shape[0])
     metadata = {
-        "columns": {
-            "id": id_column,
-            "text": text_column,
-            "embedding": {"x": x_column, "y": y_column},
-            "neighbors": neighbors_column,
-        },
     }
-    # Create a unique identifier for the dataset to avoid conflicts
     hasher = Hasher()
-    hasher.update(f"{dataset_name}-{text_column}-{sample_size}-{model_name}")
     identifier = hasher.hexdigest()
     atlas_dataset = DataSource(identifier, df, metadata)
-    static_path = get_atlas_static_path()
-    # --- 5. Create and Mount the FastAPI App ---
     progress(0.9, desc="Mounting visualization UI...")
-    # Generate a unique path for this instance to avoid conflicts on remounting
     mount_path = f"/{uuid.uuid4().hex}"
-    # Create the server instance
     atlas_app = make_server(atlas_dataset, static_path=static_path, duckdb_uri="wasm")
-    # The `blocks` object is global in the Gradio script context.
-    # We mount the atlas server onto the main Gradio FastAPI app.
-    gr.mount_gradio_app(app, atlas_app, path=mount_path)
-    mounted_apps[mount_path] = atlas_app # Store it for potential cleanup later
     progress(1.0, desc="Done!")
-    # --- 6. Return an IFrame pointing to the mounted path ---
     iframe_html = f"<iframe src='{mount_path}' width='100%' height='800px' frameborder='0'></iframe>"
     return gr.HTML(iframe_html)
 # --- Gradio UI Definition ---
 with gr.Blocks(theme=gr.themes.Soft(), title="Embedding Atlas Explorer") as app:
     gr.Markdown("# Embedding Atlas Explorer")
     gr.Markdown(
-        "Visualize any text column from a Hugging Face dataset. This app loads the data, "
-        "computes embeddings using Sentence Transformers, reduces dimensionality with UMAP, "
-        "and displays the result in an interactive Embedding Atlas."
     )
     with gr.Row():
         with gr.Column(scale=1):
-            gr.Markdown("### 1. Configuration")
-            dataset_input = gr.Textbox(
-                label="Hugging Face Dataset Name",
-                value="Trendyol/Trendyol-Cybersecurity-Instruction-Tuning-Dataset"
-            )
-            text_column_input = gr.Textbox(
-                label="Text Column to Visualize",
-                value="instruction"
-            )
-            split_input = gr.Textbox(label="Dataset Split", value="train")
-            sample_size_input = gr.Slider(
-                label="Number of Samples (0 for all)",
-                minimum=0,
-                maximum=5000,
-                value=2000,
-                step=100
-            )
             with gr.Accordion("Advanced Settings", open=False):
-                model_input = gr.Dropdown(
-                    label="Embedding Model",
-                    choices=["all-MiniLM-L6-v2", "all-mpnet-base-v2", "multi-qa-MiniLM-L6-cos-v1"],
-                    value="all-MiniLM-L6-v2",
-                )
-                umap_neighbors_input = gr.Slider(
-                    label="UMAP Neighbors",
-                    minimum=2,
-                    maximum=100,
-                    value=15,
-                    step=1,
-                    info="Controls how UMAP balances local vs. global structure."
-                )
-                umap_min_dist_input = gr.Slider(
-                    label="UMAP Min Distance",
-                    minimum=0.0,
-                    maximum=0.99,
-                    value=0.1,
-                    step=0.01,
-                    info="Controls how tightly UMAP packs points together."
-                )
             generate_button = gr.Button("Generate Atlas", variant="primary")
         with gr.Column(scale=3):
-            gr.Markdown("### 2. Visualization")
-            output_html = gr.HTML(
-                "<div style='display:flex; justify-content:center; align-items:center; height:800px; border: 1px solid #ddd; border-radius: 5px;'><p>Atlas will be displayed here after generation.</p></div>"
-            )
     generate_button.click(
         fn=generate_atlas,
         inputs=[
-            dataset_input,
-            text_column_input,
-            split_input,
-            sample_size_input,
-            model_input,
-            umap_neighbors_input,
-            umap_min_dist_input,
         ],
         outputs=[output_html],
     )
 if __name__ == "__main__":
     app.launch()

 import gradio as gr
 import pandas as pd
+from datasets import load_dataset, get_dataset_split_names
+from huggingface_hub import HfApi, HfFolder
 import os
 import pathlib
 import uuid
 # --- Embedding Atlas Imports ---
 from embedding_atlas.data_source import DataSource
 from embedding_atlas.server import make_server
 from embedding_atlas.projection import compute_text_projection
 from embedding_atlas.utils import Hasher
 # --- Helper function from embedding_atlas/cli.py ---
 def find_column_name(existing_names, candidate):
     """Finds a unique column name, appending '_1', '_2', etc. if the candidate name already exists."""
     if candidate not in existing_names:
                 return s
             index += 1
+# --- Hugging Face API Helpers for Dynamic UI ---
+hf_api = HfApi()
+def get_user_datasets(username: str):
+    """Fetches all public datasets for a given username or organization."""
+    if not username:
+        return gr.Dropdown.update(choices=[], value=None, interactive=False)
+    try:
+        datasets = hf_api.list_datasets(author=username, cardData=True)
+        dataset_ids = [d.id for d in datasets if not d.private]
+        return gr.Dropdown.update(choices=sorted(dataset_ids), value=None, interactive=True)
+    except Exception as e:
+        gr.Warning(f"Could not fetch datasets for user '{username}'. Error: {e}")
+        return gr.Dropdown.update(choices=[], value=None, interactive=False)
+def get_dataset_splits(dataset_id: str):
+    """Gets all available splits for a selected dataset."""
+    if not dataset_id:
+        return gr.Dropdown.update(choices=[], value=None, interactive=False)
+    try:
+        splits = get_dataset_split_names(dataset_id)
+        return gr.Dropdown.update(choices=splits, value=splits[0] if splits else None, interactive=True)
+    except Exception as e:
+        gr.Warning(f"Could not fetch splits for dataset '{dataset_id}'. Error: {e}")
+        return gr.Dropdown.update(choices=[], value=None, interactive=False)
+def get_split_columns(dataset_id: str, split: str):
+    """Gets all columns for a selected split by loading one row."""
+    if not dataset_id or not split:
+        return gr.Dropdown.update(choices=[], value=None, interactive=False)
+    try:
+        # Stream one row to get column names without downloading the whole dataset
+        dataset_sample = load_dataset(dataset_id, split=split, streaming=True)
+        first_row = next(iter(dataset_sample))
+        columns = list(first_row.keys())
+        # Heuristically find the best text column
+        preferred_cols = ['text', 'content', 'instruction', 'question', 'document']
+        best_col = next((col for col in preferred_cols if col in columns), columns[0] if columns else None)
+        return gr.Dropdown.update(choices=columns, value=best_col, interactive=True)
+    except Exception as e:
+        gr.Warning(f"Could not fetch columns for split '{split}'. Error: {e}")
+        return gr.Dropdown.update(choices=[], value=None, interactive=False)
+# --- Main Atlas Generation Logic ---
 def generate_atlas(
     dataset_name: str,
     split: str,
+    text_column: str,
     sample_size: int,
     model_name: str,
     umap_neighbors: int,
     """
     Loads data, computes embeddings, and serves the Embedding Atlas UI.
     """
+    if not all([dataset_name, split, text_column]):
+        raise gr.Error("Please ensure a Dataset, Split, and Text Column are selected.")
+    progress(0, desc=f"Loading dataset '{dataset_name}' [{split}]...")
     try:
         dataset = load_dataset(dataset_name, split=split)
         df = dataset.to_pandas()
     except Exception as e:
+        raise gr.Error(f"Failed to load data. Error: {e}")
     if sample_size > 0 and sample_size < len(df):
         progress(0.1, desc=f"Sampling {sample_size} rows...")
         df = df.sample(n=sample_size, random_state=42).reset_index(drop=True)
     if text_column not in df.columns:
+        raise gr.Error(f"Column '{text_column}' not found. Please select a valid column.")
+    progress(0.2, desc="Computing embeddings and UMAP. This may take a while...")
+    x_col = find_column_name(df.columns, "projection_x")
+    y_col = find_column_name(df.columns, "projection_y")
+    neighbors_col = find_column_name(df.columns, "__neighbors")
     try:
         compute_text_projection(
+            df, text_column, x=x_col, y=y_col, neighbors=neighbors_col, model=model_name,
+            umap_args={"n_neighbors": umap_neighbors, "min_dist": umap_min_dist, "metric": "cosine", "random_state": 42},
         )
     except Exception as e:
         raise gr.Error(f"Failed to compute embeddings. Check model name or try a smaller sample. Error: {e}")
     progress(0.8, desc="Preparing Atlas data source...")
+    id_col = find_column_name(df.columns, "_row_index")
+    df[id_col] = range(df.shape[0])
     metadata = {
+        "columns": {"id": id_col, "text": text_column, "embedding": {"x": x_col, "y": y_col}, "neighbors": neighbors_col},
     }
     hasher = Hasher()
+    hasher.update(f"{dataset_name}-{split}-{text_column}-{sample_size}-{model_name}")
     identifier = hasher.hexdigest()
     atlas_dataset = DataSource(identifier, df, metadata)
     progress(0.9, desc="Mounting visualization UI...")
+    static_path = str((pathlib.Path(__import__('embedding_atlas').__file__).parent / "static").resolve())
     mount_path = f"/{uuid.uuid4().hex}"
     atlas_app = make_server(atlas_dataset, static_path=static_path, duckdb_uri="wasm")
+    # --- THIS IS THE FIX ---
+    # Call mount_gradio_app on the Blocks instance `app`
+    app.mount_gradio_app(atlas_app, path=mount_path)
     progress(1.0, desc="Done!")
     iframe_html = f"<iframe src='{mount_path}' width='100%' height='800px' frameborder='0'></iframe>"
     return gr.HTML(iframe_html)
 # --- Gradio UI Definition ---
 with gr.Blocks(theme=gr.themes.Soft(), title="Embedding Atlas Explorer") as app:
     gr.Markdown("# Embedding Atlas Explorer")
     gr.Markdown(
+        "Interactively select and visualize any text-based dataset from the Hugging Face Hub. "
+        "The app computes embeddings and projects them into a 2D map for exploration."
     )
     with gr.Row():
         with gr.Column(scale=1):
+            gr.Markdown("### 1. Select Data")
+            hf_user_input = gr.Textbox(label="Hugging Face User or Org Name", value="Trendyol", placeholder="e.g., 'gradio' or 'google'")
+            dataset_input = gr.Dropdown(label="Select a Dataset", interactive=False)
+            split_input = gr.Dropdown(label="Select a Split", interactive=False)
+            text_column_input = gr.Dropdown(label="Select a Text Column", interactive=False)
+            gr.Markdown("### 2. Configure Visualization")
+            sample_size_input = gr.Slider(label="Number of Samples", minimum=0, maximum=10000, value=2000, step=100)
             with gr.Accordion("Advanced Settings", open=False):
+                model_input = gr.Dropdown(label="Embedding Model", choices=["all-MiniLM-L6-v2", "all-mpnet-base-v2", "multi-qa-MiniLM-L6-cos-v1"], value="all-MiniLM-L6-v2")
+                umap_neighbors_input = gr.Slider(label="UMAP Neighbors", minimum=2, maximum=100, value=15, step=1, info="Controls local vs. global structure.")
+                umap_min_dist_input = gr.Slider(label="UMAP Min Distance", minimum=0.0, maximum=0.99, value=0.1, step=0.01, info="Controls how tightly points are packed.")
             generate_button = gr.Button("Generate Atlas", variant="primary")
         with gr.Column(scale=3):
+            gr.Markdown("### 3. Explore Atlas")
+            output_html = gr.HTML("<div style='display:flex; justify-content:center; align-items:center; height:800px; border: 1px solid #ddd; border-radius: 5px;'><p>Atlas will be displayed here after generation.</p></div>")
+    # --- Chained Event Listeners for Dynamic UI ---
+    hf_user_input.submit(
+        fn=get_user_datasets,
+        inputs=[hf_user_input],
+        outputs=[dataset_input]
+    )
+    dataset_input.select(
+        fn=get_dataset_splits,
+        inputs=[dataset_input],
+        outputs=[split_input]
+    )
+    split_input.select(
+        fn=get_split_columns,
+        inputs=[dataset_input, split_input],
+        outputs=[text_column_input]
+    )
+    # --- Button Click Event ---
     generate_button.click(
         fn=generate_atlas,
         inputs=[
+            dataset_input, split_input, text_column_input,
+            sample_size_input, model_input, umap_neighbors_input, umap_min_dist_input
         ],
         outputs=[output_html],
     )
+    # Load initial example data on app load
+    app.load(fn=get_user_datasets, inputs=[hf_user_input], outputs=[dataset_input])
 if __name__ == "__main__":
+    # To run locally, you might need to log in to Hugging Face Hub
+    # HfFolder.save_token("YOUR_HF_TOKEN")
     app.launch()