Spaces:

sklearn-docs
/

GradientBoostingClassifier

Running

App Files Files Community

ZennyKenny commited on Jan 31

Commit

91cbc46

verified ·

1 Parent(s): 349af26

try new dataset handling

Browse files

Files changed (1) hide show

app.py +115 -100

app.py CHANGED Viewed

@@ -1,79 +1,102 @@
 import gradio as gr
 import numpy as np
 import matplotlib
 import matplotlib.pyplot as plt
-import pandas as pd
 from datasets import load_dataset
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score, confusion_matrix
-matplotlib.use('Agg')  # Avoid issues in some remote environments
-# Pre-populate a short list of "recommended" Hugging Face datasets
-# (Replace "datasorg/iris" etc. with real dataset IDs you want to showcase)
 SUGGESTED_DATASETS = [
-    "datasorg/iris",         # hypothetical ID
-    "uciml/wine_quality-red", # example from the HF Hub
-    "SKIP/ENTER_CUSTOM"      # We'll treat this as a "separator" or "prompt" for custom
 ]
-def load_and_prepare_dataset(dataset_id, label_column, feature_columns):
     """
-    Loads a dataset from the Hugging Face Hub,
-    converts it to a pandas DataFrame,
-    returns X, y as NumPy arrays for modeling.
     """
-    # Load only the "train" split for simplicity
-    # Many datasets have "train", "test", "validation" splits
-    ds = load_dataset(dataset_id, split="train")
-    # Convert to a DataFrame for easy manipulation
-    df = pd.DataFrame(ds)
-    # Subset to selected columns
-    if label_column not in df.columns:
-        raise ValueError(f"Label column '{label_column}' not in dataset columns: {df.columns.to_list()}")
-    for col in feature_columns:
-        if col not in df.columns:
-            raise ValueError(f"Feature column '{col}' not in dataset columns: {df.columns.to_list()}")
-    # Split into X and y
-    X = df[feature_columns].values
-    y = df[label_column].values
-    return X, y, df.columns.tolist()
-def train_model(dataset_id, custom_dataset_id, label_column, feature_columns,
                 learning_rate, n_estimators, max_depth, test_size):
     """
-    1. Determine final dataset ID (either from dropdown or custom text).
-    2. Load dataset -> DataFrame -> X, y.
-    3. Train a GradientBoostingClassifier.
-    4. Generate plots & metrics (accuracy and confusion matrix).
     """
-    # Decide which dataset ID to use
     if dataset_id != "SKIP/ENTER_CUSTOM":
         final_id = dataset_id
     else:
-        # Use the user-supplied "custom_dataset_id"
         final_id = custom_dataset_id.strip()
-    # Prepare data
-    X, y, columns_available = load_and_prepare_dataset(
-        final_id,
-        label_column,
-        feature_columns
-    )
-    # Train/test split
     X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=test_size, random_state=42
     )
     # Train model
     clf = GradientBoostingClassifier(
         learning_rate=learning_rate,
@@ -82,13 +105,15 @@ def train_model(dataset_id, custom_dataset_id, label_column, feature_columns,
         random_state=42
     )
     clf.fit(X_train, y_train)
-    # Evaluate
     y_pred = clf.predict(X_test)
     accuracy = accuracy_score(y_test, y_pred)
     cm = confusion_matrix(y_test, y_pred)
-    # Plot figure
     fig, axs = plt.subplots(1, 2, figsize=(10, 4))
     # Subplot 1: Feature Importances
@@ -103,90 +128,80 @@ def train_model(dataset_id, custom_dataset_id, label_column, feature_columns,
     im = axs[1].imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
     axs[1].set_title("Confusion Matrix")
     plt.colorbar(im, ax=axs[1])
-    # Labeling
     axs[1].set_xlabel("Predicted")
     axs[1].set_ylabel("True")
-    # If you want to annotate each cell:
     thresh = cm.max() / 2.0
     for i in range(cm.shape[0]):
         for j in range(cm.shape[1]):
             color = "white" if cm[i, j] > thresh else "black"
-            axs[1].text(j, i, format(cm[i, j], "d"), ha="center", va="center", color=color)
     plt.tight_layout()
-    output_text = f"**Dataset used:** {final_id}\n\n"
-    output_text += f"**Accuracy:** {accuracy:.3f}\n\n"
-    output_text += "**Confusion Matrix** (raw counts above)."
-    return output_text, fig, columns_available
-def update_columns(dataset_id, dataset_config, custom_dataset_id):
-    """
-    Load the dataset from HF hub, using either the suggested one or the custom user-specified,
-    plus an optional config.
-    """
-    if dataset_id != "SKIP/ENTER_CUSTOM":
-        final_id = dataset_id
-        final_config = dataset_config.strip() if dataset_config else None
-    else:
-        # Use the user-supplied text
-        final_id = custom_dataset_id.strip()
-        final_config = None  # or parse from text if you like
-    try:
-        if final_config:
-            ds = load_dataset(final_id, final_config, split="train")
-        else:
-            ds = load_dataset(final_id, split="train")
-        df = pd.DataFrame(ds)
-        cols = df.columns.tolist()
-        return gr.update(choices=cols), gr.update(choices=cols), f"Columns found: {cols}"
-    except Exception as e:
-        return gr.update(choices=[]), gr.update(choices=[]), f"Error loading {final_id}: {e}"
 with gr.Blocks() as demo:
-    gr.Markdown("## Train GradientBoostingClassifier on a Hugging Face dataset of your choice")
     with gr.Row():
         dataset_dropdown = gr.Dropdown(
             choices=SUGGESTED_DATASETS,
-            value=SUGGESTED_DATASETS[0],
-            label="Choose a dataset"
         )
-        custom_dataset_id = gr.Textbox(label="Or enter HF dataset (user/dataset)", value="",
-                                       placeholder="e.g. 'username/my_custom_dataset'")
-    # Button to load columns from the chosen dataset
-    load_cols_btn = gr.Button("Load columns")
     load_cols_info = gr.Markdown()
     with gr.Row():
         label_col = gr.Dropdown(choices=[], label="Label column (choose 1)")
         feature_cols = gr.CheckboxGroup(choices=[], label="Feature columns (choose 1 or more)")
-    # Once columns are chosen, we can set hyperparams
     learning_rate_slider = gr.Slider(0.01, 1.0, value=0.1, step=0.01, label="learning_rate")
     n_estimators_slider = gr.Slider(50, 300, value=100, step=50, label="n_estimators")
     max_depth_slider = gr.Slider(1, 10, value=3, step=1, label="max_depth")
-    test_size_slider = gr.Slider(0.1, 0.9, value=0.3, step=0.1, label="test_size (fraction)")
     train_button = gr.Button("Train & Evaluate")
     output_text = gr.Markdown()
     output_plot = gr.Plot()
-    # We might also want to show the columns for reference post-training
-    columns_return = gr.Markdown()
-    # When "Load columns" is clicked, we call update_columns to fetch the dataset columns
     load_cols_btn.click(
         fn=update_columns,
         inputs=[dataset_dropdown, custom_dataset_id],
-        outputs=[label_col, feature_cols, load_cols_info]
     )
-    # When "Train & Evaluate" is clicked, we train the model
     train_button.click(
         fn=train_model,
         inputs=[
@@ -199,7 +214,7 @@ with gr.Blocks() as demo:
             max_depth_slider,
             test_size_slider
         ],
-        outputs=[output_text, output_plot, columns_return]
     )
 demo.launch()

+# app.py
 import gradio as gr
 import numpy as np
+import pandas as pd
 import matplotlib
 import matplotlib.pyplot as plt
 from datasets import load_dataset
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score, confusion_matrix
+# In some remote environments, Matplotlib needs to be set to 'Agg' backend
+matplotlib.use('Agg')
+################################################################################
+# SUGGESTED_DATASETS: Must actually exist on huggingface.co/datasets.
+#
+# "scikit-learn/iris" -> a tabular Iris dataset with a "train" split of 150 rows.
+# "uci/wine"          -> a tabular Wine dataset with a "train" split of 178 rows.
+################################################################################
 SUGGESTED_DATASETS = [
+    "scikit-learn/iris",
+    "uci/wine",
+    "SKIP/ENTER_CUSTOM"  # a placeholder meaning "use custom_dataset_id"
 ]
+def update_columns(dataset_id, custom_dataset_id):
     """
+    Loads the chosen dataset (train split) and returns its column names,
+    to populate the Label Column & Feature Columns selectors.
     """
+    # If user picked a suggested dataset (not SKIP), use that
+    if dataset_id != "SKIP/ENTER_CUSTOM":
+        final_id = dataset_id
+    else:
+        # Use the user-supplied dataset ID
+        final_id = custom_dataset_id.strip()
+    try:
+        # Load just the "train" split; many HF datasets have train/test/validation
+        ds = load_dataset(final_id, split="train")
+        df = pd.DataFrame(ds)
+        cols = df.columns.tolist()
+        message = f"**Loaded dataset**: {final_id}\n\n**Columns found**: {cols}"
+        # Return list of columns for both label & features
+        return (
+            gr.update(choices=cols, value=None),   # label_col dropdown
+            gr.update(choices=cols, value=[]),     # feature_cols checkbox group
+            message
+        )
+    except Exception as e:
+        # If load fails or dataset doesn't exist
+        err_msg = f"**Error loading** `{final_id}`: {e}"
+        return (
+            gr.update(choices=[], value=None),
+            gr.update(choices=[], value=[]),
+            err_msg
+        )
+def train_model(dataset_id, custom_dataset_id, label_column, feature_columns,
                 learning_rate, n_estimators, max_depth, test_size):
     """
+    1. Determine the final dataset ID (from dropdown or custom text).
+    2. Load the dataset -> create dataframe -> X, y.
+    3. Train GradientBoostingClassifier.
+    4. Return metrics (accuracy) and a Matplotlib figure with:
+       - Feature importance bar chart
+       - Confusion matrix heatmap
     """
     if dataset_id != "SKIP/ENTER_CUSTOM":
         final_id = dataset_id
     else:
         final_id = custom_dataset_id.strip()
+    # Load dataset
+    ds = load_dataset(final_id, split="train")
+    df = pd.DataFrame(ds)
+    # Basic validation
+    if label_column not in df.columns:
+        raise ValueError(f"Label column '{label_column}' not found in dataset columns.")
+    for fc in feature_columns:
+        if fc not in df.columns:
+            raise ValueError(f"Feature column '{fc}' not found in dataset columns.")
+    # Build X, y arrays
+    X = df[feature_columns].values
+    y = df[label_column].values
+    # Split
     X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=test_size, random_state=42
     )
     # Train model
     clf = GradientBoostingClassifier(
         learning_rate=learning_rate,
         random_state=42
     )
     clf.fit(X_train, y_train)
+    # Predictions & metrics
     y_pred = clf.predict(X_test)
     accuracy = accuracy_score(y_test, y_pred)
     cm = confusion_matrix(y_test, y_pred)
+    # Build a single figure with 2 subplots:
+    #   1) Feature importances
+    #   2) Confusion matrix heatmap
     fig, axs = plt.subplots(1, 2, figsize=(10, 4))
     # Subplot 1: Feature Importances
     im = axs[1].imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
     axs[1].set_title("Confusion Matrix")
     plt.colorbar(im, ax=axs[1])
     axs[1].set_xlabel("Predicted")
     axs[1].set_ylabel("True")
+    # Optionally annotate each cell with the count
     thresh = cm.max() / 2.0
     for i in range(cm.shape[0]):
         for j in range(cm.shape[1]):
             color = "white" if cm[i, j] > thresh else "black"
+            axs[1].text(j, i, str(cm[i, j]), ha="center", va="center", color=color)
     plt.tight_layout()
+    # Build textual summary
+    text_summary = (
+        f"**Dataset used**: `{final_id}`\n\n"
+        f"**Label column**: `{label_column}`\n\n"
+        f"**Feature columns**: `{feature_columns}`\n\n"
+        f"**Accuracy**: {accuracy:.3f}\n\n"
+    )
+    return text_summary, fig
+# Build the Gradio Blocks UI
 with gr.Blocks() as demo:
+    gr.Markdown("# Train a GradientBoostingClassifier on any HF Dataset\n")
+    gr.Markdown(
+        "1. Choose a suggested dataset from the dropdown **or** enter a custom dataset ID in the format `user/dataset`.\n"
+        "2. Click **Load Columns** to inspect the columns.\n"
+        "3. Pick a **Label column** and **Feature columns**.\n"
+        "4. Adjust hyperparameters and click **Train & Evaluate**.\n"
+        "5. Observe accuracy, feature importances, and a confusion matrix heatmap.\n\n"
+        "*(Note: the dataset must have a `train` split!)*"
+    )
+    # Row 1: Dataset selection
     with gr.Row():
         dataset_dropdown = gr.Dropdown(
+            label="Choose suggested dataset",
             choices=SUGGESTED_DATASETS,
+            value=SUGGESTED_DATASETS[0]  # default
+        )
+        custom_dataset_id = gr.Textbox(
+            label="Or enter a custom dataset ID",
+            placeholder="e.g. username/my_custom_dataset"
         )
+    load_cols_btn = gr.Button("Load Columns")
     load_cols_info = gr.Markdown()
+    # Row 2: label & feature columns
     with gr.Row():
         label_col = gr.Dropdown(choices=[], label="Label column (choose 1)")
         feature_cols = gr.CheckboxGroup(choices=[], label="Feature columns (choose 1 or more)")
+    # Hyperparameters
     learning_rate_slider = gr.Slider(0.01, 1.0, value=0.1, step=0.01, label="learning_rate")
     n_estimators_slider = gr.Slider(50, 300, value=100, step=50, label="n_estimators")
     max_depth_slider = gr.Slider(1, 10, value=3, step=1, label="max_depth")
+    test_size_slider = gr.Slider(0.1, 0.9, value=0.3, step=0.1, label="test_size fraction (0.1-0.9)")
     train_button = gr.Button("Train & Evaluate")
     output_text = gr.Markdown()
     output_plot = gr.Plot()
+    # Link the "Load Columns" button -> update_columns function
     load_cols_btn.click(
         fn=update_columns,
         inputs=[dataset_dropdown, custom_dataset_id],
+        outputs=[label_col, feature_cols, load_cols_info],
     )
+    # Link "Train & Evaluate" -> train_model function
     train_button.click(
         fn=train_model,
         inputs=[
             max_depth_slider,
             test_size_slider
         ],
+        outputs=[output_text, output_plot],
     )
 demo.launch()