Spaces:

sklearn-docs
/

GradientBoostingClassifier

Runtime error

App Files Files Community

GradientBoostingClassifier / app.py

ZennyKenny

improve docs

e72fe9d verified 11 months ago

raw

history blame

9.76 kB

	import gradio as gr
	import numpy as np
	import pandas as pd
	import matplotlib
	import matplotlib.pyplot as plt

	from datasets import load_dataset
	from sklearn.ensemble import GradientBoostingClassifier
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score, confusion_matrix

	matplotlib.use('Agg')

	################################################################################
	# SUGGESTED_DATASETS: These must actually exist on huggingface.co/datasets
	#
	# "scikit-learn/iris" -> A small, classic Iris dataset with a "train" split
	# "uci/wine" -> Another small dataset with a "train" split
	# "SKIP/ENTER_CUSTOM" -> Placeholder to let the user enter a custom dataset ID
	################################################################################
	SUGGESTED_DATASETS = [
	"scikit-learn/iris",
	"uci/wine",
	"SKIP/ENTER_CUSTOM"
	]

	def update_columns(dataset_id, custom_dataset_id):
	"""
	After the user chooses a dataset from the dropdown or enters their own,
	this function loads the dataset's "train" split, converts it to a DataFrame,
	and returns the columns. These columns are used to populate the Label and
	Feature selectors in the UI.
	"""
	if dataset_id != "SKIP/ENTER_CUSTOM":
	final_id = dataset_id
	else:
	final_id = custom_dataset_id.strip()

	try:
	ds = load_dataset(final_id, split="train")
	df = pd.DataFrame(ds)
	cols = df.columns.tolist()

	message = (
	f"Loaded dataset: `{final_id}`\n\n"
	f"Columns found: {cols}"
	)
	return (
	gr.update(choices=cols, value=None), # label_col dropdown
	gr.update(choices=cols, value=[]), # feature_cols checkbox group
	message
	)
	except Exception as e:
	err_msg = f"Error loading `{final_id}`: {e}"
	return (
	gr.update(choices=[], value=None),
	gr.update(choices=[], value=[]),
	err_msg
	)

	def train_model(dataset_id, custom_dataset_id, label_column, feature_columns,
	learning_rate, n_estimators, max_depth, test_size):
	"""
	1. Decide which dataset ID to load (from dropdown or custom).
	2. Load that dataset's 'train' split, turn into DataFrame, extract X (features) and y (label).
	3. Train a GradientBoostingClassifier on X_train, y_train.
	4. Compute accuracy and confusion matrix on X_test, y_test.
	5. Plot and return feature importances + confusion matrix heatmap + textual summary.
	"""
	# Resolve final dataset ID
	if dataset_id != "SKIP/ENTER_CUSTOM":
	final_id = dataset_id
	else:
	final_id = custom_dataset_id.strip()

	# Load dataset -> df
	ds = load_dataset(final_id, split="train")
	df = pd.DataFrame(ds)

	# Validate columns
	if label_column not in df.columns:
	raise ValueError(f"Label column '{label_column}' not found in dataset columns.")
	for fc in feature_columns:
	if fc not in df.columns:
	raise ValueError(f"Feature column '{fc}' not found in dataset columns.")

	# Convert to NumPy arrays
	X = df[feature_columns].values
	y = df[label_column].values

	# Train/test split
	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=test_size, random_state=42
	)

	# Instantiate and train GradientBoostingClassifier
	clf = GradientBoostingClassifier(
	learning_rate=learning_rate,
	n_estimators=int(n_estimators),
	max_depth=int(max_depth),
	random_state=42
	)
	clf.fit(X_train, y_train)

	# Evaluate
	y_pred = clf.predict(X_test)
	accuracy = accuracy_score(y_test, y_pred)
	cm = confusion_matrix(y_test, y_pred)

	# Create Matplotlib figure with feature importances + confusion matrix
	fig, axs = plt.subplots(1, 2, figsize=(10, 4))

	# Subplot 1: Feature Importances
	importances = clf.feature_importances_
	axs[0].barh(range(len(feature_columns)), importances, color='skyblue')
	axs[0].set_yticks(range(len(feature_columns)))
	axs[0].set_yticklabels(feature_columns)
	axs[0].set_xlabel("Importance")
	axs[0].set_title("Feature Importances")

	# Subplot 2: Confusion Matrix Heatmap
	im = axs[1].imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
	axs[1].set_title("Confusion Matrix")
	plt.colorbar(im, ax=axs[1])
	axs[1].set_xlabel("Predicted")
	axs[1].set_ylabel("True")

	# Optionally annotate each cell with numeric counts
	thresh = cm.max() / 2.0
	for i in range(cm.shape[0]):
	for j in range(cm.shape[1]):
	color = "white" if cm[i, j] > thresh else "black"
	axs[1].text(j, i, str(cm[i, j]), ha="center", va="center", color=color)

	plt.tight_layout()

	# Textual summary
	text_summary = (
	f"Dataset used: `{final_id}`\n\n"
	f"Label column: `{label_column}`\n\n"
	f"Feature columns: `{feature_columns}`\n\n"
	f"Accuracy: {accuracy:.3f}\n\n"
	)

	return text_summary, fig

	###############################################################################
	# Gradio UI
	###############################################################################
	with gr.Blocks() as demo:

	# High-level title and description
	gr.Markdown(
	"""
	# Introduction to Gradient Boosting

	This Space demonstrates how to train a [GradientBoostingClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#gradientboostingclassifier) from scikit-learn on tabular datasets hosted on the [Hugging Face Hub](https://huggingface.co/datasets).

	Gradient Boosting is an ensemble machine learning technique that combines many weak learners (usually small decision trees) in an iterative, stage-wise fashion to create a stronger overall model.
	In each step, the algorithm fits a new weak learner to the current errors of the combined ensemble, effectively allowing the model to focus on the hardest-to-predict data points.
	By repeatedly adding these specialized trees, Gradient Boosting can capture complex patterns and deliver high predictive accuracy, especially on tabular data.

	Put simply, Gradient Boosting makes a big deal out of small anomolies!

	Purpose:
	- Easily explore hyperparameters (_learning_rate, n_estimators, max_depth_) and quickly train an ML model on real data.
	- Visualise model performance via confusion matrix heatmap and a feature importance plot.

	Notes:
	- The dataset must have a "train" split with tabular columns (i.e., no nested structures).
	- Large datasets may take time to download/train.
	- The confusion matrix helps you see how predictions compare to ground-truth labels. The diagonal cells show correct predictions; off-diagonal cells indicate misclassifications.
	- The feature importance plot shows which features the model relies on the most for its predictions.

	---

	Usage:
	1. Select one of the suggested datasets from the dropdown _or_ enter any valid dataset from the [Hugging Face Hub](https://huggingface.co/datasets).
	2. Click Load Columns to retrieve the column names from the dataset's train split.
	3. Choose exactly _one_ Label column (the target) and one or more Feature columns (the inputs).
	4. Adjust hyperparameters (learning_rate, n_estimators, max_depth, test_size).
	5. Click Train & Evaluate to train a Gradient Boosting model and see its accuracy, feature importances, and confusion matrix.

	You are now a machine learning engineer, congratulations 🤗

	---
	"""
	)

	with gr.Row():
	dataset_dropdown = gr.Dropdown(
	label="Choose suggested dataset",
	choices=SUGGESTED_DATASETS,
	value=SUGGESTED_DATASETS[0]
	)
	custom_dataset_id = gr.Textbox(
	label="Or enter a custom dataset ID",
	placeholder="e.g. user/my_custom_dataset"
	)

	load_cols_btn = gr.Button("Load Columns")
	load_cols_info = gr.Markdown()

	with gr.Row():
	label_col = gr.Dropdown(choices=[], label="Label column (choose 1)")
	feature_cols = gr.CheckboxGroup(choices=[], label="Feature columns (choose 1 or more)")

	# Model Hyperparameters
	learning_rate_slider = gr.Slider(
	minimum=0.01, maximum=1.0, value=0.1, step=0.01,
	label="learning_rate"
	)
	n_estimators_slider = gr.Slider(
	minimum=50, maximum=300, value=100, step=50,
	label="n_estimators"
	)
	max_depth_slider = gr.Slider(
	minimum=1, maximum=10, value=3, step=1,
	label="max_depth"
	)
	test_size_slider = gr.Slider(
	minimum=0.1, maximum=0.9, value=0.3, step=0.1,
	label="test_size fraction (0.1-0.9)"
	)

	train_button = gr.Button("Train & Evaluate")

	output_text = gr.Markdown()
	output_plot = gr.Plot()

	# Link the "Load Columns" button -> update_columns function
	load_cols_btn.click(
	fn=update_columns,
	inputs=[dataset_dropdown, custom_dataset_id],
	outputs=[label_col, feature_cols, load_cols_info],
	)

	# Link "Train & Evaluate" -> train_model function
	train_button.click(
	fn=train_model,
	inputs=[
	dataset_dropdown,
	custom_dataset_id,
	label_col,
	feature_cols,
	learning_rate_slider,
	n_estimators_slider,
	max_depth_slider,
	test_size_slider
	],
	outputs=[output_text, output_plot],
	)

	demo.launch()