Spaces:

MaxNoichl
/

clustering_explorer

Sleeping

Maximilian Noichl

Update app.py

2c551c4 verified over 1 year ago

13.3 kB

	# -- coding: utf-8 --
	"""01_clustering_methods.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1mqAGInsaItbKYVUlP9muYz3fpdGBWFz5
	"""



	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	import sklearn.cluster as cluster


	import colormaps as cmaps
	import opinionated
	plt.style.use("opinionated_rc")
	from opinionated.core import download_googlefont
	download_googlefont('Quicksand', add_to_cache=True)
	plt.rc('font', family='Quicksand')

	#wget https://github.com/scikit-learn-contrib/hdbscan/raw/master/notebooks/clusterable_data.npy
	#!wget https://github.com/mwaskom/seaborn-data/raw/master/penguins.csv


	import requests

	# URLs of the files to download
	clusterable_data_url = "https://github.com/scikit-learn-contrib/hdbscan/raw/master/notebooks/clusterable_data.npy"
	penguins_csv_url = "https://github.com/mwaskom/seaborn-data/raw/master/penguins.csv"

	# Function to download and save a file from a URL
	def download_file(url, local_filename):
	with requests.get(url, stream=True) as r:
	r.raise_for_status()
	with open(local_filename, 'wb') as f:
	for chunk in r.iter_content(chunk_size=8192):
	f.write(chunk)

	# Download the files
	download_file(clusterable_data_url, "clusterable_data.npy")
	download_file(penguins_csv_url, "penguins.csv")

	print("Files downloaded successfully.")




	hdbscan_example_data = np.load('clusterable_data.npy')
	penguins_dataset = pd.read_csv('penguins.csv')[['bill_length_mm','bill_depth_mm','flipper_length_mm']].dropna().values

	from sklearn.preprocessing import StandardScaler

	scaler = StandardScaler()
	penguins_dataset_standardized = scaler.fit_transform(penguins_dataset)













	import gradio as gr
	import numpy as np
	import matplotlib.pyplot as plt
	from sklearn.datasets import make_blobs, make_moons, load_iris
	import seaborn as sns
	import pandas as pd
	import matplotlib.colors as mcolors


	from sklearn.cluster import KMeans
	from sklearn.cluster import AgglomerativeClustering
	from sklearn.mixture import GaussianMixture
	import hdbscan


	import genieclust





	# Pre-defined datasets
	blobs_X, _ = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
	moons_X, _ = make_moons(n_samples=300, noise=0.05, random_state=0)

	# Penguins dataset (3D example)
	# For the purpose of this example, let's simulate the Penguins dataset with iris for simplicity
	iris_X, _ = load_iris(return_X_y=True)
	# Assuming iris_X to be a placeholder for the Penguins dataset with numerical features

	datasets = {
	"Blobs": blobs_X,
	"Moons": moons_X,
	"Penguins": penguins_dataset_standardized, # Placeholder for Penguins dataset
	"hDBSCAN sample": hdbscan_example_data
	}

	# Function for plotting the unclustered dataset
	def plot_unclustered(dataset_name):
	X = datasets[dataset_name] # Fetch dataset from the dictionary

	# Check if the dataset has more than 2 dimensions
	if X.shape[1] > 2:
	# Convert dataset to DataFrame for seaborn pairplot
	df = pd.DataFrame(X)
	fig = sns.pairplot(df, plot_kws={'color': 'grey','alpha':0.7}, diag_kws={'color': 'grey'}).fig
	else:
	fig, ax = plt.subplots(figsize=(8, 6))
	ax.scatter(X[:, 0], X[:, 1], color='gray', marker='.',alpha=.7)
	ax.set_xlabel("Feature 1")
	ax.set_ylabel("Feature 2")
	ax.grid(True)
	plt.tight_layout()
	plt.close(fig)

	return fig

	def plot_clustered(dataset_name, clustering_method, kmeans_n_clusters, agg_n_clusters, agg_linkage, gmm_n_clusters, covariance_type,
	genie_n_clusters, gini_threshold, M,hdbscan_min_cluster_size, hdbscan_min_samples):
	X = datasets[dataset_name]

	# Determine the clustering method and fit the model accordingly
	if clustering_method == "K-Means":
	model = KMeans(n_clusters=kmeans_n_clusters)
	model.fit(X)
	labels = model.labels_ # For K-Means, labels are in .labels_

	elif clustering_method == "Agglomerative":
	model = AgglomerativeClustering(n_clusters=agg_n_clusters, linkage=agg_linkage)
	model.fit(X)
	labels = model.labels_ # For Agglomerative Clustering, labels are in .labels_

	elif clustering_method == "Gaussian Mixture":
	model = GaussianMixture(n_components=gmm_n_clusters, covariance_type=covariance_type)
	model.fit(X)
	labels = model.predict(X) # For Gaussian Mixture, use .predict() to get labels

	elif clustering_method == "Genie":
	model = genieclust.Genie(n_clusters=genie_n_clusters, gini_threshold=gini_threshold, M=M)
	labels = model.fit_predict(X) # GenieClust uses fit_predict directly for both fitting and label prediction

	elif clustering_method == "h-DBSCAN":
	clusterer = hdbscan.HDBSCAN(min_cluster_size=hdbscan_min_cluster_size, min_samples=hdbscan_min_samples).fit(X)
	labels = clusterer.labels_



	n_clusters= len(np.unique([x for x in labels if x >= 0]))


	if n_clusters <= 10:
	original_cmap = cmaps.greenorange_12
	colors = original_cmap([x for x in range(n_clusters)])
	# Create a new listed colormap with the extracted colors
	new_cmap = mcolors.ListedColormap(colors)
	else:
	new_cmap = cmaps.cet_g_bw_minc

	cluster_colors = [new_cmap(x) if x >= 0
	else (0.5, 0.5, 0.5)
	for x in labels]


	# Check if the dataset has more than 2 dimensions
	if X.shape[1] > 2:
	# Convert dataset to DataFrame for seaborn pairplot
	df = pd.DataFrame(X)
	# df['cluster'] = labels
	# fig = sns.pairplot(df, color = cluster_colors, cmap=new_cmap).fig


	# Create bins for each variable
	n_bins = 10
	bins = {column: np.linspace(df[column].min(), df[column].max(), n_bins+1) for column in df.columns}

	# Create a figure and axes
	n = len(df.columns)
	fig, axes = plt.subplots(nrows=n, ncols=n, figsize=(n2.3, n2.3))

	for i in range(n):
	for j in range(n):
	ax = axes[i, j]
	ax.grid(True, which='both', linestyle='--', linewidth=0.5)

	if i != j:
	ax.scatter(df[df.columns[j]], df[df.columns[i]], c=cluster_colors, alpha=0.8, marker='o',s = 10)
	else: # Diagonal - Stacked Bar Charts
	data = df[df.columns[i]]
	counts = np.zeros((n_bins, n_clusters))
	for cluster in range(n_clusters):
	cluster_data = data[labels == cluster]
	hist, _ = np.histogram(cluster_data, bins=bins[df.columns[i]])
	counts[:, cluster] = hist
	for cluster in range(n_clusters):
	ax.bar(range(n_bins), counts[:, cluster], width=1, align='center',
	bottom=np.sum(counts[:, :cluster], axis=1), color=cluster_colors[list(labels).index(cluster)] )

	# Explicit axis lines at the bottom and left
	ax.spines['top'].set_visible(False)
	ax.spines['right'].set_visible(False)
	ax.spines['bottom'].set_visible(True)
	ax.spines['left'].set_visible(True)

	# Hide axis marks for inner plots and adjust label size
	if i < n - 1:
	ax.tick_params(labelbottom=False) # Hide x-axis labels for all but bottom row
	else:
	ax.tick_params(axis='x', labelsize=8) # Smaller labels for x-axis
	if j > 0:
	ax.tick_params(labelleft=False) # Hide y-axis labels for all but first column
	else:
	ax.tick_params(axis='y', labelsize=8) # Smaller labels for y-axis

	# Set labels for outer plots only
	if i == n - 1:
	ax.set_xlabel(df.columns[j], rotation=0, fontsize=12)
	if j == 0:
	ax.set_ylabel(df.columns[i], fontsize=12)




	else:
	fig, ax = plt.subplots(figsize=(8, 6))
	ax.scatter(X[:, 0], X[:, 1], c=cluster_colors, marker='.')
	ax.grid(True)
	plt.tight_layout()
	plt.close(fig)



	return fig

	intro_md = """
	# Cluster-algorithm-explorer

	_by [Max Noichl](https://homepage.univie.ac.at/maximilian.noichl/), for the clustering & data-visualization-workshop, Bremen, 2024_

	Below you can test a number of clustering-algorithms on several easier and harder datasets.

	"""



	# Gradio interface setup remains the same
	with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
	with gr.Column():
	gr.Markdown(intro_md)
	with gr.Row():

	with gr.Column():
	gr.Markdown("# Choose your dataset:")
	dataset_dropdown = gr.Dropdown(label="Select a dataset", choices=list(datasets.keys()), value="Blobs")



	gr.Markdown("# Choose your Clustering algorithm & Parameters:")


	# Update the dropdown for clustering method to include "Genie"
	clustering_method_dropdown = gr.Dropdown(label="Select a clustering method", choices=["K-Means", "Agglomerative", "Gaussian Mixture", "Genie", "h-DBSCAN"], value="K-Means")

	# K-Means parameters
	with gr.Group(visible=True) as kmeans_params_group:
	kmeans_n_clusters_slider = gr.Slider(minimum=2, maximum=10, step=1, label="Number of Clusters (K-Means)", value=4)

	# Agglomerative Clustering parameters
	with gr.Group(visible=False) as agglomerative_params_group:
	agg_n_clusters_slider = gr.Slider(minimum=2, maximum=10, step=1, label="Number of Clusters (Agglomerative)", value=4)
	agg_linkage_dropdown = gr.Dropdown(label="Linkage Type", choices=["ward", "complete", "average", "single"], value="ward")

	# Gaussian Mixture Model parameters
	with gr.Group(visible=False) as gmm_params_group:
	gmm_n_clusters_slider = gr.Slider(minimum=2, maximum=10, step=1, label="Number of Components (GMM)", value=4)
	covariance_type_dropdown = gr.Dropdown(label="Covariance Type", choices=["full", "tied", "diag", "spherical"], value="full")

	# GenieClust parameters
	with gr.Group(visible=False) as genie_params_group:
	genie_n_clusters_slider = gr.Slider(minimum=2, maximum=10, step=1, label="Number of Clusters (Genie)", value=4)
	gini_threshold_slider = gr.Slider(minimum=0.0, maximum=1.05, step=0.05, label="Gini Threshold (Genie)", value=.3)
	M_slider = gr.Slider(minimum=0.5, maximum=2.0, step=0.1, label="M Parameter (Genie)", value=1.0)

	with gr.Group(visible=False) as hdbscan_params_group:
	hdbscan_min_cluster_size = gr.Slider(minimum=2, maximum=200, step=1, label="Minimal Cluster Size (hDBSCAN)", value=10)
	hdbscan_min_samples = gr.Slider(minimum=2, maximum=200, step=1, label="Min. Samples (hDBSCAN)", value=10)



	# Update the function that changes visible parameter groups based on selected clustering method
	def update_method_params(clustering_method):
	return {
	kmeans_params_group: gr.Group(visible=clustering_method == "K-Means"),
	agglomerative_params_group: gr.Group(visible=clustering_method == "Agglomerative"),
	gmm_params_group: gr.Group(visible=clustering_method == "Gaussian Mixture"),
	genie_params_group: gr.Group(visible=clustering_method == "Genie"),
	hdbscan_params_group: gr.Group(visible=clustering_method == "h-DBSCAN"),


	}


	clustering_method_dropdown.change(update_method_params, inputs=[clustering_method_dropdown], outputs=[kmeans_params_group, agglomerative_params_group,
	gmm_params_group, genie_params_group,hdbscan_params_group])

	button = gr.Button("Run Clustering!")


	with gr.Column():
	unclustered_plot_output = gr.Plot(label=None)
	clustered_plot_output = gr.Plot(label=None)


	dataset_dropdown.change(plot_unclustered, inputs=[dataset_dropdown], outputs=[unclustered_plot_output])
	demo.load(plot_unclustered, inputs=[dataset_dropdown], outputs=[unclustered_plot_output])
	# Update the button click event to include new parameters for GenieClust
	button.click(
	plot_clustered,
	inputs=[
	dataset_dropdown,
	clustering_method_dropdown,
	kmeans_n_clusters_slider,
	agg_n_clusters_slider,
	agg_linkage_dropdown,
	gmm_n_clusters_slider,
	covariance_type_dropdown,
	genie_n_clusters_slider, # Add Genie parameters
	gini_threshold_slider,
	M_slider,
	hdbscan_min_cluster_size,
	hdbscan_min_samples
	],
	outputs=[clustered_plot_output]
	)

	if __name__ == "__main__":
	demo.launch(debug=True)