Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	| # -*- coding: utf-8 -*- | |
| """01_clustering_methods.ipynb | |
| Automatically generated by Colaboratory. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1mqAGInsaItbKYVUlP9muYz3fpdGBWFz5 | |
| """ | |
| import numpy as np | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import sklearn.cluster as cluster | |
| import colormaps as cmaps | |
| import opinionated | |
| plt.style.use("opinionated_rc") | |
| from opinionated.core import download_googlefont | |
| download_googlefont('Quicksand', add_to_cache=True) | |
| plt.rc('font', family='Quicksand') | |
| #wget https://github.com/scikit-learn-contrib/hdbscan/raw/master/notebooks/clusterable_data.npy | |
| #!wget https://github.com/mwaskom/seaborn-data/raw/master/penguins.csv | |
| import requests | |
| # URLs of the files to download | |
| clusterable_data_url = "https://github.com/scikit-learn-contrib/hdbscan/raw/master/notebooks/clusterable_data.npy" | |
| penguins_csv_url = "https://github.com/mwaskom/seaborn-data/raw/master/penguins.csv" | |
| # Function to download and save a file from a URL | |
| def download_file(url, local_filename): | |
| with requests.get(url, stream=True) as r: | |
| r.raise_for_status() | |
| with open(local_filename, 'wb') as f: | |
| for chunk in r.iter_content(chunk_size=8192): | |
| f.write(chunk) | |
| # Download the files | |
| download_file(clusterable_data_url, "clusterable_data.npy") | |
| download_file(penguins_csv_url, "penguins.csv") | |
| print("Files downloaded successfully.") | |
| hdbscan_example_data = np.load('clusterable_data.npy') | |
| penguins_dataset = pd.read_csv('penguins.csv')[['bill_length_mm','bill_depth_mm','flipper_length_mm']].dropna().values | |
| from sklearn.preprocessing import StandardScaler | |
| scaler = StandardScaler() | |
| penguins_dataset_standardized = scaler.fit_transform(penguins_dataset) | |
| import gradio as gr | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| from sklearn.datasets import make_blobs, make_moons, load_iris | |
| import seaborn as sns | |
| import pandas as pd | |
| import matplotlib.colors as mcolors | |
| from sklearn.cluster import KMeans | |
| from sklearn.cluster import AgglomerativeClustering | |
| from sklearn.mixture import GaussianMixture | |
| import hdbscan | |
| import genieclust | |
| # Pre-defined datasets | |
| blobs_X, _ = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0) | |
| moons_X, _ = make_moons(n_samples=300, noise=0.05, random_state=0) | |
| # Penguins dataset (3D example) | |
| # For the purpose of this example, let's simulate the Penguins dataset with iris for simplicity | |
| iris_X, _ = load_iris(return_X_y=True) | |
| # Assuming iris_X to be a placeholder for the Penguins dataset with numerical features | |
| datasets = { | |
| "Blobs": blobs_X, | |
| "Moons": moons_X, | |
| "Penguins": penguins_dataset_standardized, # Placeholder for Penguins dataset | |
| "hDBSCAN sample": hdbscan_example_data | |
| } | |
| # Function for plotting the unclustered dataset | |
| def plot_unclustered(dataset_name): | |
| X = datasets[dataset_name] # Fetch dataset from the dictionary | |
| # Check if the dataset has more than 2 dimensions | |
| if X.shape[1] > 2: | |
| # Convert dataset to DataFrame for seaborn pairplot | |
| df = pd.DataFrame(X) | |
| fig = sns.pairplot(df, plot_kws={'color': 'grey','alpha':0.7}, diag_kws={'color': 'grey'}).fig | |
| else: | |
| fig, ax = plt.subplots(figsize=(8, 6)) | |
| ax.scatter(X[:, 0], X[:, 1], color='gray', marker='.',alpha=.7) | |
| ax.set_xlabel("Feature 1") | |
| ax.set_ylabel("Feature 2") | |
| ax.grid(True) | |
| plt.tight_layout() | |
| plt.close(fig) | |
| return fig | |
| def plot_clustered(dataset_name, clustering_method, kmeans_n_clusters, agg_n_clusters, agg_linkage, gmm_n_clusters, covariance_type, | |
| genie_n_clusters, gini_threshold, M,hdbscan_min_cluster_size, hdbscan_min_samples): | |
| X = datasets[dataset_name] | |
| # Determine the clustering method and fit the model accordingly | |
| if clustering_method == "K-Means": | |
| model = KMeans(n_clusters=kmeans_n_clusters) | |
| model.fit(X) | |
| labels = model.labels_ # For K-Means, labels are in .labels_ | |
| elif clustering_method == "Agglomerative": | |
| model = AgglomerativeClustering(n_clusters=agg_n_clusters, linkage=agg_linkage) | |
| model.fit(X) | |
| labels = model.labels_ # For Agglomerative Clustering, labels are in .labels_ | |
| elif clustering_method == "Gaussian Mixture": | |
| model = GaussianMixture(n_components=gmm_n_clusters, covariance_type=covariance_type) | |
| model.fit(X) | |
| labels = model.predict(X) # For Gaussian Mixture, use .predict() to get labels | |
| elif clustering_method == "Genie": | |
| model = genieclust.Genie(n_clusters=genie_n_clusters, gini_threshold=gini_threshold, M=M) | |
| labels = model.fit_predict(X) # GenieClust uses fit_predict directly for both fitting and label prediction | |
| elif clustering_method == "h-DBSCAN": | |
| clusterer = hdbscan.HDBSCAN(min_cluster_size=hdbscan_min_cluster_size, min_samples=hdbscan_min_samples).fit(X) | |
| labels = clusterer.labels_ | |
| n_clusters= len(np.unique([x for x in labels if x >= 0])) | |
| if n_clusters <= 10: | |
| original_cmap = cmaps.greenorange_12 | |
| colors = original_cmap([x for x in range(n_clusters)]) | |
| # Create a new listed colormap with the extracted colors | |
| new_cmap = mcolors.ListedColormap(colors) | |
| else: | |
| new_cmap = cmaps.cet_g_bw_minc | |
| cluster_colors = [new_cmap(x) if x >= 0 | |
| else (0.5, 0.5, 0.5) | |
| for x in labels] | |
| # Check if the dataset has more than 2 dimensions | |
| if X.shape[1] > 2: | |
| # Convert dataset to DataFrame for seaborn pairplot | |
| df = pd.DataFrame(X) | |
| # df['cluster'] = labels | |
| # fig = sns.pairplot(df, color = cluster_colors, cmap=new_cmap).fig | |
| # Create bins for each variable | |
| n_bins = 10 | |
| bins = {column: np.linspace(df[column].min(), df[column].max(), n_bins+1) for column in df.columns} | |
| # Create a figure and axes | |
| n = len(df.columns) | |
| fig, axes = plt.subplots(nrows=n, ncols=n, figsize=(n*2.3, n*2.3)) | |
| for i in range(n): | |
| for j in range(n): | |
| ax = axes[i, j] | |
| ax.grid(True, which='both', linestyle='--', linewidth=0.5) | |
| if i != j: | |
| ax.scatter(df[df.columns[j]], df[df.columns[i]], c=cluster_colors, alpha=0.8, marker='o',s = 10) | |
| else: # Diagonal - Stacked Bar Charts | |
| data = df[df.columns[i]] | |
| counts = np.zeros((n_bins, n_clusters)) | |
| for cluster in range(n_clusters): | |
| cluster_data = data[labels == cluster] | |
| hist, _ = np.histogram(cluster_data, bins=bins[df.columns[i]]) | |
| counts[:, cluster] = hist | |
| for cluster in range(n_clusters): | |
| ax.bar(range(n_bins), counts[:, cluster], width=1, align='center', | |
| bottom=np.sum(counts[:, :cluster], axis=1), color=cluster_colors[list(labels).index(cluster)] ) | |
| # Explicit axis lines at the bottom and left | |
| ax.spines['top'].set_visible(False) | |
| ax.spines['right'].set_visible(False) | |
| ax.spines['bottom'].set_visible(True) | |
| ax.spines['left'].set_visible(True) | |
| # Hide axis marks for inner plots and adjust label size | |
| if i < n - 1: | |
| ax.tick_params(labelbottom=False) # Hide x-axis labels for all but bottom row | |
| else: | |
| ax.tick_params(axis='x', labelsize=8) # Smaller labels for x-axis | |
| if j > 0: | |
| ax.tick_params(labelleft=False) # Hide y-axis labels for all but first column | |
| else: | |
| ax.tick_params(axis='y', labelsize=8) # Smaller labels for y-axis | |
| # Set labels for outer plots only | |
| if i == n - 1: | |
| ax.set_xlabel(df.columns[j], rotation=0, fontsize=12) | |
| if j == 0: | |
| ax.set_ylabel(df.columns[i], fontsize=12) | |
| else: | |
| fig, ax = plt.subplots(figsize=(8, 6)) | |
| ax.scatter(X[:, 0], X[:, 1], c=cluster_colors, marker='.') | |
| ax.grid(True) | |
| plt.tight_layout() | |
| plt.close(fig) | |
| return fig | |
| intro_md = """ | |
| # Cluster-algorithm-explorer | |
| _by [Max Noichl](https://homepage.univie.ac.at/maximilian.noichl/), for the clustering & data-visualization-workshop, Bremen, 2024_ | |
| Below you can test a number of clustering-algorithms on several easier and harder datasets. | |
| """ | |
| # Gradio interface setup remains the same | |
| with gr.Blocks(theme=gr.themes.Monochrome()) as demo: | |
| with gr.Column(): | |
| gr.Markdown(intro_md) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("# Choose your dataset:") | |
| dataset_dropdown = gr.Dropdown(label="Select a dataset", choices=list(datasets.keys()), value="Blobs") | |
| gr.Markdown("# Choose your Clustering algorithm & Parameters:") | |
| # Update the dropdown for clustering method to include "Genie" | |
| clustering_method_dropdown = gr.Dropdown(label="Select a clustering method", choices=["K-Means", "Agglomerative", "Gaussian Mixture", "Genie", "h-DBSCAN"], value="K-Means") | |
| # K-Means parameters | |
| with gr.Group(visible=True) as kmeans_params_group: | |
| kmeans_n_clusters_slider = gr.Slider(minimum=2, maximum=10, step=1, label="Number of Clusters (K-Means)", value=4) | |
| # Agglomerative Clustering parameters | |
| with gr.Group(visible=False) as agglomerative_params_group: | |
| agg_n_clusters_slider = gr.Slider(minimum=2, maximum=10, step=1, label="Number of Clusters (Agglomerative)", value=4) | |
| agg_linkage_dropdown = gr.Dropdown(label="Linkage Type", choices=["ward", "complete", "average", "single"], value="ward") | |
| # Gaussian Mixture Model parameters | |
| with gr.Group(visible=False) as gmm_params_group: | |
| gmm_n_clusters_slider = gr.Slider(minimum=2, maximum=10, step=1, label="Number of Components (GMM)", value=4) | |
| covariance_type_dropdown = gr.Dropdown(label="Covariance Type", choices=["full", "tied", "diag", "spherical"], value="full") | |
| # GenieClust parameters | |
| with gr.Group(visible=False) as genie_params_group: | |
| genie_n_clusters_slider = gr.Slider(minimum=2, maximum=10, step=1, label="Number of Clusters (Genie)", value=4) | |
| gini_threshold_slider = gr.Slider(minimum=0.0, maximum=1.05, step=0.05, label="Gini Threshold (Genie)", value=.3) | |
| M_slider = gr.Slider(minimum=0.5, maximum=2.0, step=0.1, label="M Parameter (Genie)", value=1.0) | |
| with gr.Group(visible=False) as hdbscan_params_group: | |
| hdbscan_min_cluster_size = gr.Slider(minimum=2, maximum=200, step=1, label="Minimal Cluster Size (hDBSCAN)", value=10) | |
| hdbscan_min_samples = gr.Slider(minimum=2, maximum=200, step=1, label="Min. Samples (hDBSCAN)", value=10) | |
| # Update the function that changes visible parameter groups based on selected clustering method | |
| def update_method_params(clustering_method): | |
| return { | |
| kmeans_params_group: gr.Group(visible=clustering_method == "K-Means"), | |
| agglomerative_params_group: gr.Group(visible=clustering_method == "Agglomerative"), | |
| gmm_params_group: gr.Group(visible=clustering_method == "Gaussian Mixture"), | |
| genie_params_group: gr.Group(visible=clustering_method == "Genie"), | |
| hdbscan_params_group: gr.Group(visible=clustering_method == "h-DBSCAN"), | |
| } | |
| clustering_method_dropdown.change(update_method_params, inputs=[clustering_method_dropdown], outputs=[kmeans_params_group, agglomerative_params_group, | |
| gmm_params_group, genie_params_group,hdbscan_params_group]) | |
| button = gr.Button("Run Clustering!") | |
| with gr.Column(): | |
| unclustered_plot_output = gr.Plot(label=None) | |
| clustered_plot_output = gr.Plot(label=None) | |
| dataset_dropdown.change(plot_unclustered, inputs=[dataset_dropdown], outputs=[unclustered_plot_output]) | |
| demo.load(plot_unclustered, inputs=[dataset_dropdown], outputs=[unclustered_plot_output]) | |
| # Update the button click event to include new parameters for GenieClust | |
| button.click( | |
| plot_clustered, | |
| inputs=[ | |
| dataset_dropdown, | |
| clustering_method_dropdown, | |
| kmeans_n_clusters_slider, | |
| agg_n_clusters_slider, | |
| agg_linkage_dropdown, | |
| gmm_n_clusters_slider, | |
| covariance_type_dropdown, | |
| genie_n_clusters_slider, # Add Genie parameters | |
| gini_threshold_slider, | |
| M_slider, | |
| hdbscan_min_cluster_size, | |
| hdbscan_min_samples | |
| ], | |
| outputs=[clustered_plot_output] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(debug=True) | |