|
from collections import defaultdict |
|
import streamlit as st |
|
from utils import load_and_preprocess_data |
|
import pandas as pd |
|
import numpy as np |
|
import altair as alt |
|
from sklearn.mixture import GaussianMixture |
|
import plotly.express as px |
|
import itertools |
|
from typing import Dict, List |
|
|
|
|
|
SIDEBAR_DESCRIPTION = """ |
|
# Client clustering |
|
|
|
To cluster a client, we adopt the RFM metrics. They stand for: |
|
|
|
- R = recency, that is the number of days since the last purchase |
|
in the store |
|
- F = frequency, that is the number of times a customer has ordered something |
|
- M = monetary value, that is how much a customer has spent buying |
|
from your business. |
|
|
|
Given these 3 metrics, we can cluster the customers and find a suitable |
|
"definition" based on the clusters they belong to. Since the dataset |
|
we're using right now has about 5000 distinct customers, we identify |
|
3 clusters for each metric. |
|
|
|
## How we compute the clusters |
|
|
|
We resort to a GaussianMixture algorithm. We can think of GaussianMixture |
|
as generalized k-means clustering that incorporates information about |
|
the covariance structure of the data as well as the centers of the clusters. |
|
""".lstrip() |
|
|
|
FREQUENCY_CLUSTERS_EXPLAIN = """ |
|
The **frequency** denotes how frequently a customer has ordered. |
|
|
|
There 3 available clusters for this metric: |
|
|
|
- cluster 0: denotes a customer that purchases one or few times (range [{}, {}]) |
|
- cluster 1: these customer have a discrete amount of orders (range [{}, {}]) |
|
- cluster 2: these customer purchases lots of times (range [{}, {}]) |
|
|
|
------- |
|
""".lstrip() |
|
|
|
RECENCY_CLUSTERS_EXPLAIN = """ |
|
The **recency** refers to how recently a customer has bought; |
|
|
|
There 3 available clusters for this metric: |
|
|
|
- cluster 0: the last order of these client is long time ago (range [{}, {}]) |
|
- cluster 1: these are clients that purchases something not very recently (range [{}, {}]) |
|
- cluster 2: the last order of these client is a few days/weeks ago (range [{}, {}]) |
|
|
|
------- |
|
""".lstrip() |
|
|
|
MONETARY_CLUSTERS_EXPLAIN = """ |
|
The **revenue** refers to how much a customer has spent buying |
|
from your business. |
|
|
|
There 3 available clusters for this metric: |
|
|
|
- cluster 0: these clients spent little money (range [{}, {}]) |
|
- cluster 1: these clients spent a considerable amount of money (range [{}, {}]) |
|
- cluster 2: these clients spent lots of money (range [{}, {}]) |
|
|
|
------- |
|
""".lstrip() |
|
|
|
EXPLANATION_DICT = { |
|
"Frequency_cluster": FREQUENCY_CLUSTERS_EXPLAIN, |
|
"Recency_cluster": RECENCY_CLUSTERS_EXPLAIN, |
|
"Revenue_cluster": MONETARY_CLUSTERS_EXPLAIN, |
|
} |
|
|
|
|
|
def create_features(df: pd.DataFrame): |
|
"""Creates a new dataframe with the RFM features for each client.""" |
|
|
|
client_features = df.groupby("CustomerID")["InvoiceDate"].nunique().reset_index() |
|
client_features.columns = ["CustomerID", "Frequency"] |
|
|
|
|
|
client_takings = df.groupby("CustomerID")["Price"].sum() |
|
client_features["Revenue"] = client_takings.values |
|
|
|
|
|
max_date = df.groupby("CustomerID")["InvoiceDate"].max().reset_index() |
|
max_date.columns = ["CustomerID", "LastPurchaseDate"] |
|
|
|
client_features["Recency"] = ( |
|
max_date["LastPurchaseDate"].max() - max_date["LastPurchaseDate"] |
|
).dt.days |
|
|
|
return client_features |
|
|
|
|
|
@st.cache |
|
def cluster_clients(df: pd.DataFrame): |
|
"""Computes the RFM features and clusters for each user based on the RFM metrics.""" |
|
|
|
df_rfm = create_features(df) |
|
|
|
for to_cluster, order in zip( |
|
["Revenue", "Frequency", "Recency"], ["ascending", "ascending", "descending"] |
|
): |
|
kmeans = GaussianMixture(n_components=3, random_state=42) |
|
labels = kmeans.fit_predict(df_rfm[[to_cluster]]) |
|
df_rfm[f"{to_cluster}_cluster"] = _order_cluster(kmeans, labels, order) |
|
|
|
return df_rfm |
|
|
|
|
|
def _order_cluster(cluster_model: GaussianMixture, clusters, order="ascending"): |
|
"""Orders the cluster by order.""" |
|
centroids = cluster_model.means_.sum(axis=1) |
|
|
|
if order.lower() == "descending": |
|
centroids *= -1 |
|
|
|
ascending_order = np.argsort(centroids) |
|
lookup_table = np.zeros_like(ascending_order) |
|
|
|
lookup_table[ascending_order] = np.arange(cluster_model.n_components) + 1 |
|
return lookup_table[clusters] |
|
|
|
|
|
def show_purhcase_history(user: int, df: pd.DataFrame): |
|
user_purchases = df.loc[df.CustomerID == user, ["Price", "InvoiceDate"]] |
|
expenses = user_purchases.groupby(user_purchases.InvoiceDate).sum() |
|
expenses.columns = ["Expenses"] |
|
expenses = expenses.reset_index() |
|
|
|
c = ( |
|
alt.Chart(expenses) |
|
.mark_line(point=True) |
|
.encode( |
|
x=alt.X("InvoiceDate", timeUnit="yearmonthdate", title="Date"), |
|
y="Expenses", |
|
) |
|
.properties(title="User expenses") |
|
) |
|
|
|
st.altair_chart(c) |
|
|
|
|
|
def show_user_info(user: int, df_rfm: pd.DataFrame): |
|
"""Prints some information about the user. |
|
|
|
The main information are the total expenses, how |
|
many times he purchases in the store, and the clusters |
|
he belongs to. |
|
""" |
|
|
|
user_row = df_rfm[df_rfm["CustomerID"] == user] |
|
if len(user_row) == 0: |
|
st.write(f"No user with id {user}") |
|
|
|
output = [] |
|
|
|
output.append(f"The user purchased **{user_row['Frequency'].squeeze()} times**.\n") |
|
output.append( |
|
f"She/he spent **{user_row['Revenue'].squeeze()} dollars** in total.\n" |
|
) |
|
output.append( |
|
f"The last time she/he bought something was **{user_row['Recency'].squeeze()} days ago**.\n" |
|
) |
|
output.append(f"She/he belongs to the clusters: ") |
|
for cluster in [column for column in user_row.columns if "_cluster" in column]: |
|
output.append(f"- {cluster} = {user_row[cluster].squeeze()}") |
|
|
|
st.write("\n".join(output)) |
|
|
|
return ( |
|
user_row["Recency_cluster"].squeeze(), |
|
user_row["Frequency_cluster"].squeeze(), |
|
user_row["Revenue_cluster"].squeeze(), |
|
) |
|
|
|
|
|
def explain_cluster(cluster_info): |
|
"""Displays a popup menu explinging the meanining of the clusters.""" |
|
|
|
with st.expander("Show information about the clusters"): |
|
st.write( |
|
"**Note**: these values are valid for these dataset." |
|
"Different dataset will have different number of clusters" |
|
" and values" |
|
) |
|
for cluster, info in cluster_info.items(): |
|
st.write(EXPLANATION_DICT[cluster].format(*info)) |
|
|
|
|
|
def categorize_user(recency_cluster, frequency_cluster, monetary_cluster): |
|
"""Describe the user with few words based on the cluster he belongs to.""" |
|
|
|
score = f"{recency_cluster}{frequency_cluster}{monetary_cluster}" |
|
|
|
|
|
|
|
description = "" |
|
|
|
if score == "111": |
|
description = "Tourist" |
|
elif score.startswith("2"): |
|
description = "Losing interest" |
|
elif score == "133": |
|
description = "Former lover" |
|
elif score == "123": |
|
description = "Former passionate client" |
|
elif score == "113": |
|
description = "Spent a lot, but never come back" |
|
elif score.startswith("1"): |
|
description = "About to dump" |
|
elif score == "313": |
|
description = "Potential lover" |
|
elif score == "312": |
|
description = "Interesting new client" |
|
elif score == "311": |
|
description = "New customer" |
|
elif score == "333": |
|
description = "Gold client" |
|
elif score == "322": |
|
description = "Lovers" |
|
else: |
|
description = "Average client" |
|
|
|
st.write(f"The customer can be described as: **{description}**") |
|
|
|
|
|
def plot_rfm_distribution(df_rfm: pd.DataFrame, cluster_info: Dict[str, List[int]]): |
|
"""Plots 3 histograms for the RFM metrics.""" |
|
|
|
for x in ("Revenue", "Frequency", "Recency"): |
|
fig = px.histogram( |
|
df_rfm, |
|
x=x, |
|
log_y=True, |
|
title=f"{x} metric", |
|
) |
|
|
|
|
|
values = cluster_info[f"{x}_cluster"] |
|
|
|
for n_cluster, i in enumerate(range(1, len(values)-1, 2)): |
|
fig.add_vline( |
|
x=values[i], |
|
annotation_text=f"End of cluster {n_cluster+1}", |
|
line_dash="dot", |
|
annotation=dict(textangle=90, font_color="red"), |
|
) |
|
|
|
fig.update_layout( |
|
yaxis_title="Count (log scale)", |
|
) |
|
|
|
st.plotly_chart(fig) |
|
|
|
|
|
def display_dataframe_heatmap(df_rfm: pd.DataFrame): |
|
"""Displays an heatmap of how many clients lay in the clusters. |
|
|
|
This method uses some black magic coming from the dataframe |
|
styling guide. |
|
""" |
|
|
|
|
|
|
|
|
|
count = ( |
|
df_rfm.groupby(["Recency_cluster", "Frequency_cluster", "Revenue_cluster"])[ |
|
"CustomerID" |
|
] |
|
.count() |
|
.reset_index() |
|
) |
|
count = count.rename(columns={"CustomerID": "Count"}) |
|
|
|
|
|
count = count.drop_duplicates( |
|
["Revenue_cluster", "Frequency_cluster", "Recency_cluster"] |
|
) |
|
|
|
|
|
count = count.pivot( |
|
index=["Revenue_cluster", "Frequency_cluster"], |
|
columns="Recency_cluster", |
|
values="Count", |
|
) |
|
|
|
|
|
cell_hover = { |
|
"selector": "td", |
|
"props": "font-size:1.5em", |
|
} |
|
index_names = { |
|
"selector": ".index_name", |
|
"props": "font-style: italic; color: Black; font-weight:normal;font-size:1.5em;", |
|
} |
|
headers = { |
|
"selector": "th:not(.index_name)", |
|
"props": "background-color: White; color: black; font-size:1.5em", |
|
} |
|
|
|
|
|
|
|
|
|
|
|
st.markdown("## Heatmap: how the client are distributed between clusters") |
|
st.write( |
|
count.style.format(thousands=" ", precision=0, na_rep="0") |
|
.set_table_styles([cell_hover, index_names, headers]) |
|
.background_gradient(cmap="coolwarm") |
|
.to_html(), |
|
unsafe_allow_html=True, |
|
) |
|
|
|
|
|
def main(): |
|
st.sidebar.markdown(SIDEBAR_DESCRIPTION) |
|
|
|
df, _, _ = load_and_preprocess_data() |
|
df_rfm = cluster_clients(df) |
|
|
|
st.markdown( |
|
"# Dataset " |
|
"\nThis is the processed dataset with information about the clients, such as" |
|
" the RFM values and the clusters they belong to." |
|
) |
|
st.dataframe(df_rfm.style.format(formatter={"Revenue": "{:.2f}"})) |
|
|
|
cluster_info_dict = defaultdict(list) |
|
|
|
with st.expander("Show more details about the clusters"): |
|
for cluster in [column for column in df_rfm.columns if "_cluster" in column]: |
|
st.write(cluster) |
|
cluster_info = ( |
|
df_rfm.groupby(cluster)[cluster.split("_")[0]] |
|
.describe() |
|
.reset_index(names="Cluster") |
|
) |
|
min_cluster = cluster_info["min"].astype(int) |
|
max_cluster = cluster_info["max"].astype(int) |
|
min_max_interlieved = list(itertools.chain(*zip(min_cluster, max_cluster))) |
|
cluster_info_dict[cluster].extend(min_max_interlieved) |
|
st.dataframe(cluster_info) |
|
|
|
st.markdown("## RFM metric distribution") |
|
|
|
plot_rfm_distribution(df_rfm, cluster_info_dict) |
|
|
|
display_dataframe_heatmap(df_rfm) |
|
|
|
st.markdown("## Interactive exploration") |
|
|
|
filter_by_cluster = st.checkbox( |
|
"Filter client: only one client per cluster type", |
|
value=True, |
|
) |
|
|
|
client_to_select = ( |
|
df_rfm.groupby(["Recency_cluster", "Frequency_cluster", "Revenue_cluster"])["CustomerID"].first().values |
|
if filter_by_cluster |
|
else df["CustomerID"].unique() |
|
) |
|
|
|
|
|
user = st.selectbox( |
|
"Select a customer to show more information about him.", |
|
client_to_select, |
|
) |
|
|
|
show_purhcase_history(user, df) |
|
|
|
recency, frequency, revenue = show_user_info(user, df_rfm) |
|
|
|
categorize_user(recency, frequency, revenue) |
|
|
|
explain_cluster(cluster_info_dict) |
|
|
|
|
|
main() |
|
|