Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import datetime as dt | |
from sklearn.cluster import KMeans | |
from sklearn.decomposition import PCA | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import plotly.express as px | |
from gensim.models import Word2Vec | |
# Set the page configuration | |
st.set_page_config(page_title="Customer Segmentation and Product Recommendation", layout="wide") | |
# Title and Description | |
st.title("πCustomer Segmentation & Product Recommendation App") | |
st.markdown(""" | |
This application performs **Customer Segmentation** using RFM analysis and clustering, | |
and provides **Product Recommendations** based on purchase patterns. | |
Upload your dataset, analyze customer behavior, and visualize results interactively. | |
""") | |
# Sidebar for uploading data | |
st.sidebar.header("Upload Dataset") | |
uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type=["csv"]) | |
if uploaded_file: | |
# Load data | |
df = pd.read_csv(uploaded_file, encoding="ISO-8859-1", dtype={'CustomerID': str, 'InvoiceID': str}) | |
st.sidebar.success("Dataset uploaded successfully!") | |
else: | |
st.sidebar.warning("Please upload a CSV file to start!") | |
st.stop() | |
# Data Cleaning and Preprocessing | |
st.header("π§Ή Data Cleaning and Preprocessing") | |
# Create 'Amount' column | |
df["Amount"] = df["Quantity"] * df["UnitPrice"] | |
st.markdown("### Initial Data Preview") | |
st.write(df.head()) | |
# Filter UK customers | |
df = df[df["Country"] == "United Kingdom"] | |
df = df[df["Quantity"] > 0] | |
df.dropna(subset=['CustomerID'], inplace=True) | |
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]) | |
df["date"] = df["InvoiceDate"].dt.date | |
# Cleaned data preview | |
st.markdown("### Cleaned Data Overview") | |
st.write(df.describe()) | |
# Summary Statistics | |
st.subheader("π Summary Statistics") | |
metrics = { | |
"Number of Invoices": df['InvoiceNo'].nunique(), | |
"Number of Products Bought": df['StockCode'].nunique(), | |
"Number of Customers": df['CustomerID'].nunique(), | |
"Average Quantity per Customer": round(df.groupby("CustomerID").Quantity.sum().mean(), 0), | |
"Average Revenue per Customer (Β£)": round(df.groupby("CustomerID").Amount.sum().mean(), 2), | |
} | |
st.write(pd.DataFrame(metrics.items(), columns=["Metric", "Value"])) | |
# Monthly Transactions Analysis | |
st.subheader("π Monthly Transactions Analysis") | |
df['month'] = df['InvoiceDate'].dt.month | |
monthly_counts = df.groupby('month').size() | |
# Plot using Plotly | |
fig_monthly = px.bar( | |
monthly_counts, | |
x=monthly_counts.index, | |
y=monthly_counts.values, | |
labels={"x": "Month", "y": "Transactions"}, | |
title="Transactions Per Month" | |
) | |
st.plotly_chart(fig_monthly) | |
# RFM Analysis | |
st.header("π RFM Analysis") | |
# Recency Calculation | |
now = pd.Timestamp("2011-12-09") | |
recency_df = df.groupby("CustomerID")["date"].max().reset_index() | |
recency_df["Recency"] = (now - pd.to_datetime(recency_df["date"])).dt.days | |
# Frequency Calculation | |
frequency_df = df.groupby("CustomerID")["InvoiceNo"].nunique().reset_index() | |
frequency_df.rename(columns={"InvoiceNo": "Frequency"}, inplace=True) | |
# Monetary Calculation | |
monetary_df = df.groupby("CustomerID")["Amount"].sum().reset_index() | |
monetary_df.rename(columns={"Amount": "Monetary"}, inplace=True) | |
# Combine RFM | |
rfm = recency_df.merge(frequency_df, on="CustomerID").merge(monetary_df, on="CustomerID") | |
st.write("### RFM Data") | |
st.write(rfm.head()) | |
# Visualize RFM Distributions | |
fig_rfm = px.scatter_3d( | |
rfm, | |
x="Recency", | |
y="Frequency", | |
z="Monetary", | |
color="Monetary", | |
size="Monetary", | |
title="RFM Scatter Plot" | |
) | |
st.plotly_chart(fig_rfm) | |
# K-Means Clustering | |
st.header("π K-Means Clustering") | |
st.sidebar.subheader("Clustering Parameters") | |
num_clusters = st.sidebar.slider("Number of Clusters", 2, 10, value=4) | |
kmeans = KMeans(n_clusters=num_clusters, random_state=42) | |
rfm["Cluster"] = kmeans.fit_predict(rfm[["Recency", "Frequency", "Monetary"]]) | |
# Cluster Visualization | |
fig_cluster = px.scatter_3d( | |
rfm, | |
x="Recency", | |
y="Frequency", | |
z="Monetary", | |
color="Cluster", | |
title=f"Customer Segmentation with {num_clusters} Clusters", | |
symbol="Cluster", | |
size="Monetary", | |
) | |
st.plotly_chart(fig_cluster) | |
#Enhanced RFM Analysis | |
st.header("π Enhanced RFM Analysis") | |
# Interactive RFM Heatmap | |
heatmap_data = rfm[["Recency", "Frequency", "Monetary", "Cluster"]].groupby("Cluster").mean() | |
fig, ax = plt.subplots(figsize=(10, 6)) | |
sns.heatmap(heatmap_data, annot=True, fmt=".1f", cmap="coolwarm", cbar=True, ax=ax) | |
ax.set_title("Average RFM Values per Cluster", fontsize=16) | |
st.pyplot(fig) | |
# Animated RFM Scatter | |
st.subheader("π Animated RFM Scatter Plot") | |
fig_rfm_animated = px.scatter_3d( | |
rfm, | |
x="Recency", | |
y="Frequency", | |
z="Monetary", | |
color="Cluster", | |
animation_frame="Cluster", # Add animation based on clusters | |
title="RFM Clusters Over Time", | |
size="Monetary", | |
) | |
st.plotly_chart(fig_rfm_animated) | |
# Product Recommendation | |
st.header("π― Product Recommendations") | |
# Train Word2Vec Model | |
st.subheader("π Train Word2Vec Model") | |
with st.spinner("Training Word2Vec model..."): | |
invoices = df.groupby("InvoiceNo")["Description"].apply(list) # Group products by invoices | |
model = Word2Vec(sentences=invoices, vector_size=50, window=5, min_count=1, workers=4, sg=1) | |
st.success("Word2Vec model trained successfully!") | |
# Display similar products | |
st.subheader("π Find Similar Products") | |
selected_product = st.selectbox("Select a product to find recommendations:", df["Description"].unique()) | |
if st.button("Recommend Products for Customers"): | |
try: | |
similar_products = model.wv.most_similar(selected_product, topn=5) # Top 5 recommendations | |
st.write("### Recommended Products") | |
for product, similarity in similar_products: | |
st.write(f"- **{product}** (Similarity: {similarity:.2f})") | |
except KeyError: | |
st.warning("The selected product is not in the vocabulary. Please choose another.") | |
# Recommendations for Cluster-Based Segmentation | |
st.subheader("π Recommendations by Cluster") | |
cluster_to_recommend = st.selectbox("Select a cluster:", rfm["Cluster"].unique()) | |
if st.button("Recommend for Cluster"): | |
cluster_customers = rfm[rfm["Cluster"] == cluster_to_recommend]["CustomerID"] | |
cluster_df = df[df["CustomerID"].isin(cluster_customers)] | |
cluster_invoices = cluster_df.groupby("InvoiceNo")["Description"].apply(list) | |
with st.spinner("Training cluster-specific Word2Vec model..."): | |
cluster_model = Word2Vec(sentences=cluster_invoices, vector_size=50, window=5, min_count=1, workers=4, sg=1) | |
try: | |
cluster_similar_products = cluster_model.wv.most_similar(selected_product, topn=5) | |
st.write(f"### Recommended Products for Cluster {cluster_to_recommend}") | |
for product, similarity in cluster_similar_products: | |
st.write(f"- **{product}** (Similarity: {similarity:.2f})") | |
except KeyError: | |
st.warning("The selected product is not in the vocabulary for this cluster.") | |
# PCA to visualize Word2Vec embeddings | |
st.subheader("π Word2Vec Embedding Visualization") | |
vectors = model.wv[model.wv.key_to_index.keys()] # Product vectors | |
pca = PCA(n_components=2) | |
pca_result = pca.fit_transform(vectors) | |
# Create DataFrame for visualization | |
embedding_df = pd.DataFrame(pca_result, columns=["PCA1", "PCA2"]) | |
embedding_df["Product"] = model.wv.key_to_index.keys() | |
# Interactive Plot | |
fig_embed = px.scatter( | |
embedding_df, | |
x="PCA1", | |
y="PCA2", | |
hover_data=["Product"], | |
title="Word2Vec Product Embeddings", | |
template="plotly_dark", | |
) | |
st.plotly_chart(fig_embed) | |
# Export Data | |
st.header("π€ Export Processed Data") | |
if st.button("Export RFM Data"): | |
rfm.to_csv("rfm_data.csv", index=False) | |
st.success("RFM data exported as `rfm_data.csv`!") | |
st.markdown("### Enjoy exploring your customer data! π") | |