import streamlit as st import pandas as pd import numpy as np import datetime as dt from sklearn.cluster import KMeans import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px from mlxtend.frequent_patterns import apriori, association_rules # Set the page configuration st.set_page_config(page_title="Customer Segmentation and Product Recommendation", layout="wide") # Title and Description st.title("πŸ›’Customer Segmentation & Product Recommendation App") st.markdown(""" This application performs **Customer Segmentation** using RFM analysis and clustering, and provides **Product Recommendations** based on purchase patterns. Upload your dataset, analyze customer behavior, and visualize results interactively. """) # Sidebar for uploading data st.sidebar.header("Upload Dataset") uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type=["csv"]) if uploaded_file: # Load data df = pd.read_csv(uploaded_file, encoding="ISO-8859-1", dtype={'CustomerID': str, 'InvoiceID': str}) st.sidebar.success("Dataset uploaded successfully!") else: st.sidebar.warning("Please upload a CSV file to start!") st.stop() # Data Cleaning and Preprocessing st.header("🧹 Data Cleaning and Preprocessing") # Create 'Amount' column df["Amount"] = df["Quantity"] * df["UnitPrice"] st.markdown("### Initial Data Preview") st.write(df.head()) # Filter UK customers df = df[df["Country"] == "United Kingdom"] df = df[df["Quantity"] > 0] df.dropna(subset=['CustomerID'], inplace=True) df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]) df["date"] = df["InvoiceDate"].dt.date # Cleaned data preview st.markdown("### Cleaned Data Overview") st.write(df.describe()) # Summary Statistics st.subheader("πŸ“Š Summary Statistics") metrics = { "Number of Invoices": df['InvoiceNo'].nunique(), "Number of Products Bought": df['StockCode'].nunique(), "Number of Customers": df['CustomerID'].nunique(), "Average Quantity per Customer": round(df.groupby("CustomerID").Quantity.sum().mean(), 0), "Average Revenue per Customer (Β£)": round(df.groupby("CustomerID").Amount.sum().mean(), 2), } st.write(pd.DataFrame(metrics.items(), columns=["Metric", "Value"])) # Monthly Transactions Analysis st.subheader("πŸ“… Monthly Transactions Analysis") df['month'] = df['InvoiceDate'].dt.month monthly_counts = df.groupby('month').size() # Plot using Plotly fig_monthly = px.bar( monthly_counts, x=monthly_counts.index, y=monthly_counts.values, labels={"x": "Month", "y": "Transactions"}, title="Transactions Per Month" ) st.plotly_chart(fig_monthly) # RFM Analysis st.header("πŸ“ˆ RFM Analysis") # Recency Calculation now = pd.Timestamp("2011-12-09") recency_df = df.groupby("CustomerID")["date"].max().reset_index() recency_df["Recency"] = (now - pd.to_datetime(recency_df["date"])).dt.days # Frequency Calculation frequency_df = df.groupby("CustomerID")["InvoiceNo"].nunique().reset_index() frequency_df.rename(columns={"InvoiceNo": "Frequency"}, inplace=True) # Monetary Calculation monetary_df = df.groupby("CustomerID")["Amount"].sum().reset_index() monetary_df.rename(columns={"Amount": "Monetary"}, inplace=True) # Combine RFM rfm = recency_df.merge(frequency_df, on="CustomerID").merge(monetary_df, on="CustomerID") st.write("### RFM Data") st.write(rfm.head()) # Visualize RFM Distributions fig_rfm = px.scatter_3d( rfm, x="Recency", y="Frequency", z="Monetary", color="Monetary", size="Monetary", title="RFM Scatter Plot" ) st.plotly_chart(fig_rfm) # K-Means Clustering st.header("πŸ“ K-Means Clustering") st.sidebar.subheader("Clustering Parameters") num_clusters = st.sidebar.slider("Number of Clusters", 2, 10, value=4) kmeans = KMeans(n_clusters=num_clusters, random_state=42) rfm["Cluster"] = kmeans.fit_predict(rfm[["Recency", "Frequency", "Monetary"]]) # Cluster Visualization fig_cluster = px.scatter_3d( rfm, x="Recency", y="Frequency", z="Monetary", color="Cluster", title=f"Customer Segmentation with {num_clusters} Clusters", symbol="Cluster", size="Monetary", ) st.plotly_chart(fig_cluster) # Product Recommendation st.header("πŸ›οΈ Product Recommendation") st.sidebar.subheader("Recommendation Parameters") cluster_to_recommend = st.sidebar.selectbox("Select Cluster", rfm["Cluster"].unique()) # Filter data by cluster customers_in_cluster = rfm[rfm["Cluster"] == cluster_to_recommend]["CustomerID"] df_cluster = df[df["CustomerID"].isin(customers_in_cluster)] # Association Rule Mining for Recommendations basket = ( df_cluster.groupby(["InvoiceNo", "Description"])["Quantity"] .sum() .unstack() .fillna(0) .applymap(lambda x: 1 if x > 0 else 0) ) # Generate frequent itemsets frequent_itemsets = apriori(basket, min_support=0.05, use_colnames=True) # Generate association rules if not frequent_itemsets.empty: rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1) # Display top recommendations st.write(f"### Recommendations for Cluster {cluster_to_recommend}") top_recommendations = rules.sort_values(by="confidence", ascending=False).head(10) st.write(top_recommendations[["antecedents", "consequents", "support", "confidence", "lift"]]) else: st.write("No significant patterns found for this cluster.") st.write(f"### Recommendations for Cluster {cluster_to_recommend}") if not rules.empty: top_recommendations = rules.sort_values(by="confidence", ascending=False).head(10) st.write(top_recommendations[["antecedents", "consequents", "support", "confidence", "lift"]]) else: st.write("No significant patterns found for this cluster.") # Export Data st.header("πŸ“€ Export Processed Data") if st.button("Export RFM Data"): rfm.to_csv("rfm_data.csv", index=False) st.success("RFM data exported as `rfm_data.csv`!") st.markdown("### Enjoy exploring your customer data! πŸš€")