import streamlit as st import pandas as pd import numpy as np import datetime as dt from sklearn.cluster import KMeans from sklearn.decomposition import PCA import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px from gensim.models import Word2Vec # Set the page configuration st.set_page_config(page_title="Customer Segmentation and Product Recommendation", layout="wide") # Title and Description st.title("πCustomer Segmentation & Product Recommendation App") st.markdown(""" This application performs **Customer Segmentation** using RFM analysis and clustering, and provides **Product Recommendations** based on purchase patterns. Upload your dataset, analyze customer behavior, and visualize results interactively. """) # Sidebar for uploading data st.sidebar.header("Upload Dataset") uploaded_file = st.sidebar.file_uploader("Choose a CSV file", type=["csv"]) if uploaded_file: # Load data df = pd.read_csv(uploaded_file, encoding="ISO-8859-1", dtype={'CustomerID': str, 'InvoiceID': str}) st.sidebar.success("Dataset uploaded successfully!") else: st.sidebar.warning("Please upload a CSV file to start!") st.stop() # Data Cleaning and Preprocessing st.header("π§Ή Data Cleaning and Preprocessing") # Create 'Amount' column df["Amount"] = df["Quantity"] * df["UnitPrice"] st.markdown("### Initial Data Preview") st.write(df.head()) # Filter UK customers df = df[df["Country"] == "United Kingdom"] df = df[df["Quantity"] > 0] df.dropna(subset=['CustomerID'], inplace=True) df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]) df["date"] = df["InvoiceDate"].dt.date # Cleaned data preview st.markdown("### Cleaned Data Overview") st.write(df.describe()) # Summary Statistics st.subheader("π Summary Statistics") metrics = { "Number of Invoices": df['InvoiceNo'].nunique(), "Number of Products Bought": df['StockCode'].nunique(), "Number of Customers": df['CustomerID'].nunique(), "Average Quantity per Customer": round(df.groupby("CustomerID").Quantity.sum().mean(), 0), "Average Revenue per Customer (Β£)": round(df.groupby("CustomerID").Amount.sum().mean(), 2), } st.write(pd.DataFrame(metrics.items(), columns=["Metric", "Value"])) # Monthly Transactions Analysis st.subheader("π Monthly Transactions Analysis") df['month'] = df['InvoiceDate'].dt.month monthly_counts = df.groupby('month').size() # Plot using Plotly fig_monthly = px.bar( monthly_counts, x=monthly_counts.index, y=monthly_counts.values, labels={"x": "Month", "y": "Transactions"}, title="Transactions Per Month" ) st.plotly_chart(fig_monthly) # RFM Analysis st.header("π RFM Analysis") # Recency Calculation now = pd.Timestamp("2011-12-09") recency_df = df.groupby("CustomerID")["date"].max().reset_index() recency_df["Recency"] = (now - pd.to_datetime(recency_df["date"])).dt.days # Frequency Calculation frequency_df = df.groupby("CustomerID")["InvoiceNo"].nunique().reset_index() frequency_df.rename(columns={"InvoiceNo": "Frequency"}, inplace=True) # Monetary Calculation monetary_df = df.groupby("CustomerID")["Amount"].sum().reset_index() monetary_df.rename(columns={"Amount": "Monetary"}, inplace=True) # Combine RFM rfm = recency_df.merge(frequency_df, on="CustomerID").merge(monetary_df, on="CustomerID") st.write("### RFM Data") st.write(rfm.head()) # Visualize RFM Distributions fig_rfm = px.scatter_3d( rfm, x="Recency", y="Frequency", z="Monetary", color="Monetary", size="Monetary", title="RFM Scatter Plot" ) st.plotly_chart(fig_rfm) # K-Means Clustering st.header("π K-Means Clustering") st.sidebar.subheader("Clustering Parameters") num_clusters = st.sidebar.slider("Number of Clusters", 2, 10, value=4) kmeans = KMeans(n_clusters=num_clusters, random_state=42) rfm["Cluster"] = kmeans.fit_predict(rfm[["Recency", "Frequency", "Monetary"]]) # Cluster Visualization fig_cluster = px.scatter_3d( rfm, x="Recency", y="Frequency", z="Monetary", color="Cluster", title=f"Customer Segmentation with {num_clusters} Clusters", symbol="Cluster", size="Monetary", ) st.plotly_chart(fig_cluster) #Enhanced RFM Analysis st.header("π Enhanced RFM Analysis") # Interactive RFM Heatmap heatmap_data = rfm[["Recency", "Frequency", "Monetary", "Cluster"]].groupby("Cluster").mean() fig, ax = plt.subplots(figsize=(10, 6)) sns.heatmap(heatmap_data, annot=True, fmt=".1f", cmap="coolwarm", cbar=True, ax=ax) ax.set_title("Average RFM Values per Cluster", fontsize=16) st.pyplot(fig) # Animated RFM Scatter st.subheader("π Animated RFM Scatter Plot") fig_rfm_animated = px.scatter_3d( rfm, x="Recency", y="Frequency", z="Monetary", color="Cluster", animation_frame="Cluster", # Add animation based on clusters title="RFM Clusters Over Time", size="Monetary", ) st.plotly_chart(fig_rfm_animated) # Product Recommendation st.header("π― Product Recommendations") # Train Word2Vec Model st.subheader("π Train Word2Vec Model") with st.spinner("Training Word2Vec model..."): invoices = df.groupby("InvoiceNo")["Description"].apply(list) # Group products by invoices model = Word2Vec(sentences=invoices, vector_size=50, window=5, min_count=1, workers=4, sg=1) st.success("Word2Vec model trained successfully!") # Display similar products st.subheader("π Find Similar Products") selected_product = st.selectbox("Select a product to find recommendations:", df["Description"].unique()) if st.button("Recommend Products for Customers"): try: similar_products = model.wv.most_similar(selected_product, topn=5) # Top 5 recommendations st.write("### Recommended Products") for product, similarity in similar_products: st.write(f"- **{product}** (Similarity: {similarity:.2f})") except KeyError: st.warning("The selected product is not in the vocabulary. Please choose another.") # Recommendations for Cluster-Based Segmentation st.subheader("π Recommendations by Cluster") cluster_to_recommend = st.selectbox("Select a cluster:", rfm["Cluster"].unique()) if st.button("Recommend for Cluster"): cluster_customers = rfm[rfm["Cluster"] == cluster_to_recommend]["CustomerID"] cluster_df = df[df["CustomerID"].isin(cluster_customers)] cluster_invoices = cluster_df.groupby("InvoiceNo")["Description"].apply(list) with st.spinner("Training cluster-specific Word2Vec model..."): cluster_model = Word2Vec(sentences=cluster_invoices, vector_size=50, window=5, min_count=1, workers=4, sg=1) try: cluster_similar_products = cluster_model.wv.most_similar(selected_product, topn=5) st.write(f"### Recommended Products for Cluster {cluster_to_recommend}") for product, similarity in cluster_similar_products: st.write(f"- **{product}** (Similarity: {similarity:.2f})") except KeyError: st.warning("The selected product is not in the vocabulary for this cluster.") # PCA to visualize Word2Vec embeddings st.subheader("π Word2Vec Embedding Visualization") vectors = model.wv[model.wv.key_to_index.keys()] # Product vectors pca = PCA(n_components=2) pca_result = pca.fit_transform(vectors) # Create DataFrame for visualization embedding_df = pd.DataFrame(pca_result, columns=["PCA1", "PCA2"]) embedding_df["Product"] = model.wv.key_to_index.keys() # Interactive Plot fig_embed = px.scatter( embedding_df, x="PCA1", y="PCA2", hover_data=["Product"], title="Word2Vec Product Embeddings", template="plotly_dark", ) st.plotly_chart(fig_embed) # Export Data st.header("π€ Export Processed Data") if st.button("Export RFM Data"): rfm.to_csv("rfm_data.csv", index=False) st.success("RFM data exported as `rfm_data.csv`!") st.markdown("### Enjoy exploring your customer data! π")