In [2]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
import plotly.express as px
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv(r"D:\Customer Segmentation\retail_sales.csv")
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


Data Cleaning

In [5]:
def report(df):
    col = []
    d_type = []
    uniques = []
    n_uniques = []
    missing_values = []
    mean_of_missing = []
    
    for i in df.columns:
        col.append(i)
        d_type.append(df[i].dtypes)
        uniques.append(df[i].unique()[:5])
        n_uniques.append(df[i].nunique())
        missing_values.append(df[i].isna().sum())
        mean_of_missing.append(df[i].isna().sum()/len(df))
    
    return pd.DataFrame({'Column': col, 'dtype': d_type, 'unique sample': uniques, 'n uniques': n_uniques, 'num of missing': missing_values, 'mean of missing': mean_of_missing })


report(df)

Unnamed: 0,Column,dtype,unique sample,n uniques,num of missing,mean of missing
0,InvoiceNo,object,"[536365, 536366, 536367, 536368, 536369]",25900,0,0.0
1,StockCode,object,"[85123A, 71053, 84406B, 84029G, 84029E]",4070,0,0.0
2,Description,object,"[WHITE HANGING HEART T-LIGHT HOLDER, WHITE MET...",4223,1454,0.002683
3,Quantity,int64,"[6, 8, 2, 32, 3]",722,0,0.0
4,InvoiceDate,object,"[2010-12-01 08:26:00, 2010-12-01 08:28:00, 201...",23260,0,0.0
5,UnitPrice,float64,"[2.55, 3.39, 2.75, 7.65, 4.25]",1630,0,0.0
6,CustomerID,float64,"[17850.0, 13047.0, 12583.0, 13748.0, 15100.0]",4372,135080,0.249267
7,Country,object,"[United Kingdom, France, Australia, Netherland...",38,0,0.0


In [6]:
df.dropna(inplace=True)

In [7]:
df.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,406829.0,406829.0,406829.0
mean,12.061303,3.460471,15287.69057
std,248.69337,69.315162,1713.600303
min,-80995.0,0.0,12346.0
25%,2.0,1.25,13953.0
50%,5.0,1.95,15152.0
75%,12.0,3.75,16791.0
max,80995.0,38970.0,18287.0


Remove Negative Values

In [8]:
df = df[df['Quantity'] > 0]

In [9]:
df.shape

(397924, 8)

EDA

Top Products by Quantity Sold

In [11]:
TopProducts= df.pivot_table(
    index=['StockCode','Description'],
    values='Quantity',
    aggfunc='sum').sort_values(
    by='Quantity', ascending=False)

TopProducts.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Quantity
StockCode,Description,Unnamed: 2_level_1
23843,"PAPER CRAFT , LITTLE BIRDIE",80995
23166,MEDIUM CERAMIC TOP STORAGE JAR,77916
84077,WORLD WAR 2 GLIDERS ASSTD DESIGNS,54415
85099B,JUMBO BAG RED RETROSPOT,46181
85123A,WHITE HANGING HEART T-LIGHT HOLDER,36725
84879,ASSORTED COLOUR BIRD ORNAMENT,35362
21212,PACK OF 72 RETROSPOT CAKE CASES,33693
22197,POPCORN HOLDER,30931
23084,RABBIT NIGHT LIGHT,27202
22492,MINI PAINT SET VINTAGE,26076


In [12]:
TopProducts.reset_index(inplace=True)

px.bar(TopProducts.head(10), y='Description', x='Quantity',
    orientation='h',
    title='Top 10 Products by Quantity Sold')

The product with the highest quantity sold is "PAPER CRAFT, LITTLE BIRDIE," with approximately 80,000 units.

Let’s check out the number of unique customers:

In [13]:
customers = df["CustomerID"].unique().tolist()
len(customers)

4339

Top Products by Number of Customers

In [14]:
CustomersBoughts = df.pivot_table(index=['StockCode','Description'],
                                values='CustomerID',
                                aggfunc=lambda x: len(x.unique())).sort_values(by='CustomerID', ascending=False)

In [15]:
CustomersBoughts.head(10)


Unnamed: 0_level_0,Unnamed: 1_level_0,CustomerID
StockCode,Description,Unnamed: 2_level_1
22423,REGENCY CAKESTAND 3 TIER,881
85123A,WHITE HANGING HEART T-LIGHT HOLDER,856
47566,PARTY BUNTING,708
84879,ASSORTED COLOUR BIRD ORNAMENT,678
22720,SET OF 3 CAKE TINS PANTRY DESIGN,640
21212,PACK OF 72 RETROSPOT CAKE CASES,635
85099B,JUMBO BAG RED RETROSPOT,635
22086,PAPER CHAIN KIT 50'S CHRISTMAS,613
22457,NATURAL SLATE HEART CHALKBOARD,587
22138,BAKING SET 9 PIECE RETROSPOT,581


Top 10 products by number of customers

In [16]:
CustomersBoughts.reset_index(inplace=True)

px.bar(CustomersBoughts.head(10), y='Description', x='CustomerID',
    orientation='h',
    title='Top 10 Products by Number of Customers')

Prepare Data For Modelling

Splitting Data::::
    We will use 90% data of the customers as a training dataset to create word2vec embeddings.

In [17]:
random.shuffle(customers)

# extract 90% of customer ID's
customers_train = [customers[i] for i in range(round(0.9*len(customers)))]

# split data into train and validation set
train_df = df[df['CustomerID'].isin(customers_train)]
validation_df = df[~df['CustomerID'].isin(customers_train)]

Creating Sequence of Purchases for training dataset::::

In [18]:
purchases_train = []

for i in tqdm(customers_train):
    temp = train_df[train_df["CustomerID"] == i]["StockCode"].tolist()
    purchases_train.append(temp)

100%|██████████| 3905/3905 [00:01<00:00, 1954.11it/s]


In [19]:
purchases_val = []

for i in tqdm(validation_df['CustomerID'].unique()):
    temp = validation_df[validation_df["CustomerID"] == i]["StockCode"].tolist()
    purchases_val.append(temp)

100%|██████████| 434/434 [00:00<00:00, 2451.86it/s]


Building a Recommendation System

Building word2vec Embeddings for products

In [20]:
pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-win_amd64.whl.metadata (8.2 kB)
Downloading gensim-4.3.3-cp311-cp311-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/24.0 MB 2.8 MB/s eta 0:00:09
    --------------------------------------- 0.4/24.0 MB 3.9 MB/s eta 0:00:07
    --------------------------------------- 0.6/24.0 MB 4.1 MB/s eta 0:00:06
   - -------------------------------------- 0.8/24.0 MB 4.2 MB/s eta 0:00:06
   - -------------------------------------- 1.0/24.0 MB 4.1 MB/s eta 0:00:06
   -- ------------------------------------- 1.3/24.0 MB 4.4 MB/s eta 0:00:06
   -- ------------------------------------- 1.5/24.0 MB 4.7 MB/s eta 0:00:05
   -- ------------------------------------- 1.7/24.0 MB 4.8 MB/s eta 0:00:05
   --- ------------------------------------ 1.9/24.0 MB 4.6 MB/s eta 0:00:05
   --- ------------------------------------ 2.1/24.0 MB 4.5 MB/s eta 0:00:05
   --- ---


[notice] A new release of pip is available: 24.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
from gensim.models import Word2Vec

The parameters i will use:/n/n

window = 15: Defines the maximum distance between the current and predicted word within a sentence./n
sg = 1: Means the model will use the Skip-gram approach/n
hs = 0: Indicates that hierarchical softmax is not used because there arn't large vocabularies./n
negative=10: Sets the number of negative samples to 10./n
alpha=0.03: Set learning rate for the process to 0.03./n
min_alpha=0.0007: Sets the minimum learning rate to 0.0007./n