{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: tqdm in c:\\python3.11.1\\lib\\site-packages (4.66.2)\n",
"Requirement already satisfied: colorama in c:\\python3.11.1\\lib\\site-packages (from tqdm) (0.4.6)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"[notice] A new release of pip is available: 24.1.2 -> 24.3.1\n",
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
]
}
],
"source": [
"pip install tqdm"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import random\n",
"from tqdm import tqdm\n",
"import plotly.express as px\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" InvoiceNo | \n",
" StockCode | \n",
" Description | \n",
" Quantity | \n",
" InvoiceDate | \n",
" UnitPrice | \n",
" CustomerID | \n",
" Country | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 536365 | \n",
" 85123A | \n",
" WHITE HANGING HEART T-LIGHT HOLDER | \n",
" 6 | \n",
" 2010-12-01 08:26:00 | \n",
" 2.55 | \n",
" 17850.0 | \n",
" United Kingdom | \n",
"
\n",
" \n",
" 1 | \n",
" 536365 | \n",
" 71053 | \n",
" WHITE METAL LANTERN | \n",
" 6 | \n",
" 2010-12-01 08:26:00 | \n",
" 3.39 | \n",
" 17850.0 | \n",
" United Kingdom | \n",
"
\n",
" \n",
" 2 | \n",
" 536365 | \n",
" 84406B | \n",
" CREAM CUPID HEARTS COAT HANGER | \n",
" 8 | \n",
" 2010-12-01 08:26:00 | \n",
" 2.75 | \n",
" 17850.0 | \n",
" United Kingdom | \n",
"
\n",
" \n",
" 3 | \n",
" 536365 | \n",
" 84029G | \n",
" KNITTED UNION FLAG HOT WATER BOTTLE | \n",
" 6 | \n",
" 2010-12-01 08:26:00 | \n",
" 3.39 | \n",
" 17850.0 | \n",
" United Kingdom | \n",
"
\n",
" \n",
" 4 | \n",
" 536365 | \n",
" 84029E | \n",
" RED WOOLLY HOTTIE WHITE HEART. | \n",
" 6 | \n",
" 2010-12-01 08:26:00 | \n",
" 3.39 | \n",
" 17850.0 | \n",
" United Kingdom | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" InvoiceNo StockCode Description Quantity \\\n",
"0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 \n",
"1 536365 71053 WHITE METAL LANTERN 6 \n",
"2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 \n",
"3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 \n",
"4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 \n",
"\n",
" InvoiceDate UnitPrice CustomerID Country \n",
"0 2010-12-01 08:26:00 2.55 17850.0 United Kingdom \n",
"1 2010-12-01 08:26:00 3.39 17850.0 United Kingdom \n",
"2 2010-12-01 08:26:00 2.75 17850.0 United Kingdom \n",
"3 2010-12-01 08:26:00 3.39 17850.0 United Kingdom \n",
"4 2010-12-01 08:26:00 3.39 17850.0 United Kingdom "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(r\"D:\\Customer Segmentation\\retail_sales.csv\")\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Data Cleaning"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Column | \n",
" dtype | \n",
" unique sample | \n",
" n uniques | \n",
" num of missing | \n",
" mean of missing | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" InvoiceNo | \n",
" object | \n",
" [536365, 536366, 536367, 536368, 536369] | \n",
" 25900 | \n",
" 0 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 1 | \n",
" StockCode | \n",
" object | \n",
" [85123A, 71053, 84406B, 84029G, 84029E] | \n",
" 4070 | \n",
" 0 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 2 | \n",
" Description | \n",
" object | \n",
" [WHITE HANGING HEART T-LIGHT HOLDER, WHITE MET... | \n",
" 4223 | \n",
" 1454 | \n",
" 0.002683 | \n",
"
\n",
" \n",
" 3 | \n",
" Quantity | \n",
" int64 | \n",
" [6, 8, 2, 32, 3] | \n",
" 722 | \n",
" 0 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 4 | \n",
" InvoiceDate | \n",
" object | \n",
" [2010-12-01 08:26:00, 2010-12-01 08:28:00, 201... | \n",
" 23260 | \n",
" 0 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 5 | \n",
" UnitPrice | \n",
" float64 | \n",
" [2.55, 3.39, 2.75, 7.65, 4.25] | \n",
" 1630 | \n",
" 0 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 6 | \n",
" CustomerID | \n",
" float64 | \n",
" [17850.0, 13047.0, 12583.0, 13748.0, 15100.0] | \n",
" 4372 | \n",
" 135080 | \n",
" 0.249267 | \n",
"
\n",
" \n",
" 7 | \n",
" Country | \n",
" object | \n",
" [United Kingdom, France, Australia, Netherland... | \n",
" 38 | \n",
" 0 | \n",
" 0.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Column dtype unique sample \\\n",
"0 InvoiceNo object [536365, 536366, 536367, 536368, 536369] \n",
"1 StockCode object [85123A, 71053, 84406B, 84029G, 84029E] \n",
"2 Description object [WHITE HANGING HEART T-LIGHT HOLDER, WHITE MET... \n",
"3 Quantity int64 [6, 8, 2, 32, 3] \n",
"4 InvoiceDate object [2010-12-01 08:26:00, 2010-12-01 08:28:00, 201... \n",
"5 UnitPrice float64 [2.55, 3.39, 2.75, 7.65, 4.25] \n",
"6 CustomerID float64 [17850.0, 13047.0, 12583.0, 13748.0, 15100.0] \n",
"7 Country object [United Kingdom, France, Australia, Netherland... \n",
"\n",
" n uniques num of missing mean of missing \n",
"0 25900 0 0.000000 \n",
"1 4070 0 0.000000 \n",
"2 4223 1454 0.002683 \n",
"3 722 0 0.000000 \n",
"4 23260 0 0.000000 \n",
"5 1630 0 0.000000 \n",
"6 4372 135080 0.249267 \n",
"7 38 0 0.000000 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def report(df):\n",
" col = []\n",
" d_type = []\n",
" uniques = []\n",
" n_uniques = []\n",
" missing_values = []\n",
" mean_of_missing = []\n",
" \n",
" for i in df.columns:\n",
" col.append(i)\n",
" d_type.append(df[i].dtypes)\n",
" uniques.append(df[i].unique()[:5])\n",
" n_uniques.append(df[i].nunique())\n",
" missing_values.append(df[i].isna().sum())\n",
" mean_of_missing.append(df[i].isna().sum()/len(df))\n",
" \n",
" return pd.DataFrame({'Column': col, 'dtype': d_type, 'unique sample': uniques, 'n uniques': n_uniques, 'num of missing': missing_values, 'mean of missing': mean_of_missing })\n",
"\n",
"\n",
"report(df)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"df.dropna(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Quantity | \n",
" UnitPrice | \n",
" CustomerID | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 406829.000000 | \n",
" 406829.000000 | \n",
" 406829.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 12.061303 | \n",
" 3.460471 | \n",
" 15287.690570 | \n",
"
\n",
" \n",
" std | \n",
" 248.693370 | \n",
" 69.315162 | \n",
" 1713.600303 | \n",
"
\n",
" \n",
" min | \n",
" -80995.000000 | \n",
" 0.000000 | \n",
" 12346.000000 | \n",
"
\n",
" \n",
" 25% | \n",
" 2.000000 | \n",
" 1.250000 | \n",
" 13953.000000 | \n",
"
\n",
" \n",
" 50% | \n",
" 5.000000 | \n",
" 1.950000 | \n",
" 15152.000000 | \n",
"
\n",
" \n",
" 75% | \n",
" 12.000000 | \n",
" 3.750000 | \n",
" 16791.000000 | \n",
"
\n",
" \n",
" max | \n",
" 80995.000000 | \n",
" 38970.000000 | \n",
" 18287.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Quantity UnitPrice CustomerID\n",
"count 406829.000000 406829.000000 406829.000000\n",
"mean 12.061303 3.460471 15287.690570\n",
"std 248.693370 69.315162 1713.600303\n",
"min -80995.000000 0.000000 12346.000000\n",
"25% 2.000000 1.250000 13953.000000\n",
"50% 5.000000 1.950000 15152.000000\n",
"75% 12.000000 3.750000 16791.000000\n",
"max 80995.000000 38970.000000 18287.000000"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Remove Negative Values"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"df = df[df['Quantity'] > 0]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(397924, 8)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"EDA"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Top Products by Quantity Sold"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" Quantity | \n",
"
\n",
" \n",
" StockCode | \n",
" Description | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 23843 | \n",
" PAPER CRAFT , LITTLE BIRDIE | \n",
" 80995 | \n",
"
\n",
" \n",
" 23166 | \n",
" MEDIUM CERAMIC TOP STORAGE JAR | \n",
" 77916 | \n",
"
\n",
" \n",
" 84077 | \n",
" WORLD WAR 2 GLIDERS ASSTD DESIGNS | \n",
" 54415 | \n",
"
\n",
" \n",
" 85099B | \n",
" JUMBO BAG RED RETROSPOT | \n",
" 46181 | \n",
"
\n",
" \n",
" 85123A | \n",
" WHITE HANGING HEART T-LIGHT HOLDER | \n",
" 36725 | \n",
"
\n",
" \n",
" 84879 | \n",
" ASSORTED COLOUR BIRD ORNAMENT | \n",
" 35362 | \n",
"
\n",
" \n",
" 21212 | \n",
" PACK OF 72 RETROSPOT CAKE CASES | \n",
" 33693 | \n",
"
\n",
" \n",
" 22197 | \n",
" POPCORN HOLDER | \n",
" 30931 | \n",
"
\n",
" \n",
" 23084 | \n",
" RABBIT NIGHT LIGHT | \n",
" 27202 | \n",
"
\n",
" \n",
" 22492 | \n",
" MINI PAINT SET VINTAGE | \n",
" 26076 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Quantity\n",
"StockCode Description \n",
"23843 PAPER CRAFT , LITTLE BIRDIE 80995\n",
"23166 MEDIUM CERAMIC TOP STORAGE JAR 77916\n",
"84077 WORLD WAR 2 GLIDERS ASSTD DESIGNS 54415\n",
"85099B JUMBO BAG RED RETROSPOT 46181\n",
"85123A WHITE HANGING HEART T-LIGHT HOLDER 36725\n",
"84879 ASSORTED COLOUR BIRD ORNAMENT 35362\n",
"21212 PACK OF 72 RETROSPOT CAKE CASES 33693\n",
"22197 POPCORN HOLDER 30931\n",
"23084 RABBIT NIGHT LIGHT 27202\n",
"22492 MINI PAINT SET VINTAGE 26076"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"TopProducts= df.pivot_table(\n",
" index=['StockCode','Description'],\n",
" values='Quantity',\n",
" aggfunc='sum').sort_values(\n",
" by='Quantity', ascending=False)\n",
"\n",
"TopProducts.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"plotlyServerURL": "https://plot.ly"
},
"data": [
{
"alignmentgroup": "True",
"hovertemplate": "Quantity=%{x}
Description=%{y}",
"legendgroup": "",
"marker": {
"color": "#636efa",
"pattern": {
"shape": ""
}
},
"name": "",
"offsetgroup": "",
"orientation": "h",
"showlegend": false,
"textposition": "auto",
"type": "bar",
"x": [
80995,
77916,
54415,
46181,
36725,
35362,
33693,
30931,
27202,
26076
],
"xaxis": "x",
"y": [
"PAPER CRAFT , LITTLE BIRDIE",
"MEDIUM CERAMIC TOP STORAGE JAR",
"WORLD WAR 2 GLIDERS ASSTD DESIGNS",
"JUMBO BAG RED RETROSPOT",
"WHITE HANGING HEART T-LIGHT HOLDER",
"ASSORTED COLOUR BIRD ORNAMENT",
"PACK OF 72 RETROSPOT CAKE CASES",
"POPCORN HOLDER",
"RABBIT NIGHT LIGHT",
"MINI PAINT SET VINTAGE "
],
"yaxis": "y"
}
],
"layout": {
"barmode": "relative",
"legend": {
"tracegroupgap": 0
},
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"heatmapgl": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmapgl"
}
],
"histogram": [
{
"marker": {
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"fillpattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
},
"title": {
"text": "Top 10 Products by Quantity Sold"
},
"xaxis": {
"anchor": "y",
"domain": [
0,
1
],
"title": {
"text": "Quantity"
}
},
"yaxis": {
"anchor": "x",
"domain": [
0,
1
],
"title": {
"text": "Description"
}
}
}
}
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"TopProducts.reset_index(inplace=True)\n",
"\n",
"px.bar(TopProducts.head(10), y='Description', x='Quantity',\n",
" orientation='h',\n",
" title='Top 10 Products by Quantity Sold')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The product with the highest quantity sold is \"PAPER CRAFT, LITTLE BIRDIE,\" with approximately 80,000 units."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let’s check out the number of unique customers:"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4339"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customers = df[\"CustomerID\"].unique().tolist()\n",
"len(customers)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Top Products by Number of Customers"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"CustomersBoughts = df.pivot_table(index=['StockCode','Description'],\n",
" values='CustomerID',\n",
" aggfunc=lambda x: len(x.unique())).sort_values(by='CustomerID', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" CustomerID | \n",
"
\n",
" \n",
" StockCode | \n",
" Description | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 22423 | \n",
" REGENCY CAKESTAND 3 TIER | \n",
" 881 | \n",
"
\n",
" \n",
" 85123A | \n",
" WHITE HANGING HEART T-LIGHT HOLDER | \n",
" 856 | \n",
"
\n",
" \n",
" 47566 | \n",
" PARTY BUNTING | \n",
" 708 | \n",
"
\n",
" \n",
" 84879 | \n",
" ASSORTED COLOUR BIRD ORNAMENT | \n",
" 678 | \n",
"
\n",
" \n",
" 22720 | \n",
" SET OF 3 CAKE TINS PANTRY DESIGN | \n",
" 640 | \n",
"
\n",
" \n",
" 21212 | \n",
" PACK OF 72 RETROSPOT CAKE CASES | \n",
" 635 | \n",
"
\n",
" \n",
" 85099B | \n",
" JUMBO BAG RED RETROSPOT | \n",
" 635 | \n",
"
\n",
" \n",
" 22086 | \n",
" PAPER CHAIN KIT 50'S CHRISTMAS | \n",
" 613 | \n",
"
\n",
" \n",
" 22457 | \n",
" NATURAL SLATE HEART CHALKBOARD | \n",
" 587 | \n",
"
\n",
" \n",
" 22138 | \n",
" BAKING SET 9 PIECE RETROSPOT | \n",
" 581 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" CustomerID\n",
"StockCode Description \n",
"22423 REGENCY CAKESTAND 3 TIER 881\n",
"85123A WHITE HANGING HEART T-LIGHT HOLDER 856\n",
"47566 PARTY BUNTING 708\n",
"84879 ASSORTED COLOUR BIRD ORNAMENT 678\n",
"22720 SET OF 3 CAKE TINS PANTRY DESIGN 640\n",
"21212 PACK OF 72 RETROSPOT CAKE CASES 635\n",
"85099B JUMBO BAG RED RETROSPOT 635\n",
"22086 PAPER CHAIN KIT 50'S CHRISTMAS 613\n",
"22457 NATURAL SLATE HEART CHALKBOARD 587\n",
"22138 BAKING SET 9 PIECE RETROSPOT 581"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"CustomersBoughts.head(10)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Top 10 products by number of customers"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"plotlyServerURL": "https://plot.ly"
},
"data": [
{
"alignmentgroup": "True",
"hovertemplate": "CustomerID=%{x}
Description=%{y}",
"legendgroup": "",
"marker": {
"color": "#636efa",
"pattern": {
"shape": ""
}
},
"name": "",
"offsetgroup": "",
"orientation": "h",
"showlegend": false,
"textposition": "auto",
"type": "bar",
"x": [
881,
856,
708,
678,
640,
635,
635,
613,
587,
581
],
"xaxis": "x",
"y": [
"REGENCY CAKESTAND 3 TIER",
"WHITE HANGING HEART T-LIGHT HOLDER",
"PARTY BUNTING",
"ASSORTED COLOUR BIRD ORNAMENT",
"SET OF 3 CAKE TINS PANTRY DESIGN ",
"PACK OF 72 RETROSPOT CAKE CASES",
"JUMBO BAG RED RETROSPOT",
"PAPER CHAIN KIT 50'S CHRISTMAS ",
"NATURAL SLATE HEART CHALKBOARD ",
"BAKING SET 9 PIECE RETROSPOT "
],
"yaxis": "y"
}
],
"layout": {
"barmode": "relative",
"legend": {
"tracegroupgap": 0
},
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"heatmapgl": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmapgl"
}
],
"histogram": [
{
"marker": {
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"fillpattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
},
"title": {
"text": "Top 10 Products by Number of Customers"
},
"xaxis": {
"anchor": "y",
"domain": [
0,
1
],
"title": {
"text": "CustomerID"
}
},
"yaxis": {
"anchor": "x",
"domain": [
0,
1
],
"title": {
"text": "Description"
}
}
}
}
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"CustomersBoughts.reset_index(inplace=True)\n",
"\n",
"px.bar(CustomersBoughts.head(10), y='Description', x='CustomerID',\n",
" orientation='h',\n",
" title='Top 10 Products by Number of Customers')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Prepare Data For Modelling"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Splitting Data::::\n",
" We will use 90% data of the customers as a training dataset to create word2vec embeddings."
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"random.shuffle(customers)\n",
"\n",
"# extract 90% of customer ID's\n",
"customers_train = [customers[i] for i in range(round(0.9*len(customers)))]\n",
"\n",
"# split data into train and validation set\n",
"train_df = df[df['CustomerID'].isin(customers_train)]\n",
"validation_df = df[~df['CustomerID'].isin(customers_train)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Creating Sequence of Purchases for training dataset::::"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 3905/3905 [00:01<00:00, 1954.11it/s]\n"
]
}
],
"source": [
"purchases_train = []\n",
"\n",
"for i in tqdm(customers_train):\n",
" temp = train_df[train_df[\"CustomerID\"] == i][\"StockCode\"].tolist()\n",
" purchases_train.append(temp)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 434/434 [00:00<00:00, 2451.86it/s]\n"
]
}
],
"source": [
"purchases_val = []\n",
"\n",
"for i in tqdm(validation_df['CustomerID'].unique()):\n",
" temp = validation_df[validation_df[\"CustomerID\"] == i][\"StockCode\"].tolist()\n",
" purchases_val.append(temp)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Building a Recommendation System"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Building word2vec Embeddings for products"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting gensim\n",
" Downloading gensim-4.3.3-cp311-cp311-win_amd64.whl.metadata (8.2 kB)\n",
"Requirement already satisfied: numpy<2.0,>=1.18.5 in c:\\python3.11.1\\lib\\site-packages (from gensim) (1.26.4)\n",
"Requirement already satisfied: scipy<1.14.0,>=1.7.0 in c:\\python3.11.1\\lib\\site-packages (from gensim) (1.12.0)\n",
"Requirement already satisfied: smart-open>=1.8.1 in c:\\python3.11.1\\lib\\site-packages (from gensim) (7.0.4)\n",
"Requirement already satisfied: wrapt in c:\\python3.11.1\\lib\\site-packages (from smart-open>=1.8.1->gensim) (1.16.0)\n",
"Downloading gensim-4.3.3-cp311-cp311-win_amd64.whl (24.0 MB)\n",
" ---------------------------------------- 0.0/24.0 MB ? eta -:--:--\n",
" ---------------------------------------- 0.1/24.0 MB 2.8 MB/s eta 0:00:09\n",
" --------------------------------------- 0.4/24.0 MB 3.9 MB/s eta 0:00:07\n",
" --------------------------------------- 0.6/24.0 MB 4.1 MB/s eta 0:00:06\n",
" - -------------------------------------- 0.8/24.0 MB 4.2 MB/s eta 0:00:06\n",
" - -------------------------------------- 1.0/24.0 MB 4.1 MB/s eta 0:00:06\n",
" -- ------------------------------------- 1.3/24.0 MB 4.4 MB/s eta 0:00:06\n",
" -- ------------------------------------- 1.5/24.0 MB 4.7 MB/s eta 0:00:05\n",
" -- ------------------------------------- 1.7/24.0 MB 4.8 MB/s eta 0:00:05\n",
" --- ------------------------------------ 1.9/24.0 MB 4.6 MB/s eta 0:00:05\n",
" --- ------------------------------------ 2.1/24.0 MB 4.5 MB/s eta 0:00:05\n",
" --- ------------------------------------ 2.3/24.0 MB 4.6 MB/s eta 0:00:05\n",
" ---- ----------------------------------- 2.5/24.0 MB 4.5 MB/s eta 0:00:05\n",
" ---- ----------------------------------- 2.8/24.0 MB 4.6 MB/s eta 0:00:05\n",
" ----- ---------------------------------- 3.1/24.0 MB 4.7 MB/s eta 0:00:05\n",
" ----- ---------------------------------- 3.3/24.0 MB 4.7 MB/s eta 0:00:05\n",
" ----- ---------------------------------- 3.6/24.0 MB 4.7 MB/s eta 0:00:05\n",
" ------ --------------------------------- 3.8/24.0 MB 4.8 MB/s eta 0:00:05\n",
" ------ --------------------------------- 4.0/24.0 MB 4.8 MB/s eta 0:00:05\n",
" ------- -------------------------------- 4.2/24.0 MB 4.8 MB/s eta 0:00:05\n",
" ------- -------------------------------- 4.4/24.0 MB 4.7 MB/s eta 0:00:05\n",
" ------- -------------------------------- 4.7/24.0 MB 4.7 MB/s eta 0:00:05\n",
" -------- ------------------------------- 4.9/24.0 MB 4.8 MB/s eta 0:00:05\n",
" -------- ------------------------------- 5.1/24.0 MB 4.8 MB/s eta 0:00:04\n",
" -------- ------------------------------- 5.4/24.0 MB 4.8 MB/s eta 0:00:04\n",
" --------- ------------------------------ 5.6/24.0 MB 4.9 MB/s eta 0:00:04\n",
" --------- ------------------------------ 5.9/24.0 MB 4.9 MB/s eta 0:00:04\n",
" ---------- ----------------------------- 6.1/24.0 MB 4.9 MB/s eta 0:00:04\n",
" ---------- ----------------------------- 6.5/24.0 MB 5.0 MB/s eta 0:00:04\n",
" ----------- ---------------------------- 6.6/24.0 MB 4.9 MB/s eta 0:00:04\n",
" ----------- ---------------------------- 6.9/24.0 MB 5.0 MB/s eta 0:00:04\n",
" ----------- ---------------------------- 7.1/24.0 MB 4.9 MB/s eta 0:00:04\n",
" ------------ --------------------------- 7.4/24.0 MB 5.0 MB/s eta 0:00:04\n",
" ------------ --------------------------- 7.6/24.0 MB 5.0 MB/s eta 0:00:04\n",
" ------------- -------------------------- 7.8/24.0 MB 5.0 MB/s eta 0:00:04\n",
" ------------- -------------------------- 8.1/24.0 MB 5.0 MB/s eta 0:00:04\n",
" ------------- -------------------------- 8.3/24.0 MB 5.0 MB/s eta 0:00:04\n",
" -------------- ------------------------- 8.6/24.0 MB 5.0 MB/s eta 0:00:04\n",
" -------------- ------------------------- 8.8/24.0 MB 5.0 MB/s eta 0:00:04\n",
" --------------- ------------------------ 9.1/24.0 MB 5.0 MB/s eta 0:00:03\n",
" --------------- ------------------------ 9.3/24.0 MB 5.0 MB/s eta 0:00:03\n",
" --------------- ------------------------ 9.6/24.0 MB 5.1 MB/s eta 0:00:03\n",
" ---------------- ----------------------- 9.8/24.0 MB 5.1 MB/s eta 0:00:03\n",
" ---------------- ----------------------- 10.1/24.0 MB 5.1 MB/s eta 0:00:03\n",
" ----------------- ---------------------- 10.3/24.0 MB 5.1 MB/s eta 0:00:03\n",
" ----------------- ---------------------- 10.5/24.0 MB 5.1 MB/s eta 0:00:03\n",
" ----------------- ---------------------- 10.6/24.0 MB 5.0 MB/s eta 0:00:03\n",
" ----------------- ---------------------- 10.7/24.0 MB 5.0 MB/s eta 0:00:03\n",
" ------------------ --------------------- 10.9/24.0 MB 5.0 MB/s eta 0:00:03\n",
" ------------------ --------------------- 11.2/24.0 MB 5.0 MB/s eta 0:00:03\n",
" ------------------ --------------------- 11.4/24.0 MB 5.0 MB/s eta 0:00:03\n",
" ------------------- -------------------- 11.6/24.0 MB 5.0 MB/s eta 0:00:03\n",
" ------------------- -------------------- 11.9/24.0 MB 5.0 MB/s eta 0:00:03\n",
" -------------------- ------------------- 12.1/24.0 MB 5.0 MB/s eta 0:00:03\n",
" -------------------- ------------------- 12.3/24.0 MB 5.0 MB/s eta 0:00:03\n",
" -------------------- ------------------- 12.6/24.0 MB 5.0 MB/s eta 0:00:03\n",
" --------------------- ------------------ 12.7/24.0 MB 5.1 MB/s eta 0:00:03\n",
" --------------------- ------------------ 13.0/24.0 MB 5.0 MB/s eta 0:00:03\n",
" ---------------------- ----------------- 13.2/24.0 MB 5.0 MB/s eta 0:00:03\n",
" ---------------------- ----------------- 13.5/24.0 MB 5.0 MB/s eta 0:00:03\n",
" ---------------------- ----------------- 13.7/24.0 MB 5.0 MB/s eta 0:00:03\n",
" ----------------------- ---------------- 14.0/24.0 MB 5.0 MB/s eta 0:00:02\n",
" ----------------------- ---------------- 14.3/24.0 MB 5.1 MB/s eta 0:00:02\n",
" ------------------------ --------------- 14.5/24.0 MB 5.1 MB/s eta 0:00:02\n",
" ------------------------ --------------- 14.7/24.0 MB 5.1 MB/s eta 0:00:02\n",
" ------------------------ --------------- 15.0/24.0 MB 5.1 MB/s eta 0:00:02\n",
" ------------------------- -------------- 15.2/24.0 MB 5.1 MB/s eta 0:00:02\n",
" ------------------------- -------------- 15.5/24.0 MB 5.1 MB/s eta 0:00:02\n",
" -------------------------- ------------- 15.7/24.0 MB 5.1 MB/s eta 0:00:02\n",
" -------------------------- ------------- 15.9/24.0 MB 5.1 MB/s eta 0:00:02\n",
" -------------------------- ------------- 16.2/24.0 MB 5.1 MB/s eta 0:00:02\n",
" --------------------------- ------------ 16.4/24.0 MB 5.1 MB/s eta 0:00:02\n",
" --------------------------- ------------ 16.7/24.0 MB 5.0 MB/s eta 0:00:02\n",
" --------------------------- ------------ 16.8/24.0 MB 5.0 MB/s eta 0:00:02\n",
" ---------------------------- ----------- 17.0/24.0 MB 5.0 MB/s eta 0:00:02\n",
" ---------------------------- ----------- 17.3/24.0 MB 5.0 MB/s eta 0:00:02\n",
" ----------------------------- ---------- 17.5/24.0 MB 5.0 MB/s eta 0:00:02\n",
" ----------------------------- ---------- 17.8/24.0 MB 5.0 MB/s eta 0:00:02\n",
" ------------------------------ --------- 18.0/24.0 MB 5.0 MB/s eta 0:00:02\n",
" ------------------------------ --------- 18.3/24.0 MB 5.1 MB/s eta 0:00:02\n",
" ------------------------------ --------- 18.5/24.0 MB 5.0 MB/s eta 0:00:02\n",
" ------------------------------- -------- 18.9/24.0 MB 5.0 MB/s eta 0:00:02\n",
" ------------------------------- -------- 19.0/24.0 MB 5.1 MB/s eta 0:00:01\n",
" -------------------------------- ------- 19.3/24.0 MB 5.0 MB/s eta 0:00:01\n",
" -------------------------------- ------- 19.5/24.0 MB 5.0 MB/s eta 0:00:01\n",
" -------------------------------- ------- 19.8/24.0 MB 5.0 MB/s eta 0:00:01\n",
" --------------------------------- ------ 20.0/24.0 MB 5.0 MB/s eta 0:00:01\n",
" --------------------------------- ------ 20.2/24.0 MB 5.0 MB/s eta 0:00:01\n",
" ---------------------------------- ----- 20.5/24.0 MB 5.0 MB/s eta 0:00:01\n",
" ---------------------------------- ----- 20.7/24.0 MB 5.1 MB/s eta 0:00:01\n",
" ----------------------------------- ---- 21.0/24.0 MB 5.2 MB/s eta 0:00:01\n",
" ----------------------------------- ---- 21.3/24.0 MB 5.2 MB/s eta 0:00:01\n",
" ----------------------------------- ---- 21.6/24.0 MB 5.3 MB/s eta 0:00:01\n",
" ------------------------------------ --- 21.8/24.0 MB 5.3 MB/s eta 0:00:01\n",
" ------------------------------------ --- 22.1/24.0 MB 5.3 MB/s eta 0:00:01\n",
" ------------------------------------- -- 22.4/24.0 MB 5.3 MB/s eta 0:00:01\n",
" ------------------------------------- -- 22.4/24.0 MB 5.2 MB/s eta 0:00:01\n",
" ------------------------------------- -- 22.8/24.0 MB 5.2 MB/s eta 0:00:01\n",
" -------------------------------------- - 23.0/24.0 MB 5.3 MB/s eta 0:00:01\n",
" -------------------------------------- - 23.3/24.0 MB 5.3 MB/s eta 0:00:01\n",
" --------------------------------------- 23.6/24.0 MB 5.3 MB/s eta 0:00:01\n",
" --------------------------------------- 23.9/24.0 MB 5.4 MB/s eta 0:00:01\n",
" --------------------------------------- 24.0/24.0 MB 5.4 MB/s eta 0:00:01\n",
" --------------------------------------- 24.0/24.0 MB 5.4 MB/s eta 0:00:01\n",
" --------------------------------------- 24.0/24.0 MB 5.4 MB/s eta 0:00:01\n",
" --------------------------------------- 24.0/24.0 MB 5.4 MB/s eta 0:00:01\n",
" --------------------------------------- 24.0/24.0 MB 5.4 MB/s eta 0:00:01\n",
" ---------------------------------------- 24.0/24.0 MB 4.8 MB/s eta 0:00:00\n",
"Installing collected packages: gensim\n",
"Successfully installed gensim-4.3.3\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"[notice] A new release of pip is available: 24.1.2 -> 24.3.1\n",
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
]
}
],
"source": [
"pip install gensim"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"from gensim.models import Word2Vec"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The parameters i will use:/n/n\n",
"\n",
"window = 15: Defines the maximum distance between the current and predicted word within a sentence./n\n",
"sg = 1: Means the model will use the Skip-gram approach/n\n",
"hs = 0: Indicates that hierarchical softmax is not used because there arn't large vocabularies./n\n",
"negative=10: Sets the number of negative samples to 10./n\n",
"alpha=0.03: Set learning rate for the process to 0.03./n\n",
"min_alpha=0.0007: Sets the minimum learning rate to 0.0007./n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}