{ "cells": [ { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "from IPython.display import display, HTML\n", "display(HTML(\"\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# TU257 - Lab9 - Demo 1 - Asscociation Rules\n", "\n", "#### You'll need to import a new library\n", "#### mlxtend" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
products
0MILK,BREAD,BISCUIT
1BREAD,MILK,BISCUIT,CORNFLAKES
2BREAD,TEA,BOURNVITA
3JAM,MAGGI,BREAD,MILK
4MAGGI,TEA,BISCUIT
\n", "
" ], "text/plain": [ " products\n", "0 MILK,BREAD,BISCUIT\n", "1 BREAD,MILK,BISCUIT,CORNFLAKES\n", "2 BREAD,TEA,BOURNVITA\n", "3 JAM,MAGGI,BREAD,MILK\n", "4 MAGGI,TEA,BISCUIT" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "from mlxtend.frequent_patterns import apriori\n", "from mlxtend.frequent_patterns import association_rules\n", "\n", "df = pd.read_csv(\"/Users/brendan.tierney/Dropbox/4-Datasets/GroceryStoreDataSet.csv\", names = ['products'], sep = ',')\n", "df.head()\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(20, 1)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape\n", "#20 records\n", "#1 attribute/feature" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['MILK', 'BREAD', 'BISCUIT'],\n", " ['BREAD', 'MILK', 'BISCUIT', 'CORNFLAKES'],\n", " ['BREAD', 'TEA', 'BOURNVITA'],\n", " ['JAM', 'MAGGI', 'BREAD', 'MILK'],\n", " ['MAGGI', 'TEA', 'BISCUIT'],\n", " ['BREAD', 'TEA', 'BOURNVITA'],\n", " ['MAGGI', 'TEA', 'CORNFLAKES'],\n", " ['MAGGI', 'BREAD', 'TEA', 'BISCUIT'],\n", " ['JAM', 'MAGGI', 'BREAD', 'TEA'],\n", " ['BREAD', 'MILK'],\n", " ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'],\n", " ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'],\n", " ['COFFEE', 'SUGER', 'BOURNVITA'],\n", " ['BREAD', 'COFFEE', 'COCK'],\n", " ['BREAD', 'SUGER', 'BISCUIT'],\n", " ['COFFEE', 'SUGER', 'CORNFLAKES'],\n", " ['BREAD', 'SUGER', 'BOURNVITA'],\n", " ['BREAD', 'COFFEE', 'SUGER'],\n", " ['BREAD', 'COFFEE', 'SUGER'],\n", " ['TEA', 'MILK', 'COFFEE', 'CORNFLAKES']]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Let’s split the products and create a list called by ‘data’\n", "data = list(df[\"products\"].apply(lambda x:x.split(\",\") ))\n", "data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Apriori’s algorithm transforms True/False or 1/0.\n", "Using TransactionEncoder, we convert the list to a One-Hot Encoded Boolean list.\n", "Products that customers bought or did not buy during shopping will now be represented by values 1 and 0" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
BISCUITBOURNVITABREADCOCKCOFFEECORNFLAKESJAMMAGGIMILKSUGERTEA
010100000100
110100100100
201100000001
300100011100
410000001001
501100000001
600000101001
710100001001
800100011001
900100000100
1010011100000
1110011100000
1201001000010
1300111000000
1410100000010
1500001100010
1601100000010
1700101000010
1800101000010
1900001100101
\n", "
" ], "text/plain": [ " BISCUIT BOURNVITA BREAD COCK COFFEE CORNFLAKES JAM MAGGI MILK \\\n", "0 1 0 1 0 0 0 0 0 1 \n", "1 1 0 1 0 0 1 0 0 1 \n", "2 0 1 1 0 0 0 0 0 0 \n", "3 0 0 1 0 0 0 1 1 1 \n", "4 1 0 0 0 0 0 0 1 0 \n", "5 0 1 1 0 0 0 0 0 0 \n", "6 0 0 0 0 0 1 0 1 0 \n", "7 1 0 1 0 0 0 0 1 0 \n", "8 0 0 1 0 0 0 1 1 0 \n", "9 0 0 1 0 0 0 0 0 1 \n", "10 1 0 0 1 1 1 0 0 0 \n", "11 1 0 0 1 1 1 0 0 0 \n", "12 0 1 0 0 1 0 0 0 0 \n", "13 0 0 1 1 1 0 0 0 0 \n", "14 1 0 1 0 0 0 0 0 0 \n", "15 0 0 0 0 1 1 0 0 0 \n", "16 0 1 1 0 0 0 0 0 0 \n", "17 0 0 1 0 1 0 0 0 0 \n", "18 0 0 1 0 1 0 0 0 0 \n", "19 0 0 0 0 1 1 0 0 1 \n", "\n", " SUGER TEA \n", "0 0 0 \n", "1 0 0 \n", "2 0 1 \n", "3 0 0 \n", "4 0 1 \n", "5 0 1 \n", "6 0 1 \n", "7 0 1 \n", "8 0 1 \n", "9 0 0 \n", "10 0 0 \n", "11 0 0 \n", "12 1 0 \n", "13 0 0 \n", "14 1 0 \n", "15 1 0 \n", "16 1 0 \n", "17 1 0 \n", "18 1 0 \n", "19 0 1 " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Let's transform the list, with one-hot encoding\n", "from mlxtend.preprocessing import TransactionEncoder\n", "\n", "a = TransactionEncoder()\n", "a_data = a.fit(data).transform(data)\n", "\n", "df = pd.DataFrame(a_data,columns=a.columns_)\n", "df = df.replace(False,0)\n", "df = df.replace(True,1)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Applying Apriori and Resulting\n", "\n", "The next step is to create the Apriori Model. We can change all the parameters in the Apriori Model in the mlxtend package.\n", "I will try to use minimum support parameters for this modeling.\n", "For this, I set a min_support value with a threshold value of 20% and printed them on the screen as well." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\r", "Processing 72 combinations | Sampling itemset size 2\r", "Processing 42 combinations | Sampling itemset size 3\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
supportitemsets
20.65(BREAD)
30.40(COFFEE)
00.35(BISCUIT)
80.35(TEA)
40.30(CORNFLAKES)
70.30(SUGER)
50.25(MAGGI)
60.25(MILK)
10.20(BOURNVITA)
90.20(BREAD, BISCUIT)
100.20(BREAD, MILK)
110.20(BREAD, SUGER)
120.20(BREAD, TEA)
130.20(CORNFLAKES, COFFEE)
140.20(SUGER, COFFEE)
150.20(MAGGI, TEA)
\n", "
" ], "text/plain": [ " support itemsets\n", "2 0.65 (BREAD)\n", "3 0.40 (COFFEE)\n", "0 0.35 (BISCUIT)\n", "8 0.35 (TEA)\n", "4 0.30 (CORNFLAKES)\n", "7 0.30 (SUGER)\n", "5 0.25 (MAGGI)\n", "6 0.25 (MILK)\n", "1 0.20 (BOURNVITA)\n", "9 0.20 (BREAD, BISCUIT)\n", "10 0.20 (BREAD, MILK)\n", "11 0.20 (BREAD, SUGER)\n", "12 0.20 (BREAD, TEA)\n", "13 0.20 (CORNFLAKES, COFFEE)\n", "14 0.20 (SUGER, COFFEE)\n", "15 0.20 (MAGGI, TEA)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "support = apriori(df, min_support = 0.2, use_colnames = True, verbose = 1)\n", "\n", "support.sort_values(by = 'support', ascending = False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If I set 60% minimum confidence value. In other words, when product X is purchased, we can say that the purchase of product Y is 60% or more." ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
antecedentsconsequentsantecedent supportconsequent supportsupportconfidenceliftleverageconvictionzhangs_metric
0(MILK)(BREAD)0.250.650.20.8000001.2307690.03751.750.250000
4(MAGGI)(TEA)0.250.350.20.8000002.2857140.11253.250.750000
1(SUGER)(BREAD)0.300.650.20.6666671.0256410.00501.050.035714
2(CORNFLAKES)(COFFEE)0.300.400.20.6666671.6666670.08001.800.571429
3(SUGER)(COFFEE)0.300.400.20.6666671.6666670.08001.800.571429
\n", "
" ], "text/plain": [ " antecedents consequents antecedent support consequent support support \\\n", "0 (MILK) (BREAD) 0.25 0.65 0.2 \n", "4 (MAGGI) (TEA) 0.25 0.35 0.2 \n", "1 (SUGER) (BREAD) 0.30 0.65 0.2 \n", "2 (CORNFLAKES) (COFFEE) 0.30 0.40 0.2 \n", "3 (SUGER) (COFFEE) 0.30 0.40 0.2 \n", "\n", " confidence lift leverage conviction zhangs_metric \n", "0 0.800000 1.230769 0.0375 1.75 0.250000 \n", "4 0.800000 2.285714 0.1125 3.25 0.750000 \n", "1 0.666667 1.025641 0.0050 1.05 0.035714 \n", "2 0.666667 1.666667 0.0800 1.80 0.571429 \n", "3 0.666667 1.666667 0.0800 1.80 0.571429 " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Let's view our interpretation values using the Associan rule function.\n", "confidence = association_rules(support, metric = \"confidence\", min_threshold = 0.6)\n", "confidence.sort_values(by = 'confidence', ascending = False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For example, if we examine our 1st index value;\n", "\n", " The probability of seeing Sugar sales is seen as 30%.\n", " Bread intake is seen as 65%.\n", " We can say that the support of both of them is measured as 20%.\n", " 67% of those who buys sugar, buys bread as well.\n", " " ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "#Question: Change the confidence level to 30%\n", "#What impact does this have?\n", "\n", "#Save the results to a new DF called confidence2" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
antecedentsconsequentsantecedent supportconsequent supportsupportconfidenceliftleverageconvictionzhangs_metric
3(MILK)(BREAD)0.250.650.20.8000001.2307690.03751.7500000.250000
12(MAGGI)(TEA)0.250.350.20.8000002.2857140.11253.2500000.750000
5(SUGER)(BREAD)0.300.650.20.6666671.0256410.00501.0500000.035714
8(CORNFLAKES)(COFFEE)0.300.400.20.6666671.6666670.08001.8000000.571429
10(SUGER)(COFFEE)0.300.400.20.6666671.6666670.08001.8000000.571429
1(BISCUIT)(BREAD)0.350.650.20.5714290.879121-0.02750.816667-0.174603
7(TEA)(BREAD)0.350.650.20.5714290.879121-0.02750.816667-0.174603
13(TEA)(MAGGI)0.350.250.20.5714292.2857140.11251.7500000.865385
9(COFFEE)(CORNFLAKES)0.400.300.20.5000001.6666670.08001.4000000.666667
11(COFFEE)(SUGER)0.400.300.20.5000001.6666670.08001.4000000.666667
0(BREAD)(BISCUIT)0.650.350.20.3076920.879121-0.02750.938889-0.282051
2(BREAD)(MILK)0.650.250.20.3076921.2307690.03751.0833330.535714
4(BREAD)(SUGER)0.650.300.20.3076921.0256410.00501.0111110.071429
6(BREAD)(TEA)0.650.350.20.3076920.879121-0.02750.938889-0.282051
\n", "
" ], "text/plain": [ " antecedents consequents antecedent support consequent support \\\n", "3 (MILK) (BREAD) 0.25 0.65 \n", "12 (MAGGI) (TEA) 0.25 0.35 \n", "5 (SUGER) (BREAD) 0.30 0.65 \n", "8 (CORNFLAKES) (COFFEE) 0.30 0.40 \n", "10 (SUGER) (COFFEE) 0.30 0.40 \n", "1 (BISCUIT) (BREAD) 0.35 0.65 \n", "7 (TEA) (BREAD) 0.35 0.65 \n", "13 (TEA) (MAGGI) 0.35 0.25 \n", "9 (COFFEE) (CORNFLAKES) 0.40 0.30 \n", "11 (COFFEE) (SUGER) 0.40 0.30 \n", "0 (BREAD) (BISCUIT) 0.65 0.35 \n", "2 (BREAD) (MILK) 0.65 0.25 \n", "4 (BREAD) (SUGER) 0.65 0.30 \n", "6 (BREAD) (TEA) 0.65 0.35 \n", "\n", " support confidence lift leverage conviction zhangs_metric \n", "3 0.2 0.800000 1.230769 0.0375 1.750000 0.250000 \n", "12 0.2 0.800000 2.285714 0.1125 3.250000 0.750000 \n", "5 0.2 0.666667 1.025641 0.0050 1.050000 0.035714 \n", "8 0.2 0.666667 1.666667 0.0800 1.800000 0.571429 \n", "10 0.2 0.666667 1.666667 0.0800 1.800000 0.571429 \n", "1 0.2 0.571429 0.879121 -0.0275 0.816667 -0.174603 \n", "7 0.2 0.571429 0.879121 -0.0275 0.816667 -0.174603 \n", "13 0.2 0.571429 2.285714 0.1125 1.750000 0.865385 \n", "9 0.2 0.500000 1.666667 0.0800 1.400000 0.666667 \n", "11 0.2 0.500000 1.666667 0.0800 1.400000 0.666667 \n", "0 0.2 0.307692 0.879121 -0.0275 0.938889 -0.282051 \n", "2 0.2 0.307692 1.230769 0.0375 1.083333 0.535714 \n", "4 0.2 0.307692 1.025641 0.0050 1.011111 0.071429 \n", "6 0.2 0.307692 0.879121 -0.0275 0.938889 -0.282051 " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "confidence2 = association_rules(support, metric = \"confidence\", min_threshold = 0.3)\n", "confidence2.sort_values(by = 'confidence', ascending = False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 2 }