{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### In this demo and lab exercise we will look at scaling numerical data\n", "### The examples given below illustrate using a min-max scaler\n", "\n", "\n", "#### Let's beging with importing the data set\n", "#### Download this from webpage" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np \n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "\n", "from sklearn.preprocessing import MinMaxScaler\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#read in the data set\n", "#NB. you will need to edit this command to change it to the directory you are using\n", "data = pd.read_csv('/Users/brendan.tierney/Dropbox/4-Datasets/small_purchases.csv', )" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 10 entries, 0 to 9\n", "Data columns (total 4 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Country 10 non-null object \n", " 1 Age 9 non-null float64\n", " 2 Salary 9 non-null float64\n", " 3 Purchased 10 non-null object \n", "dtypes: float64(2), object(2)\n", "memory usage: 448.0+ bytes\n" ] } ], "source": [ "#Display basic information about the Pandas dataframe\n", "data.info()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(10, 4)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#How many rows and columns does the dataframe have?\n", "data.shape" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CountryAgeSalaryPurchased
0France44.027000.0No
1Spain27.048000.0Yes
2Germany30.054000.0No
3Spain38.061000.0No
4Germany40.0NaNYes
5France35.058000.0Yes
6SpainNaN52000.0No
7France48.079000.0Yes
8Germany50.083000.0No
9France37.067000.0Yes
\n", "
" ], "text/plain": [ " Country Age Salary Purchased\n", "0 France 44.0 27000.0 No\n", "1 Spain 27.0 48000.0 Yes\n", "2 Germany 30.0 54000.0 No\n", "3 Spain 38.0 61000.0 No\n", "4 Germany 40.0 NaN Yes\n", "5 France 35.0 58000.0 Yes\n", "6 Spain NaN 52000.0 No\n", "7 France 48.0 79000.0 Yes\n", "8 Germany 50.0 83000.0 No\n", "9 France 37.0 67000.0 Yes" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Display the first 10 rows\n", "#Question: How many rows does the dataframe contain\n", "#Question: Modify the code to display all the data\n", "\n", "data.head(10)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeSalary
count9.0000009.000000
mean38.77777858777.777778
std7.69379316820.952543
min27.00000027000.000000
25%35.00000052000.000000
50%38.00000058000.000000
75%44.00000067000.000000
max50.00000083000.000000
\n", "
" ], "text/plain": [ " Age Salary\n", "count 9.000000 9.000000\n", "mean 38.777778 58777.777778\n", "std 7.693793 16820.952543\n", "min 27.000000 27000.000000\n", "25% 35.000000 52000.000000\n", "50% 38.000000 58000.000000\n", "75% 44.000000 67000.000000\n", "max 50.000000 83000.000000" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Crate summary statisics about the data in the dataframe\n", "#This only provides summary statistics for Numerical data\n", "data.describe()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countmeanstdmin25%50%75%max
Age9.038.7777787.69379327.035.038.044.050.0
Salary9.058777.77777816820.95254327000.052000.058000.067000.083000.0
\n", "
" ], "text/plain": [ " count mean std min 25% 50% 75% \\\n", "Age 9.0 38.777778 7.693793 27.0 35.0 38.0 44.0 \n", "Salary 9.0 58777.777778 16820.952543 27000.0 52000.0 58000.0 67000.0 \n", "\n", " max \n", "Age 50.0 \n", "Salary 83000.0 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#An alternative was of viewing the summary statistics\n", "data.describe().transpose()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CountryAgeSalaryPurchased
0France44.027000.0No
1Spain27.048000.0Yes
2Germany30.054000.0No
3Spain38.061000.0No
4Germany40.0NaNYes
5France35.058000.0Yes
6SpainNaN52000.0No
7France48.079000.0Yes
8Germany50.083000.0No
9France37.067000.0Yes
\n", "
" ], "text/plain": [ " Country Age Salary Purchased\n", "0 France 44.0 27000.0 No\n", "1 Spain 27.0 48000.0 Yes\n", "2 Germany 30.0 54000.0 No\n", "3 Spain 38.0 61000.0 No\n", "4 Germany 40.0 NaN Yes\n", "5 France 35.0 58000.0 Yes\n", "6 Spain NaN 52000.0 No\n", "7 France 48.0 79000.0 Yes\n", "8 Germany 50.0 83000.0 No\n", "9 France 37.0 67000.0 Yes" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Display the data from the dataframe\n", "#NB: notices we have some values to NO values -> See question later in the notebook\n", "data" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CountryAgeSalaryPurchased
0France0.7391300.000000No
1Spain0.0000000.375000Yes
2Germany0.1304350.482143No
3Spain0.4782610.607143No
4Germany0.565217NaNYes
5France0.3478260.553571Yes
6SpainNaN0.446429No
7France0.9130430.928571Yes
8Germany1.0000001.000000No
9France0.4347830.714286Yes
\n", "
" ], "text/plain": [ " Country Age Salary Purchased\n", "0 France 0.739130 0.000000 No\n", "1 Spain 0.000000 0.375000 Yes\n", "2 Germany 0.130435 0.482143 No\n", "3 Spain 0.478261 0.607143 No\n", "4 Germany 0.565217 NaN Yes\n", "5 France 0.347826 0.553571 Yes\n", "6 Spain NaN 0.446429 No\n", "7 France 0.913043 0.928571 Yes\n", "8 Germany 1.000000 1.000000 No\n", "9 France 0.434783 0.714286 Yes" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Setup the MinMaxScaler\n", "#This will only work on Numerical attributes/features\n", "scaler = MinMaxScaler()\n", "\n", "#Apply the scaler to the numerical data\n", "# and save the data back to the dataframe\n", "# overwriting the original values\n", "data[['Age', 'Salary']] = scaler.fit_transform(data[['Age', 'Salary']])\n", "\n", "#Display the dataframe\n", "data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Copy and modify the above code to repace the NaN" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Modify the data set to replace the empty data (NaN) with an appropriate values\n", "#### Rerun the scaler with this updated/modified dataframe\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Another Example and Exercise - Complete all the steps\n", "\n", "#### Data set = Pima Indian diabetes dataset \n", "#### https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pregplaspresskintestmasspediageclass
061487235033.60.627501
11856629026.60.351310
28183640023.30.672321
318966239428.10.167210
40137403516843.12.288331
\n", "
" ], "text/plain": [ " preg plas pres skin test mass pedi age class\n", "0 6 148 72 35 0 33.6 0.627 50 1\n", "1 1 85 66 29 0 26.6 0.351 31 0\n", "2 8 183 64 0 0 23.3 0.672 32 1\n", "3 1 89 66 23 94 28.1 0.167 21 0\n", "4 0 137 40 35 168 43.1 2.288 33 1" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Import the data set\n", "\n", "import pandas as pd\n", "columns = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']\n", "data = pd.read_csv('/Users/brendan.tierney/Dropbox/4-Datasets/pima-indians-diabetes.csv', names=columns)\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pregplaspresskintestmasspediageclass
count768.000000768.000000768.000000768.000000768.000000768.000000768.000000768.000000768.000000
mean3.845052120.89453169.10546920.53645879.79947931.9925780.47187633.2408850.348958
std3.36957831.97261819.35580715.952218115.2440027.8841600.33132911.7602320.476951
min0.0000000.0000000.0000000.0000000.0000000.0000000.07800021.0000000.000000
25%1.00000099.00000062.0000000.0000000.00000027.3000000.24375024.0000000.000000
50%3.000000117.00000072.00000023.00000030.50000032.0000000.37250029.0000000.000000
75%6.000000140.25000080.00000032.000000127.25000036.6000000.62625041.0000001.000000
max17.000000199.000000122.00000099.000000846.00000067.1000002.42000081.0000001.000000
\n", "
" ], "text/plain": [ " preg plas pres skin test mass \\\n", "count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 \n", "mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578 \n", "std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160 \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000 \n", "50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000 \n", "75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000 \n", "max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000 \n", "\n", " pedi age class \n", "count 768.000000 768.000000 768.000000 \n", "mean 0.471876 33.240885 0.348958 \n", "std 0.331329 11.760232 0.476951 \n", "min 0.078000 21.000000 0.000000 \n", "25% 0.243750 24.000000 0.000000 \n", "50% 0.372500 29.000000 0.000000 \n", "75% 0.626250 41.000000 1.000000 \n", "max 2.420000 81.000000 1.000000 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Create the summary statistics\n", "data.describe()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countmeanstdmin25%50%75%max
preg768.03.8450523.3695780.0001.000003.00006.0000017.00
plas768.0120.89453131.9726180.00099.00000117.0000140.25000199.00
pres768.069.10546919.3558070.00062.0000072.000080.00000122.00
skin768.020.53645815.9522180.0000.0000023.000032.0000099.00
test768.079.799479115.2440020.0000.0000030.5000127.25000846.00
mass768.031.9925787.8841600.00027.3000032.000036.6000067.10
pedi768.00.4718760.3313290.0780.243750.37250.626252.42
age768.033.24088511.76023221.00024.0000029.000041.0000081.00
class768.00.3489580.4769510.0000.000000.00001.000001.00
\n", "
" ], "text/plain": [ " count mean std min 25% 50% 75% \\\n", "preg 768.0 3.845052 3.369578 0.000 1.00000 3.0000 6.00000 \n", "plas 768.0 120.894531 31.972618 0.000 99.00000 117.0000 140.25000 \n", "pres 768.0 69.105469 19.355807 0.000 62.00000 72.0000 80.00000 \n", "skin 768.0 20.536458 15.952218 0.000 0.00000 23.0000 32.00000 \n", "test 768.0 79.799479 115.244002 0.000 0.00000 30.5000 127.25000 \n", "mass 768.0 31.992578 7.884160 0.000 27.30000 32.0000 36.60000 \n", "pedi 768.0 0.471876 0.331329 0.078 0.24375 0.3725 0.62625 \n", "age 768.0 33.240885 11.760232 21.000 24.00000 29.0000 41.00000 \n", "class 768.0 0.348958 0.476951 0.000 0.00000 0.0000 1.00000 \n", "\n", " max \n", "preg 17.00 \n", "plas 199.00 \n", "pres 122.00 \n", "skin 99.00 \n", "test 846.00 \n", "mass 67.10 \n", "pedi 2.42 \n", "age 81.00 \n", "class 1.00 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.describe().transpose()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "data[columns].hist(stacked=False, bins=100, figsize=(12,30), layout=(14,2));" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pregplaspresskintestmasspediage
061487235033.60.62750
11856629026.60.35131
28183640023.30.67232
318966239428.10.16721
40137403516843.12.28833
\n", "
" ], "text/plain": [ " preg plas pres skin test mass pedi age\n", "0 6 148 72 35 0 33.6 0.627 50\n", "1 1 85 66 29 0 26.6 0.351 31\n", "2 8 183 64 0 0 23.3 0.672 32\n", "3 1 89 66 23 94 28.1 0.167 21\n", "4 0 137 40 35 168 43.1 2.288 33" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#The data set contains a Class attribute. \n", "#This is an indicator variable that is non-descriptive and only indicates if the \n", "# descriptive data indicates a particular event\n", "\n", "#Let's separate the data to into 2 dataframes.\n", "# - The first will contain the descriptive attributes\n", "# - The second will contain the indication attribute\n", "\n", "#Create a new dataframe (X) to contain the descriptive attributes, droping the indicitor attribute\n", "X = data.drop('class', axis=1)\n", "\n", "#Create a new dataframe (Y) to only contain the indicator attribute\n", "Y = data['class']\n", "X.head()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pregplaspresskintestmasspediage
00.3529410.7437190.5901640.3535350.0000000.5007450.2344150.483333
10.0588240.4271360.5409840.2929290.0000000.3964230.1165670.166667
20.4705880.9195980.5245900.0000000.0000000.3472430.2536290.183333
30.0588240.4472360.5409840.2323230.1111110.4187780.0380020.000000
40.0000000.6884420.3278690.3535350.1985820.6423250.9436380.200000
\n", "
" ], "text/plain": [ " preg plas pres skin test mass pedi \\\n", "0 0.352941 0.743719 0.590164 0.353535 0.000000 0.500745 0.234415 \n", "1 0.058824 0.427136 0.540984 0.292929 0.000000 0.396423 0.116567 \n", "2 0.470588 0.919598 0.524590 0.000000 0.000000 0.347243 0.253629 \n", "3 0.058824 0.447236 0.540984 0.232323 0.111111 0.418778 0.038002 \n", "4 0.000000 0.688442 0.327869 0.353535 0.198582 0.642325 0.943638 \n", "\n", " age \n", "0 0.483333 \n", "1 0.166667 \n", "2 0.183333 \n", "3 0.000000 \n", "4 0.200000 " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.preprocessing import MinMaxScaler\n", "X_copy = X.copy() #We create a copy so we can still refer to the original dataframe later\n", "scaler = MinMaxScaler()\n", "#Create list of Columns to transform/scale\n", "X_columns = X.columns\n", "#Create a new dataframe\n", "X_scaled = pd.DataFrame(scaler.fit_transform(X_copy), columns=X_columns)\n", "X_scaled.head()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pregplaspresskintestmasspediageclass
count768.000000768.000000768.000000768.000000768.000000768.000000768.000000768.000000768.000000
mean3.845052120.89453169.10546920.53645879.79947931.9925780.47187633.2408850.348958
std3.36957831.97261819.35580715.952218115.2440027.8841600.33132911.7602320.476951
min0.0000000.0000000.0000000.0000000.0000000.0000000.07800021.0000000.000000
25%1.00000099.00000062.0000000.0000000.00000027.3000000.24375024.0000000.000000
50%3.000000117.00000072.00000023.00000030.50000032.0000000.37250029.0000000.000000
75%6.000000140.25000080.00000032.000000127.25000036.6000000.62625041.0000001.000000
max17.000000199.000000122.00000099.000000846.00000067.1000002.42000081.0000001.000000
\n", "
" ], "text/plain": [ " preg plas pres skin test mass \\\n", "count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 \n", "mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578 \n", "std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160 \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000 \n", "50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000 \n", "75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000 \n", "max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000 \n", "\n", " pedi age class \n", "count 768.000000 768.000000 768.000000 \n", "mean 0.471876 33.240885 0.348958 \n", "std 0.331329 11.760232 0.476951 \n", "min 0.078000 21.000000 0.000000 \n", "25% 0.243750 24.000000 0.000000 \n", "50% 0.372500 29.000000 0.000000 \n", "75% 0.626250 41.000000 1.000000 \n", "max 2.420000 81.000000 1.000000 " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.describe()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pregplaspresskintestmasspediage
count768.000000768.000000768.000000768.000000768.000000768.000000768.000000768.000000
mean0.2261800.6075100.5664380.2074390.0943260.4767900.1681790.204015
std0.1982100.1606660.1586540.1611340.1362220.1174990.1414730.196004
min0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
25%0.0588240.4974870.5081970.0000000.0000000.4068550.0707730.050000
50%0.1764710.5879400.5901640.2323230.0360520.4769000.1257470.133333
75%0.3529410.7047740.6557380.3232320.1504140.5454550.2340950.333333
max1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
\n", "
" ], "text/plain": [ " preg plas pres skin test mass \\\n", "count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 \n", "mean 0.226180 0.607510 0.566438 0.207439 0.094326 0.476790 \n", "std 0.198210 0.160666 0.158654 0.161134 0.136222 0.117499 \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 0.058824 0.497487 0.508197 0.000000 0.000000 0.406855 \n", "50% 0.176471 0.587940 0.590164 0.232323 0.036052 0.476900 \n", "75% 0.352941 0.704774 0.655738 0.323232 0.150414 0.545455 \n", "max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 \n", "\n", " pedi age \n", "count 768.000000 768.000000 \n", "mean 0.168179 0.204015 \n", "std 0.141473 0.196004 \n", "min 0.000000 0.000000 \n", "25% 0.070773 0.050000 \n", "50% 0.125747 0.133333 \n", "75% 0.234095 0.333333 \n", "max 1.000000 1.000000 " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_scaled.describe()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "#Question: Add code (below) to create a new dataframe, where only the 'preg' and 'plas' attributes are transformed" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pregplaspresskintestmasspediage
00.3529410.7437197235033.60.62750
10.0588240.4271366629026.60.35131
20.4705880.919598640023.30.67232
30.0588240.44723666239428.10.16721
40.0000000.688442403516843.12.28833
\n", "
" ], "text/plain": [ " preg plas pres skin test mass pedi age\n", "0 0.352941 0.743719 72 35 0 33.6 0.627 50\n", "1 0.058824 0.427136 66 29 0 26.6 0.351 31\n", "2 0.470588 0.919598 64 0 0 23.3 0.672 32\n", "3 0.058824 0.447236 66 23 94 28.1 0.167 21\n", "4 0.000000 0.688442 40 35 168 43.1 2.288 33" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.preprocessing import MinMaxScaler\n", "X_copy = X.copy()\n", "scaler = MinMaxScaler()\n", "X_copy[['preg', 'plas']] = scaler.fit_transform(X_copy[['preg', 'plas']])\n", "X_copy.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.9" } }, "nbformat": 4, "nbformat_minor": 4 }