{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pregplaspresskintestmasspediageclass
061487235033.60.627501
11856629026.60.351310
28183640023.30.672321
318966239428.10.167210
40137403516843.12.288331
\n", "
" ], "text/plain": [ " preg plas pres skin test mass pedi age class\n", "0 6 148 72 35 0 33.6 0.627 50 1\n", "1 1 85 66 29 0 26.6 0.351 31 0\n", "2 8 183 64 0 0 23.3 0.672 32 1\n", "3 1 89 66 23 94 28.1 0.167 21 0\n", "4 0 137 40 35 168 43.1 2.288 33 1" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Import the data set\n", "\n", "import pandas as pd\n", "columns = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']\n", "data = pd.read_csv('/Users/brendan.tierney/Dropbox/4-Datasets/pima-indians-diabetes.csv', names=columns)\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(768, 9)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.shape" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 500\n", "1 268\n", "Name: class, dtype: int64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data['class'].value_counts()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEFCAYAAAAYKqc0AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAQN0lEQVR4nO3df6zddX3H8edrVECFUaB3FVuwbNQ5zAKaihB/xEk2BefKH8pQp5WwNFkg0TB/dGoUjS64ZAPNnFkzDFX8AUORTpgOUaJG+VEUUESlY7C2Aq3QVpT5A3nvj/MpnNZ7e2/be++hnz4fycn5fD+fz/d836e9ffV7P+d7zklVIUnqy++MugBJ0vQz3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4SxNIMpbkB0mePOpaxpPkgFbf2Khr0ROP4a6RSvLaJGuS/CzJvUn+M8kLZ+G4leSYSaatAC6uqv9r+1yX5K9nuraJ7Hj8qvol8DEGdUrbMdw1MknOBS4E/h6YDxwF/AuwdIRlAYOzYmAZcMk0Puac6XqsIZ8ClrV6pccY7hqJJIcA7wPOrqrPVdXPq+rXVfUfVfXWNueAJBcm+XG7XbgtxJK8Mck3dnjMx87Gk1yc5CNJrkryUJIbkvxBG/ta2+XW9hvDX45T4vOBLVW1vu3zAeBFwD+3ff659X8oybokP01yc5IXDdVzXpLLk1yS5KfAG5McneRrraYvtxovGdrnxCTfTLIlya1JXrKz47f6NgMn7v7fhnpkuGtUTgIOBK7YyZx3Mgit44HjgBOAd+3CMc4A3gscCqwFPgBQVS9u48dV1UFVdek4+/4x8MNtG1X1TuDrwDltn3Pa0E2tvsMYnEX/e5IDhx5nKXA5MBf4ZJtzI3A4cB7w+m0TkywArgLe3x7vLcBnk4zt5PgAdzD485EeY7hrVA4HflJVj+xkzuuA91XVxqraxCCoX7+T+Tu6oqpubMf4JIMQnqq5wEOTTaqqS6rqgap6pKr+ETgA+MOhKd+qqs9X1aPAGPA84N1V9auq+gawemjuXwFXV9XVVfVoVV0DrAFOnaSMh1q90mMMd43KA8C8Sdahnw7cM7R9T+ubqvuG2g8DB+3CvpuBgyeblOQtSe5IsjXJFuAQYN7QlHVD7acDD1bVwxOMPwN4dVuS2dIe74XAEZOUcTCwZbJatW8x3DUq3wJ+CZy2kzk/ZhB42xzV+gB+Djxl20CSp01zfbcBz9yhb7uPUG3r628DTgcOraq5wFYgE+xzL3BYkqcM9R051F4HfKKq5g7dnlpV5493/CF/BNw6heekfYjhrpGoqq3Au4GPJDktyVOSPCnJKUn+oU37NPCudr35vDZ/24uPtwLPTnJ8W+M+bxdLuB/4/Z2M3wjMbevgE+1zMPAIsAmYk+TdwO9O9IBVdQ+DZZbzkuyf5CTglUNTLgFemeRlSfZLcmCSlyRZOFHNrb7DgOt38ly0DzLcNTJtjfpcBi+SbmJw5noO8Pk25f0MwvA24LvAt1sfVfUjBlfbfBm4E9juypkpOA9Y1ZY/Th+ntl8BFzNYB9/mQ8CrkmxO8mHgS8AXgR8xWDL6Bdsvs4zndQxeTH6gPZdLGfwGQ1WtY/AC7Dt4/M/jrTz+73TH4wO8FljVrnmXHhO/rEMaX3vn59eB52x7I9MMHONS4AdV9Z7d2PcABr/BvLiqNk57cdqrGe7SLEryPOBB4H+AP2PwW8pJVfWdUdal/szEO+YkTexpwOcYXAq6Hvgbg10zwTN3SeqQL6hKUocMd0nq0BNizX3evHm1aNGiUZchSXuVm2+++SdVNe7n+T8hwn3RokWsWbNm1GVI0l4lyT0TjbksI0kdMtwlqUOGuyR1yHCXpA4Z7pLUoSmFe5K7k3w3yS1J1rS+w5Jck+TOdn9o60+SDydZm+S2JM+dyScgSfptu3Lm/idVdXxVLWnbK4Brq2oxcG3bBjgFWNxuy4GPTlexkqSp2ZNlmaXAqtZexePfqLMU+HgNXM/gCw8m+5owSdI0muqbmAr4ryQF/GtVrQTmV9W9bfw+YH5rL2D7LyxY3/ruHeojyXIGZ/YcddRRu1f9LFu04qpRl9CVu89/xahLkLo11XB/YVVtSPJ7wDVJfjA8WFXVgn/K2n8QKwGWLFniR1NK0jSa0rJMVW1o9xuBK4ATgPu3Lbe0+23fBLOB7b/0d2HrkyTNkknDPclTkxy8rc3g22O+B6wGlrVpy4ArW3s18IZ21cyJwNah5RtJ0iyYyrLMfOCKJNvmf6qqvpjkJuCyJGcx+HLgbV8yfDVwKrAWeBg4c9qrliTt1KThXlV3AceN0/8AcPI4/QWcPS3VSZJ2i+9QlaQOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHVoyuGeZL8k30nyhbZ9dJIbkqxNcmmS/Vv/AW17bRtfNEO1S5ImsCtn7m8C7hja/iBwQVUdA2wGzmr9ZwGbW/8FbZ4kaRZNKdyTLAReAfxb2w7wUuDyNmUVcFprL23btPGT23xJ0iyZ6pn7hcDbgEfb9uHAlqp6pG2vBxa09gJgHUAb39rmS5JmyaThnuTPgY1VdfN0HjjJ8iRrkqzZtGnTdD60JO3zpnLm/gLgL5LcDXyGwXLMh4C5Sea0OQuBDa29ATgSoI0fAjyw44NW1cqqWlJVS8bGxvboSUiStjdpuFfV31XVwqpaBJwBfKWqXgd8FXhVm7YMuLK1V7dt2vhXqqqmtWpJ0k7tyXXubwfOTbKWwZr6Ra3/IuDw1n8usGLPSpQk7ao5k095XFVdB1zX2ncBJ4wz5xfAq6ehNknSbvIdqpLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOTRruSQ5McmOSW5PcnuS9rf/oJDckWZvk0iT7t/4D2vbaNr5ohp+DJGkHUzlz/yXw0qo6DjgeeHmSE4EPAhdU1THAZuCsNv8sYHPrv6DNkyTNoknDvQZ+1jaf1G4FvBS4vPWvAk5r7aVtmzZ+cpJMV8GSpMlNac09yX5JbgE2AtcA/w1sqapH2pT1wILWXgCsA2jjW4HDp7FmSdIkphTuVfWbqjoeWAicADxrTw+cZHmSNUnWbNq0aU8fTpI0ZJeulqmqLcBXgZOAuUnmtKGFwIbW3gAcCdDGDwEeGOexVlbVkqpaMjY2tnvVS5LGNZWrZcaSzG3tJwN/CtzBIORf1aYtA65s7dVtmzb+laqqaaxZkjSJOZNP4QhgVZL9GPxncFlVfSHJ94HPJHk/8B3gojb/IuATSdYCDwJnzEDdkqSdmDTcq+o24Dnj9N/FYP19x/5fAK+eluokSbvFd6hKUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOjSVd6hKeoJbtOKqUZfQlbvPf8WoS9hjnrlLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6NGm4JzkyyVeTfD/J7Une1PoPS3JNkjvb/aGtP0k+nGRtktuSPHemn4QkaXtTOXN/BPjbqjoWOBE4O8mxwArg2qpaDFzbtgFOARa323Lgo9NetSRppyYN96q6t6q+3doPAXcAC4ClwKo2bRVwWmsvBT5eA9cDc5McMd2FS5Imtktr7kkWAc8BbgDmV9W9beg+YH5rLwDWDe22vvVJkmbJlMM9yUHAZ4E3V9VPh8eqqoDalQMnWZ5kTZI1mzZt2pVdJUmTmFK4J3kSg2D/ZFV9rnXfv225pd1vbP0bgCOHdl/Y+rZTVSuraklVLRkbG9vd+iVJ45jK1TIBLgLuqKp/GhpaDSxr7WXAlUP9b2hXzZwIbB1avpEkzYI5U5jzAuD1wHeT3NL63gGcD1yW5CzgHuD0NnY1cCqwFngYOHM6C5YkTW7ScK+qbwCZYPjkceYXcPYe1iVJ2gO+Q1WSOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUoUnDPcnHkmxM8r2hvsOSXJPkznZ/aOtPkg8nWZvktiTPncniJUnjm8qZ+8XAy3foWwFcW1WLgWvbNsApwOJ2Ww58dHrKlCTtiknDvaq+Bjy4Q/dSYFVrrwJOG+r/eA1cD8xNcsQ01SpJmqLdXXOfX1X3tvZ9wPzWXgCsG5q3vvVJkmbRHr+gWlUF1K7ul2R5kjVJ1mzatGlPy5AkDdndcL9/23JLu9/Y+jcARw7NW9j6fktVrayqJVW1ZGxsbDfLkCSNZ3fDfTWwrLWXAVcO9b+hXTVzIrB1aPlGkjRL5kw2IcmngZcA85KsB94DnA9cluQs4B7g9Db9auBUYC3wMHDmDNQsSZrEpOFeVa+ZYOjkceYWcPaeFiVJ2jO+Q1WSOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjo0I+Ge5OVJfphkbZIVM3EMSdLEpj3ck+wHfAQ4BTgWeE2SY6f7OJKkic3EmfsJwNqququqfgV8Blg6A8eRJE1gzgw85gJg3dD2euD5O05KshxY3jZ/luSHM1DLvmoe8JNRFzGZfHDUFWgE/NmcXs+YaGAmwn1KqmolsHJUx+9ZkjVVtWTUdUg78mdz9szEsswG4Mih7YWtT5I0S2Yi3G8CFic5Osn+wBnA6hk4jiRpAtO+LFNVjyQ5B/gSsB/wsaq6fbqPo51yuUtPVP5szpJU1ahrkCRNM9+hKkkdMtwlqUOGuyR1aGTXuWt6JHkWg3cAL2hdG4DVVXXH6KqSNGqeue/Fkrydwcc7BLix3QJ82g9s0xNZkjNHXUPvvFpmL5bkR8Czq+rXO/TvD9xeVYtHU5m0c0n+t6qOGnUdPXNZZu/2KPB04J4d+o9oY9LIJLltoiFg/mzWsi8y3PdubwauTXInj39Y21HAMcA5oypKauYDLwM279Af4JuzX86+xXDfi1XVF5M8k8HHLA+/oHpTVf1mdJVJAHwBOKiqbtlxIMl1s17NPsY1d0nqkFfLSFKHDHdJ6pDhLkkdMtwlqUOGuyR16P8BtIgOjfRADRgAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "#print bar chart\n", "data['class'].value_counts().plot(kind='bar', title='Count (target)');" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Down Sampling - Majority Class - Using Random Sampling" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Class = 0 500\n", "Class = 1 268\n" ] } ], "source": [ "count_class_0, count_class_1 = data['class'].value_counts()\n", "\n", "# Divide by class\n", "df_class_0 = data[data['class'] == 0] #majority class\n", "df_class_1 = data[data['class'] == 1] #minority class\n", "\n", "print('Class = 0 ', df_class_0.shape[0])\n", "print('Class = 1 ', df_class_1.shape[0])" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(268, 9)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Sample Majority class (y=0, to have same number of records as minority calls (y=1)\n", "df_class_0_under = df_class_0.sample(count_class_1)\n", "\n", "df_class_0_under.shape" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Random under-sampling:\n", "0 268\n", "1 268\n", "Name: class, dtype: int64\n", "Num records = 536\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEFCAYAAAAYKqc0AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAPyklEQVR4nO3df6zddX3H8edrVHEKs2DvaimtZVq3QRarqYjxR1hMRFhMMdkY6LAal5qFJhp/bPgj2hlZ2DJ/RiWpkVAFEaag3WQ6bDRI/AGFQQUq0ihdWwu98lvZ0MJ7f5xv4fRyb+/ve+inz0dyc8/5fL/f831fuDx7+r3nXFJVSJLa8nuDHkCSNPOMuyQ1yLhLUoOMuyQ1yLhLUoOMuyQ1yLhLY0gylOSnSX5/0LOMJsnh3XxDg55FTz3GXQOV5I1JNif5dZLdSf4zySvn4LyV5AXj7HYucFFV/W93zPeS/O1szzaWkeevqkeAC+nNKe3HuGtgkrwL+CTwT8BCYCnwOWDVAMcCes+KgdXAxTP4mPNm6rH6fBlY3c0rPc64ayCSPBv4CHBOVV1RVb+pqt9V1b9X1Xu7fQ5P8skkv+w+PrkvYknekuTaEY/5+LPxJBcl+WySbyZ5KMmPkzy/23ZNd8jN3d8Y/nqUEV8G3F9VO7tjzgNeBXymO+Yz3fqnkuxI8mCSG5K8qm+edUm+muTiJA8Cb0lyXJJrupm+0814cd8xJyX5QZL7k9yc5OQDnb+b7z7gpKn/21CLjLsG5eXAM4ArD7DPB+hFawXwIuBE4IOTOMeZwD8CRwHbgPMAqurV3fYXVdURVXXZKMf+GXD7vjtV9QHg+8Da7pi13abru/mOpvcs+t+SPKPvcVYBXwXmA5d0+1wHPAdYB5y9b8cki4FvAh/tHu89wNeSDB3g/ABb6f3zkR5n3DUozwF+VVV7D7DPm4CPVNWeqhqmF+qzD7D/SFdW1XXdOS6hF+GJmg88NN5OVXVxVd1TVXur6mPA4cAf9+3yw6r6elU9BgwBLwU+VFW/raprgY19+/4NcFVVXVVVj1XV1cBm4LRxxniom1d6nHHXoNwDLBjnOvQxwPa++9u7tYm6q+/2w8ARkzj2PuDI8XZK8p4kW5M8kOR+4NnAgr5ddvTdPga4t6oeHmP784C/6i7J3N893iuBReOMcSRw/3iz6tBi3DUoPwQeAU4/wD6/pBe8fZZ2awC/AZ65b0OS587wfFuAF45Y2+9XqHbX1/8eOAM4qqrmAw8AGeOY3cDRSZ7Zt7ak7/YO4EtVNb/v41lVdf5o5+/zp8DNE/iadAgx7hqIqnoA+BDw2SSnJ3lmkqclOTXJv3S7XQp8sHu9+YJu/30/fLwZOCHJiu4a97pJjnA38EcH2H4dML+7Dj7WMUcCe4FhYF6SDwF/MNYDVtV2epdZ1iV5epKXA6/v2+Vi4PVJTklyWJJnJDk5ybFjzdzNdzTwowN8LToEGXcNTHeN+l30fkg6TO+Z61rg690uH6UXwy3AT4AbuzWq6mf0Xm3zHeAOYL9XzkzAOmBDd/njjFFm+y1wEb3r4Pt8CvjLJPcl+TTwbeBbwM/oXTL6P/a/zDKaN9H7YfI93ddyGb2/wVBVO+j9APb9PPHP47088d/pyPMDvBHY0L3mXXpc/J91SKPr3vn5feDF+97INAvnuAz4aVV9eArHHk7vbzCvrqo9Mz6cDmrGXZpDSV4K3Av8Angtvb+lvLyq/nuQc6k9s/GOOUljey5wBb2Xgu4E/s6wazb4zF2SGuQPVCWpQcZdkhr0lLjmvmDBglq2bNmgx5Ckg8oNN9zwq6oa9ff5PyXivmzZMjZv3jzoMSTpoJJk+1jbvCwjSQ0y7pLUIOMuSQ0y7pLUIOMuSQ0y7pLUIOMuSQ0y7pLUoKfEm5gOFsvO/eagR2jKnef/xaBHaIbfmzOrhe9Nn7lLUoOMuyQ1yLhLUoOMuyQ1yLhLUoOMuyQ1yLhLUoOMuyQ1yLhLUoPGjXuSJUm+m+S2JLcmeUe3vi7JriQ3dR+n9R3zviTbktye5JTZ/AIkSU82kV8/sBd4d1XdmORI4IYkV3fbPlFV/9q/c5LjgTOBE4BjgO8keWFVPTqTg0uSxjbuM/eq2l1VN3a3HwK2AosPcMgq4CtV9UhV/QLYBpw4E8NKkiZmUtfckywDXgz8uFtam2RLkguTHNWtLQZ29B22kwP/YSBJmmETjnuSI4CvAe+sqgeBC4DnAyuA3cDHJnPiJGuSbE6yeXh4eDKHSpLGMaG4J3kavbBfUlVXAFTV3VX1aFU9BnyeJy697AKW9B1+bLe2n6paX1Urq2rl0NDQdL4GSdIIE3m1TIAvAFur6uN964v6dnsDcEt3eyNwZpLDkxwHLAeum7mRJUnjmcirZV4BnA38JMlN3dr7gbOSrAAKuBN4O0BV3ZrkcuA2eq+0OcdXykjS3Bo37lV1LZBRNl11gGPOA86bxlySpGnwHaqS1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNGjfuSZYk+W6S25LcmuQd3frRSa5Ockf3+ahuPUk+nWRbki1JXjLbX4QkaX8Teea+F3h3VR0PnASck+R44FxgU1UtBzZ19wFOBZZ3H2uAC2Z8aknSAY0b96raXVU3drcfArYCi4FVwIZutw3A6d3tVcAXq+dHwPwki2Z6cEnS2CZ1zT3JMuDFwI+BhVW1u9t0F7Cwu70Y2NF32M5uTZI0RyYc9yRHAF8D3llVD/Zvq6oCajInTrImyeYkm4eHhydzqCRpHBOKe5Kn0Qv7JVV1Rbd8977LLd3nPd36LmBJ3+HHdmv7qar1VbWyqlYODQ1NdX5J0igm8mqZAF8AtlbVx/s2bQRWd7dXA9/oW39z96qZk4AH+i7fSJLmwLwJ7PMK4GzgJ0lu6tbeD5wPXJ7kbcB24Ixu21XAacA24GHgrTM5sCRpfOPGvaquBTLG5teMsn8B50xzLknSNPgOVUlqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAaNG/ckFybZk+SWvrV1SXYluan7OK1v2/uSbEtye5JTZmtwSdLYJvLM/SLgdaOsf6KqVnQfVwEkOR44EzihO+ZzSQ6bqWElSRMzbtyr6hrg3gk+3irgK1X1SFX9AtgGnDiN+SRJUzCda+5rk2zpLtsc1a0tBnb07bOzW5MkzaGpxv0C4PnACmA38LHJPkCSNUk2J9k8PDw8xTEkSaOZUtyr6u6qerSqHgM+zxOXXnYBS/p2PbZbG+0x1lfVyqpaOTQ0NJUxJEljmFLckyzqu/sGYN8raTYCZyY5PMlxwHLguumNKEmarHnj7ZDkUuBkYEGSncCHgZOTrAAKuBN4O0BV3ZrkcuA2YC9wTlU9OiuTS5LGNG7cq+qsUZa/cID9zwPOm85QkqTp8R2qktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktSgceOe5MIke5Lc0rd2dJKrk9zRfT6qW0+STyfZlmRLkpfM5vCSpNFN5Jn7RcDrRqydC2yqquXApu4+wKnA8u5jDXDBzIwpSZqMceNeVdcA945YXgVs6G5vAE7vW/9i9fwImJ9k0QzNKkmaoKlec19YVbu723cBC7vbi4Edffvt7NYkSXNo2j9QraoCarLHJVmTZHOSzcPDw9MdQ5LUZ6pxv3vf5Zbu855ufRewpG+/Y7u1J6mq9VW1sqpWDg0NTXEMSdJophr3jcDq7vZq4Bt962/uXjVzEvBA3+UbSdIcmTfeDkkuBU4GFiTZCXwYOB+4PMnbgO3AGd3uVwGnAduAh4G3zsLMkqRxjBv3qjprjE2vGWXfAs6Z7lCSpOnxHaqS1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNmjedg5PcCTwEPArsraqVSY4GLgOWAXcCZ1TVfdMbU5I0GTPxzP3Pq2pFVa3s7p8LbKqq5cCm7r4kaQ7NxmWZVcCG7vYG4PRZOIck6QCmG/cC/ivJDUnWdGsLq2p3d/suYOE0zyFJmqRpXXMHXllVu5L8IXB1kp/2b6yqSlKjHdj9YbAGYOnSpdMcQ5LUb1rP3KtqV/d5D3AlcCJwd5JFAN3nPWMcu76qVlbVyqGhoemMIUkaYcpxT/KsJEfuuw28FrgF2Ais7nZbDXxjukNKkiZnOpdlFgJXJtn3OF+uqm8luR64PMnbgO3AGdMfU5I0GVOOe1X9HHjRKOv3AK+ZzlCSpOnxHaqS1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNmrW4J3ldktuTbEty7mydR5L0ZLMS9ySHAZ8FTgWOB85KcvxsnEuS9GSz9cz9RGBbVf28qn4LfAVYNUvnkiSNMG+WHncxsKPv/k7gZf07JFkDrOnu/jrJ7bM0y6FoAfCrQQ8xnvzzoCfQAPi9ObOeN9aG2Yr7uKpqPbB+UOdvWZLNVbVy0HNII/m9OXdm67LMLmBJ3/1juzVJ0hyYrbhfDyxPclySpwNnAhtn6VySpBFm5bJMVe1Nshb4NnAYcGFV3Tob59KovNylpyq/N+dIqmrQM0iSZpjvUJWkBhl3SWqQcZekBg3sde6aOUn+hN47gBd3S7uAjVW1dXBTSRokn7kf5JL8A71f7xDguu4jwKX+wjY9VSV566BnaJ2vljnIJfkZcEJV/W7E+tOBW6tq+WAmk8aW5H+qaumg52iZl2UOfo8BxwDbR6wv6rZJA5Fky1ibgIVzOcuhyLgf/N4JbEpyB0/8sralwAuAtYMaSqIX8FOA+0asB/jB3I9zaDHuB7mq+laSF9L7Ncv9P1C9vqoeHdxkEv8BHFFVN43ckOR7cz7NIcZr7pLUIF8tI0kNMu6S1CDjLkkNMu6S1CDjLkkN+n9FlP1ETWJfHAAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# join the dataframes containing y=1 and y=0\n", "df_test_under = pd.concat([df_class_0_under, df_class_1])\n", "\n", "print('Random under-sampling:')\n", "print(df_test_under['class'].value_counts())\n", "print(\"Num records = \", df_test_under.shape[0])\n", "\n", "df_test_under['class'].value_counts().plot(kind='bar', title='Count (target)');" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Down Sampling - Majority Class - Using imblearn " ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "imblearn over-sampling:\n", "0 268\n", "1 268\n", "Name: class, dtype: int64\n", "Num records = 536\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEFCAYAAAAYKqc0AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAPyklEQVR4nO3df6zddX3H8edrVHEKs2DvaimtZVq3QRarqYjxR1hMRFhMMdkY6LAal5qFJhp/bPgj2hlZ2DJ/RiWpkVAFEaag3WQ6bDRI/AGFQQUq0ihdWwu98lvZ0MJ7f5xv4fRyb+/ve+inz0dyc8/5fL/f831fuDx7+r3nXFJVSJLa8nuDHkCSNPOMuyQ1yLhLUoOMuyQ1yLhLUoOMuyQ1yLhLY0gylOSnSX5/0LOMJsnh3XxDg55FTz3GXQOV5I1JNif5dZLdSf4zySvn4LyV5AXj7HYucFFV/W93zPeS/O1szzaWkeevqkeAC+nNKe3HuGtgkrwL+CTwT8BCYCnwOWDVAMcCes+KgdXAxTP4mPNm6rH6fBlY3c0rPc64ayCSPBv4CHBOVV1RVb+pqt9V1b9X1Xu7fQ5P8skkv+w+PrkvYknekuTaEY/5+LPxJBcl+WySbyZ5KMmPkzy/23ZNd8jN3d8Y/nqUEV8G3F9VO7tjzgNeBXymO+Yz3fqnkuxI8mCSG5K8qm+edUm+muTiJA8Cb0lyXJJrupm+0814cd8xJyX5QZL7k9yc5OQDnb+b7z7gpKn/21CLjLsG5eXAM4ArD7DPB+hFawXwIuBE4IOTOMeZwD8CRwHbgPMAqurV3fYXVdURVXXZKMf+GXD7vjtV9QHg+8Da7pi13abru/mOpvcs+t+SPKPvcVYBXwXmA5d0+1wHPAdYB5y9b8cki4FvAh/tHu89wNeSDB3g/ABb6f3zkR5n3DUozwF+VVV7D7DPm4CPVNWeqhqmF+qzD7D/SFdW1XXdOS6hF+GJmg88NN5OVXVxVd1TVXur6mPA4cAf9+3yw6r6elU9BgwBLwU+VFW/raprgY19+/4NcFVVXVVVj1XV1cBm4LRxxniom1d6nHHXoNwDLBjnOvQxwPa++9u7tYm6q+/2w8ARkzj2PuDI8XZK8p4kW5M8kOR+4NnAgr5ddvTdPga4t6oeHmP784C/6i7J3N893iuBReOMcSRw/3iz6tBi3DUoPwQeAU4/wD6/pBe8fZZ2awC/AZ65b0OS587wfFuAF45Y2+9XqHbX1/8eOAM4qqrmAw8AGeOY3cDRSZ7Zt7ak7/YO4EtVNb/v41lVdf5o5+/zp8DNE/iadAgx7hqIqnoA+BDw2SSnJ3lmkqclOTXJv3S7XQp8sHu9+YJu/30/fLwZOCHJiu4a97pJjnA38EcH2H4dML+7Dj7WMUcCe4FhYF6SDwF/MNYDVtV2epdZ1iV5epKXA6/v2+Vi4PVJTklyWJJnJDk5ybFjzdzNdzTwowN8LToEGXcNTHeN+l30fkg6TO+Z61rg690uH6UXwy3AT4AbuzWq6mf0Xm3zHeAOYL9XzkzAOmBDd/njjFFm+y1wEb3r4Pt8CvjLJPcl+TTwbeBbwM/oXTL6P/a/zDKaN9H7YfI93ddyGb2/wVBVO+j9APb9PPHP47088d/pyPMDvBHY0L3mXXpc/J91SKPr3vn5feDF+97INAvnuAz4aVV9eArHHk7vbzCvrqo9Mz6cDmrGXZpDSV4K3Av8Angtvb+lvLyq/nuQc6k9s/GOOUljey5wBb2Xgu4E/s6wazb4zF2SGuQPVCWpQcZdkhr0lLjmvmDBglq2bNmgx5Ckg8oNN9zwq6oa9ff5PyXivmzZMjZv3jzoMSTpoJJk+1jbvCwjSQ0y7pLUIOMuSQ0y7pLUIOMuSQ0y7pLUIOMuSQ0y7pLUoKfEm5gOFsvO/eagR2jKnef/xaBHaIbfmzOrhe9Nn7lLUoOMuyQ1yLhLUoOMuyQ1yLhLUoOMuyQ1yLhLUoOMuyQ1yLhLUoPGjXuSJUm+m+S2JLcmeUe3vi7JriQ3dR+n9R3zviTbktye5JTZ/AIkSU82kV8/sBd4d1XdmORI4IYkV3fbPlFV/9q/c5LjgTOBE4BjgO8keWFVPTqTg0uSxjbuM/eq2l1VN3a3HwK2AosPcMgq4CtV9UhV/QLYBpw4E8NKkiZmUtfckywDXgz8uFtam2RLkguTHNWtLQZ29B22kwP/YSBJmmETjnuSI4CvAe+sqgeBC4DnAyuA3cDHJnPiJGuSbE6yeXh4eDKHSpLGMaG4J3kavbBfUlVXAFTV3VX1aFU9BnyeJy697AKW9B1+bLe2n6paX1Urq2rl0NDQdL4GSdIIE3m1TIAvAFur6uN964v6dnsDcEt3eyNwZpLDkxwHLAeum7mRJUnjmcirZV4BnA38JMlN3dr7gbOSrAAKuBN4O0BV3ZrkcuA2eq+0OcdXykjS3Bo37lV1LZBRNl11gGPOA86bxlySpGnwHaqS1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNGjfuSZYk+W6S25LcmuQd3frRSa5Ockf3+ahuPUk+nWRbki1JXjLbX4QkaX8Teea+F3h3VR0PnASck+R44FxgU1UtBzZ19wFOBZZ3H2uAC2Z8aknSAY0b96raXVU3drcfArYCi4FVwIZutw3A6d3tVcAXq+dHwPwki2Z6cEnS2CZ1zT3JMuDFwI+BhVW1u9t0F7Cwu70Y2NF32M5uTZI0RyYc9yRHAF8D3llVD/Zvq6oCajInTrImyeYkm4eHhydzqCRpHBOKe5Kn0Qv7JVV1Rbd8977LLd3nPd36LmBJ3+HHdmv7qar1VbWyqlYODQ1NdX5J0igm8mqZAF8AtlbVx/s2bQRWd7dXA9/oW39z96qZk4AH+i7fSJLmwLwJ7PMK4GzgJ0lu6tbeD5wPXJ7kbcB24Ixu21XAacA24GHgrTM5sCRpfOPGvaquBTLG5teMsn8B50xzLknSNPgOVUlqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAaNG/ckFybZk+SWvrV1SXYluan7OK1v2/uSbEtye5JTZmtwSdLYJvLM/SLgdaOsf6KqVnQfVwEkOR44EzihO+ZzSQ6bqWElSRMzbtyr6hrg3gk+3irgK1X1SFX9AtgGnDiN+SRJUzCda+5rk2zpLtsc1a0tBnb07bOzW5MkzaGpxv0C4PnACmA38LHJPkCSNUk2J9k8PDw8xTEkSaOZUtyr6u6qerSqHgM+zxOXXnYBS/p2PbZbG+0x1lfVyqpaOTQ0NJUxJEljmFLckyzqu/sGYN8raTYCZyY5PMlxwHLguumNKEmarHnj7ZDkUuBkYEGSncCHgZOTrAAKuBN4O0BV3ZrkcuA2YC9wTlU9OiuTS5LGNG7cq+qsUZa/cID9zwPOm85QkqTp8R2qktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktSgceOe5MIke5Lc0rd2dJKrk9zRfT6qW0+STyfZlmRLkpfM5vCSpNFN5Jn7RcDrRqydC2yqquXApu4+wKnA8u5jDXDBzIwpSZqMceNeVdcA945YXgVs6G5vAE7vW/9i9fwImJ9k0QzNKkmaoKlec19YVbu723cBC7vbi4Edffvt7NYkSXNo2j9QraoCarLHJVmTZHOSzcPDw9MdQ5LUZ6pxv3vf5Zbu855ufRewpG+/Y7u1J6mq9VW1sqpWDg0NTXEMSdJophr3jcDq7vZq4Bt962/uXjVzEvBA3+UbSdIcmTfeDkkuBU4GFiTZCXwYOB+4PMnbgO3AGd3uVwGnAduAh4G3zsLMkqRxjBv3qjprjE2vGWXfAs6Z7lCSpOnxHaqS1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNmjedg5PcCTwEPArsraqVSY4GLgOWAXcCZ1TVfdMbU5I0GTPxzP3Pq2pFVa3s7p8LbKqq5cCm7r4kaQ7NxmWZVcCG7vYG4PRZOIck6QCmG/cC/ivJDUnWdGsLq2p3d/suYOE0zyFJmqRpXXMHXllVu5L8IXB1kp/2b6yqSlKjHdj9YbAGYOnSpdMcQ5LUb1rP3KtqV/d5D3AlcCJwd5JFAN3nPWMcu76qVlbVyqGhoemMIUkaYcpxT/KsJEfuuw28FrgF2Ais7nZbDXxjukNKkiZnOpdlFgJXJtn3OF+uqm8luR64PMnbgO3AGdMfU5I0GVOOe1X9HHjRKOv3AK+ZzlCSpOnxHaqS1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNmrW4J3ldktuTbEty7mydR5L0ZLMS9ySHAZ8FTgWOB85KcvxsnEuS9GSz9cz9RGBbVf28qn4LfAVYNUvnkiSNMG+WHncxsKPv/k7gZf07JFkDrOnu/jrJ7bM0y6FoAfCrQQ8xnvzzoCfQAPi9ObOeN9aG2Yr7uKpqPbB+UOdvWZLNVbVy0HNII/m9OXdm67LMLmBJ3/1juzVJ0hyYrbhfDyxPclySpwNnAhtn6VySpBFm5bJMVe1Nshb4NnAYcGFV3Tob59KovNylpyq/N+dIqmrQM0iSZpjvUJWkBhl3SWqQcZekBg3sde6aOUn+hN47gBd3S7uAjVW1dXBTSRokn7kf5JL8A71f7xDguu4jwKX+wjY9VSV566BnaJ2vljnIJfkZcEJV/W7E+tOBW6tq+WAmk8aW5H+qaumg52iZl2UOfo8BxwDbR6wv6rZJA5Fky1ibgIVzOcuhyLgf/N4JbEpyB0/8sralwAuAtYMaSqIX8FOA+0asB/jB3I9zaDHuB7mq+laSF9L7Ncv9P1C9vqoeHdxkEv8BHFFVN43ckOR7cz7NIcZr7pLUIF8tI0kNMu6S1CDjLkkNMu6S1CDjLkkN+n9FlP1ETWJfHAAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "from imblearn.under_sampling import RandomUnderSampler\n", "\n", "#separate the data in descriptive and target attributes\n", "X = data.drop('class', axis=1)\n", "Y = data['class']\n", "\n", "rus = RandomUnderSampler(random_state=42, replacement=True)\n", "X_rus, Y_rus = rus.fit_resample(X, Y)\n", "\n", "df_rus = pd.concat([pd.DataFrame(X_rus), pd.DataFrame(Y_rus, columns=['class'])], axis=1)\n", "\n", "print('imblearn over-sampling:')\n", "print(df_rus['class'].value_counts())\n", "print(\"Num records = \", df_rus.shape[0])\n", "\n", "df_rus['class'].value_counts().plot(kind='bar', title='Count (target)');" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "# we should have the same/similar results as previous. Although the selection of records could be different" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " #### Down/Under sampling the majority class y=1 using Sci-Kit Learn" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Original Data distribution\n", "0 500\n", "1 268\n", "Name: class, dtype: int64\n", "Sci-Kit Learn : resample : Down Sampled data set\n", "0 268\n", "1 268\n", "Name: class, dtype: int64\n", "Num records = 536\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEFCAYAAAAYKqc0AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAPyklEQVR4nO3df6zddX3H8edrVHEKs2DvaimtZVq3QRarqYjxR1hMRFhMMdkY6LAal5qFJhp/bPgj2hlZ2DJ/RiWpkVAFEaag3WQ6bDRI/AGFQQUq0ihdWwu98lvZ0MJ7f5xv4fRyb+/ve+inz0dyc8/5fL/f831fuDx7+r3nXFJVSJLa8nuDHkCSNPOMuyQ1yLhLUoOMuyQ1yLhLUoOMuyQ1yLhLY0gylOSnSX5/0LOMJsnh3XxDg55FTz3GXQOV5I1JNif5dZLdSf4zySvn4LyV5AXj7HYucFFV/W93zPeS/O1szzaWkeevqkeAC+nNKe3HuGtgkrwL+CTwT8BCYCnwOWDVAMcCes+KgdXAxTP4mPNm6rH6fBlY3c0rPc64ayCSPBv4CHBOVV1RVb+pqt9V1b9X1Xu7fQ5P8skkv+w+PrkvYknekuTaEY/5+LPxJBcl+WySbyZ5KMmPkzy/23ZNd8jN3d8Y/nqUEV8G3F9VO7tjzgNeBXymO+Yz3fqnkuxI8mCSG5K8qm+edUm+muTiJA8Cb0lyXJJrupm+0814cd8xJyX5QZL7k9yc5OQDnb+b7z7gpKn/21CLjLsG5eXAM4ArD7DPB+hFawXwIuBE4IOTOMeZwD8CRwHbgPMAqurV3fYXVdURVXXZKMf+GXD7vjtV9QHg+8Da7pi13abru/mOpvcs+t+SPKPvcVYBXwXmA5d0+1wHPAdYB5y9b8cki4FvAh/tHu89wNeSDB3g/ABb6f3zkR5n3DUozwF+VVV7D7DPm4CPVNWeqhqmF+qzD7D/SFdW1XXdOS6hF+GJmg88NN5OVXVxVd1TVXur6mPA4cAf9+3yw6r6elU9BgwBLwU+VFW/raprgY19+/4NcFVVXVVVj1XV1cBm4LRxxniom1d6nHHXoNwDLBjnOvQxwPa++9u7tYm6q+/2w8ARkzj2PuDI8XZK8p4kW5M8kOR+4NnAgr5ddvTdPga4t6oeHmP784C/6i7J3N893iuBReOMcSRw/3iz6tBi3DUoPwQeAU4/wD6/pBe8fZZ2awC/AZ65b0OS587wfFuAF45Y2+9XqHbX1/8eOAM4qqrmAw8AGeOY3cDRSZ7Zt7ak7/YO4EtVNb/v41lVdf5o5+/zp8DNE/iadAgx7hqIqnoA+BDw2SSnJ3lmkqclOTXJv3S7XQp8sHu9+YJu/30/fLwZOCHJiu4a97pJjnA38EcH2H4dML+7Dj7WMUcCe4FhYF6SDwF/MNYDVtV2epdZ1iV5epKXA6/v2+Vi4PVJTklyWJJnJDk5ybFjzdzNdzTwowN8LToEGXcNTHeN+l30fkg6TO+Z61rg690uH6UXwy3AT4AbuzWq6mf0Xm3zHeAOYL9XzkzAOmBDd/njjFFm+y1wEb3r4Pt8CvjLJPcl+TTwbeBbwM/oXTL6P/a/zDKaN9H7YfI93ddyGb2/wVBVO+j9APb9PPHP47088d/pyPMDvBHY0L3mXXpc/J91SKPr3vn5feDF+97INAvnuAz4aVV9eArHHk7vbzCvrqo9Mz6cDmrGXZpDSV4K3Av8Angtvb+lvLyq/nuQc6k9s/GOOUljey5wBb2Xgu4E/s6wazb4zF2SGuQPVCWpQcZdkhr0lLjmvmDBglq2bNmgx5Ckg8oNN9zwq6oa9ff5PyXivmzZMjZv3jzoMSTpoJJk+1jbvCwjSQ0y7pLUIOMuSQ0y7pLUIOMuSQ0y7pLUIOMuSQ0y7pLUoKfEm5gOFsvO/eagR2jKnef/xaBHaIbfmzOrhe9Nn7lLUoOMuyQ1yLhLUoOMuyQ1yLhLUoOMuyQ1yLhLUoOMuyQ1yLhLUoPGjXuSJUm+m+S2JLcmeUe3vi7JriQ3dR+n9R3zviTbktye5JTZ/AIkSU82kV8/sBd4d1XdmORI4IYkV3fbPlFV/9q/c5LjgTOBE4BjgO8keWFVPTqTg0uSxjbuM/eq2l1VN3a3HwK2AosPcMgq4CtV9UhV/QLYBpw4E8NKkiZmUtfckywDXgz8uFtam2RLkguTHNWtLQZ29B22kwP/YSBJmmETjnuSI4CvAe+sqgeBC4DnAyuA3cDHJnPiJGuSbE6yeXh4eDKHSpLGMaG4J3kavbBfUlVXAFTV3VX1aFU9BnyeJy697AKW9B1+bLe2n6paX1Urq2rl0NDQdL4GSdIIE3m1TIAvAFur6uN964v6dnsDcEt3eyNwZpLDkxwHLAeum7mRJUnjmcirZV4BnA38JMlN3dr7gbOSrAAKuBN4O0BV3ZrkcuA2eq+0OcdXykjS3Bo37lV1LZBRNl11gGPOA86bxlySpGnwHaqS1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNGjfuSZYk+W6S25LcmuQd3frRSa5Ockf3+ahuPUk+nWRbki1JXjLbX4QkaX8Teea+F3h3VR0PnASck+R44FxgU1UtBzZ19wFOBZZ3H2uAC2Z8aknSAY0b96raXVU3drcfArYCi4FVwIZutw3A6d3tVcAXq+dHwPwki2Z6cEnS2CZ1zT3JMuDFwI+BhVW1u9t0F7Cwu70Y2NF32M5uTZI0RyYc9yRHAF8D3llVD/Zvq6oCajInTrImyeYkm4eHhydzqCRpHBOKe5Kn0Qv7JVV1Rbd8977LLd3nPd36LmBJ3+HHdmv7qar1VbWyqlYODQ1NdX5J0igm8mqZAF8AtlbVx/s2bQRWd7dXA9/oW39z96qZk4AH+i7fSJLmwLwJ7PMK4GzgJ0lu6tbeD5wPXJ7kbcB24Ixu21XAacA24GHgrTM5sCRpfOPGvaquBTLG5teMsn8B50xzLknSNPgOVUlqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAYZd0lqkHGXpAaNG/ckFybZk+SWvrV1SXYluan7OK1v2/uSbEtye5JTZmtwSdLYJvLM/SLgdaOsf6KqVnQfVwEkOR44EzihO+ZzSQ6bqWElSRMzbtyr6hrg3gk+3irgK1X1SFX9AtgGnDiN+SRJUzCda+5rk2zpLtsc1a0tBnb07bOzW5MkzaGpxv0C4PnACmA38LHJPkCSNUk2J9k8PDw8xTEkSaOZUtyr6u6qerSqHgM+zxOXXnYBS/p2PbZbG+0x1lfVyqpaOTQ0NJUxJEljmFLckyzqu/sGYN8raTYCZyY5PMlxwHLguumNKEmarHnj7ZDkUuBkYEGSncCHgZOTrAAKuBN4O0BV3ZrkcuA2YC9wTlU9OiuTS5LGNG7cq+qsUZa/cID9zwPOm85QkqTp8R2qktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktQg4y5JDTLuktSgceOe5MIke5Lc0rd2dJKrk9zRfT6qW0+STyfZlmRLkpfM5vCSpNFN5Jn7RcDrRqydC2yqquXApu4+wKnA8u5jDXDBzIwpSZqMceNeVdcA945YXgVs6G5vAE7vW/9i9fwImJ9k0QzNKkmaoKlec19YVbu723cBC7vbi4Edffvt7NYkSXNo2j9QraoCarLHJVmTZHOSzcPDw9MdQ5LUZ6pxv3vf5Zbu855ufRewpG+/Y7u1J6mq9VW1sqpWDg0NTXEMSdJophr3jcDq7vZq4Bt962/uXjVzEvBA3+UbSdIcmTfeDkkuBU4GFiTZCXwYOB+4PMnbgO3AGd3uVwGnAduAh4G3zsLMkqRxjBv3qjprjE2vGWXfAs6Z7lCSpOnxHaqS1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNmjedg5PcCTwEPArsraqVSY4GLgOWAXcCZ1TVfdMbU5I0GTPxzP3Pq2pFVa3s7p8LbKqq5cCm7r4kaQ7NxmWZVcCG7vYG4PRZOIck6QCmG/cC/ivJDUnWdGsLq2p3d/suYOE0zyFJmqRpXXMHXllVu5L8IXB1kp/2b6yqSlKjHdj9YbAGYOnSpdMcQ5LUb1rP3KtqV/d5D3AlcCJwd5JFAN3nPWMcu76qVlbVyqGhoemMIUkaYcpxT/KsJEfuuw28FrgF2Ais7nZbDXxjukNKkiZnOpdlFgJXJtn3OF+uqm8luR64PMnbgO3AGdMfU5I0GVOOe1X9HHjRKOv3AK+ZzlCSpOnxHaqS1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNMu6S1CDjLkkNmrW4J3ldktuTbEty7mydR5L0ZLMS9ySHAZ8FTgWOB85KcvxsnEuS9GSz9cz9RGBbVf28qn4LfAVYNUvnkiSNMG+WHncxsKPv/k7gZf07JFkDrOnu/jrJ7bM0y6FoAfCrQQ8xnvzzoCfQAPi9ObOeN9aG2Yr7uKpqPbB+UOdvWZLNVbVy0HNII/m9OXdm67LMLmBJ3/1juzVJ0hyYrbhfDyxPclySpwNnAhtn6VySpBFm5bJMVe1Nshb4NnAYcGFV3Tob59KovNylpyq/N+dIqmrQM0iSZpjvUJWkBhl3SWqQcZekBg3sde6aOUn+hN47gBd3S7uAjVW1dXBTSRokn7kf5JL8A71f7xDguu4jwKX+wjY9VSV566BnaJ2vljnIJfkZcEJV/W7E+tOBW6tq+WAmk8aW5H+qaumg52iZl2UOfo8BxwDbR6wv6rZJA5Fky1ibgIVzOcuhyLgf/N4JbEpyB0/8sralwAuAtYMaSqIX8FOA+0asB/jB3I9zaDHuB7mq+laSF9L7Ncv9P1C9vqoeHdxkEv8BHFFVN43ckOR7cz7NIcZr7pLUIF8tI0kNMu6S1CDjLkkNMu6S1CDjLkkN+n9FlP1ETWJfHAAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "from sklearn.utils import resample\n", "\n", "print(\"Original Data distribution\")\n", "print(data['class'].value_counts())\n", "\n", "# Down Sample Majority class\n", "down_sample = resample(data[data['class']==0],\n", " replace = True, # sample with replacement\n", " n_samples = data[data['class']==1].shape[0], # to match minority class\n", " random_state=42) # reproducible results\n", "\n", "# Combine majority class with upsampled minority class\n", "train_downsample = pd.concat([data[data['class']==1], down_sample])\n", "\n", "# Display new class counts\n", "print('Sci-Kit Learn : resample : Down Sampled data set')\n", "print(train_downsample['class'].value_counts())\n", "print(\"Num records = \", train_downsample.shape[0])\n", "train_downsample['class'].value_counts().plot(kind='bar', title='Count (target)');" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Over sampling the minority call y=0 (using random sampling)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Random over-sampling:\n", "0 500\n", "1 500\n", "Name: class, dtype: int64\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEFCAYAAAAYKqc0AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAQKklEQVR4nO3df6zddX3H8edrVECFUaB3FVuwbNQ5zAKaihB/xEk2BefKH8pQp5WwNFkg0TB/MDWKRhdcsoFmzqwZhir+gKFIp0yHKFGj/CgKKKLSMVlbgVZoK8r8gbz3x/kUTq/39t62995DP30+kpPz+X4+n+/5vk97++r3fs73nJOqQpLUl98ZdQGSpJlnuEtShwx3SeqQ4S5JHTLcJalDhrskdchwlyaRZCzJ95M8cdS1TCTJAa2+sVHXoscfw10jleTVSdYm+VmSe5L8Z5Lnz8FxK8kxU0w7D7ikqv6v7XNdkr+e7domM/74VfVL4CMM6pR2YLhrZJKcC1wE/D2wEDgK+Bdg+QjLAgZnxcAK4NIZfMx5M/VYQz4BrGj1So8y3DUSSQ4B3gOcXVWfqaqfV9Wvq+o/qurNbc4BSS5K8uN2u2h7iCV5fZKvj3vMR8/Gk1yS5ENJPp/kwSQ3JPmDNvbVtsut7TeGv5ygxOcCW6tqQ9vnfcALgH9u+/xz6/9AkvVJfprk5iQvGKrn/CRXJLk0yU+B1yc5OslXW01fajVeOrTPiUm+kWRrkluTvGhnx2/1bQFO3P2/DfXIcNeonAQcCFy5kzlvZxBaxwPHAScA79iFY5wBvBs4FFgHvA+gql7Yxo+rqoOq6rIJ9v1j4AfbN6rq7cDXgHPaPue0oZtafYcxOIv+9yQHDj3OcuAKYD7w8TbnRuBw4HzgtdsnJlkEfB54b3u8NwGfTjK2k+MD3MHgz0d6lOGuUTkc+ElVPbyTOa8B3lNVm6pqM4Ogfu1O5o93ZVXd2I7xcQYhPF3zgQenmlRVl1bV/VX1cFX9I3AA8IdDU75ZVZ+tqkeAMeA5wDur6ldV9XVgzdDcvwKurqqrq+qRqroGWAucOkUZD7Z6pUcZ7hqV+4EFU6xDPxW4e2j77tY3XfcOtR8CDtqFfbcAB081KcmbktyRZFuSrcAhwIKhKeuH2k8FHqiqhyYZfxrwyrYks7U93vOBI6Yo42Bg61S1at9iuGtUvgn8EjhtJ3N+zCDwtjuq9QH8HHjS9oEkT5nh+m4Dnj6ub4ePUG3r628BTgcOrar5wDYgk+xzD3BYkicN9R051F4PfKyq5g/dnlxVF0x0/CF/BNw6jeekfYjhrpGoqm3AO4EPJTktyZOSPCHJKUn+oU37JPCOdr35gjZ/+4uPtwLPTHJ8W+M+fxdLuA/4/Z2M3wjMb+vgk+1zMPAwsBmYl+SdwO9O9oBVdTeDZZbzk+yf5CTg5UNTLgVenuQlSfZLcmCSFyVZPFnNrb7DgOt38ly0DzLcNTJtjfpcBi+SbmZw5noO8Nk25b0MwvA24DvAt1ofVfVDBlfbfAm4E9jhyplpOB9Y3ZY/Tp+gtl8BlzBYB9/uA8ArkmxJ8kHgi8AXgB8yWDL6BTsus0zkNQxeTL6/PZfLGPwGQ1WtZ/AC7Nt47M/jzTz273T88QFeDaxu17xLj4pf1iFNrL3z82vAs7a/kWkWjnEZ8P2qetdu7HsAg99gXlhVm2a8OO3VDHdpDiV5DvAA8D/AnzH4LeWkqvr2KOtSf2bjHXOSJvcU4DMMLgXdAPyNwa7Z4Jm7JHXIF1QlqUOGuyR16HGx5r5gwYJasmTJqMuQpL3KzTff/JOqmvDz/B8X4b5kyRLWrl076jIkaa+S5O7JxlyWkaQOGe6S1CHDXZI6ZLhLUocMd0nq0LTCPcmPknwnyS1J1ra+w5Jck+TOdn9o60+SDyZZl+S2JM+ezScgSfptu3Lm/idVdXxVLWvb5wHXVtVS4Nq2DXAKsLTdVgIfnqliJUnTsyfLMsuB1a29mse+UWc58NEauJ7BFx5M9TVhkqQZNN03MRXwX0kK+NeqWgUsrKp72vi9wMLWXsSOX1iwofXdM9RHkpUMzuw56qijdq/6ObbkvM+PuoSu/OiCl426hG74szmzevjZnG64P7+qNib5PeCaJN8fHqyqasE/be0/iFUAy5Yt86MpJWkGTWtZpqo2tvtNwJXACcB925db2v32b4LZyI5f+ru49UmS5siU4Z7kyUkO3t5m8O0x3wXWACvatBXAVa29Bnhdu2rmRGDb0PKNJGkOTGdZZiFwZZLt8z9RVV9IchNweZKzGHw58PYvGb4aOBVYBzwEnDnjVUuSdmrKcK+qu4DjJui/Hzh5gv4Czp6R6iRJu8V3qEpShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6NO1wT7Jfkm8n+VzbPjrJDUnWJbksyf6t/4C2va6NL5ml2iVJk9iVM/c3AHcMbb8fuLCqjgG2AGe1/rOALa3/wjZPkjSHphXuSRYDLwP+rW0HeDFwRZuyGjittZe3bdr4yW2+JGmOTPfM/SLgLcAjbftwYGtVPdy2NwCLWnsRsB6gjW9r8yVJc2TKcE/y58Cmqrp5Jg+cZGWStUnWbt68eSYfWpL2edM5c38e8BdJfgR8isFyzAeA+UnmtTmLgY2tvRE4EqCNHwLcP/5Bq2pVVS2rqmVjY2N79CQkSTuaMtyr6u+qanFVLQHOAL5cVa8BvgK8ok1bAVzV2mvaNm38y1VVM1q1JGmn9uQ697cC5yZZx2BN/eLWfzFweOs/Fzhvz0qUJO2qeVNPeUxVXQdc19p3ASdMMOcXwCtnoDZJ0m7yHaqS1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDk0Z7kkOTHJjkluT3J7k3a3/6CQ3JFmX5LIk+7f+A9r2uja+ZJafgyRpnOmcuf8SeHFVHQccD7w0yYnA+4ELq+oYYAtwVpt/FrCl9V/Y5kmS5tCU4V4DP2ubT2i3Al4MXNH6VwOntfbytk0bPzlJZqpgSdLUprXmnmS/JLcAm4BrgP8GtlbVw23KBmBRay8C1gO08W3A4TNYsyRpCtMK96r6TVUdDywGTgCesacHTrIyydokazdv3rynDydJGrJLV8tU1VbgK8BJwPwk89rQYmBja28EjgRo44cA90/wWKuqallVLRsbG9u96iVJE5rO1TJjSea39hOBPwXuYBDyr2jTVgBXtfaatk0b/3JV1QzWLEmawrypp3AEsDrJfgz+M7i8qj6X5HvAp5K8F/g2cHGbfzHwsSTrgAeAM2ahbknSTkwZ7lV1G/CsCfrvYrD+Pr7/F8ArZ6Q6SdJu8R2qktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUoSnDPcmRSb6S5HtJbk/yhtZ/WJJrktzZ7g9t/UnywSTrktyW5Nmz/SQkSTuazpn7w8DfVtWxwInA2UmOBc4Drq2qpcC1bRvgFGBpu60EPjzjVUuSdmrKcK+qe6rqW639IHAHsAhYDqxu01YDp7X2cuCjNXA9MD/JETNduCRpcru05p5kCfAs4AZgYVXd04buBRa29iJg/dBuG1qfJGmOTDvckxwEfBp4Y1X9dHisqgqoXTlwkpVJ1iZZu3nz5l3ZVZI0hWmFe5InMAj2j1fVZ1r3fduXW9r9pta/EThyaPfFrW8HVbWqqpZV1bKxsbHdrV+SNIHpXC0T4GLgjqr6p6GhNcCK1l4BXDXU/7p21cyJwLah5RtJ0hyYN405zwNeC3wnyS2t723ABcDlSc4C7gZOb2NXA6cC64CHgDNnsmBJ0tSmDPeq+jqQSYZPnmB+AWfvYV2SpD3gO1QlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHZoy3JN8JMmmJN8d6jssyTVJ7mz3h7b+JPlgknVJbkvy7NksXpI0semcuV8CvHRc33nAtVW1FLi2bQOcAixtt5XAh2emTEnSrpgy3Kvqq8AD47qXA6tbezVw2lD/R2vgemB+kiNmqFZJ0jTt7pr7wqq6p7XvBRa29iJg/dC8Da1PkjSH9vgF1aoqoHZ1vyQrk6xNsnbz5s17WoYkacjuhvt925db2v2m1r8ROHJo3uLW91uqalVVLauqZWNjY7tZhiRpIrsb7muAFa29ArhqqP917aqZE4FtQ8s3kqQ5Mm+qCUk+CbwIWJBkA/Au4ALg8iRnAXcDp7fpVwOnAuuAh4AzZ6FmSdIUpgz3qnrVJEMnTzC3gLP3tChJ0p7xHaqS1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktShWQn3JC9N8oMk65KcNxvHkCRNbsbDPcl+wIeAU4BjgVclOXamjyNJmtxsnLmfAKyrqruq6lfAp4Dls3AcSdIk5s3CYy4C1g9tbwCeO35SkpXAyrb5syQ/mIVa9lULgJ+Muoip5P2jrkAj4M/mzHraZAOzEe7TUlWrgFWjOn7PkqytqmWjrkMaz5/NuTMbyzIbgSOHthe3PknSHJmNcL8JWJrk6CT7A2cAa2bhOJKkScz4skxVPZzkHOCLwH7AR6rq9pk+jnbK5S49XvmzOUdSVaOuQZI0w3yHqiR1yHCXpA4Z7pLUoZFd566ZkeQZDN4BvKh1bQTWVNUdo6tK0qh55r4XS/JWBh/vEODGdgvwST+wTY9nSc4cdQ2982qZvViSHwLPrKpfj+vfH7i9qpaOpjJp55L8b1UdNeo6euayzN7tEeCpwN3j+o9oY9LIJLltsiFg4VzWsi8y3PdubwSuTXInj31Y21HAMcA5oypKahYCLwG2jOsP8I25L2ffYrjvxarqC0mezuBjlodfUL2pqn4zusokAD4HHFRVt4wfSHLdnFezj3HNXZI65NUyktQhw12SOmS4S1KHDHdJ6pDhLkkd+n+rPQ6LBFTagQAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "df_class_1_over = df_class_1.sample(count_class_0, replace=True)\n", "\n", "df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)\n", "\n", "print('Random over-sampling:')\n", "print(df_test_over['class'].value_counts())\n", "\n", "df_test_over['class'].value_counts().plot(kind='bar', title='Count (target)');" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Over sampling the minority call y=0 using SMOTE" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 500\n", "1 268\n", "Name: class, dtype: int64\n", "SMOTE over-sampling:\n", "0 500\n", "1 500\n", "Name: class, dtype: int64\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEFCAYAAAAYKqc0AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAQKklEQVR4nO3df6zddX3H8edrVECFUaB3FVuwbNQ5zAKaihB/xEk2BefKH8pQp5WwNFkg0TB/MDWKRhdcsoFmzqwZhir+gKFIp0yHKFGj/CgKKKLSMVlbgVZoK8r8gbz3x/kUTq/39t62995DP30+kpPz+X4+n+/5vk97++r3fs73nJOqQpLUl98ZdQGSpJlnuEtShwx3SeqQ4S5JHTLcJalDhrskdchwlyaRZCzJ95M8cdS1TCTJAa2+sVHXoscfw10jleTVSdYm+VmSe5L8Z5Lnz8FxK8kxU0w7D7ikqv6v7XNdkr+e7domM/74VfVL4CMM6pR2YLhrZJKcC1wE/D2wEDgK+Bdg+QjLAgZnxcAK4NIZfMx5M/VYQz4BrGj1So8y3DUSSQ4B3gOcXVWfqaqfV9Wvq+o/qurNbc4BSS5K8uN2u2h7iCV5fZKvj3vMR8/Gk1yS5ENJPp/kwSQ3JPmDNvbVtsut7TeGv5ygxOcCW6tqQ9vnfcALgH9u+/xz6/9AkvVJfprk5iQvGKrn/CRXJLk0yU+B1yc5OslXW01fajVeOrTPiUm+kWRrkluTvGhnx2/1bQFO3P2/DfXIcNeonAQcCFy5kzlvZxBaxwPHAScA79iFY5wBvBs4FFgHvA+gql7Yxo+rqoOq6rIJ9v1j4AfbN6rq7cDXgHPaPue0oZtafYcxOIv+9yQHDj3OcuAKYD7w8TbnRuBw4HzgtdsnJlkEfB54b3u8NwGfTjK2k+MD3MHgz0d6lOGuUTkc+ElVPbyTOa8B3lNVm6pqM4Ogfu1O5o93ZVXd2I7xcQYhPF3zgQenmlRVl1bV/VX1cFX9I3AA8IdDU75ZVZ+tqkeAMeA5wDur6ldV9XVgzdDcvwKurqqrq+qRqroGWAucOkUZD7Z6pUcZ7hqV+4EFU6xDPxW4e2j77tY3XfcOtR8CDtqFfbcAB081KcmbktyRZFuSrcAhwIKhKeuH2k8FHqiqhyYZfxrwyrYks7U93vOBI6Yo42Bg61S1at9iuGtUvgn8EjhtJ3N+zCDwtjuq9QH8HHjS9oEkT5nh+m4Dnj6ub4ePUG3r628BTgcOrar5wDYgk+xzD3BYkicN9R051F4PfKyq5g/dnlxVF0x0/CF/BNw6jeekfYjhrpGoqm3AO4EPJTktyZOSPCHJKUn+oU37JPCOdr35gjZ/+4uPtwLPTHJ8W+M+fxdLuA/4/Z2M3wjMb+vgk+1zMPAwsBmYl+SdwO9O9oBVdTeDZZbzk+yf5CTg5UNTLgVenuQlSfZLcmCSFyVZPFnNrb7DgOt38ly0DzLcNTJtjfpcBi+SbmZw5noO8Nk25b0MwvA24DvAt1ofVfVDBlfbfAm4E9jhyplpOB9Y3ZY/Tp+gtl8BlzBYB9/uA8ArkmxJ8kHgi8AXgB8yWDL6BTsus0zkNQxeTL6/PZfLGPwGQ1WtZ/AC7Nt47M/jzTz273T88QFeDaxu17xLj4pf1iFNrL3z82vAs7a/kWkWjnEZ8P2qetdu7HsAg99gXlhVm2a8OO3VDHdpDiV5DvAA8D/AnzH4LeWkqvr2KOtSf2bjHXOSJvcU4DMMLgXdAPyNwa7Z4Jm7JHXIF1QlqUOGuyR16HGx5r5gwYJasmTJqMuQpL3KzTff/JOqmvDz/B8X4b5kyRLWrl076jIkaa+S5O7JxlyWkaQOGe6S1CHDXZI6ZLhLUocMd0nq0LTCPcmPknwnyS1J1ra+w5Jck+TOdn9o60+SDyZZl+S2JM+ezScgSfptu3Lm/idVdXxVLWvb5wHXVtVS4Nq2DXAKsLTdVgIfnqliJUnTsyfLMsuB1a29mse+UWc58NEauJ7BFx5M9TVhkqQZNN03MRXwX0kK+NeqWgUsrKp72vi9wMLWXsSOX1iwofXdM9RHkpUMzuw56qijdq/6ObbkvM+PuoSu/OiCl426hG74szmzevjZnG64P7+qNib5PeCaJN8fHqyqasE/be0/iFUAy5Yt86MpJWkGTWtZpqo2tvtNwJXACcB925db2v32b4LZyI5f+ru49UmS5siU4Z7kyUkO3t5m8O0x3wXWACvatBXAVa29Bnhdu2rmRGDb0PKNJGkOTGdZZiFwZZLt8z9RVV9IchNweZKzGHw58PYvGb4aOBVYBzwEnDnjVUuSdmrKcK+qu4DjJui/Hzh5gv4Czp6R6iRJu8V3qEpShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6NO1wT7Jfkm8n+VzbPjrJDUnWJbksyf6t/4C2va6NL5ml2iVJk9iVM/c3AHcMbb8fuLCqjgG2AGe1/rOALa3/wjZPkjSHphXuSRYDLwP+rW0HeDFwRZuyGjittZe3bdr4yW2+JGmOTPfM/SLgLcAjbftwYGtVPdy2NwCLWnsRsB6gjW9r8yVJc2TKcE/y58Cmqrp5Jg+cZGWStUnWbt68eSYfWpL2edM5c38e8BdJfgR8isFyzAeA+UnmtTmLgY2tvRE4EqCNHwLcP/5Bq2pVVS2rqmVjY2N79CQkSTuaMtyr6u+qanFVLQHOAL5cVa8BvgK8ok1bAVzV2mvaNm38y1VVM1q1JGmn9uQ697cC5yZZx2BN/eLWfzFweOs/Fzhvz0qUJO2qeVNPeUxVXQdc19p3ASdMMOcXwCtnoDZJ0m7yHaqS1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDk0Z7kkOTHJjkluT3J7k3a3/6CQ3JFmX5LIk+7f+A9r2uja+ZJafgyRpnOmcuf8SeHFVHQccD7w0yYnA+4ELq+oYYAtwVpt/FrCl9V/Y5kmS5tCU4V4DP2ubT2i3Al4MXNH6VwOntfbytk0bPzlJZqpgSdLUprXmnmS/JLcAm4BrgP8GtlbVw23KBmBRay8C1gO08W3A4TNYsyRpCtMK96r6TVUdDywGTgCesacHTrIyydokazdv3rynDydJGrJLV8tU1VbgK8BJwPwk89rQYmBja28EjgRo44cA90/wWKuqallVLRsbG9u96iVJE5rO1TJjSea39hOBPwXuYBDyr2jTVgBXtfaatk0b/3JV1QzWLEmawrypp3AEsDrJfgz+M7i8qj6X5HvAp5K8F/g2cHGbfzHwsSTrgAeAM2ahbknSTkwZ7lV1G/CsCfrvYrD+Pr7/F8ArZ6Q6SdJu8R2qktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUoSnDPcmRSb6S5HtJbk/yhtZ/WJJrktzZ7g9t/UnywSTrktyW5Nmz/SQkSTuazpn7w8DfVtWxwInA2UmOBc4Drq2qpcC1bRvgFGBpu60EPjzjVUuSdmrKcK+qe6rqW639IHAHsAhYDqxu01YDp7X2cuCjNXA9MD/JETNduCRpcru05p5kCfAs4AZgYVXd04buBRa29iJg/dBuG1qfJGmOTDvckxwEfBp4Y1X9dHisqgqoXTlwkpVJ1iZZu3nz5l3ZVZI0hWmFe5InMAj2j1fVZ1r3fduXW9r9pta/EThyaPfFrW8HVbWqqpZV1bKxsbHdrV+SNIHpXC0T4GLgjqr6p6GhNcCK1l4BXDXU/7p21cyJwLah5RtJ0hyYN405zwNeC3wnyS2t723ABcDlSc4C7gZOb2NXA6cC64CHgDNnsmBJ0tSmDPeq+jqQSYZPnmB+AWfvYV2SpD3gO1QlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHZoy3JN8JMmmJN8d6jssyTVJ7mz3h7b+JPlgknVJbkvy7NksXpI0semcuV8CvHRc33nAtVW1FLi2bQOcAixtt5XAh2emTEnSrpgy3Kvqq8AD47qXA6tbezVw2lD/R2vgemB+kiNmqFZJ0jTt7pr7wqq6p7XvBRa29iJg/dC8Da1PkjSH9vgF1aoqoHZ1vyQrk6xNsnbz5s17WoYkacjuhvt925db2v2m1r8ROHJo3uLW91uqalVVLauqZWNjY7tZhiRpIrsb7muAFa29ArhqqP917aqZE4FtQ8s3kqQ5Mm+qCUk+CbwIWJBkA/Au4ALg8iRnAXcDp7fpVwOnAuuAh4AzZ6FmSdIUpgz3qnrVJEMnTzC3gLP3tChJ0p7xHaqS1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktQhw12SOmS4S1KHDHdJ6pDhLkkdMtwlqUOGuyR1yHCXpA4Z7pLUIcNdkjpkuEtShwx3SeqQ4S5JHTLcJalDhrskdchwl6QOGe6S1CHDXZI6ZLhLUocMd0nqkOEuSR0y3CWpQ4a7JHXIcJekDhnuktShWQn3JC9N8oMk65KcNxvHkCRNbsbDPcl+wIeAU4BjgVclOXamjyNJmtxsnLmfAKyrqruq6lfAp4Dls3AcSdIk5s3CYy4C1g9tbwCeO35SkpXAyrb5syQ/mIVa9lULgJ+Muoip5P2jrkAj4M/mzHraZAOzEe7TUlWrgFWjOn7PkqytqmWjrkMaz5/NuTMbyzIbgSOHthe3PknSHJmNcL8JWJrk6CT7A2cAa2bhOJKkScz4skxVPZzkHOCLwH7AR6rq9pk+jnbK5S49XvmzOUdSVaOuQZI0w3yHqiR1yHCXpA4Z7pLUoZFd566ZkeQZDN4BvKh1bQTWVNUdo6tK0qh55r4XS/JWBh/vEODGdgvwST+wTY9nSc4cdQ2982qZvViSHwLPrKpfj+vfH7i9qpaOpjJp55L8b1UdNeo6euayzN7tEeCpwN3j+o9oY9LIJLltsiFg4VzWsi8y3PdubwSuTXInj31Y21HAMcA5oypKahYCLwG2jOsP8I25L2ffYrjvxarqC0mezuBjlodfUL2pqn4zusokAD4HHFRVt4wfSHLdnFezj3HNXZI65NUyktQhw12SOmS4S1KHDHdJ6pDhLkkd+n+rPQ6LBFTagQAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "from imblearn.over_sampling import SMOTE\n", "\n", "print(data['class'].value_counts())\n", "X = data.drop('class', axis=1)\n", "Y = data['class']\n", "\n", "sm = SMOTE(random_state=42)\n", "X_res, Y_res = sm.fit_resample(X, Y)\n", "\n", "df_smote_over = pd.concat([pd.DataFrame(X_res), pd.DataFrame(Y_res, columns=['class'])], axis=1)\n", "\n", "print('SMOTE over-sampling:')\n", "print(df_smote_over['class'].value_counts())\n", "\n", "df_smote_over['class'].value_counts().plot(kind='bar', title='Count (target)');" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.9" } }, "nbformat": 4, "nbformat_minor": 4 }