{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "90d1208c-18ee-43b2-aafb-c79d0b862687",
"metadata": {},
"outputs": [],
"source": [
"import nltk\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"import re\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import PorterStemmer\n",
"from nltk.stem import WordNetLemmatizer\n",
"\n",
"stemmer = PorterStemmer()\n",
"lemmatizer = WordNetLemmatizer()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "583799b8-54f4-4faa-83a0-8d5da9ed6c1f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Label | \n",
" Message | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Label, Message]\n",
"Index: []"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"full_data = pd.DataFrame({'Label':[], 'Message':[]})\n",
"full_data"
]
},
{
"cell_type": "markdown",
"id": "c7a03b9c-ae0c-49d0-b65c-73aa0b12f773",
"metadata": {},
"source": [
"# Dataset 1"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ab2c7f73-dce3-4c31-848b-741c3b68c418",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" v1 | \n",
" v2 | \n",
" Unnamed: 2 | \n",
" Unnamed: 3 | \n",
" Unnamed: 4 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" ham | \n",
" Go until jurong point, crazy.. Available only ... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" ham | \n",
" Ok lar... Joking wif u oni... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" spam | \n",
" Free entry in 2 a wkly comp to win FA Cup fina... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" ham | \n",
" U dun say so early hor... U c already then say... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" ham | \n",
" Nah I don't think he goes to usf, he lives aro... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" v1 v2 Unnamed: 2 \\\n",
"0 ham Go until jurong point, crazy.. Available only ... NaN \n",
"1 ham Ok lar... Joking wif u oni... NaN \n",
"2 spam Free entry in 2 a wkly comp to win FA Cup fina... NaN \n",
"3 ham U dun say so early hor... U c already then say... NaN \n",
"4 ham Nah I don't think he goes to usf, he lives aro... NaN \n",
"\n",
" Unnamed: 3 Unnamed: 4 \n",
"0 NaN NaN \n",
"1 NaN NaN \n",
"2 NaN NaN \n",
"3 NaN NaN \n",
"4 NaN NaN "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.read_csv(\"spam_data/spam_data_1.csv\", encoding='Windows-1252')\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "347db751-7bd6-4d6f-8cfd-4456a69ebc90",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\thaku\\AppData\\Local\\Temp\\ipykernel_24436\\3848975045.py:1: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
" data['v1'] = data['v1'].replace(to_replace=['ham', 'spam'], value=[1, 0]).astype(int)\n"
]
}
],
"source": [
"data['v1'] = data['v1'].replace(to_replace=['ham', 'spam'], value=[1, 0]).astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "b4e18778-0900-4114-b405-92433f686d85",
"metadata": {},
"outputs": [],
"source": [
"for i in range(len(data)):\n",
" review = re.sub('[^a-zA-Z]', ' ', data['v2'][i])\n",
" review = review.lower()\n",
" review = review.split()\n",
" review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]\n",
" review = ' '.join(review)\n",
" data.loc[i, 'v2'] = review "
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e7e5d709-fce0-459f-8df8-4daa7ec7f1e2",
"metadata": {},
"outputs": [],
"source": [
"data = data[['v1', 'v2']]\n",
"data = data.rename(columns={'v1':'Label', 'v2':'Message'})"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "d286a7a2-8bd7-4f3b-b6f1-71197c5dd234",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Label | \n",
" Message | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" go jurong point crazy available bugis n great ... | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" ok lar joking wif u oni | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" free entry wkly comp win fa cup final tkts st ... | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" u dun say early hor u c already say | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" nah think go usf life around though | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 5567 | \n",
" 0 | \n",
" nd time tried contact u u pound prize claim ea... | \n",
"
\n",
" \n",
" 5568 | \n",
" 1 | \n",
" b going esplanade fr home | \n",
"
\n",
" \n",
" 5569 | \n",
" 1 | \n",
" pity mood suggestion | \n",
"
\n",
" \n",
" 5570 | \n",
" 1 | \n",
" guy bitching acted like interested buying some... | \n",
"
\n",
" \n",
" 5571 | \n",
" 1 | \n",
" rofl true name | \n",
"
\n",
" \n",
"
\n",
"
5572 rows × 2 columns
\n",
"
"
],
"text/plain": [
" Label Message\n",
"0 1 go jurong point crazy available bugis n great ...\n",
"1 1 ok lar joking wif u oni\n",
"2 0 free entry wkly comp win fa cup final tkts st ...\n",
"3 1 u dun say early hor u c already say\n",
"4 1 nah think go usf life around though\n",
"... ... ...\n",
"5567 0 nd time tried contact u u pound prize claim ea...\n",
"5568 1 b going esplanade fr home\n",
"5569 1 pity mood suggestion\n",
"5570 1 guy bitching acted like interested buying some...\n",
"5571 1 rofl true name\n",
"\n",
"[5572 rows x 2 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d337a825-5af8-40b5-88b1-f6ec7ca5bca2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Label | \n",
" Message | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1.0 | \n",
" go jurong point crazy available bugis n great ... | \n",
"
\n",
" \n",
" 1 | \n",
" 1.0 | \n",
" ok lar joking wif u oni | \n",
"
\n",
" \n",
" 2 | \n",
" 0.0 | \n",
" free entry wkly comp win fa cup final tkts st ... | \n",
"
\n",
" \n",
" 3 | \n",
" 1.0 | \n",
" u dun say early hor u c already say | \n",
"
\n",
" \n",
" 4 | \n",
" 1.0 | \n",
" nah think go usf life around though | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 5567 | \n",
" 0.0 | \n",
" nd time tried contact u u pound prize claim ea... | \n",
"
\n",
" \n",
" 5568 | \n",
" 1.0 | \n",
" b going esplanade fr home | \n",
"
\n",
" \n",
" 5569 | \n",
" 1.0 | \n",
" pity mood suggestion | \n",
"
\n",
" \n",
" 5570 | \n",
" 1.0 | \n",
" guy bitching acted like interested buying some... | \n",
"
\n",
" \n",
" 5571 | \n",
" 1.0 | \n",
" rofl true name | \n",
"
\n",
" \n",
"
\n",
"
5572 rows × 2 columns
\n",
"
"
],
"text/plain": [
" Label Message\n",
"0 1.0 go jurong point crazy available bugis n great ...\n",
"1 1.0 ok lar joking wif u oni\n",
"2 0.0 free entry wkly comp win fa cup final tkts st ...\n",
"3 1.0 u dun say early hor u c already say\n",
"4 1.0 nah think go usf life around though\n",
"... ... ...\n",
"5567 0.0 nd time tried contact u u pound prize claim ea...\n",
"5568 1.0 b going esplanade fr home\n",
"5569 1.0 pity mood suggestion\n",
"5570 1.0 guy bitching acted like interested buying some...\n",
"5571 1.0 rofl true name\n",
"\n",
"[5572 rows x 2 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"full_data = pd.concat([full_data, data], ignore_index=True)\n",
"full_data"
]
},
{
"cell_type": "markdown",
"id": "e903b4ef-f47c-4df6-9775-f920b9a91ad1",
"metadata": {},
"source": [
"# Dataset 2"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "ea1cbd53-160c-481e-b911-7c03f672de9b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" email | \n",
" label | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" date wed NUMBER aug NUMBER NUMBER NUMBER NUMB... | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" martin a posted tassos papadopoulos the greek ... | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" man threatens explosion in moscow thursday aug... | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" klez the virus that won t die already the most... | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" in adding cream to spaghetti carbonara which ... | \n",
" 0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 2995 | \n",
" abc s good morning america ranks it the NUMBE... | \n",
" 1 | \n",
"
\n",
" \n",
" 2996 | \n",
" hyperlink hyperlink hyperlink let mortgage le... | \n",
" 1 | \n",
"
\n",
" \n",
" 2997 | \n",
" thank you for shopping with us gifts for all ... | \n",
" 1 | \n",
"
\n",
" \n",
" 2998 | \n",
" the famous ebay marketing e course learn to s... | \n",
" 1 | \n",
"
\n",
" \n",
" 2999 | \n",
" hello this is chinese traditional 子 件 NUMBER世... | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
3000 rows × 2 columns
\n",
"
"
],
"text/plain": [
" email label\n",
"0 date wed NUMBER aug NUMBER NUMBER NUMBER NUMB... 0\n",
"1 martin a posted tassos papadopoulos the greek ... 0\n",
"2 man threatens explosion in moscow thursday aug... 0\n",
"3 klez the virus that won t die already the most... 0\n",
"4 in adding cream to spaghetti carbonara which ... 0\n",
"... ... ...\n",
"2995 abc s good morning america ranks it the NUMBE... 1\n",
"2996 hyperlink hyperlink hyperlink let mortgage le... 1\n",
"2997 thank you for shopping with us gifts for all ... 1\n",
"2998 the famous ebay marketing e course learn to s... 1\n",
"2999 hello this is chinese traditional 子 件 NUMBER世... 1\n",
"\n",
"[3000 rows x 2 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.read_csv('spam_data/spam_data_2.csv')\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "25352c21-ce85-4b17-90da-48fb4a959844",
"metadata": {},
"outputs": [],
"source": [
"data = data.dropna()\n",
"data = data.reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "7b07e8f2-be13-4d87-a5fd-eecaf2408f61",
"metadata": {},
"outputs": [],
"source": [
"data['label'] = data['label'].replace(to_replace=[0, 1], value=[1, 0]).astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "aadb5b45-e08f-456c-940b-457936bf49f0",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"messages processed : 0\n",
"messages processed : 100\n",
"messages processed : 200\n",
"messages processed : 300\n",
"messages processed : 400\n",
"messages processed : 500\n",
"messages processed : 600\n",
"messages processed : 700\n",
"messages processed : 800\n",
"messages processed : 900\n",
"messages processed : 1000\n",
"messages processed : 1100\n",
"messages processed : 1200\n",
"messages processed : 1300\n",
"messages processed : 1400\n",
"messages processed : 1500\n",
"messages processed : 1600\n",
"messages processed : 1700\n",
"messages processed : 1800\n",
"messages processed : 1900\n",
"messages processed : 2000\n",
"messages processed : 2100\n",
"messages processed : 2200\n",
"messages processed : 2300\n",
"messages processed : 2400\n",
"messages processed : 2500\n",
"messages processed : 2600\n",
"messages processed : 2700\n",
"messages processed : 2800\n",
"messages processed : 2900\n"
]
}
],
"source": [
"for i in range(len(data)):\n",
" review = re.sub('[^a-zA-Z]', ' ', data['email'][i])\n",
" review = review.lower()\n",
" review = review.split()\n",
" review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]\n",
" review = ' '.join(review)\n",
" data.loc[i, 'email'] = review\n",
" if i%100==0:\n",
" print('messages processed :' ,i)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "fbe8e35f-979c-4a93-abab-a2dd7be2eee5",
"metadata": {},
"outputs": [],
"source": [
"data = data[['label', 'email']]\n",
"data = data.rename(columns={'label':'Label', 'email':'Message'})"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "14abf300-2450-43e8-8473-2d80ba810889",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Label | \n",
" Message | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1.0 | \n",
" go jurong point crazy available bugis n great ... | \n",
"
\n",
" \n",
" 1 | \n",
" 1.0 | \n",
" ok lar joking wif u oni | \n",
"
\n",
" \n",
" 2 | \n",
" 0.0 | \n",
" free entry wkly comp win fa cup final tkts st ... | \n",
"
\n",
" \n",
" 3 | \n",
" 1.0 | \n",
" u dun say early hor u c already say | \n",
"
\n",
" \n",
" 4 | \n",
" 1.0 | \n",
" nah think go usf life around though | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 8566 | \n",
" 0.0 | \n",
" abc good morning america rank number christmas... | \n",
"
\n",
" \n",
" 8567 | \n",
" 0.0 | \n",
" hyperlink hyperlink hyperlink let mortgage len... | \n",
"
\n",
" \n",
" 8568 | \n",
" 0.0 | \n",
" thank shopping u gift occasion free gift numbe... | \n",
"
\n",
" \n",
" 8569 | \n",
" 0.0 | \n",
" famous ebay marketing e course learn sell comp... | \n",
"
\n",
" \n",
" 8570 | \n",
" 0.0 | \n",
" hello chinese traditional number number f r v ... | \n",
"
\n",
" \n",
"
\n",
"
8571 rows × 2 columns
\n",
"
"
],
"text/plain": [
" Label Message\n",
"0 1.0 go jurong point crazy available bugis n great ...\n",
"1 1.0 ok lar joking wif u oni\n",
"2 0.0 free entry wkly comp win fa cup final tkts st ...\n",
"3 1.0 u dun say early hor u c already say\n",
"4 1.0 nah think go usf life around though\n",
"... ... ...\n",
"8566 0.0 abc good morning america rank number christmas...\n",
"8567 0.0 hyperlink hyperlink hyperlink let mortgage len...\n",
"8568 0.0 thank shopping u gift occasion free gift numbe...\n",
"8569 0.0 famous ebay marketing e course learn sell comp...\n",
"8570 0.0 hello chinese traditional number number f r v ...\n",
"\n",
"[8571 rows x 2 columns]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"full_data = pd.concat([full_data, data], ignore_index=True)\n",
"full_data"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "bf09a641-c36f-434d-8d5a-3fd62ae1dac8",
"metadata": {},
"outputs": [],
"source": [
"full_data.to_csv('spam_data/full_data.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5f84c43d-2915-4d99-b1c5-85cb65113c8c",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}