{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "90d1208c-18ee-43b2-aafb-c79d0b862687", "metadata": {}, "outputs": [], "source": [ "import nltk\n", "import pandas as pd\n", "import numpy as np\n", "\n", "import re\n", "from nltk.corpus import stopwords\n", "from nltk.stem import PorterStemmer\n", "from nltk.stem import WordNetLemmatizer\n", "\n", "stemmer = PorterStemmer()\n", "lemmatizer = WordNetLemmatizer()" ] }, { "cell_type": "code", "execution_count": 2, "id": "583799b8-54f4-4faa-83a0-8d5da9ed6c1f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LabelMessage
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [Label, Message]\n", "Index: []" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "full_data = pd.DataFrame({'Label':[], 'Message':[]})\n", "full_data" ] }, { "cell_type": "markdown", "id": "c7a03b9c-ae0c-49d0-b65c-73aa0b12f773", "metadata": {}, "source": [ "# Dataset 1" ] }, { "cell_type": "code", "execution_count": 3, "id": "ab2c7f73-dce3-4c31-848b-741c3b68c418", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
v1v2Unnamed: 2Unnamed: 3Unnamed: 4
0hamGo until jurong point, crazy.. Available only ...NaNNaNNaN
1hamOk lar... Joking wif u oni...NaNNaNNaN
2spamFree entry in 2 a wkly comp to win FA Cup fina...NaNNaNNaN
3hamU dun say so early hor... U c already then say...NaNNaNNaN
4hamNah I don't think he goes to usf, he lives aro...NaNNaNNaN
\n", "
" ], "text/plain": [ " v1 v2 Unnamed: 2 \\\n", "0 ham Go until jurong point, crazy.. Available only ... NaN \n", "1 ham Ok lar... Joking wif u oni... NaN \n", "2 spam Free entry in 2 a wkly comp to win FA Cup fina... NaN \n", "3 ham U dun say so early hor... U c already then say... NaN \n", "4 ham Nah I don't think he goes to usf, he lives aro... NaN \n", "\n", " Unnamed: 3 Unnamed: 4 \n", "0 NaN NaN \n", "1 NaN NaN \n", "2 NaN NaN \n", "3 NaN NaN \n", "4 NaN NaN " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_csv(\"spam_data/spam_data_1.csv\", encoding='Windows-1252')\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 4, "id": "347db751-7bd6-4d6f-8cfd-4456a69ebc90", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\thaku\\AppData\\Local\\Temp\\ipykernel_24436\\3848975045.py:1: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", " data['v1'] = data['v1'].replace(to_replace=['ham', 'spam'], value=[1, 0]).astype(int)\n" ] } ], "source": [ "data['v1'] = data['v1'].replace(to_replace=['ham', 'spam'], value=[1, 0]).astype(int)" ] }, { "cell_type": "code", "execution_count": 5, "id": "b4e18778-0900-4114-b405-92433f686d85", "metadata": {}, "outputs": [], "source": [ "for i in range(len(data)):\n", " review = re.sub('[^a-zA-Z]', ' ', data['v2'][i])\n", " review = review.lower()\n", " review = review.split()\n", " review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]\n", " review = ' '.join(review)\n", " data.loc[i, 'v2'] = review " ] }, { "cell_type": "code", "execution_count": 6, "id": "e7e5d709-fce0-459f-8df8-4daa7ec7f1e2", "metadata": {}, "outputs": [], "source": [ "data = data[['v1', 'v2']]\n", "data = data.rename(columns={'v1':'Label', 'v2':'Message'})" ] }, { "cell_type": "code", "execution_count": 7, "id": "d286a7a2-8bd7-4f3b-b6f1-71197c5dd234", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LabelMessage
01go jurong point crazy available bugis n great ...
11ok lar joking wif u oni
20free entry wkly comp win fa cup final tkts st ...
31u dun say early hor u c already say
41nah think go usf life around though
.........
55670nd time tried contact u u pound prize claim ea...
55681b going esplanade fr home
55691pity mood suggestion
55701guy bitching acted like interested buying some...
55711rofl true name
\n", "

5572 rows × 2 columns

\n", "
" ], "text/plain": [ " Label Message\n", "0 1 go jurong point crazy available bugis n great ...\n", "1 1 ok lar joking wif u oni\n", "2 0 free entry wkly comp win fa cup final tkts st ...\n", "3 1 u dun say early hor u c already say\n", "4 1 nah think go usf life around though\n", "... ... ...\n", "5567 0 nd time tried contact u u pound prize claim ea...\n", "5568 1 b going esplanade fr home\n", "5569 1 pity mood suggestion\n", "5570 1 guy bitching acted like interested buying some...\n", "5571 1 rofl true name\n", "\n", "[5572 rows x 2 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data" ] }, { "cell_type": "code", "execution_count": 8, "id": "d337a825-5af8-40b5-88b1-f6ec7ca5bca2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LabelMessage
01.0go jurong point crazy available bugis n great ...
11.0ok lar joking wif u oni
20.0free entry wkly comp win fa cup final tkts st ...
31.0u dun say early hor u c already say
41.0nah think go usf life around though
.........
55670.0nd time tried contact u u pound prize claim ea...
55681.0b going esplanade fr home
55691.0pity mood suggestion
55701.0guy bitching acted like interested buying some...
55711.0rofl true name
\n", "

5572 rows × 2 columns

\n", "
" ], "text/plain": [ " Label Message\n", "0 1.0 go jurong point crazy available bugis n great ...\n", "1 1.0 ok lar joking wif u oni\n", "2 0.0 free entry wkly comp win fa cup final tkts st ...\n", "3 1.0 u dun say early hor u c already say\n", "4 1.0 nah think go usf life around though\n", "... ... ...\n", "5567 0.0 nd time tried contact u u pound prize claim ea...\n", "5568 1.0 b going esplanade fr home\n", "5569 1.0 pity mood suggestion\n", "5570 1.0 guy bitching acted like interested buying some...\n", "5571 1.0 rofl true name\n", "\n", "[5572 rows x 2 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "full_data = pd.concat([full_data, data], ignore_index=True)\n", "full_data" ] }, { "cell_type": "markdown", "id": "e903b4ef-f47c-4df6-9775-f920b9a91ad1", "metadata": {}, "source": [ "# Dataset 2" ] }, { "cell_type": "code", "execution_count": 9, "id": "ea1cbd53-160c-481e-b911-7c03f672de9b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
emaillabel
0date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...0
1martin a posted tassos papadopoulos the greek ...0
2man threatens explosion in moscow thursday aug...0
3klez the virus that won t die already the most...0
4in adding cream to spaghetti carbonara which ...0
.........
2995abc s good morning america ranks it the NUMBE...1
2996hyperlink hyperlink hyperlink let mortgage le...1
2997thank you for shopping with us gifts for all ...1
2998the famous ebay marketing e course learn to s...1
2999hello this is chinese traditional 子 件 NUMBER世...1
\n", "

3000 rows × 2 columns

\n", "
" ], "text/plain": [ " email label\n", "0 date wed NUMBER aug NUMBER NUMBER NUMBER NUMB... 0\n", "1 martin a posted tassos papadopoulos the greek ... 0\n", "2 man threatens explosion in moscow thursday aug... 0\n", "3 klez the virus that won t die already the most... 0\n", "4 in adding cream to spaghetti carbonara which ... 0\n", "... ... ...\n", "2995 abc s good morning america ranks it the NUMBE... 1\n", "2996 hyperlink hyperlink hyperlink let mortgage le... 1\n", "2997 thank you for shopping with us gifts for all ... 1\n", "2998 the famous ebay marketing e course learn to s... 1\n", "2999 hello this is chinese traditional 子 件 NUMBER世... 1\n", "\n", "[3000 rows x 2 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_csv('spam_data/spam_data_2.csv')\n", "data" ] }, { "cell_type": "code", "execution_count": 10, "id": "25352c21-ce85-4b17-90da-48fb4a959844", "metadata": {}, "outputs": [], "source": [ "data = data.dropna()\n", "data = data.reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 11, "id": "7b07e8f2-be13-4d87-a5fd-eecaf2408f61", "metadata": {}, "outputs": [], "source": [ "data['label'] = data['label'].replace(to_replace=[0, 1], value=[1, 0]).astype(int)" ] }, { "cell_type": "code", "execution_count": 12, "id": "aadb5b45-e08f-456c-940b-457936bf49f0", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "messages processed : 0\n", "messages processed : 100\n", "messages processed : 200\n", "messages processed : 300\n", "messages processed : 400\n", "messages processed : 500\n", "messages processed : 600\n", "messages processed : 700\n", "messages processed : 800\n", "messages processed : 900\n", "messages processed : 1000\n", "messages processed : 1100\n", "messages processed : 1200\n", "messages processed : 1300\n", "messages processed : 1400\n", "messages processed : 1500\n", "messages processed : 1600\n", "messages processed : 1700\n", "messages processed : 1800\n", "messages processed : 1900\n", "messages processed : 2000\n", "messages processed : 2100\n", "messages processed : 2200\n", "messages processed : 2300\n", "messages processed : 2400\n", "messages processed : 2500\n", "messages processed : 2600\n", "messages processed : 2700\n", "messages processed : 2800\n", "messages processed : 2900\n" ] } ], "source": [ "for i in range(len(data)):\n", " review = re.sub('[^a-zA-Z]', ' ', data['email'][i])\n", " review = review.lower()\n", " review = review.split()\n", " review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]\n", " review = ' '.join(review)\n", " data.loc[i, 'email'] = review\n", " if i%100==0:\n", " print('messages processed :' ,i)" ] }, { "cell_type": "code", "execution_count": 13, "id": "fbe8e35f-979c-4a93-abab-a2dd7be2eee5", "metadata": {}, "outputs": [], "source": [ "data = data[['label', 'email']]\n", "data = data.rename(columns={'label':'Label', 'email':'Message'})" ] }, { "cell_type": "code", "execution_count": 14, "id": "14abf300-2450-43e8-8473-2d80ba810889", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LabelMessage
01.0go jurong point crazy available bugis n great ...
11.0ok lar joking wif u oni
20.0free entry wkly comp win fa cup final tkts st ...
31.0u dun say early hor u c already say
41.0nah think go usf life around though
.........
85660.0abc good morning america rank number christmas...
85670.0hyperlink hyperlink hyperlink let mortgage len...
85680.0thank shopping u gift occasion free gift numbe...
85690.0famous ebay marketing e course learn sell comp...
85700.0hello chinese traditional number number f r v ...
\n", "

8571 rows × 2 columns

\n", "
" ], "text/plain": [ " Label Message\n", "0 1.0 go jurong point crazy available bugis n great ...\n", "1 1.0 ok lar joking wif u oni\n", "2 0.0 free entry wkly comp win fa cup final tkts st ...\n", "3 1.0 u dun say early hor u c already say\n", "4 1.0 nah think go usf life around though\n", "... ... ...\n", "8566 0.0 abc good morning america rank number christmas...\n", "8567 0.0 hyperlink hyperlink hyperlink let mortgage len...\n", "8568 0.0 thank shopping u gift occasion free gift numbe...\n", "8569 0.0 famous ebay marketing e course learn sell comp...\n", "8570 0.0 hello chinese traditional number number f r v ...\n", "\n", "[8571 rows x 2 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "full_data = pd.concat([full_data, data], ignore_index=True)\n", "full_data" ] }, { "cell_type": "code", "execution_count": 15, "id": "bf09a641-c36f-434d-8d5a-3fd62ae1dac8", "metadata": {}, "outputs": [], "source": [ "full_data.to_csv('spam_data/full_data.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "5f84c43d-2915-4d99-b1c5-85cb65113c8c", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 5 }