{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "9e7940e4-1ba3-4192-a82c-5008dc5ed698", "metadata": {}, "outputs": [], "source": [ "import gradio as gr\n", "from transformers import pipeline\n", "from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline" ] }, { "cell_type": "code", "execution_count": 9, "id": "54c98c8e-a9d2-453c-933a-6a945143361f", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "import sklearn" ] }, { "cell_type": "code", "execution_count": 10, "id": "4be240ca-2fe4-4d02-9955-c54d414fc795", "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd" ] }, { "cell_type": "markdown", "id": "0e93873a-e119-4fb5-aa38-b8814098f758", "metadata": {}, "source": [ "### first we need to import our libraries dataset and convert them into .csv" ] }, { "cell_type": "code", "execution_count": 6, "id": "ad1e3444-0197-4a5f-945d-bee10e977ffe", "metadata": {}, "outputs": [], "source": [ "spam_folder = \"enron2/spam\"\n", "ham_folder = \"enron2/ham\"" ] }, { "cell_type": "code", "execution_count": 7, "id": "262cfd88-9c03-4131-8e62-94dc9e5cce7e", "metadata": {}, "outputs": [], "source": [ "def load_emails(folder, label):\n", " emails = []\n", " for filename in os.listdir(folder):\n", " if filename.endswith(\".txt\"):\n", " with open(os.path.join(folder, filename), \"r\", encoding=\"utf-8\", errors=\"ignore\") as file:\n", " text = file.read()\n", " emails.append((text, label))\n", " return emails" ] }, { "cell_type": "code", "execution_count": 8, "id": "f54a8a8a-9e31-46b4-82c1-6111e158594b", "metadata": {}, "outputs": [], "source": [ "spam_emails = load_emails(spam_folder, label=1) \n", "ham_emails = load_emails(ham_folder, label=0) " ] }, { "cell_type": "code", "execution_count": 13, "id": "8229102b-ec10-47f3-9cc8-dc42f7c4d6e5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CSV file created successfully!\n" ] } ], "source": [ "df = pd.DataFrame(all_emails, columns=[\"text\", \"label\"])\n", "df.to_csv(\"dataset/emails.csv\", index=False, escapechar='\\\\', quotechar='\"')\n", "print(\"CSV file created successfully!\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "a5484a4b-6a93-40ea-993a-36118c5a31b1", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"dataset/emails.csv\")" ] }, { "cell_type": "markdown", "id": "643bbc72-bfaa-4e82-85f8-3f216bc9e843", "metadata": {}, "source": [ "### now we need to clean our messy dataset" ] }, { "cell_type": "code", "execution_count": 5, "id": "d778066a-5526-428c-a118-4e1178b206c1", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to\n", "[nltk_data] C:\\Users\\raofb\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package stopwords to\n", "[nltk_data] C:\\Users\\raofb\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package wordnet to\n", "[nltk_data] C:\\Users\\raofb\\AppData\\Roaming\\nltk_data...\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import re\n", "import nltk\n", "from bs4 import BeautifulSoup\n", "from nltk.corpus import stopwords\n", "from nltk.stem import WordNetLemmatizer\n", "from nltk.tokenize import word_tokenize\n", "nltk.download('punkt')\n", "nltk.download('stopwords')\n", "nltk.download('wordnet')" ] }, { "cell_type": "code", "execution_count": 8, "id": "24c1784f-8262-487a-9868-44762dc976ea", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt_tab to\n", "[nltk_data] C:\\Users\\raofb\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Unzipping tokenizers\\punkt_tab.zip.\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nltk.download('punkt_tab')" ] }, { "cell_type": "code", "execution_count": 6, "id": "e043a0fb-f7e0-4727-8aee-b8ecce88b835", "metadata": {}, "outputs": [], "source": [ "stop_words = set(stopwords.words('english'))\n", "def clean_text(text):\n", " # Lowercase\n", " text = text.lower()\n", " \n", " # Remove HTML\n", " text = BeautifulSoup(text, \"html.parser\").get_text()\n", " \n", " # Replace URLs and emails\n", " text = re.sub(r'http\\S+', '', text)\n", " text = re.sub(r'\\S+@\\S+', '', text)\n", " \n", " # Remove special characters (retain $ for monetary values)\n", " text = re.sub(r'[^\\w\\s$]', '', text)\n", " \n", " # Replace numbers (except after $)\n", " text = re.sub(r'\\b\\d+\\b', '', text)\n", " \n", " # Normalize whitespace\n", " text = re.sub(r'\\s+', ' ', text).strip()\n", "\n", " tokens = word_tokenize(text)\n", " lemmatizer = WordNetLemmatizer()\n", " tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]\n", "\n", " return ' '.join(tokens)" ] }, { "cell_type": "markdown", "id": "f60b360e-c425-47f2-ad9c-6caf1d04f8ba", "metadata": {}, "source": [ "### now we apply our nltk function to dataset" ] }, { "cell_type": "code", "execution_count": 9, "id": "90b9a80e-0937-46c0-8430-226fa0f4d0cf", "metadata": { "scrolled": true }, "outputs": [], "source": [ "df = pd.read_csv('Dataset/emails.csv')\n", "df['cleaned_text'] = df['text'].apply(clean_text)" ] }, { "cell_type": "code", "execution_count": 10, "id": "a425c6e7-5628-4af2-9f0f-c2fe8b7d5eb2", "metadata": {}, "outputs": [], "source": [ "df.to_csv('cleaned_emails.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 28, "id": "54941e16-8791-426e-ba66-d207b90d7876", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textlabelcleaned_text
0Subject: fw : this is the solution i mentioned...1subject fw solution mentioned lsc oo thank ema...
1Subject: advs\\ngreetings ,\\ni am benedicta lin...1subject advs greeting benedicta lindiwe hendri...
2Subject: whats new in summer ? bawled\\ncarolyn...1subject whats new summer bawled carolyn regret...
3Subject: \\nh $ ello\\ndea 54 r home owner ,\\nwe...1subject h $ ello dea < NUM > r home owner beet...
4Subject: : ) ) you can not save the world by q...1subject save world quitting smoking save self ...
5Subject: need software ? click here .\\ntop qua...1subject need software click top quality softwa...
6Subject: spend too much on your phone bill ? 2...1subject spend much phone bill < NUM > crystal ...
7Subject: slotting order confirmation may 18 , ...1subject slotting order confirmation may < NUM ...
8Subject: we shiip to ur country for mircosoft ...1subject shiip ur country mircosoft adobe norto...
9Subject: urgent business proposal ,\\nmrs . reg...1subject urgent business proposal mr regina ros...
\n", "
" ], "text/plain": [ " text label \\\n", "0 Subject: fw : this is the solution i mentioned... 1 \n", "1 Subject: advs\\ngreetings ,\\ni am benedicta lin... 1 \n", "2 Subject: whats new in summer ? bawled\\ncarolyn... 1 \n", "3 Subject: \\nh $ ello\\ndea 54 r home owner ,\\nwe... 1 \n", "4 Subject: : ) ) you can not save the world by q... 1 \n", "5 Subject: need software ? click here .\\ntop qua... 1 \n", "6 Subject: spend too much on your phone bill ? 2... 1 \n", "7 Subject: slotting order confirmation may 18 , ... 1 \n", "8 Subject: we shiip to ur country for mircosoft ... 1 \n", "9 Subject: urgent business proposal ,\\nmrs . reg... 1 \n", "\n", " cleaned_text \n", "0 subject fw solution mentioned lsc oo thank ema... \n", "1 subject advs greeting benedicta lindiwe hendri... \n", "2 subject whats new summer bawled carolyn regret... \n", "3 subject h $ ello dea < NUM > r home owner beet... \n", "4 subject save world quitting smoking save self ... \n", "5 subject need software click top quality softwa... \n", "6 subject spend much phone bill < NUM > crystal ... \n", "7 subject slotting order confirmation may < NUM ... \n", "8 subject shiip ur country mircosoft adobe norto... \n", "9 subject urgent business proposal mr regina ros... " ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "read = pd.read_csv(\"cleaned_emails.csv\")\n", "read.head(10)" ] }, { "cell_type": "code", "execution_count": 30, "id": "f4b111d3-d794-4aaa-ac45-ef86a398cddf", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('cleaned_emails.csv')\n", "\n", "# Drop the 'text' column\n", "df = df.drop(columns=['text'])\n", "\n", "# Rename 'cleaned_text' to 'text'\n", "df = df.rename(columns={'cleaned_text': 'text'})\n", "\n", "# Save the updated DataFrame to a new CSV file (or overwrite the existing one)\n", "df.to_csv('Cleaned_data.csv', index=False)" ] }, { "cell_type": "markdown", "id": "04a92aca-fa9c-468b-84dd-b6bc404b2851", "metadata": {}, "source": [ "### 2nd cleaning ---------------------------" ] }, { "cell_type": "code", "execution_count": 38, "id": "977a3274-d470-4bf1-9467-dd63a0774ee7", "metadata": {}, "outputs": [], "source": [ "from langdetect import detect, LangDetectException\n", "from nltk.tokenize import word_tokenize, sent_tokenize" ] }, { "cell_type": "code", "execution_count": 39, "id": "e690b5d1-aa74-4146-9bda-9437790ae571", "metadata": {}, "outputs": [], "source": [ "stop_words = set(stopwords.words('english'))\n", "lemmatizer = WordNetLemmatizer()\n", "\n", "def clean_text(text):\n", " try:\n", " # Detect and remove non-English text\n", " if detect(text) != 'en':\n", " return ''\n", " except LangDetectException:\n", " return ''\n", " \n", " # Lowercase\n", " text = text.lower()\n", " \n", " # Remove HTML\n", " text = BeautifulSoup(text, \"html.parser\").get_text()\n", " \n", " # Enhanced URL/email replacement\n", " text = re.sub(r'\\b(?:https?://|www\\.)\\S+', '', text)\n", " text = re.sub(r'\\b[\\w\\.-]+@[\\w\\.-]+\\.\\w{2,}\\b', '', text)\n", " \n", " # Date pattern replacement (MM/DD/YYYY, DD-MM-YYYY, Month names, etc.)\n", " text = re.sub(r'\\b(\\d{1,2}[/-]\\d{1,2}[/-]\\d{2,4}\\b|(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\\s+\\d{1,2},?\\s+\\d{4})', '', text)\n", " \n", " # Time pattern replacement\n", " text = re.sub(r'\\b\\d{1,2}:\\d{2}\\s*(?:am|pm)?\\b', '