diff --git "a/TrainingModel(FutureProject).ipynb" "b/TrainingModel(FutureProject).ipynb" new file mode 100644--- /dev/null +++ "b/TrainingModel(FutureProject).ipynb" @@ -0,0 +1,2115 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "9e7940e4-1ba3-4192-a82c-5008dc5ed698", + "metadata": {}, + "outputs": [], + "source": [ + "import gradio as gr\n", + "from transformers import pipeline\n", + "from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "54c98c8e-a9d2-453c-933a-6a945143361f", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "import sklearn" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4be240ca-2fe4-4d02-9955-c54d414fc795", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "0e93873a-e119-4fb5-aa38-b8814098f758", + "metadata": {}, + "source": [ + "### first we need to import our libraries dataset and convert them into .csv" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ad1e3444-0197-4a5f-945d-bee10e977ffe", + "metadata": {}, + "outputs": [], + "source": [ + "spam_folder = \"enron2/spam\"\n", + "ham_folder = \"enron2/ham\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "262cfd88-9c03-4131-8e62-94dc9e5cce7e", + "metadata": {}, + "outputs": [], + "source": [ + "def load_emails(folder, label):\n", + " emails = []\n", + " for filename in os.listdir(folder):\n", + " if filename.endswith(\".txt\"):\n", + " with open(os.path.join(folder, filename), \"r\", encoding=\"utf-8\", errors=\"ignore\") as file:\n", + " text = file.read()\n", + " emails.append((text, label))\n", + " return emails" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f54a8a8a-9e31-46b4-82c1-6111e158594b", + "metadata": {}, + "outputs": [], + "source": [ + "spam_emails = load_emails(spam_folder, label=1) \n", + "ham_emails = load_emails(ham_folder, label=0) " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "8229102b-ec10-47f3-9cc8-dc42f7c4d6e5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CSV file created successfully!\n" + ] + } + ], + "source": [ + "df = pd.DataFrame(all_emails, columns=[\"text\", \"label\"])\n", + "df.to_csv(\"dataset/emails.csv\", index=False, escapechar='\\\\', quotechar='\"')\n", + "print(\"CSV file created successfully!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a5484a4b-6a93-40ea-993a-36118c5a31b1", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"dataset/emails.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "643bbc72-bfaa-4e82-85f8-3f216bc9e843", + "metadata": {}, + "source": [ + "### now we need to clean our messy dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d778066a-5526-428c-a118-4e1178b206c1", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to\n", + "[nltk_data] C:\\Users\\raofb\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n", + "[nltk_data] Downloading package stopwords to\n", + "[nltk_data] C:\\Users\\raofb\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n", + "[nltk_data] Downloading package wordnet to\n", + "[nltk_data] C:\\Users\\raofb\\AppData\\Roaming\\nltk_data...\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import re\n", + "import nltk\n", + "from bs4 import BeautifulSoup\n", + "from nltk.corpus import stopwords\n", + "from nltk.stem import WordNetLemmatizer\n", + "from nltk.tokenize import word_tokenize\n", + "nltk.download('punkt')\n", + "nltk.download('stopwords')\n", + "nltk.download('wordnet')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "24c1784f-8262-487a-9868-44762dc976ea", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt_tab to\n", + "[nltk_data] C:\\Users\\raofb\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Unzipping tokenizers\\punkt_tab.zip.\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nltk.download('punkt_tab')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e043a0fb-f7e0-4727-8aee-b8ecce88b835", + "metadata": {}, + "outputs": [], + "source": [ + "stop_words = set(stopwords.words('english'))\n", + "def clean_text(text):\n", + " # Lowercase\n", + " text = text.lower()\n", + " \n", + " # Remove HTML\n", + " text = BeautifulSoup(text, \"html.parser\").get_text()\n", + " \n", + " # Replace URLs and emails\n", + " text = re.sub(r'http\\S+', '', text)\n", + " text = re.sub(r'\\S+@\\S+', '', text)\n", + " \n", + " # Remove special characters (retain $ for monetary values)\n", + " text = re.sub(r'[^\\w\\s$]', '', text)\n", + " \n", + " # Replace numbers (except after $)\n", + " text = re.sub(r'\\b\\d+\\b', '', text)\n", + " \n", + " # Normalize whitespace\n", + " text = re.sub(r'\\s+', ' ', text).strip()\n", + "\n", + " tokens = word_tokenize(text)\n", + " lemmatizer = WordNetLemmatizer()\n", + " tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]\n", + "\n", + " return ' '.join(tokens)" + ] + }, + { + "cell_type": "markdown", + "id": "f60b360e-c425-47f2-ad9c-6caf1d04f8ba", + "metadata": {}, + "source": [ + "### now we apply our nltk function to dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "90b9a80e-0937-46c0-8430-226fa0f4d0cf", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "df = pd.read_csv('Dataset/emails.csv')\n", + "df['cleaned_text'] = df['text'].apply(clean_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a425c6e7-5628-4af2-9f0f-c2fe8b7d5eb2", + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv('cleaned_emails.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "54941e16-8791-426e-ba66-d207b90d7876", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textlabelcleaned_text
0Subject: fw : this is the solution i mentioned...1subject fw solution mentioned lsc oo thank ema...
1Subject: advs\\ngreetings ,\\ni am benedicta lin...1subject advs greeting benedicta lindiwe hendri...
2Subject: whats new in summer ? bawled\\ncarolyn...1subject whats new summer bawled carolyn regret...
3Subject: \\nh $ ello\\ndea 54 r home owner ,\\nwe...1subject h $ ello dea < NUM > r home owner beet...
4Subject: : ) ) you can not save the world by q...1subject save world quitting smoking save self ...
5Subject: need software ? click here .\\ntop qua...1subject need software click top quality softwa...
6Subject: spend too much on your phone bill ? 2...1subject spend much phone bill < NUM > crystal ...
7Subject: slotting order confirmation may 18 , ...1subject slotting order confirmation may < NUM ...
8Subject: we shiip to ur country for mircosoft ...1subject shiip ur country mircosoft adobe norto...
9Subject: urgent business proposal ,\\nmrs . reg...1subject urgent business proposal mr regina ros...
\n", + "
" + ], + "text/plain": [ + " text label \\\n", + "0 Subject: fw : this is the solution i mentioned... 1 \n", + "1 Subject: advs\\ngreetings ,\\ni am benedicta lin... 1 \n", + "2 Subject: whats new in summer ? bawled\\ncarolyn... 1 \n", + "3 Subject: \\nh $ ello\\ndea 54 r home owner ,\\nwe... 1 \n", + "4 Subject: : ) ) you can not save the world by q... 1 \n", + "5 Subject: need software ? click here .\\ntop qua... 1 \n", + "6 Subject: spend too much on your phone bill ? 2... 1 \n", + "7 Subject: slotting order confirmation may 18 , ... 1 \n", + "8 Subject: we shiip to ur country for mircosoft ... 1 \n", + "9 Subject: urgent business proposal ,\\nmrs . reg... 1 \n", + "\n", + " cleaned_text \n", + "0 subject fw solution mentioned lsc oo thank ema... \n", + "1 subject advs greeting benedicta lindiwe hendri... \n", + "2 subject whats new summer bawled carolyn regret... \n", + "3 subject h $ ello dea < NUM > r home owner beet... \n", + "4 subject save world quitting smoking save self ... \n", + "5 subject need software click top quality softwa... \n", + "6 subject spend much phone bill < NUM > crystal ... \n", + "7 subject slotting order confirmation may < NUM ... \n", + "8 subject shiip ur country mircosoft adobe norto... \n", + "9 subject urgent business proposal mr regina ros... " + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "read = pd.read_csv(\"cleaned_emails.csv\")\n", + "read.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "f4b111d3-d794-4aaa-ac45-ef86a398cddf", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('cleaned_emails.csv')\n", + "\n", + "# Drop the 'text' column\n", + "df = df.drop(columns=['text'])\n", + "\n", + "# Rename 'cleaned_text' to 'text'\n", + "df = df.rename(columns={'cleaned_text': 'text'})\n", + "\n", + "# Save the updated DataFrame to a new CSV file (or overwrite the existing one)\n", + "df.to_csv('Cleaned_data.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "04a92aca-fa9c-468b-84dd-b6bc404b2851", + "metadata": {}, + "source": [ + "### 2nd cleaning ---------------------------" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "977a3274-d470-4bf1-9467-dd63a0774ee7", + "metadata": {}, + "outputs": [], + "source": [ + "from langdetect import detect, LangDetectException\n", + "from nltk.tokenize import word_tokenize, sent_tokenize" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "e690b5d1-aa74-4146-9bda-9437790ae571", + "metadata": {}, + "outputs": [], + "source": [ + "stop_words = set(stopwords.words('english'))\n", + "lemmatizer = WordNetLemmatizer()\n", + "\n", + "def clean_text(text):\n", + " try:\n", + " # Detect and remove non-English text\n", + " if detect(text) != 'en':\n", + " return ''\n", + " except LangDetectException:\n", + " return ''\n", + " \n", + " # Lowercase\n", + " text = text.lower()\n", + " \n", + " # Remove HTML\n", + " text = BeautifulSoup(text, \"html.parser\").get_text()\n", + " \n", + " # Enhanced URL/email replacement\n", + " text = re.sub(r'\\b(?:https?://|www\\.)\\S+', '', text)\n", + " text = re.sub(r'\\b[\\w\\.-]+@[\\w\\.-]+\\.\\w{2,}\\b', '', text)\n", + " \n", + " # Date pattern replacement (MM/DD/YYYY, DD-MM-YYYY, Month names, etc.)\n", + " text = re.sub(r'\\b(\\d{1,2}[/-]\\d{1,2}[/-]\\d{2,4}\\b|(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\\s+\\d{1,2},?\\s+\\d{4})', '', text)\n", + " \n", + " # Time pattern replacement\n", + " text = re.sub(r'\\b\\d{1,2}:\\d{2}\\s*(?:am|pm)?\\b', '