{ "cells": [ { "cell_type": "markdown", "id": "4ef3464673447a14", "metadata": {}, "source": [ "### Data Load" ] }, { "cell_type": "code", "execution_count": 1, "id": "initial_id", "metadata": { "ExecuteTime": { "end_time": "2025-01-18T21:42:43.965893Z", "start_time": "2025-01-18T21:42:43.144021Z" }, "collapsed": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
timestampsourcelog_messagetarget_labelcomplexity
02025-06-27 07:20:25ModernCRMnova.osapi_compute.wsgi.server [req-b9718cd8-f...HTTP Statusbert
11/14/2025 23:07ModernCRMEmail service experiencing issues with sendingCritical Errorbert
21/17/2025 1:29AnalyticsEngineUnauthorized access to data was attemptedSecurity Alertbert
32025-07-12 00:24:16ModernHRnova.osapi_compute.wsgi.server [req-4895c258-b...HTTP Statusbert
42025-06-02 18:25:23BillingSystemnova.osapi_compute.wsgi.server [req-ee8bc8ba-9...HTTP Statusbert
\n", "
" ], "text/plain": [ " timestamp source \\\n", "0 2025-06-27 07:20:25 ModernCRM \n", "1 1/14/2025 23:07 ModernCRM \n", "2 1/17/2025 1:29 AnalyticsEngine \n", "3 2025-07-12 00:24:16 ModernHR \n", "4 2025-06-02 18:25:23 BillingSystem \n", "\n", " log_message target_label \\\n", "0 nova.osapi_compute.wsgi.server [req-b9718cd8-f... HTTP Status \n", "1 Email service experiencing issues with sending Critical Error \n", "2 Unauthorized access to data was attempted Security Alert \n", "3 nova.osapi_compute.wsgi.server [req-4895c258-b... HTTP Status \n", "4 nova.osapi_compute.wsgi.server [req-ee8bc8ba-9... HTTP Status \n", "\n", " complexity \n", "0 bert \n", "1 bert \n", "2 bert \n", "3 bert \n", "4 bert " ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv(\"dataset/synthetic_logs.csv\")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 2, "id": "4495a73d8e933c6e", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:27:53.403158Z", "start_time": "2025-01-15T20:27:53.387783Z" } }, "outputs": [ { "data": { "text/plain": [ "array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',\n", " 'ThirdPartyAPI', 'LegacyCRM'], dtype=object)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.source.unique()" ] }, { "cell_type": "code", "execution_count": 3, "id": "92da3b13bac914a7", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:27:53.466975Z", "start_time": "2025-01-15T20:27:53.452028Z" } }, "outputs": [ { "data": { "text/plain": [ "array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',\n", " 'System Notification', 'Resource Usage', 'User Action',\n", " 'Workflow Error', 'Deprecation Warning'], dtype=object)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.target_label.unique()" ] }, { "cell_type": "code", "execution_count": 4, "id": "b350454e0d700e15", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:27:53.537931Z", "start_time": "2025-01-15T20:27:53.521598Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
timestampsourcelog_messagetarget_labelcomplexity
168312/13/2025 5:35ModernCRMBackup completed successfully.System Notificationregex
22891/23/2025 9:59ThirdPartyAPISystem updated to version 2.3.5.System Notificationregex
10023/1/2025 7:52ModernCRMDisk cleanup completed successfully.System Notificationregex
13435/13/2025 15:57ModernCRMSystem updated to version 5.7.5.System Notificationregex
23955/2/2025 14:29ThirdPartyAPIBackup ended at 2025-05-06 11:23:16.System Notificationregex
145611/14/2025 17:55AnalyticsEngineDisk cleanup completed successfully.System Notificationregex
6431/12/2025 5:21ModernHRSystem updated to version 1.2.9.System Notificationregex
13214/25/2025 7:16ModernCRMSystem updated to version 5.4.0.System Notificationregex
84111/7/2025 19:23ModernHRFile data_7222.csv uploaded successfully by us...System Notificationregex
23338/28/2025 2:09ThirdPartyAPIDisk cleanup completed successfully.System Notificationregex
\n", "
" ], "text/plain": [ " timestamp source \\\n", "1683 12/13/2025 5:35 ModernCRM \n", "2289 1/23/2025 9:59 ThirdPartyAPI \n", "1002 3/1/2025 7:52 ModernCRM \n", "1343 5/13/2025 15:57 ModernCRM \n", "2395 5/2/2025 14:29 ThirdPartyAPI \n", "1456 11/14/2025 17:55 AnalyticsEngine \n", "643 1/12/2025 5:21 ModernHR \n", "1321 4/25/2025 7:16 ModernCRM \n", "841 11/7/2025 19:23 ModernHR \n", "2333 8/28/2025 2:09 ThirdPartyAPI \n", "\n", " log_message target_label \\\n", "1683 Backup completed successfully. System Notification \n", "2289 System updated to version 2.3.5. System Notification \n", "1002 Disk cleanup completed successfully. System Notification \n", "1343 System updated to version 5.7.5. System Notification \n", "2395 Backup ended at 2025-05-06 11:23:16. System Notification \n", "1456 Disk cleanup completed successfully. System Notification \n", "643 System updated to version 1.2.9. System Notification \n", "1321 System updated to version 5.4.0. System Notification \n", "841 File data_7222.csv uploaded successfully by us... System Notification \n", "2333 Disk cleanup completed successfully. System Notification \n", "\n", " complexity \n", "1683 regex \n", "2289 regex \n", "1002 regex \n", "1343 regex \n", "2395 regex \n", "1456 regex \n", "643 regex \n", "1321 regex \n", "841 regex \n", "2333 regex " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.target_label=='System Notification'].sample(10)" ] }, { "cell_type": "code", "execution_count": 5, "id": "dc5394d70050e10d", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:27:53.626880Z", "start_time": "2025-01-15T20:27:53.611502Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
timestampsourcelog_messagetarget_labelcomplexity
3611/19/2025 13:14BillingSystemSystem reboot initiated by user User243.System Notificationregex
9212/4/2025 21:20BillingSystemSystem reboot initiated by user User471.System Notificationregex
1395/8/2025 16:34ModernHRSystem reboot initiated by user User216.System Notificationregex
1409/11/2025 8:49AnalyticsEngineSystem reboot initiated by user User639.System Notificationregex
1613/31/2025 19:40BillingSystemSystem reboot initiated by user User819.System Notificationregex
1636/6/2025 15:29BillingSystemSystem reboot initiated by user User938.System Notificationregex
3074/12/2025 0:41BillingSystemSystem reboot initiated by user User929.System Notificationregex
36510/20/2025 22:32ModernHRSystem reboot initiated by user User533.System Notificationregex
5084/15/2025 2:04ThirdPartyAPISystem reboot initiated by user User591.System Notificationregex
5529/22/2025 20:54ModernHRSystem reboot initiated by user User421.System Notificationregex
6689/5/2025 7:14ModernHRSystem reboot initiated by user User297.System Notificationregex
6937/6/2025 21:40BillingSystemSystem reboot initiated by user User159.System Notificationregex
6973/13/2025 7:09BillingSystemSystem reboot initiated by user User648.System Notificationregex
7149/25/2025 23:35ThirdPartyAPISystem reboot initiated by user User600.System Notificationregex
7305/24/2025 11:08AnalyticsEngineSystem reboot initiated by user User120.System Notificationregex
8008/15/2025 12:14BillingSystemSystem reboot initiated by user User901.System Notificationregex
8374/9/2025 8:28AnalyticsEngineSystem reboot initiated by user User876.System Notificationregex
8523/31/2025 5:20ModernCRMSystem reboot initiated by user User811.System Notificationregex
8652/25/2025 1:40AnalyticsEngineSystem reboot initiated by user User964.System Notificationregex
88911/30/2025 13:45ModernHRSystem reboot initiated by user User766.System Notificationregex
8967/28/2025 11:24BillingSystemSystem reboot initiated by user User765.System Notificationregex
9889/11/2025 22:23BillingSystemSystem reboot initiated by user User427.System Notificationregex
110612/28/2025 13:32ModernHRSystem reboot initiated by user User246.System Notificationregex
11598/5/2025 6:52BillingSystemSystem reboot initiated by user User329.System Notificationregex
11947/20/2025 9:10AnalyticsEngineSystem reboot initiated by user User747.System Notificationregex
12757/15/2025 23:37BillingSystemSystem reboot initiated by user User829.System Notificationregex
12997/15/2025 19:19BillingSystemSystem reboot initiated by user User966.System Notificationregex
13048/10/2025 6:18ThirdPartyAPISystem reboot initiated by user User758.System Notificationregex
152411/30/2025 2:39ThirdPartyAPISystem reboot initiated by user User278.System Notificationregex
15628/18/2025 4:17ThirdPartyAPISystem reboot initiated by user User648.System Notificationregex
162412/14/2025 5:14AnalyticsEngineSystem reboot initiated by user User268.System Notificationregex
166310/27/2025 22:04AnalyticsEngineSystem reboot initiated by user User315.System Notificationregex
17762/21/2025 11:56ModernHRSystem reboot initiated by user User155.System Notificationregex
18038/22/2025 6:30AnalyticsEngineSystem reboot initiated by user User204.System Notificationregex
18048/26/2025 21:06ModernHRSystem reboot initiated by user User899.System Notificationregex
18521/26/2025 12:34AnalyticsEngineSystem reboot initiated by user User223.System Notificationregex
18655/11/2025 10:58AnalyticsEngineSystem reboot initiated by user User932.System Notificationregex
19569/26/2025 19:32ThirdPartyAPISystem reboot initiated by user User264.System Notificationregex
20036/23/2025 17:54ModernCRMSystem reboot initiated by user User517.System Notificationregex
201412/25/2025 4:33AnalyticsEngineSystem reboot initiated by user User293.System Notificationregex
20439/12/2025 20:20ThirdPartyAPISystem reboot initiated by user User262.System Notificationregex
20749/13/2025 8:43ModernCRMSystem reboot initiated by user User937.System Notificationregex
22289/3/2025 11:24ModernHRSystem reboot initiated by user User179.System Notificationregex
22431/16/2025 7:22ModernHRSystem reboot initiated by user User770.System Notificationregex
22463/2/2025 22:56ModernHRSystem reboot initiated by user User488.System Notificationregex
225310/7/2025 2:20ModernHRSystem reboot initiated by user User644.System Notificationregex
23173/7/2025 5:44BillingSystemSystem reboot initiated by user User724.System Notificationregex
23605/1/2025 4:21ThirdPartyAPISystem reboot initiated by user User876.System Notificationregex
\n", "
" ], "text/plain": [ " timestamp source \\\n", "36 11/19/2025 13:14 BillingSystem \n", "92 12/4/2025 21:20 BillingSystem \n", "139 5/8/2025 16:34 ModernHR \n", "140 9/11/2025 8:49 AnalyticsEngine \n", "161 3/31/2025 19:40 BillingSystem \n", "163 6/6/2025 15:29 BillingSystem \n", "307 4/12/2025 0:41 BillingSystem \n", "365 10/20/2025 22:32 ModernHR \n", "508 4/15/2025 2:04 ThirdPartyAPI \n", "552 9/22/2025 20:54 ModernHR \n", "668 9/5/2025 7:14 ModernHR \n", "693 7/6/2025 21:40 BillingSystem \n", "697 3/13/2025 7:09 BillingSystem \n", "714 9/25/2025 23:35 ThirdPartyAPI \n", "730 5/24/2025 11:08 AnalyticsEngine \n", "800 8/15/2025 12:14 BillingSystem \n", "837 4/9/2025 8:28 AnalyticsEngine \n", "852 3/31/2025 5:20 ModernCRM \n", "865 2/25/2025 1:40 AnalyticsEngine \n", "889 11/30/2025 13:45 ModernHR \n", "896 7/28/2025 11:24 BillingSystem \n", "988 9/11/2025 22:23 BillingSystem \n", "1106 12/28/2025 13:32 ModernHR \n", "1159 8/5/2025 6:52 BillingSystem \n", "1194 7/20/2025 9:10 AnalyticsEngine \n", "1275 7/15/2025 23:37 BillingSystem \n", "1299 7/15/2025 19:19 BillingSystem \n", "1304 8/10/2025 6:18 ThirdPartyAPI \n", "1524 11/30/2025 2:39 ThirdPartyAPI \n", "1562 8/18/2025 4:17 ThirdPartyAPI \n", "1624 12/14/2025 5:14 AnalyticsEngine \n", "1663 10/27/2025 22:04 AnalyticsEngine \n", "1776 2/21/2025 11:56 ModernHR \n", "1803 8/22/2025 6:30 AnalyticsEngine \n", "1804 8/26/2025 21:06 ModernHR \n", "1852 1/26/2025 12:34 AnalyticsEngine \n", "1865 5/11/2025 10:58 AnalyticsEngine \n", "1956 9/26/2025 19:32 ThirdPartyAPI \n", "2003 6/23/2025 17:54 ModernCRM \n", "2014 12/25/2025 4:33 AnalyticsEngine \n", "2043 9/12/2025 20:20 ThirdPartyAPI \n", "2074 9/13/2025 8:43 ModernCRM \n", "2228 9/3/2025 11:24 ModernHR \n", "2243 1/16/2025 7:22 ModernHR \n", "2246 3/2/2025 22:56 ModernHR \n", "2253 10/7/2025 2:20 ModernHR \n", "2317 3/7/2025 5:44 BillingSystem \n", "2360 5/1/2025 4:21 ThirdPartyAPI \n", "\n", " log_message target_label complexity \n", "36 System reboot initiated by user User243. System Notification regex \n", "92 System reboot initiated by user User471. System Notification regex \n", "139 System reboot initiated by user User216. System Notification regex \n", "140 System reboot initiated by user User639. System Notification regex \n", "161 System reboot initiated by user User819. System Notification regex \n", "163 System reboot initiated by user User938. System Notification regex \n", "307 System reboot initiated by user User929. System Notification regex \n", "365 System reboot initiated by user User533. System Notification regex \n", "508 System reboot initiated by user User591. System Notification regex \n", "552 System reboot initiated by user User421. System Notification regex \n", "668 System reboot initiated by user User297. System Notification regex \n", "693 System reboot initiated by user User159. System Notification regex \n", "697 System reboot initiated by user User648. System Notification regex \n", "714 System reboot initiated by user User600. System Notification regex \n", "730 System reboot initiated by user User120. System Notification regex \n", "800 System reboot initiated by user User901. System Notification regex \n", "837 System reboot initiated by user User876. System Notification regex \n", "852 System reboot initiated by user User811. System Notification regex \n", "865 System reboot initiated by user User964. System Notification regex \n", "889 System reboot initiated by user User766. System Notification regex \n", "896 System reboot initiated by user User765. System Notification regex \n", "988 System reboot initiated by user User427. System Notification regex \n", "1106 System reboot initiated by user User246. System Notification regex \n", "1159 System reboot initiated by user User329. System Notification regex \n", "1194 System reboot initiated by user User747. System Notification regex \n", "1275 System reboot initiated by user User829. System Notification regex \n", "1299 System reboot initiated by user User966. System Notification regex \n", "1304 System reboot initiated by user User758. System Notification regex \n", "1524 System reboot initiated by user User278. System Notification regex \n", "1562 System reboot initiated by user User648. System Notification regex \n", "1624 System reboot initiated by user User268. System Notification regex \n", "1663 System reboot initiated by user User315. System Notification regex \n", "1776 System reboot initiated by user User155. System Notification regex \n", "1803 System reboot initiated by user User204. System Notification regex \n", "1804 System reboot initiated by user User899. System Notification regex \n", "1852 System reboot initiated by user User223. System Notification regex \n", "1865 System reboot initiated by user User932. System Notification regex \n", "1956 System reboot initiated by user User264. System Notification regex \n", "2003 System reboot initiated by user User517. System Notification regex \n", "2014 System reboot initiated by user User293. System Notification regex \n", "2043 System reboot initiated by user User262. System Notification regex \n", "2074 System reboot initiated by user User937. System Notification regex \n", "2228 System reboot initiated by user User179. System Notification regex \n", "2243 System reboot initiated by user User770. System Notification regex \n", "2246 System reboot initiated by user User488. System Notification regex \n", "2253 System reboot initiated by user User644. System Notification regex \n", "2317 System reboot initiated by user User724. System Notification regex \n", "2360 System reboot initiated by user User876. System Notification regex " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.log_message.str.startswith(\"System reboot initiated by user\")]" ] }, { "cell_type": "markdown", "id": "f9848e705b7eaa60", "metadata": {}, "source": [ "### Clustering" ] }, { "cell_type": "code", "execution_count": 6, "id": "4ac33c95fa16ebc0", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:27:53.727373Z", "start_time": "2025-01-15T20:27:53.711739Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/yuvaraj/Desktop/projects/project-nlp-log-classification/venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from sklearn.cluster import DBSCAN\n", "from sentence_transformers import SentenceTransformer" ] }, { "cell_type": "code", "execution_count": 7, "id": "6722e5924d2a1fc0", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:27:57.371284Z", "start_time": "2025-01-15T20:27:53.820041Z" } }, "outputs": [], "source": [ "model = SentenceTransformer('all-MiniLM-L6-v2') # Lightweight embedding model\n", "embeddings = model.encode(df['log_message'].tolist())" ] }, { "cell_type": "code", "execution_count": 11, "id": "8e97b58b60296c93", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:27:57.391292Z", "start_time": "2025-01-15T20:27:57.371284Z" } }, "outputs": [ { "data": { "text/plain": [ "array([[-0.10293969, 0.03354594, -0.02202599, ..., 0.00457783,\n", " -0.04259719, 0.00322625],\n", " [ 0.00804575, -0.03573925, 0.0493874 , ..., 0.01538318,\n", " -0.0623095 , -0.02774667],\n", " [-0.00908216, 0.13003923, -0.05275571, ..., 0.02014102,\n", " -0.05117101, -0.02930292],\n", " [-0.0975106 , 0.04911299, -0.03977427, ..., 0.02477493,\n", " -0.03546083, -0.00018602],\n", " [-0.10468345, 0.05926038, -0.02488496, ..., 0.0250205 ,\n", " -0.03719296, -0.02568912]], shape=(5, 384), dtype=float32)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "embeddings[:5]" ] }, { "cell_type": "code", "execution_count": 12, "id": "797b761439f42836", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:27:57.492015Z", "start_time": "2025-01-15T20:27:57.421383Z" } }, "outputs": [], "source": [ "clustering = DBSCAN(eps=0.2, min_samples=1, metric='cosine').fit(embeddings)\n", "df['cluster'] = clustering.labels_" ] }, { "cell_type": "code", "execution_count": 13, "id": "f86db1d238061a83", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:27:57.554699Z", "start_time": "2025-01-15T20:27:57.540698Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
timestampsourcelog_messagetarget_labelcomplexitycluster
02025-06-27 07:20:25ModernCRMnova.osapi_compute.wsgi.server [req-b9718cd8-f...HTTP Statusbert0
11/14/2025 23:07ModernCRMEmail service experiencing issues with sendingCritical Errorbert1
21/17/2025 1:29AnalyticsEngineUnauthorized access to data was attemptedSecurity Alertbert2
32025-07-12 00:24:16ModernHRnova.osapi_compute.wsgi.server [req-4895c258-b...HTTP Statusbert0
42025-06-02 18:25:23BillingSystemnova.osapi_compute.wsgi.server [req-ee8bc8ba-9...HTTP Statusbert0
\n", "
" ], "text/plain": [ " timestamp source \\\n", "0 2025-06-27 07:20:25 ModernCRM \n", "1 1/14/2025 23:07 ModernCRM \n", "2 1/17/2025 1:29 AnalyticsEngine \n", "3 2025-07-12 00:24:16 ModernHR \n", "4 2025-06-02 18:25:23 BillingSystem \n", "\n", " log_message target_label \\\n", "0 nova.osapi_compute.wsgi.server [req-b9718cd8-f... HTTP Status \n", "1 Email service experiencing issues with sending Critical Error \n", "2 Unauthorized access to data was attempted Security Alert \n", "3 nova.osapi_compute.wsgi.server [req-4895c258-b... HTTP Status \n", "4 nova.osapi_compute.wsgi.server [req-ee8bc8ba-9... HTTP Status \n", "\n", " complexity cluster \n", "0 bert 0 \n", "1 bert 1 \n", "2 bert 2 \n", "3 bert 0 \n", "4 bert 0 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 14, "id": "d3ec326ef8793ed8", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:27:57.689518Z", "start_time": "2025-01-15T20:27:57.676503Z" } }, "outputs": [], "source": [ "# Group by cluster to inspect patterns\n", "clusters = df.groupby('cluster')['log_message'].apply(list)\n", "sorted_clusters = clusters.sort_values(key=lambda x: x.map(len), ascending=False)" ] }, { "cell_type": "code", "execution_count": 15, "id": "53dd43fd2cab0141", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:27:58.467824Z", "start_time": "2025-01-15T20:27:58.449975Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Clustered Patterns:\n", "Cluster 0:\n", " nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 \"GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1\" status: 200 len: 1893 time: 0.2675118\n", " nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 \"GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1\" HTTP status code - 200 len: 211 time: 0.0968180\n", " nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 \"GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1\" RCODE 200 len: 1874 time: 0.2280791\n", " nova.osapi_compute.wsgi.server [req-f0bffbc3-5ab0-4916-91c1-0a61dd7d4ec2 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 \"GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1\" Return code: 200 len: 1874 time: 0.2131531\n", " nova.osapi_compute.wsgi.server [req-2bf7cfee-a236-42f3-8fb1-96fefab0b302 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 \"GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1\" RCODE 200 len: 1874 time: 0.1794369\n", "Cluster 5:\n", " nova.compute.claims [req-a07ac654-8e81-416d-bfbb-189116b07969 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] [instance: bf8c824d-f099-4433-a41e-e3da7578262e] Total memory: 64172 MB, used: 512.00 MB\n", " nova.compute.claims [req-d6986b54-3735-4a42-9074-0ba7d9717de9 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] [instance: af5f7392-f7d4-4298-b647-c98924c64aa1] memory limit: 96258.00 MB, free: 95746.00 MB\n", " nova.compute.claims [req-72b4858f-049e-49e1-b31e-b562c5018eaf 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] [instance: 63a0d960-70b6-44c6-b606-491478a5cadf] disk limit not specified, defaulting to unlimited\n", " nova.compute.claims [req-5c8f52bd-8e3c-41f0-95a5-7861d247cafa 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] [instance: d96a117b-0193-4549-bdcc-63b917273d1d] vcpu limit not specified, defaulting to unlimited\n", " nova.compute.claims [req-d38f479d-9bb9-4276-9688-52607e8fd350 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] [instance: 95960536-049b-41f6-9049-05fc479b6a7c] disk limit not specified, defaulting to unlimited\n", "Cluster 11:\n", " User User685 logged out.\n", " User User395 logged in.\n", " User User225 logged in.\n", " User User494 logged out.\n", " User User900 logged in.\n", "Cluster 13:\n", " Backup started at 2025-05-14 07:06:55.\n", " Backup started at 2025-02-15 20:00:19.\n", " Backup ended at 2025-08-08 13:06:23.\n", " Backup started at 2025-11-14 08:27:43.\n", " Backup started at 2025-12-09 10:19:11.\n", "Cluster 7:\n", " Multiple bad login attempts detected on user 8538 account\n", " Multiple login failures occurred on user 9052 account\n", " User 7153 made multiple incorrect login attempts\n", " User 8300 made multiple incorrect login attempts\n", " Multiple login failures were detected for user 6373\n", "Cluster 8:\n", " Backup completed successfully.\n", " Backup completed successfully.\n", " Backup completed successfully.\n", " Backup completed successfully.\n", " Backup completed successfully.\n", "Cluster 21:\n", " System updated to version 3.9.1.\n", " System updated to version 5.5.4.\n", " System updated to version 4.7.4.\n", " System updated to version 3.7.7.\n", " System updated to version 2.6.2.\n", "Cluster 3:\n", " Shard 6 replication task ended in failure\n", " Data replication task for shard 14 did not complete\n", " Data replication task failed for shard 17\n", " Replication of data to shard 14 failed\n", " Data replication task for shard 6 did not complete\n", "Cluster 4:\n", " File data_6169.csv uploaded successfully by user User953.\n", " File data_3847.csv uploaded successfully by user User175.\n", " File data_7366.csv uploaded successfully by user User282.\n", " File data_1206.csv uploaded successfully by user User359.\n", " File data_1503.csv uploaded successfully by user User151.\n", "Cluster 17:\n", " Denied access attempt on restricted account Account2682\n", " Unauthorized login attempt on protected account Account5030\n", " Account Account9437 blocked due to failed login\n", " Account Account7999 access denied due to login failure\n", " Invalid login attempt made on secured account Account7864\n", "Cluster 32:\n", " Disk cleanup completed successfully.\n", " Disk cleanup completed successfully.\n", " Disk cleanup completed successfully.\n", " Disk cleanup completed successfully.\n", " Disk cleanup completed successfully.\n", "Cluster 6:\n", " Critical system unit error: unit ID Component55\n", " System component malfunction: component ID Component79\n", " Critical system element is down: element ID Component96\n", " Essential system part malfunction: part ID Component6\n", " Failure occurred in critical system component: component ID Component92\n", "Cluster 16:\n", " System reboot initiated by user User243.\n", " System reboot initiated by user User471.\n", " System reboot initiated by user User216.\n", " System reboot initiated by user User639.\n", " System reboot initiated by user User819.\n", "Cluster 20:\n", " User 7662 tried to bypass API security measures\n", " User 2367 failed to provide valid API access credentials\n", " User 3569 made an unauthorized API request\n", " Unauthorized user 2968 tried to access restricted API\n", " User 2186 attempted to access API without proper authorization\n", "Cluster 9:\n", " Account with ID 5351 created by User634.\n", " Account with ID 7813 created by User373.\n", " Account with ID 9827 created by User965.\n", " Account with ID 2520 created by User546.\n", " Account with ID 2300 created by User964.\n", "Cluster 1:\n", " Email service experiencing issues with sending\n", " Email server encountered a sending fault\n", " Mail service encountered a delivery glitch\n", " Service disruption caused by email sending error\n", " Email system had a problem sending emails\n", "Cluster 10:\n", " nova.compute.resource_tracker [req-addc1839-2ed5-4778-b57e-5854eb7b8b09 - - - - -] Final resource view: name=cp-1.slowvm1.tcloud-pg0.utah.cloudlab.us phys_ram=64172MB used_ram=512MB phys_disk=15GB used_disk=0GB total_vcpus=16 used_vcpus=0 pci_stats=[]\n", " nova.compute.resource_tracker [req-addc1839-2ed5-4778-b57e-5854eb7b8b09 - - - - -] Final resource view: name=cp-1.slowvm1.tcloud-pg0.utah.cloudlab.us phys_ram=64172MB used_ram=2560MB phys_disk=15GB used_disk=20GB total_vcpus=16 used_vcpus=1 pci_stats=[]\n", " nova.compute.resource_tracker [req-addc1839-2ed5-4778-b57e-5854eb7b8b09 - - - - -] Total usable vcpus: 16, total allocated vcpus: 0\n", " nova.compute.resource_tracker [req-addc1839-2ed5-4778-b57e-5854eb7b8b09 - - - - -] Final resource view: name=cp-1.slowvm1.tcloud-pg0.utah.cloudlab.us phys_ram=64172MB used_ram=2560MB phys_disk=15GB used_disk=20GB total_vcpus=16 used_vcpus=1 pci_stats=[]\n", " nova.compute.resource_tracker [req-addc1839-2ed5-4778-b57e-5854eb7b8b09 - - - - -] Final resource view: name=cp-1.slowvm1.tcloud-pg0.utah.cloudlab.us phys_ram=64172MB used_ram=2560MB phys_disk=15GB used_disk=20GB total_vcpus=16 used_vcpus=1 pci_stats=[]\n", "Cluster 34:\n", " Abnormal behavior found on server 10, possible security threat\n", " Security alert: suspicious activity on server 1\n", " Anomalous activity identified on server 23, security review recommended\n", " Server 27 experienced potential security incident, review required\n", " Server 36 experienced potential security incident, review required\n", "Cluster 14:\n", " Detection of multiple disk faults in RAID setup\n", " RAID array suffered multiple hard drive failures\n", " RAID array experienced multiple disk crashes\n", " Multiple hard drive issues in RAID configuration found\n", " Identification of multiple faulty disks in RAID array\n", "Cluster 53:\n", " Module X experienced an invalid data format issue\n", " Input format mismatch occurred in module X\n", " Module X reported an error in input format validation\n", " Module X failed to process input due to formatting error\n", " Input data format in module X was invalid or corrupted\n", "Cluster 52:\n", " Service health check was not successful because of SSL certificate validation failures.\n", " Invalid SSL certificate resulted in a failed service health check.\n", " Service health check failure was due to an invalid or improperly configured SSL certificate.\n", " An issue with the SSL certificate caused the service health check to fail.\n", " Service health check failure was caused by an expired SSL certificate.\n", "Cluster 18:\n", " Boot process terminated unexpectedly due to kernel issue\n", " System encountered kernel panic during initialization phase\n", " Boot process was stopped by kernel malfunction\n", " System encountered kernel failure during bootup sequence\n", " Boot sequence failed due to kernel panic\n", "Cluster 25:\n", " System configuration is no longer valid\n", " Configuration is corrupted throughout the system\n", " Cross-system configuration failure occurred\n", " System configuration is experiencing errors\n", " Configuration malfunction is system-wide\n", "Cluster 42:\n", " User 5127 has escalated admin privileges without authorization\n", " User 9745 has escalated to admin level\n", " User 8483 escalated privileges to admin level\n", " User 1987 has escalated to admin level\n", " User 8395 escalated privileges to admin level\n", "Cluster 59:\n", " Potential security threat: Admin privilege escalation for user 5130\n", " Admin privilege escalation alert for user 2893\n", " Admin privilege escalation alert for user 8532\n", " Potential security threat: Admin privilege escalation for user 1554\n", " Warning: Potential admin privilege escalation for user 5720\n", "Cluster 26:\n", " Privilege elevation detected for user 5038\n", " Elevation of admin privileges detected for user 6137\n", " Elevation of admin privileges detected for user 4907\n", " User 6069 has been granted elevated admin privileges\n", " Admin privilege elevation warning for user 7574\n" ] } ], "source": [ "print(\"Clustered Patterns:\")\n", "for cluster_id, messages in sorted_clusters.items():\n", " if len(messages) > 10:\n", " print(f\"Cluster {cluster_id}:\")\n", " for msg in messages[:5]:\n", " print(f\" {msg}\")" ] }, { "cell_type": "markdown", "id": "bbec6795396f2d6b", "metadata": {}, "source": [ "### Classification Stage 1: Regex" ] }, { "cell_type": "code", "execution_count": 16, "id": "ca32020e4fdb8f40", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:27:58.549493Z", "start_time": "2025-01-15T20:27:58.529458Z" } }, "outputs": [], "source": [ "import re\n", "def classify_with_regex(log_message):\n", " regex_patterns = {\n", " r\"User User\\d+ logged (in|out).\": \"User Action\",\n", " r\"Backup (started|ended) at .*\": \"System Notification\",\n", " r\"Backup completed successfully.\": \"System Notification\",\n", " r\"System updated to version .*\": \"System Notification\",\n", " r\"File .* uploaded successfully by user .*\": \"System Notification\",\n", " r\"Disk cleanup completed successfully.\": \"System Notification\",\n", " r\"System reboot initiated by user .*\": \"System Notification\",\n", " r\"Account with ID .* created by .*\": \"User Action\"\n", " }\n", " for pattern, label in regex_patterns.items():\n", " if re.search(pattern, log_message):\n", " return label\n", " return None" ] }, { "cell_type": "code", "execution_count": 17, "id": "4d9645ec6812da4a", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:27:58.589510Z", "start_time": "2025-01-15T20:27:58.579485Z" } }, "outputs": [ { "data": { "text/plain": [ "'User Action'" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "classify_with_regex(\"User User123 logged in.\")" ] }, { "cell_type": "code", "execution_count": 18, "id": "6b3b838a2d270190", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:27:58.636755Z", "start_time": "2025-01-15T20:27:58.624648Z" } }, "outputs": [ { "data": { "text/plain": [ "'System Notification'" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "classify_with_regex(\"System reboot initiated by user User179.\")" ] }, { "cell_type": "code", "execution_count": 19, "id": "99bcfd70b451835c", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:27:58.682792Z", "start_time": "2025-01-15T20:27:58.672822Z" } }, "outputs": [], "source": [ "classify_with_regex(\"Hey you, chill bro\")" ] }, { "cell_type": "code", "execution_count": 20, "id": "22619eedaa15acc3", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:27:58.760034Z", "start_time": "2025-01-15T20:27:58.731326Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
timestampsourcelog_messagetarget_labelcomplexityclusterregex_label
710/11/2025 8:44ModernHRFile data_6169.csv uploaded successfully by us...System Notificationregex4System Notification
141/4/2025 1:43ThirdPartyAPIFile data_3847.csv uploaded successfully by us...System Notificationregex4System Notification
155/1/2025 9:41ModernCRMBackup completed successfully.System Notificationregex8System Notification
182/22/2025 17:49ModernCRMAccount with ID 5351 created by User634.User Actionregex9User Action
279/24/2025 19:57ThirdPartyAPIUser User685 logged out.User Actionregex11User Action
........................
23766/27/2025 8:47ModernCRMSystem updated to version 2.0.5.System Notificationregex21System Notification
23819/5/2025 6:39ThirdPartyAPIDisk cleanup completed successfully.System Notificationregex32System Notification
23944/3/2025 13:13ModernHRDisk cleanup completed successfully.System Notificationregex32System Notification
23955/2/2025 14:29ThirdPartyAPIBackup ended at 2025-05-06 11:23:16.System Notificationregex13System Notification
240310/1/2025 1:31ModernCRMBackup completed successfully.System Notificationregex8System Notification
\n", "

500 rows × 7 columns

\n", "
" ], "text/plain": [ " timestamp source \\\n", "7 10/11/2025 8:44 ModernHR \n", "14 1/4/2025 1:43 ThirdPartyAPI \n", "15 5/1/2025 9:41 ModernCRM \n", "18 2/22/2025 17:49 ModernCRM \n", "27 9/24/2025 19:57 ThirdPartyAPI \n", "... ... ... \n", "2376 6/27/2025 8:47 ModernCRM \n", "2381 9/5/2025 6:39 ThirdPartyAPI \n", "2394 4/3/2025 13:13 ModernHR \n", "2395 5/2/2025 14:29 ThirdPartyAPI \n", "2403 10/1/2025 1:31 ModernCRM \n", "\n", " log_message target_label \\\n", "7 File data_6169.csv uploaded successfully by us... System Notification \n", "14 File data_3847.csv uploaded successfully by us... System Notification \n", "15 Backup completed successfully. System Notification \n", "18 Account with ID 5351 created by User634. User Action \n", "27 User User685 logged out. User Action \n", "... ... ... \n", "2376 System updated to version 2.0.5. System Notification \n", "2381 Disk cleanup completed successfully. System Notification \n", "2394 Disk cleanup completed successfully. System Notification \n", "2395 Backup ended at 2025-05-06 11:23:16. System Notification \n", "2403 Backup completed successfully. System Notification \n", "\n", " complexity cluster regex_label \n", "7 regex 4 System Notification \n", "14 regex 4 System Notification \n", "15 regex 8 System Notification \n", "18 regex 9 User Action \n", "27 regex 11 User Action \n", "... ... ... ... \n", "2376 regex 21 System Notification \n", "2381 regex 32 System Notification \n", "2394 regex 32 System Notification \n", "2395 regex 13 System Notification \n", "2403 regex 8 System Notification \n", "\n", "[500 rows x 7 columns]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Apply regex classification\n", "df['regex_label'] = df['log_message'].apply(lambda x: classify_with_regex(x))\n", "df[df['regex_label'].notnull()]" ] }, { "cell_type": "code", "execution_count": 21, "id": "e12979fc8238277f", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:27:58.806198Z", "start_time": "2025-01-15T20:27:58.792662Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
timestampsourcelog_messagetarget_labelcomplexityclusterregex_label
02025-06-27 07:20:25ModernCRMnova.osapi_compute.wsgi.server [req-b9718cd8-f...HTTP Statusbert0None
11/14/2025 23:07ModernCRMEmail service experiencing issues with sendingCritical Errorbert1None
21/17/2025 1:29AnalyticsEngineUnauthorized access to data was attemptedSecurity Alertbert2None
32025-07-12 00:24:16ModernHRnova.osapi_compute.wsgi.server [req-4895c258-b...HTTP Statusbert0None
42025-06-02 18:25:23BillingSystemnova.osapi_compute.wsgi.server [req-ee8bc8ba-9...HTTP Statusbert0None
\n", "
" ], "text/plain": [ " timestamp source \\\n", "0 2025-06-27 07:20:25 ModernCRM \n", "1 1/14/2025 23:07 ModernCRM \n", "2 1/17/2025 1:29 AnalyticsEngine \n", "3 2025-07-12 00:24:16 ModernHR \n", "4 2025-06-02 18:25:23 BillingSystem \n", "\n", " log_message target_label \\\n", "0 nova.osapi_compute.wsgi.server [req-b9718cd8-f... HTTP Status \n", "1 Email service experiencing issues with sending Critical Error \n", "2 Unauthorized access to data was attempted Security Alert \n", "3 nova.osapi_compute.wsgi.server [req-4895c258-b... HTTP Status \n", "4 nova.osapi_compute.wsgi.server [req-ee8bc8ba-9... HTTP Status \n", "\n", " complexity cluster regex_label \n", "0 bert 0 None \n", "1 bert 1 None \n", "2 bert 2 None \n", "3 bert 0 None \n", "4 bert 0 None " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['regex_label'].isnull()].head(5)" ] }, { "cell_type": "markdown", "id": "b58274a035c82628", "metadata": {}, "source": [ "### Classification Stage 2: Classification Using Embeddings" ] }, { "cell_type": "code", "execution_count": 22, "id": "7c21958116c1429b", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:29:48.629503Z", "start_time": "2025-01-15T20:29:48.598340Z" } }, "outputs": [ { "data": { "text/plain": [ "(1910, 7)" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_non_regex = df[df['regex_label'].isnull()].copy()\n", "df_non_regex.shape" ] }, { "cell_type": "code", "execution_count": 23, "id": "b340b51441a741a8", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:30:04.093929Z", "start_time": "2025-01-15T20:30:04.062728Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
timestampsourcelog_messagetarget_labelcomplexityclusterregex_label
602025-10-06 16:55:23LegacyCRMLead conversion failed for prospect ID 7842 du...Workflow Errorllm24None
2552025-05-03 16:55:35LegacyCRMAPI endpoint 'getCustomerDetails' is deprecate...Deprecation Warningllm48None
3772025-06-24 12:16:29LegacyCRMCustomer follow-up process for lead ID 5621 fa...Workflow Errorllm62None
13252025-04-17 07:33:44LegacyCRMEscalation rule execution failed for ticket ID...Workflow Errorllm105None
17342025-04-30 07:47:30LegacyCRMThe 'ExportToCSV' feature is outdated. Please ...Deprecation Warningllm118None
18262025-01-23 10:33:36LegacyCRMSupport for legacy authentication methods will...Deprecation Warningllm122None
22172025-05-12 09:46:54LegacyCRMTask assignment for TeamID 3425 could not comp...Workflow Errorllm133None
\n", "
" ], "text/plain": [ " timestamp source \\\n", "60 2025-10-06 16:55:23 LegacyCRM \n", "255 2025-05-03 16:55:35 LegacyCRM \n", "377 2025-06-24 12:16:29 LegacyCRM \n", "1325 2025-04-17 07:33:44 LegacyCRM \n", "1734 2025-04-30 07:47:30 LegacyCRM \n", "1826 2025-01-23 10:33:36 LegacyCRM \n", "2217 2025-05-12 09:46:54 LegacyCRM \n", "\n", " log_message target_label \\\n", "60 Lead conversion failed for prospect ID 7842 du... Workflow Error \n", "255 API endpoint 'getCustomerDetails' is deprecate... Deprecation Warning \n", "377 Customer follow-up process for lead ID 5621 fa... Workflow Error \n", "1325 Escalation rule execution failed for ticket ID... Workflow Error \n", "1734 The 'ExportToCSV' feature is outdated. Please ... Deprecation Warning \n", "1826 Support for legacy authentication methods will... Deprecation Warning \n", "2217 Task assignment for TeamID 3425 could not comp... Workflow Error \n", "\n", " complexity cluster regex_label \n", "60 llm 24 None \n", "255 llm 48 None \n", "377 llm 62 None \n", "1325 llm 105 None \n", "1734 llm 118 None \n", "1826 llm 122 None \n", "2217 llm 133 None " ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_legacy = df_non_regex[df_non_regex.source==\"LegacyCRM\"]\n", "df_legacy" ] }, { "cell_type": "code", "execution_count": 24, "id": "8d8f7e5902aca5f8", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:30:48.679137Z", "start_time": "2025-01-15T20:30:48.647857Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
timestampsourcelog_messagetarget_labelcomplexityclusterregex_label
02025-06-27 07:20:25ModernCRMnova.osapi_compute.wsgi.server [req-b9718cd8-f...HTTP Statusbert0None
11/14/2025 23:07ModernCRMEmail service experiencing issues with sendingCritical Errorbert1None
21/17/2025 1:29AnalyticsEngineUnauthorized access to data was attemptedSecurity Alertbert2None
32025-07-12 00:24:16ModernHRnova.osapi_compute.wsgi.server [req-4895c258-b...HTTP Statusbert0None
42025-06-02 18:25:23BillingSystemnova.osapi_compute.wsgi.server [req-ee8bc8ba-9...HTTP Statusbert0None
........................
24052025-08-13 07:29:25ModernHRnova.osapi_compute.wsgi.server [req-96c3ec98-2...HTTP Statusbert0None
24061/11/2025 5:32ModernHRUser 3844 account experienced multiple failed ...Security Alertbert7None
24072025-08-03 03:07:47ThirdPartyAPInova.metadata.wsgi.server [req-b6d4a270-accb-4...HTTP Statusbert0None
240811/11/2025 11:52BillingSystemEmail service affected by failed transmissionCritical Errorbert1None
240912/25/2025 13:21AnalyticsEngineRepeated failed login attempts occurred for us...Security Alertbert7None
\n", "

1903 rows × 7 columns

\n", "
" ], "text/plain": [ " timestamp source \\\n", "0 2025-06-27 07:20:25 ModernCRM \n", "1 1/14/2025 23:07 ModernCRM \n", "2 1/17/2025 1:29 AnalyticsEngine \n", "3 2025-07-12 00:24:16 ModernHR \n", "4 2025-06-02 18:25:23 BillingSystem \n", "... ... ... \n", "2405 2025-08-13 07:29:25 ModernHR \n", "2406 1/11/2025 5:32 ModernHR \n", "2407 2025-08-03 03:07:47 ThirdPartyAPI \n", "2408 11/11/2025 11:52 BillingSystem \n", "2409 12/25/2025 13:21 AnalyticsEngine \n", "\n", " log_message target_label \\\n", "0 nova.osapi_compute.wsgi.server [req-b9718cd8-f... HTTP Status \n", "1 Email service experiencing issues with sending Critical Error \n", "2 Unauthorized access to data was attempted Security Alert \n", "3 nova.osapi_compute.wsgi.server [req-4895c258-b... HTTP Status \n", "4 nova.osapi_compute.wsgi.server [req-ee8bc8ba-9... HTTP Status \n", "... ... ... \n", "2405 nova.osapi_compute.wsgi.server [req-96c3ec98-2... HTTP Status \n", "2406 User 3844 account experienced multiple failed ... Security Alert \n", "2407 nova.metadata.wsgi.server [req-b6d4a270-accb-4... HTTP Status \n", "2408 Email service affected by failed transmission Critical Error \n", "2409 Repeated failed login attempts occurred for us... Security Alert \n", "\n", " complexity cluster regex_label \n", "0 bert 0 None \n", "1 bert 1 None \n", "2 bert 2 None \n", "3 bert 0 None \n", "4 bert 0 None \n", "... ... ... ... \n", "2405 bert 0 None \n", "2406 bert 7 None \n", "2407 bert 0 None \n", "2408 bert 1 None \n", "2409 bert 7 None \n", "\n", "[1903 rows x 7 columns]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_non_legacy = df_non_regex[df_non_regex.source!=\"LegacyCRM\"]\n", "df_non_legacy" ] }, { "cell_type": "code", "execution_count": 25, "id": "27295bd7ada09140", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:30:59.173856Z", "start_time": "2025-01-15T20:30:59.142606Z" } }, "outputs": [ { "data": { "text/plain": [ "(1903, 7)" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_non_legacy.shape" ] }, { "cell_type": "code", "execution_count": 26, "id": "566831c64be8ed7", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:31:10.610031Z", "start_time": "2025-01-15T20:31:07.235690Z" } }, "outputs": [], "source": [ "model = SentenceTransformer('all-MiniLM-L6-v2') # Lightweight embedding model\n", "embeddings_filtered = model.encode(df_non_legacy['log_message'].tolist())" ] }, { "cell_type": "code", "execution_count": 27, "id": "ae5a2c977f0330cd", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:31:15.767984Z", "start_time": "2025-01-15T20:31:15.757908Z" } }, "outputs": [ { "data": { "text/plain": [ "1903" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(embeddings_filtered)" ] }, { "cell_type": "code", "execution_count": 28, "id": "a9b0c2b8798c9247", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:32:19.597483Z", "start_time": "2025-01-15T20:32:19.566230Z" } }, "outputs": [], "source": [ "X = embeddings_filtered\n", "y = df_non_legacy['target_label'].values" ] }, { "cell_type": "code", "execution_count": 29, "id": "b831de9df6a1d4c4", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:32:24.546133Z", "start_time": "2025-01-15T20:32:24.357812Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", "Critical Error 0.91 1.00 0.95 48\n", " Error 0.98 0.89 0.93 47\n", " HTTP Status 1.00 1.00 1.00 304\n", "Resource Usage 1.00 1.00 1.00 49\n", "Security Alert 1.00 0.99 1.00 123\n", "\n", " accuracy 0.99 571\n", " macro avg 0.98 0.98 0.98 571\n", " weighted avg 0.99 0.99 0.99 571\n", "\n" ] } ], "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.metrics import classification_report\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n", "clf = LogisticRegression(max_iter=1000)\n", "clf.fit(X_train, y_train)\n", "y_pred = clf.predict(X_test)\n", "report = classification_report(y_test, y_pred)\n", "print(report)" ] }, { "cell_type": "code", "execution_count": 31, "id": "1317f9b2de813a32", "metadata": { "ExecuteTime": { "end_time": "2025-01-15T20:36:52.942021Z", "start_time": "2025-01-15T20:36:52.910539Z" } }, "outputs": [ { "data": { "text/plain": [ "['../models/log_classifier.joblib']" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import joblib\n", "joblib.dump(clf, '../models/log_classifier.joblib')\n" ] }, { "cell_type": "code", "execution_count": null, "id": "0ddb2803", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.6" } }, "nbformat": 4, "nbformat_minor": 5 }