yuvarajareddy001 commited on
Commit
cd218ce
·
1 Parent(s): 53bfb09

Initial commit

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
README.md CHANGED
@@ -9,4 +9,110 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  pinned: false
10
  ---
11
 
12
+ # Optimized Log Classification Using LLMs
13
+ ---
14
+ A comprehensive framework for hybrid log classification that integrates multiple analytical techniques to effectively process and categorize log data.
15
+ This system leverages different methods to handle simple, complex, and sparsely labeled log patterns.
16
+ ---
17
+
18
+ ## Overview
19
+
20
+ This project combines three primary classification strategies:
21
+
22
+ - **Regex-based Classification**
23
+ Captures predictable patterns using predefined regular expressions.
24
+
25
+ - **Embedding-based Classification**
26
+ Uses Sentence Transformers to generate embeddings followed by Logistic Regression for nuanced pattern recognition.
27
+
28
+ - **LLM-assisted Classification**
29
+ Employs large language models to classify data when traditional methods struggle due to limited labeled samples.
30
+
31
+ ![System Architecture](resources/arch.png)
32
+
33
+ ---
34
+
35
+ ## Directory Structure
36
+
37
+ - **`training/`**
38
+ Contains notebooks and scripts for training the models and experimenting with different approaches.
39
+
40
+ - **`models/`**
41
+ Stores pre-trained models such as the logistic regression classifier and embedding models.
42
+
43
+ - **`resources/`**
44
+ Holds auxiliary files like CSV datasets, output samples, and images.
45
+
46
+ - **Root Directory**
47
+ Includes the main API server (`server.py`) and the command-line classification utility (`classify.py`).
48
+
49
+ ---
50
+
51
+ ## Installation & Setup
52
+
53
+ 1. **Clone the Repository**
54
+ ```bash
55
+ git clone <your_repository_url>
56
+ ```
57
+
58
+ 2. **Install Dependencies**
59
+ Ensure Python is installed and run:
60
+ ```bash
61
+ pip install -r requirements.txt
62
+ ```
63
+
64
+ 3. **Train the Model (if needed)**
65
+ Open and run the training notebook:
66
+ ```bash
67
+ jupyter notebook training/log_classification.ipynb
68
+ ```
69
+
70
+ 4. **Run the API Server**
71
+ Start the server using one of the following methods:
72
+ - Direct execution:
73
+ ```bash
74
+ python server.py
75
+ ```
76
+ - With Uvicorn:
77
+ ```bash
78
+ uvicorn server:app --reload
79
+ ```
80
+ Access the API documentation at:
81
+ - Main Endpoint: [http://127.0.0.1:8000/](http://127.0.0.1:8000/)
82
+ - Swagger UI: [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs)
83
+ - Redoc: [http://127.0.0.1:8000/redoc](http://127.0.0.1:8000/redoc)
84
+
85
+ 5. **Running the Streamlit App**
86
+ To start the Streamlit application for log classification:
87
+ ```bash
88
+ streamlit run app.py
89
+ ```
90
+ This command will launch the app in your browser at a URL like http://localhost:8501.
91
+ ---
92
+
93
+ ## Usage Instructions
94
+
95
+ - **Input Data**
96
+ Upload a CSV file with the following columns:
97
+ - `source`
98
+ - `log_message`
99
+
100
+ - **Output**
101
+ The system processes the logs and returns a CSV file with an additional `target_label` column indicating the classification result.
102
+
103
+ ---
104
+
105
+ ## Customization
106
+
107
+ Feel free to modify and extend the classification logic in the following modules:
108
+ - `processor_bert.py`
109
+ - `processor_llm.py`
110
+ - `processor_regex.py`
111
+
112
+ These modules are designed to be flexible, allowing you to tailor the classification approaches to your specific needs.
113
+
114
+ ---
115
+
116
+ ## Contributions
117
+ Contributions, feedback, and feature requests are welcome.
118
+ Please open an issue or submit a pull request in your GitHub repository.
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["TOKENIZERS_PARALLELISM"] = "false" # Disable tokenizer parallelism
3
+
4
+ import streamlit as st
5
+ import pandas as pd
6
+ from classify import classify
7
+ import asyncio
8
+
9
+ # Ensure an asyncio event loop exists
10
+ try:
11
+ asyncio.get_running_loop()
12
+ except RuntimeError:
13
+ asyncio.set_event_loop(asyncio.new_event_loop())
14
+
15
+ st.title("Log Classification App")
16
+ st.markdown("Upload a CSV file with columns `source` and `log_message` to perform log classification, or click the button below to use a default test CSV.")
17
+
18
+ # File uploader for user CSV
19
+ uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
20
+
21
+ # Button to use the default test CSV
22
+ use_test_csv = st.button("Use Test CSV")
23
+
24
+ # Function to process a DataFrame
25
+ def process_dataframe(df):
26
+ st.subheader("Input Data Sample")
27
+ st.dataframe(df.head())
28
+
29
+ # Validate required columns
30
+ if "source" not in df.columns or "log_message" not in df.columns:
31
+ st.error("CSV must contain 'source' and 'log_message' columns.")
32
+ return None
33
+
34
+ # Show a spinner while processing classification
35
+ with st.spinner("Classifying logs..."):
36
+ df["target_label"] = classify(list(zip(df["source"], df["log_message"])))
37
+
38
+ st.subheader("Output Data Sample")
39
+ st.dataframe(df.head())
40
+
41
+ # Prepare CSV for download
42
+ csv_data = df.to_csv(index=False).encode("utf-8")
43
+ st.download_button(
44
+ label="Download Output CSV",
45
+ data=csv_data,
46
+ file_name="output.csv",
47
+ mime="text/csv"
48
+ )
49
+ return df
50
+
51
+ # Process the uploaded file if provided
52
+ if uploaded_file is not None:
53
+ try:
54
+ df_input = pd.read_csv(uploaded_file)
55
+ process_dataframe(df_input)
56
+ except Exception as e:
57
+ st.error(f"An error occurred: {e}")
58
+
59
+ # If no file is uploaded and the user clicks the test CSV button
60
+ elif use_test_csv:
61
+ try:
62
+ df_input = pd.read_csv("resources/test.csv")
63
+ process_dataframe(df_input)
64
+ except Exception as e:
65
+ st.error(f"An error occurred while loading the test CSV: {e}")
arch.png ADDED
classify.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from processor_regex import classify_with_regex
2
+ from processor_bert import classify_with_bert
3
+ from processor_llm import classify_with_llm
4
+ import pandas as pd
5
+
6
+ def classify(logs):
7
+ labels = []
8
+ for source, log_msg in logs:
9
+ label = classify_log(source, log_msg)
10
+ labels.append(label)
11
+ return labels
12
+
13
+
14
+ def classify_log(source, log_msg):
15
+ if source == "LegacyCRM":
16
+ label = classify_with_llm(log_msg)
17
+ else:
18
+ label = classify_with_regex(log_msg)
19
+ if not label:
20
+ label = classify_with_bert(log_msg)
21
+ return label
22
+
23
+ def classify_csv(input_file):
24
+ df = pd.read_csv(input_file)
25
+
26
+ # Perform classification
27
+ df["target_label"] = classify(list(zip(df["source"], df["log_message"])))
28
+
29
+ # Save the modified file
30
+ output_file = "output.csv"
31
+ df.to_csv(output_file, index=False)
32
+
33
+ return output_file
34
+
35
+ if __name__ == '__main__':
36
+ classify_csv("test.csv")
37
+ # logs = [
38
+ # ("ModernCRM", "IP 192.168.133.114 blocked due to potential attack"),
39
+ # ("BillingSystem", "User 12345 logged in."),
40
+ # ("AnalyticsEngine", "File data_6957.csv uploaded successfully by user User265."),
41
+ # ("AnalyticsEngine", "Backup completed successfully."),
42
+ # ("ModernHR", "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1 RCODE 200 len: 1583 time: 0.1878400"),
43
+ # ("ModernHR", "Admin access escalation detected for user 9429"),
44
+ # ("LegacyCRM", "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active."),
45
+ # ("LegacyCRM", "Invoice generation process aborted for order ID 8910 due to invalid tax calculation module."),
46
+ # ("LegacyCRM", "The 'BulkEmailSender' feature is no longer supported. Use 'EmailCampaignManager' for improved functionality."),
47
+ # ("LegacyCRM", " The 'ReportGenerator' module will be retired in version 4.0. Please migrate to the 'AdvancedAnalyticsSuite' by Dec 2025")
48
+ # ]
49
+ # labels = classify(logs)
50
+ #
51
+ # for log, label in zip(logs, labels):
52
+ # print(log[0], "->", label)
53
+
54
+
models/log_classifier.joblib ADDED
Binary file (16.5 kB). View file
 
processor_bert.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+ from sentence_transformers import SentenceTransformer
3
+
4
+ model_embedding = SentenceTransformer('all-MiniLM-L6-v2') # Lightweight embedding model
5
+ model_classification = joblib.load("models/log_classifier.joblib")
6
+
7
+
8
+ def classify_with_bert(log_message):
9
+ embeddings = model_embedding.encode([log_message])
10
+ probabilities = model_classification.predict_proba(embeddings)[0]
11
+ if max(probabilities) < 0.5:
12
+ return "Unclassified"
13
+ predicted_label = model_classification.predict(embeddings)[0]
14
+
15
+ return predicted_label
16
+
17
+
18
+ if __name__ == "__main__":
19
+ logs = [
20
+ "alpha.osapi_compute.wsgi.server - 12.10.11.1 - API returned 404 not found error",
21
+ "GET /v2/3454/servers/detail HTTP/1.1 RCODE 404 len: 1583 time: 0.1878400",
22
+ "System crashed due to drivers errors when restarting the server",
23
+ "Hey bro, chill ya!",
24
+ "Multiple login failures occurred on user 6454 account",
25
+ "Server A790 was restarted unexpectedly during the process of data transfer"
26
+ ]
27
+ for log in logs:
28
+ label = classify_with_bert(log)
29
+ print(log, "->", label)
processor_llm.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ from groq import Groq
3
+ import json
4
+ import re
5
+
6
+
7
+ load_dotenv()
8
+
9
+ groq = Groq()
10
+
11
+ def classify_with_llm(log_msg):
12
+ """
13
+ Generate a variant of the input sentence. For example,
14
+ If input sentence is "User session timed out unexpectedly, user ID: 9250.",
15
+ variant would be "Session timed out for user 9251"
16
+ """
17
+ prompt = f'''Classify the log message into one of these categories:
18
+ (1) Workflow Error, (2) Deprecation Warning.
19
+ If you can't figure out a category, use "Unclassified".
20
+ Put the category inside <category> </category> tags.
21
+ Log message: {log_msg}'''
22
+
23
+ chat_completion = groq.chat.completions.create(
24
+ messages=[{"role": "user", "content": prompt}],
25
+ # model="llama-3.3-70b-versatile",
26
+ model="deepseek-r1-distill-llama-70b",
27
+ temperature=0.5
28
+ )
29
+
30
+ content = chat_completion.choices[0].message.content
31
+ match = re.search(r'<category>(.*)<\/category>', content, flags=re.DOTALL)
32
+ category = "Unclassified"
33
+ if match:
34
+ category = match.group(1)
35
+
36
+ return category
37
+
38
+
39
+ if __name__ == "__main__":
40
+ print(classify_with_llm(
41
+ "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active."))
42
+ print(classify_with_llm(
43
+ "The 'ReportGenerator' module will be retired in version 4.0. Please migrate to the 'AdvancedAnalyticsSuite' by Dec 2025"))
44
+ print(classify_with_llm("System reboot initiated by user 12345."))
processor_regex.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ def classify_with_regex(log_message):
3
+ regex_patterns = {
4
+ r"User User\d+ logged (in|out).": "User Action",
5
+ r"Backup (started|ended) at .*": "System Notification",
6
+ r"Backup completed successfully.": "System Notification",
7
+ r"System updated to version .*": "System Notification",
8
+ r"File .* uploaded successfully by user .*": "System Notification",
9
+ r"Disk cleanup completed successfully.": "System Notification",
10
+ r"System reboot initiated by user .*": "System Notification",
11
+ r"Account with ID .* created by .*": "User Action"
12
+ }
13
+ for pattern, label in regex_patterns.items():
14
+ if re.search(pattern, log_message):
15
+ return label
16
+ return None
17
+
18
+ if __name__ == "__main__":
19
+ print(classify_with_regex("Backup completed successfully."))
20
+ print(classify_with_regex("Account with ID 1234 created by User1."))
21
+ print(classify_with_regex("Hey Bro, chill ya!"))
22
+
23
+
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ python-dotenv
3
+ groq
4
+ sentence-transformers
5
+ joblib
6
+ pandas
7
+ scikit-learn
8
+ uvicorn
9
+ python-multipart
10
+ streamlit
11
+ watchdog
resources/.DS_Store ADDED
Binary file (6.15 kB). View file
 
resources/output.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ source,log_message,target_label
2
+ ModernCRM," ""IP 192.168.133.114 blocked due to potential attack""",Security Alert
3
+ BillingSystem," ""User 12345 logged in.""",Security Alert
4
+ AnalyticsEngine," ""File data_6957.csv uploaded successfully by user User265.""",System Notification
5
+ AnalyticsEngine," ""Backup completed successfully.""",System Notification
6
+ ModernHR," ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1 RCODE 200 len: 1583 time: 0.1878400""",HTTP Status
7
+ ModernHR," ""Admin access escalation detected for user 9429""",Security Alert
8
+ LegacyCRM," ""Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active.""",Workflow Error
9
+ LegacyCRM," ""Invoice generation process aborted for order ID 8910 due to invalid tax calculation module.""",Workflow Error
10
+ LegacyCRM," ""The 'BulkEmailSender' feature is no longer supported. Use 'EmailCampaignManager' for improved functionality.""",Deprecation Warning
11
+ LegacyCRM," ""The 'ReportGenerator' module will be retired in version 4.0. Please migrate to the 'AdvancedAnalyticsSuite' by Dec 2025""",Deprecation Warning
resources/test.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ source,log_message
2
+ ModernCRM, "IP 192.168.133.114 blocked due to potential attack"
3
+ BillingSystem, "User 12345 logged in."
4
+ AnalyticsEngine, "File data_6957.csv uploaded successfully by user User265."
5
+ AnalyticsEngine, "Backup completed successfully."
6
+ ModernHR, "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1 RCODE 200 len: 1583 time: 0.1878400"
7
+ ModernHR, "Admin access escalation detected for user 9429"
8
+ LegacyCRM, "Case escalation for ticket ID 7324 failed because the assigned support agent is no longer active."
9
+ LegacyCRM, "Invoice generation process aborted for order ID 8910 due to invalid tax calculation module."
10
+ LegacyCRM, "The 'BulkEmailSender' feature is no longer supported. Use 'EmailCampaignManager' for improved functionality."
11
+ LegacyCRM, "The 'ReportGenerator' module will be retired in version 4.0. Please migrate to the 'AdvancedAnalyticsSuite' by Dec 2025"
server.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uvicorn
2
+ import pandas as pd
3
+ from fastapi import FastAPI, UploadFile, HTTPException
4
+ from fastapi.responses import FileResponse
5
+ from classify import classify
6
+
7
+ app = FastAPI()
8
+
9
+ @app.post("/classify/")
10
+ async def classify_logs(file: UploadFile):
11
+ if not file.filename.endswith('.csv'):
12
+ raise HTTPException(status_code=400, detail="File must be a CSV.")
13
+
14
+ try:
15
+ # Read the uploaded CSV
16
+ df = pd.read_csv(file.file)
17
+ if "source" not in df.columns or "log_message" not in df.columns:
18
+ raise HTTPException(status_code=400, detail="CSV must contain 'source' and 'log_message' columns.")
19
+
20
+ # Perform classification
21
+ df["target_label"] = classify(list(zip(df["source"], df["log_message"])))
22
+
23
+ print("Dataframe:",df.to_dict())
24
+
25
+ # Save the modified file
26
+ output_file = "resources/output.csv"
27
+ df.to_csv(output_file, index=False)
28
+ print("File saved to output.csv")
29
+ return FileResponse(output_file, media_type='text/csv')
30
+ except Exception as e:
31
+ raise HTTPException(status_code=500, detail=str(e))
32
+ finally:
33
+ file.file.close()
34
+ # # Clean up if the file was saved
35
+ # if os.path.exists("output.csv"):
36
+ # os.remove("output.csv")
37
+
38
+
39
+ if __name__ == '__main__':
40
+ uvicorn.run("server:app", host="127.0.0.1", port=8000, reload=True)
training/dataset/synthetic_logs.csv ADDED
The diff for this file is too large to render. See raw diff
 
training/log_classification.ipynb ADDED
@@ -0,0 +1,2348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "4ef3464673447a14",
6
+ "metadata": {},
7
+ "source": [
8
+ "### Data Load"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 1,
14
+ "id": "initial_id",
15
+ "metadata": {
16
+ "ExecuteTime": {
17
+ "end_time": "2025-01-18T21:42:43.965893Z",
18
+ "start_time": "2025-01-18T21:42:43.144021Z"
19
+ },
20
+ "collapsed": true
21
+ },
22
+ "outputs": [
23
+ {
24
+ "data": {
25
+ "text/html": [
26
+ "<div>\n",
27
+ "<style scoped>\n",
28
+ " .dataframe tbody tr th:only-of-type {\n",
29
+ " vertical-align: middle;\n",
30
+ " }\n",
31
+ "\n",
32
+ " .dataframe tbody tr th {\n",
33
+ " vertical-align: top;\n",
34
+ " }\n",
35
+ "\n",
36
+ " .dataframe thead th {\n",
37
+ " text-align: right;\n",
38
+ " }\n",
39
+ "</style>\n",
40
+ "<table border=\"1\" class=\"dataframe\">\n",
41
+ " <thead>\n",
42
+ " <tr style=\"text-align: right;\">\n",
43
+ " <th></th>\n",
44
+ " <th>timestamp</th>\n",
45
+ " <th>source</th>\n",
46
+ " <th>log_message</th>\n",
47
+ " <th>target_label</th>\n",
48
+ " <th>complexity</th>\n",
49
+ " </tr>\n",
50
+ " </thead>\n",
51
+ " <tbody>\n",
52
+ " <tr>\n",
53
+ " <th>0</th>\n",
54
+ " <td>2025-06-27 07:20:25</td>\n",
55
+ " <td>ModernCRM</td>\n",
56
+ " <td>nova.osapi_compute.wsgi.server [req-b9718cd8-f...</td>\n",
57
+ " <td>HTTP Status</td>\n",
58
+ " <td>bert</td>\n",
59
+ " </tr>\n",
60
+ " <tr>\n",
61
+ " <th>1</th>\n",
62
+ " <td>1/14/2025 23:07</td>\n",
63
+ " <td>ModernCRM</td>\n",
64
+ " <td>Email service experiencing issues with sending</td>\n",
65
+ " <td>Critical Error</td>\n",
66
+ " <td>bert</td>\n",
67
+ " </tr>\n",
68
+ " <tr>\n",
69
+ " <th>2</th>\n",
70
+ " <td>1/17/2025 1:29</td>\n",
71
+ " <td>AnalyticsEngine</td>\n",
72
+ " <td>Unauthorized access to data was attempted</td>\n",
73
+ " <td>Security Alert</td>\n",
74
+ " <td>bert</td>\n",
75
+ " </tr>\n",
76
+ " <tr>\n",
77
+ " <th>3</th>\n",
78
+ " <td>2025-07-12 00:24:16</td>\n",
79
+ " <td>ModernHR</td>\n",
80
+ " <td>nova.osapi_compute.wsgi.server [req-4895c258-b...</td>\n",
81
+ " <td>HTTP Status</td>\n",
82
+ " <td>bert</td>\n",
83
+ " </tr>\n",
84
+ " <tr>\n",
85
+ " <th>4</th>\n",
86
+ " <td>2025-06-02 18:25:23</td>\n",
87
+ " <td>BillingSystem</td>\n",
88
+ " <td>nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...</td>\n",
89
+ " <td>HTTP Status</td>\n",
90
+ " <td>bert</td>\n",
91
+ " </tr>\n",
92
+ " </tbody>\n",
93
+ "</table>\n",
94
+ "</div>"
95
+ ],
96
+ "text/plain": [
97
+ " timestamp source \\\n",
98
+ "0 2025-06-27 07:20:25 ModernCRM \n",
99
+ "1 1/14/2025 23:07 ModernCRM \n",
100
+ "2 1/17/2025 1:29 AnalyticsEngine \n",
101
+ "3 2025-07-12 00:24:16 ModernHR \n",
102
+ "4 2025-06-02 18:25:23 BillingSystem \n",
103
+ "\n",
104
+ " log_message target_label \\\n",
105
+ "0 nova.osapi_compute.wsgi.server [req-b9718cd8-f... HTTP Status \n",
106
+ "1 Email service experiencing issues with sending Critical Error \n",
107
+ "2 Unauthorized access to data was attempted Security Alert \n",
108
+ "3 nova.osapi_compute.wsgi.server [req-4895c258-b... HTTP Status \n",
109
+ "4 nova.osapi_compute.wsgi.server [req-ee8bc8ba-9... HTTP Status \n",
110
+ "\n",
111
+ " complexity \n",
112
+ "0 bert \n",
113
+ "1 bert \n",
114
+ "2 bert \n",
115
+ "3 bert \n",
116
+ "4 bert "
117
+ ]
118
+ },
119
+ "execution_count": 1,
120
+ "metadata": {},
121
+ "output_type": "execute_result"
122
+ }
123
+ ],
124
+ "source": [
125
+ "import pandas as pd\n",
126
+ "\n",
127
+ "df = pd.read_csv(\"dataset/synthetic_logs.csv\")\n",
128
+ "df.head()"
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "code",
133
+ "execution_count": 2,
134
+ "id": "4495a73d8e933c6e",
135
+ "metadata": {
136
+ "ExecuteTime": {
137
+ "end_time": "2025-01-15T20:27:53.403158Z",
138
+ "start_time": "2025-01-15T20:27:53.387783Z"
139
+ }
140
+ },
141
+ "outputs": [
142
+ {
143
+ "data": {
144
+ "text/plain": [
145
+ "array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',\n",
146
+ " 'ThirdPartyAPI', 'LegacyCRM'], dtype=object)"
147
+ ]
148
+ },
149
+ "execution_count": 2,
150
+ "metadata": {},
151
+ "output_type": "execute_result"
152
+ }
153
+ ],
154
+ "source": [
155
+ "df.source.unique()"
156
+ ]
157
+ },
158
+ {
159
+ "cell_type": "code",
160
+ "execution_count": 3,
161
+ "id": "92da3b13bac914a7",
162
+ "metadata": {
163
+ "ExecuteTime": {
164
+ "end_time": "2025-01-15T20:27:53.466975Z",
165
+ "start_time": "2025-01-15T20:27:53.452028Z"
166
+ }
167
+ },
168
+ "outputs": [
169
+ {
170
+ "data": {
171
+ "text/plain": [
172
+ "array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',\n",
173
+ " 'System Notification', 'Resource Usage', 'User Action',\n",
174
+ " 'Workflow Error', 'Deprecation Warning'], dtype=object)"
175
+ ]
176
+ },
177
+ "execution_count": 3,
178
+ "metadata": {},
179
+ "output_type": "execute_result"
180
+ }
181
+ ],
182
+ "source": [
183
+ "df.target_label.unique()"
184
+ ]
185
+ },
186
+ {
187
+ "cell_type": "code",
188
+ "execution_count": 4,
189
+ "id": "b350454e0d700e15",
190
+ "metadata": {
191
+ "ExecuteTime": {
192
+ "end_time": "2025-01-15T20:27:53.537931Z",
193
+ "start_time": "2025-01-15T20:27:53.521598Z"
194
+ }
195
+ },
196
+ "outputs": [
197
+ {
198
+ "data": {
199
+ "text/html": [
200
+ "<div>\n",
201
+ "<style scoped>\n",
202
+ " .dataframe tbody tr th:only-of-type {\n",
203
+ " vertical-align: middle;\n",
204
+ " }\n",
205
+ "\n",
206
+ " .dataframe tbody tr th {\n",
207
+ " vertical-align: top;\n",
208
+ " }\n",
209
+ "\n",
210
+ " .dataframe thead th {\n",
211
+ " text-align: right;\n",
212
+ " }\n",
213
+ "</style>\n",
214
+ "<table border=\"1\" class=\"dataframe\">\n",
215
+ " <thead>\n",
216
+ " <tr style=\"text-align: right;\">\n",
217
+ " <th></th>\n",
218
+ " <th>timestamp</th>\n",
219
+ " <th>source</th>\n",
220
+ " <th>log_message</th>\n",
221
+ " <th>target_label</th>\n",
222
+ " <th>complexity</th>\n",
223
+ " </tr>\n",
224
+ " </thead>\n",
225
+ " <tbody>\n",
226
+ " <tr>\n",
227
+ " <th>1683</th>\n",
228
+ " <td>12/13/2025 5:35</td>\n",
229
+ " <td>ModernCRM</td>\n",
230
+ " <td>Backup completed successfully.</td>\n",
231
+ " <td>System Notification</td>\n",
232
+ " <td>regex</td>\n",
233
+ " </tr>\n",
234
+ " <tr>\n",
235
+ " <th>2289</th>\n",
236
+ " <td>1/23/2025 9:59</td>\n",
237
+ " <td>ThirdPartyAPI</td>\n",
238
+ " <td>System updated to version 2.3.5.</td>\n",
239
+ " <td>System Notification</td>\n",
240
+ " <td>regex</td>\n",
241
+ " </tr>\n",
242
+ " <tr>\n",
243
+ " <th>1002</th>\n",
244
+ " <td>3/1/2025 7:52</td>\n",
245
+ " <td>ModernCRM</td>\n",
246
+ " <td>Disk cleanup completed successfully.</td>\n",
247
+ " <td>System Notification</td>\n",
248
+ " <td>regex</td>\n",
249
+ " </tr>\n",
250
+ " <tr>\n",
251
+ " <th>1343</th>\n",
252
+ " <td>5/13/2025 15:57</td>\n",
253
+ " <td>ModernCRM</td>\n",
254
+ " <td>System updated to version 5.7.5.</td>\n",
255
+ " <td>System Notification</td>\n",
256
+ " <td>regex</td>\n",
257
+ " </tr>\n",
258
+ " <tr>\n",
259
+ " <th>2395</th>\n",
260
+ " <td>5/2/2025 14:29</td>\n",
261
+ " <td>ThirdPartyAPI</td>\n",
262
+ " <td>Backup ended at 2025-05-06 11:23:16.</td>\n",
263
+ " <td>System Notification</td>\n",
264
+ " <td>regex</td>\n",
265
+ " </tr>\n",
266
+ " <tr>\n",
267
+ " <th>1456</th>\n",
268
+ " <td>11/14/2025 17:55</td>\n",
269
+ " <td>AnalyticsEngine</td>\n",
270
+ " <td>Disk cleanup completed successfully.</td>\n",
271
+ " <td>System Notification</td>\n",
272
+ " <td>regex</td>\n",
273
+ " </tr>\n",
274
+ " <tr>\n",
275
+ " <th>643</th>\n",
276
+ " <td>1/12/2025 5:21</td>\n",
277
+ " <td>ModernHR</td>\n",
278
+ " <td>System updated to version 1.2.9.</td>\n",
279
+ " <td>System Notification</td>\n",
280
+ " <td>regex</td>\n",
281
+ " </tr>\n",
282
+ " <tr>\n",
283
+ " <th>1321</th>\n",
284
+ " <td>4/25/2025 7:16</td>\n",
285
+ " <td>ModernCRM</td>\n",
286
+ " <td>System updated to version 5.4.0.</td>\n",
287
+ " <td>System Notification</td>\n",
288
+ " <td>regex</td>\n",
289
+ " </tr>\n",
290
+ " <tr>\n",
291
+ " <th>841</th>\n",
292
+ " <td>11/7/2025 19:23</td>\n",
293
+ " <td>ModernHR</td>\n",
294
+ " <td>File data_7222.csv uploaded successfully by us...</td>\n",
295
+ " <td>System Notification</td>\n",
296
+ " <td>regex</td>\n",
297
+ " </tr>\n",
298
+ " <tr>\n",
299
+ " <th>2333</th>\n",
300
+ " <td>8/28/2025 2:09</td>\n",
301
+ " <td>ThirdPartyAPI</td>\n",
302
+ " <td>Disk cleanup completed successfully.</td>\n",
303
+ " <td>System Notification</td>\n",
304
+ " <td>regex</td>\n",
305
+ " </tr>\n",
306
+ " </tbody>\n",
307
+ "</table>\n",
308
+ "</div>"
309
+ ],
310
+ "text/plain": [
311
+ " timestamp source \\\n",
312
+ "1683 12/13/2025 5:35 ModernCRM \n",
313
+ "2289 1/23/2025 9:59 ThirdPartyAPI \n",
314
+ "1002 3/1/2025 7:52 ModernCRM \n",
315
+ "1343 5/13/2025 15:57 ModernCRM \n",
316
+ "2395 5/2/2025 14:29 ThirdPartyAPI \n",
317
+ "1456 11/14/2025 17:55 AnalyticsEngine \n",
318
+ "643 1/12/2025 5:21 ModernHR \n",
319
+ "1321 4/25/2025 7:16 ModernCRM \n",
320
+ "841 11/7/2025 19:23 ModernHR \n",
321
+ "2333 8/28/2025 2:09 ThirdPartyAPI \n",
322
+ "\n",
323
+ " log_message target_label \\\n",
324
+ "1683 Backup completed successfully. System Notification \n",
325
+ "2289 System updated to version 2.3.5. System Notification \n",
326
+ "1002 Disk cleanup completed successfully. System Notification \n",
327
+ "1343 System updated to version 5.7.5. System Notification \n",
328
+ "2395 Backup ended at 2025-05-06 11:23:16. System Notification \n",
329
+ "1456 Disk cleanup completed successfully. System Notification \n",
330
+ "643 System updated to version 1.2.9. System Notification \n",
331
+ "1321 System updated to version 5.4.0. System Notification \n",
332
+ "841 File data_7222.csv uploaded successfully by us... System Notification \n",
333
+ "2333 Disk cleanup completed successfully. System Notification \n",
334
+ "\n",
335
+ " complexity \n",
336
+ "1683 regex \n",
337
+ "2289 regex \n",
338
+ "1002 regex \n",
339
+ "1343 regex \n",
340
+ "2395 regex \n",
341
+ "1456 regex \n",
342
+ "643 regex \n",
343
+ "1321 regex \n",
344
+ "841 regex \n",
345
+ "2333 regex "
346
+ ]
347
+ },
348
+ "execution_count": 4,
349
+ "metadata": {},
350
+ "output_type": "execute_result"
351
+ }
352
+ ],
353
+ "source": [
354
+ "df[df.target_label=='System Notification'].sample(10)"
355
+ ]
356
+ },
357
+ {
358
+ "cell_type": "code",
359
+ "execution_count": 5,
360
+ "id": "dc5394d70050e10d",
361
+ "metadata": {
362
+ "ExecuteTime": {
363
+ "end_time": "2025-01-15T20:27:53.626880Z",
364
+ "start_time": "2025-01-15T20:27:53.611502Z"
365
+ }
366
+ },
367
+ "outputs": [
368
+ {
369
+ "data": {
370
+ "text/html": [
371
+ "<div>\n",
372
+ "<style scoped>\n",
373
+ " .dataframe tbody tr th:only-of-type {\n",
374
+ " vertical-align: middle;\n",
375
+ " }\n",
376
+ "\n",
377
+ " .dataframe tbody tr th {\n",
378
+ " vertical-align: top;\n",
379
+ " }\n",
380
+ "\n",
381
+ " .dataframe thead th {\n",
382
+ " text-align: right;\n",
383
+ " }\n",
384
+ "</style>\n",
385
+ "<table border=\"1\" class=\"dataframe\">\n",
386
+ " <thead>\n",
387
+ " <tr style=\"text-align: right;\">\n",
388
+ " <th></th>\n",
389
+ " <th>timestamp</th>\n",
390
+ " <th>source</th>\n",
391
+ " <th>log_message</th>\n",
392
+ " <th>target_label</th>\n",
393
+ " <th>complexity</th>\n",
394
+ " </tr>\n",
395
+ " </thead>\n",
396
+ " <tbody>\n",
397
+ " <tr>\n",
398
+ " <th>36</th>\n",
399
+ " <td>11/19/2025 13:14</td>\n",
400
+ " <td>BillingSystem</td>\n",
401
+ " <td>System reboot initiated by user User243.</td>\n",
402
+ " <td>System Notification</td>\n",
403
+ " <td>regex</td>\n",
404
+ " </tr>\n",
405
+ " <tr>\n",
406
+ " <th>92</th>\n",
407
+ " <td>12/4/2025 21:20</td>\n",
408
+ " <td>BillingSystem</td>\n",
409
+ " <td>System reboot initiated by user User471.</td>\n",
410
+ " <td>System Notification</td>\n",
411
+ " <td>regex</td>\n",
412
+ " </tr>\n",
413
+ " <tr>\n",
414
+ " <th>139</th>\n",
415
+ " <td>5/8/2025 16:34</td>\n",
416
+ " <td>ModernHR</td>\n",
417
+ " <td>System reboot initiated by user User216.</td>\n",
418
+ " <td>System Notification</td>\n",
419
+ " <td>regex</td>\n",
420
+ " </tr>\n",
421
+ " <tr>\n",
422
+ " <th>140</th>\n",
423
+ " <td>9/11/2025 8:49</td>\n",
424
+ " <td>AnalyticsEngine</td>\n",
425
+ " <td>System reboot initiated by user User639.</td>\n",
426
+ " <td>System Notification</td>\n",
427
+ " <td>regex</td>\n",
428
+ " </tr>\n",
429
+ " <tr>\n",
430
+ " <th>161</th>\n",
431
+ " <td>3/31/2025 19:40</td>\n",
432
+ " <td>BillingSystem</td>\n",
433
+ " <td>System reboot initiated by user User819.</td>\n",
434
+ " <td>System Notification</td>\n",
435
+ " <td>regex</td>\n",
436
+ " </tr>\n",
437
+ " <tr>\n",
438
+ " <th>163</th>\n",
439
+ " <td>6/6/2025 15:29</td>\n",
440
+ " <td>BillingSystem</td>\n",
441
+ " <td>System reboot initiated by user User938.</td>\n",
442
+ " <td>System Notification</td>\n",
443
+ " <td>regex</td>\n",
444
+ " </tr>\n",
445
+ " <tr>\n",
446
+ " <th>307</th>\n",
447
+ " <td>4/12/2025 0:41</td>\n",
448
+ " <td>BillingSystem</td>\n",
449
+ " <td>System reboot initiated by user User929.</td>\n",
450
+ " <td>System Notification</td>\n",
451
+ " <td>regex</td>\n",
452
+ " </tr>\n",
453
+ " <tr>\n",
454
+ " <th>365</th>\n",
455
+ " <td>10/20/2025 22:32</td>\n",
456
+ " <td>ModernHR</td>\n",
457
+ " <td>System reboot initiated by user User533.</td>\n",
458
+ " <td>System Notification</td>\n",
459
+ " <td>regex</td>\n",
460
+ " </tr>\n",
461
+ " <tr>\n",
462
+ " <th>508</th>\n",
463
+ " <td>4/15/2025 2:04</td>\n",
464
+ " <td>ThirdPartyAPI</td>\n",
465
+ " <td>System reboot initiated by user User591.</td>\n",
466
+ " <td>System Notification</td>\n",
467
+ " <td>regex</td>\n",
468
+ " </tr>\n",
469
+ " <tr>\n",
470
+ " <th>552</th>\n",
471
+ " <td>9/22/2025 20:54</td>\n",
472
+ " <td>ModernHR</td>\n",
473
+ " <td>System reboot initiated by user User421.</td>\n",
474
+ " <td>System Notification</td>\n",
475
+ " <td>regex</td>\n",
476
+ " </tr>\n",
477
+ " <tr>\n",
478
+ " <th>668</th>\n",
479
+ " <td>9/5/2025 7:14</td>\n",
480
+ " <td>ModernHR</td>\n",
481
+ " <td>System reboot initiated by user User297.</td>\n",
482
+ " <td>System Notification</td>\n",
483
+ " <td>regex</td>\n",
484
+ " </tr>\n",
485
+ " <tr>\n",
486
+ " <th>693</th>\n",
487
+ " <td>7/6/2025 21:40</td>\n",
488
+ " <td>BillingSystem</td>\n",
489
+ " <td>System reboot initiated by user User159.</td>\n",
490
+ " <td>System Notification</td>\n",
491
+ " <td>regex</td>\n",
492
+ " </tr>\n",
493
+ " <tr>\n",
494
+ " <th>697</th>\n",
495
+ " <td>3/13/2025 7:09</td>\n",
496
+ " <td>BillingSystem</td>\n",
497
+ " <td>System reboot initiated by user User648.</td>\n",
498
+ " <td>System Notification</td>\n",
499
+ " <td>regex</td>\n",
500
+ " </tr>\n",
501
+ " <tr>\n",
502
+ " <th>714</th>\n",
503
+ " <td>9/25/2025 23:35</td>\n",
504
+ " <td>ThirdPartyAPI</td>\n",
505
+ " <td>System reboot initiated by user User600.</td>\n",
506
+ " <td>System Notification</td>\n",
507
+ " <td>regex</td>\n",
508
+ " </tr>\n",
509
+ " <tr>\n",
510
+ " <th>730</th>\n",
511
+ " <td>5/24/2025 11:08</td>\n",
512
+ " <td>AnalyticsEngine</td>\n",
513
+ " <td>System reboot initiated by user User120.</td>\n",
514
+ " <td>System Notification</td>\n",
515
+ " <td>regex</td>\n",
516
+ " </tr>\n",
517
+ " <tr>\n",
518
+ " <th>800</th>\n",
519
+ " <td>8/15/2025 12:14</td>\n",
520
+ " <td>BillingSystem</td>\n",
521
+ " <td>System reboot initiated by user User901.</td>\n",
522
+ " <td>System Notification</td>\n",
523
+ " <td>regex</td>\n",
524
+ " </tr>\n",
525
+ " <tr>\n",
526
+ " <th>837</th>\n",
527
+ " <td>4/9/2025 8:28</td>\n",
528
+ " <td>AnalyticsEngine</td>\n",
529
+ " <td>System reboot initiated by user User876.</td>\n",
530
+ " <td>System Notification</td>\n",
531
+ " <td>regex</td>\n",
532
+ " </tr>\n",
533
+ " <tr>\n",
534
+ " <th>852</th>\n",
535
+ " <td>3/31/2025 5:20</td>\n",
536
+ " <td>ModernCRM</td>\n",
537
+ " <td>System reboot initiated by user User811.</td>\n",
538
+ " <td>System Notification</td>\n",
539
+ " <td>regex</td>\n",
540
+ " </tr>\n",
541
+ " <tr>\n",
542
+ " <th>865</th>\n",
543
+ " <td>2/25/2025 1:40</td>\n",
544
+ " <td>AnalyticsEngine</td>\n",
545
+ " <td>System reboot initiated by user User964.</td>\n",
546
+ " <td>System Notification</td>\n",
547
+ " <td>regex</td>\n",
548
+ " </tr>\n",
549
+ " <tr>\n",
550
+ " <th>889</th>\n",
551
+ " <td>11/30/2025 13:45</td>\n",
552
+ " <td>ModernHR</td>\n",
553
+ " <td>System reboot initiated by user User766.</td>\n",
554
+ " <td>System Notification</td>\n",
555
+ " <td>regex</td>\n",
556
+ " </tr>\n",
557
+ " <tr>\n",
558
+ " <th>896</th>\n",
559
+ " <td>7/28/2025 11:24</td>\n",
560
+ " <td>BillingSystem</td>\n",
561
+ " <td>System reboot initiated by user User765.</td>\n",
562
+ " <td>System Notification</td>\n",
563
+ " <td>regex</td>\n",
564
+ " </tr>\n",
565
+ " <tr>\n",
566
+ " <th>988</th>\n",
567
+ " <td>9/11/2025 22:23</td>\n",
568
+ " <td>BillingSystem</td>\n",
569
+ " <td>System reboot initiated by user User427.</td>\n",
570
+ " <td>System Notification</td>\n",
571
+ " <td>regex</td>\n",
572
+ " </tr>\n",
573
+ " <tr>\n",
574
+ " <th>1106</th>\n",
575
+ " <td>12/28/2025 13:32</td>\n",
576
+ " <td>ModernHR</td>\n",
577
+ " <td>System reboot initiated by user User246.</td>\n",
578
+ " <td>System Notification</td>\n",
579
+ " <td>regex</td>\n",
580
+ " </tr>\n",
581
+ " <tr>\n",
582
+ " <th>1159</th>\n",
583
+ " <td>8/5/2025 6:52</td>\n",
584
+ " <td>BillingSystem</td>\n",
585
+ " <td>System reboot initiated by user User329.</td>\n",
586
+ " <td>System Notification</td>\n",
587
+ " <td>regex</td>\n",
588
+ " </tr>\n",
589
+ " <tr>\n",
590
+ " <th>1194</th>\n",
591
+ " <td>7/20/2025 9:10</td>\n",
592
+ " <td>AnalyticsEngine</td>\n",
593
+ " <td>System reboot initiated by user User747.</td>\n",
594
+ " <td>System Notification</td>\n",
595
+ " <td>regex</td>\n",
596
+ " </tr>\n",
597
+ " <tr>\n",
598
+ " <th>1275</th>\n",
599
+ " <td>7/15/2025 23:37</td>\n",
600
+ " <td>BillingSystem</td>\n",
601
+ " <td>System reboot initiated by user User829.</td>\n",
602
+ " <td>System Notification</td>\n",
603
+ " <td>regex</td>\n",
604
+ " </tr>\n",
605
+ " <tr>\n",
606
+ " <th>1299</th>\n",
607
+ " <td>7/15/2025 19:19</td>\n",
608
+ " <td>BillingSystem</td>\n",
609
+ " <td>System reboot initiated by user User966.</td>\n",
610
+ " <td>System Notification</td>\n",
611
+ " <td>regex</td>\n",
612
+ " </tr>\n",
613
+ " <tr>\n",
614
+ " <th>1304</th>\n",
615
+ " <td>8/10/2025 6:18</td>\n",
616
+ " <td>ThirdPartyAPI</td>\n",
617
+ " <td>System reboot initiated by user User758.</td>\n",
618
+ " <td>System Notification</td>\n",
619
+ " <td>regex</td>\n",
620
+ " </tr>\n",
621
+ " <tr>\n",
622
+ " <th>1524</th>\n",
623
+ " <td>11/30/2025 2:39</td>\n",
624
+ " <td>ThirdPartyAPI</td>\n",
625
+ " <td>System reboot initiated by user User278.</td>\n",
626
+ " <td>System Notification</td>\n",
627
+ " <td>regex</td>\n",
628
+ " </tr>\n",
629
+ " <tr>\n",
630
+ " <th>1562</th>\n",
631
+ " <td>8/18/2025 4:17</td>\n",
632
+ " <td>ThirdPartyAPI</td>\n",
633
+ " <td>System reboot initiated by user User648.</td>\n",
634
+ " <td>System Notification</td>\n",
635
+ " <td>regex</td>\n",
636
+ " </tr>\n",
637
+ " <tr>\n",
638
+ " <th>1624</th>\n",
639
+ " <td>12/14/2025 5:14</td>\n",
640
+ " <td>AnalyticsEngine</td>\n",
641
+ " <td>System reboot initiated by user User268.</td>\n",
642
+ " <td>System Notification</td>\n",
643
+ " <td>regex</td>\n",
644
+ " </tr>\n",
645
+ " <tr>\n",
646
+ " <th>1663</th>\n",
647
+ " <td>10/27/2025 22:04</td>\n",
648
+ " <td>AnalyticsEngine</td>\n",
649
+ " <td>System reboot initiated by user User315.</td>\n",
650
+ " <td>System Notification</td>\n",
651
+ " <td>regex</td>\n",
652
+ " </tr>\n",
653
+ " <tr>\n",
654
+ " <th>1776</th>\n",
655
+ " <td>2/21/2025 11:56</td>\n",
656
+ " <td>ModernHR</td>\n",
657
+ " <td>System reboot initiated by user User155.</td>\n",
658
+ " <td>System Notification</td>\n",
659
+ " <td>regex</td>\n",
660
+ " </tr>\n",
661
+ " <tr>\n",
662
+ " <th>1803</th>\n",
663
+ " <td>8/22/2025 6:30</td>\n",
664
+ " <td>AnalyticsEngine</td>\n",
665
+ " <td>System reboot initiated by user User204.</td>\n",
666
+ " <td>System Notification</td>\n",
667
+ " <td>regex</td>\n",
668
+ " </tr>\n",
669
+ " <tr>\n",
670
+ " <th>1804</th>\n",
671
+ " <td>8/26/2025 21:06</td>\n",
672
+ " <td>ModernHR</td>\n",
673
+ " <td>System reboot initiated by user User899.</td>\n",
674
+ " <td>System Notification</td>\n",
675
+ " <td>regex</td>\n",
676
+ " </tr>\n",
677
+ " <tr>\n",
678
+ " <th>1852</th>\n",
679
+ " <td>1/26/2025 12:34</td>\n",
680
+ " <td>AnalyticsEngine</td>\n",
681
+ " <td>System reboot initiated by user User223.</td>\n",
682
+ " <td>System Notification</td>\n",
683
+ " <td>regex</td>\n",
684
+ " </tr>\n",
685
+ " <tr>\n",
686
+ " <th>1865</th>\n",
687
+ " <td>5/11/2025 10:58</td>\n",
688
+ " <td>AnalyticsEngine</td>\n",
689
+ " <td>System reboot initiated by user User932.</td>\n",
690
+ " <td>System Notification</td>\n",
691
+ " <td>regex</td>\n",
692
+ " </tr>\n",
693
+ " <tr>\n",
694
+ " <th>1956</th>\n",
695
+ " <td>9/26/2025 19:32</td>\n",
696
+ " <td>ThirdPartyAPI</td>\n",
697
+ " <td>System reboot initiated by user User264.</td>\n",
698
+ " <td>System Notification</td>\n",
699
+ " <td>regex</td>\n",
700
+ " </tr>\n",
701
+ " <tr>\n",
702
+ " <th>2003</th>\n",
703
+ " <td>6/23/2025 17:54</td>\n",
704
+ " <td>ModernCRM</td>\n",
705
+ " <td>System reboot initiated by user User517.</td>\n",
706
+ " <td>System Notification</td>\n",
707
+ " <td>regex</td>\n",
708
+ " </tr>\n",
709
+ " <tr>\n",
710
+ " <th>2014</th>\n",
711
+ " <td>12/25/2025 4:33</td>\n",
712
+ " <td>AnalyticsEngine</td>\n",
713
+ " <td>System reboot initiated by user User293.</td>\n",
714
+ " <td>System Notification</td>\n",
715
+ " <td>regex</td>\n",
716
+ " </tr>\n",
717
+ " <tr>\n",
718
+ " <th>2043</th>\n",
719
+ " <td>9/12/2025 20:20</td>\n",
720
+ " <td>ThirdPartyAPI</td>\n",
721
+ " <td>System reboot initiated by user User262.</td>\n",
722
+ " <td>System Notification</td>\n",
723
+ " <td>regex</td>\n",
724
+ " </tr>\n",
725
+ " <tr>\n",
726
+ " <th>2074</th>\n",
727
+ " <td>9/13/2025 8:43</td>\n",
728
+ " <td>ModernCRM</td>\n",
729
+ " <td>System reboot initiated by user User937.</td>\n",
730
+ " <td>System Notification</td>\n",
731
+ " <td>regex</td>\n",
732
+ " </tr>\n",
733
+ " <tr>\n",
734
+ " <th>2228</th>\n",
735
+ " <td>9/3/2025 11:24</td>\n",
736
+ " <td>ModernHR</td>\n",
737
+ " <td>System reboot initiated by user User179.</td>\n",
738
+ " <td>System Notification</td>\n",
739
+ " <td>regex</td>\n",
740
+ " </tr>\n",
741
+ " <tr>\n",
742
+ " <th>2243</th>\n",
743
+ " <td>1/16/2025 7:22</td>\n",
744
+ " <td>ModernHR</td>\n",
745
+ " <td>System reboot initiated by user User770.</td>\n",
746
+ " <td>System Notification</td>\n",
747
+ " <td>regex</td>\n",
748
+ " </tr>\n",
749
+ " <tr>\n",
750
+ " <th>2246</th>\n",
751
+ " <td>3/2/2025 22:56</td>\n",
752
+ " <td>ModernHR</td>\n",
753
+ " <td>System reboot initiated by user User488.</td>\n",
754
+ " <td>System Notification</td>\n",
755
+ " <td>regex</td>\n",
756
+ " </tr>\n",
757
+ " <tr>\n",
758
+ " <th>2253</th>\n",
759
+ " <td>10/7/2025 2:20</td>\n",
760
+ " <td>ModernHR</td>\n",
761
+ " <td>System reboot initiated by user User644.</td>\n",
762
+ " <td>System Notification</td>\n",
763
+ " <td>regex</td>\n",
764
+ " </tr>\n",
765
+ " <tr>\n",
766
+ " <th>2317</th>\n",
767
+ " <td>3/7/2025 5:44</td>\n",
768
+ " <td>BillingSystem</td>\n",
769
+ " <td>System reboot initiated by user User724.</td>\n",
770
+ " <td>System Notification</td>\n",
771
+ " <td>regex</td>\n",
772
+ " </tr>\n",
773
+ " <tr>\n",
774
+ " <th>2360</th>\n",
775
+ " <td>5/1/2025 4:21</td>\n",
776
+ " <td>ThirdPartyAPI</td>\n",
777
+ " <td>System reboot initiated by user User876.</td>\n",
778
+ " <td>System Notification</td>\n",
779
+ " <td>regex</td>\n",
780
+ " </tr>\n",
781
+ " </tbody>\n",
782
+ "</table>\n",
783
+ "</div>"
784
+ ],
785
+ "text/plain": [
786
+ " timestamp source \\\n",
787
+ "36 11/19/2025 13:14 BillingSystem \n",
788
+ "92 12/4/2025 21:20 BillingSystem \n",
789
+ "139 5/8/2025 16:34 ModernHR \n",
790
+ "140 9/11/2025 8:49 AnalyticsEngine \n",
791
+ "161 3/31/2025 19:40 BillingSystem \n",
792
+ "163 6/6/2025 15:29 BillingSystem \n",
793
+ "307 4/12/2025 0:41 BillingSystem \n",
794
+ "365 10/20/2025 22:32 ModernHR \n",
795
+ "508 4/15/2025 2:04 ThirdPartyAPI \n",
796
+ "552 9/22/2025 20:54 ModernHR \n",
797
+ "668 9/5/2025 7:14 ModernHR \n",
798
+ "693 7/6/2025 21:40 BillingSystem \n",
799
+ "697 3/13/2025 7:09 BillingSystem \n",
800
+ "714 9/25/2025 23:35 ThirdPartyAPI \n",
801
+ "730 5/24/2025 11:08 AnalyticsEngine \n",
802
+ "800 8/15/2025 12:14 BillingSystem \n",
803
+ "837 4/9/2025 8:28 AnalyticsEngine \n",
804
+ "852 3/31/2025 5:20 ModernCRM \n",
805
+ "865 2/25/2025 1:40 AnalyticsEngine \n",
806
+ "889 11/30/2025 13:45 ModernHR \n",
807
+ "896 7/28/2025 11:24 BillingSystem \n",
808
+ "988 9/11/2025 22:23 BillingSystem \n",
809
+ "1106 12/28/2025 13:32 ModernHR \n",
810
+ "1159 8/5/2025 6:52 BillingSystem \n",
811
+ "1194 7/20/2025 9:10 AnalyticsEngine \n",
812
+ "1275 7/15/2025 23:37 BillingSystem \n",
813
+ "1299 7/15/2025 19:19 BillingSystem \n",
814
+ "1304 8/10/2025 6:18 ThirdPartyAPI \n",
815
+ "1524 11/30/2025 2:39 ThirdPartyAPI \n",
816
+ "1562 8/18/2025 4:17 ThirdPartyAPI \n",
817
+ "1624 12/14/2025 5:14 AnalyticsEngine \n",
818
+ "1663 10/27/2025 22:04 AnalyticsEngine \n",
819
+ "1776 2/21/2025 11:56 ModernHR \n",
820
+ "1803 8/22/2025 6:30 AnalyticsEngine \n",
821
+ "1804 8/26/2025 21:06 ModernHR \n",
822
+ "1852 1/26/2025 12:34 AnalyticsEngine \n",
823
+ "1865 5/11/2025 10:58 AnalyticsEngine \n",
824
+ "1956 9/26/2025 19:32 ThirdPartyAPI \n",
825
+ "2003 6/23/2025 17:54 ModernCRM \n",
826
+ "2014 12/25/2025 4:33 AnalyticsEngine \n",
827
+ "2043 9/12/2025 20:20 ThirdPartyAPI \n",
828
+ "2074 9/13/2025 8:43 ModernCRM \n",
829
+ "2228 9/3/2025 11:24 ModernHR \n",
830
+ "2243 1/16/2025 7:22 ModernHR \n",
831
+ "2246 3/2/2025 22:56 ModernHR \n",
832
+ "2253 10/7/2025 2:20 ModernHR \n",
833
+ "2317 3/7/2025 5:44 BillingSystem \n",
834
+ "2360 5/1/2025 4:21 ThirdPartyAPI \n",
835
+ "\n",
836
+ " log_message target_label complexity \n",
837
+ "36 System reboot initiated by user User243. System Notification regex \n",
838
+ "92 System reboot initiated by user User471. System Notification regex \n",
839
+ "139 System reboot initiated by user User216. System Notification regex \n",
840
+ "140 System reboot initiated by user User639. System Notification regex \n",
841
+ "161 System reboot initiated by user User819. System Notification regex \n",
842
+ "163 System reboot initiated by user User938. System Notification regex \n",
843
+ "307 System reboot initiated by user User929. System Notification regex \n",
844
+ "365 System reboot initiated by user User533. System Notification regex \n",
845
+ "508 System reboot initiated by user User591. System Notification regex \n",
846
+ "552 System reboot initiated by user User421. System Notification regex \n",
847
+ "668 System reboot initiated by user User297. System Notification regex \n",
848
+ "693 System reboot initiated by user User159. System Notification regex \n",
849
+ "697 System reboot initiated by user User648. System Notification regex \n",
850
+ "714 System reboot initiated by user User600. System Notification regex \n",
851
+ "730 System reboot initiated by user User120. System Notification regex \n",
852
+ "800 System reboot initiated by user User901. System Notification regex \n",
853
+ "837 System reboot initiated by user User876. System Notification regex \n",
854
+ "852 System reboot initiated by user User811. System Notification regex \n",
855
+ "865 System reboot initiated by user User964. System Notification regex \n",
856
+ "889 System reboot initiated by user User766. System Notification regex \n",
857
+ "896 System reboot initiated by user User765. System Notification regex \n",
858
+ "988 System reboot initiated by user User427. System Notification regex \n",
859
+ "1106 System reboot initiated by user User246. System Notification regex \n",
860
+ "1159 System reboot initiated by user User329. System Notification regex \n",
861
+ "1194 System reboot initiated by user User747. System Notification regex \n",
862
+ "1275 System reboot initiated by user User829. System Notification regex \n",
863
+ "1299 System reboot initiated by user User966. System Notification regex \n",
864
+ "1304 System reboot initiated by user User758. System Notification regex \n",
865
+ "1524 System reboot initiated by user User278. System Notification regex \n",
866
+ "1562 System reboot initiated by user User648. System Notification regex \n",
867
+ "1624 System reboot initiated by user User268. System Notification regex \n",
868
+ "1663 System reboot initiated by user User315. System Notification regex \n",
869
+ "1776 System reboot initiated by user User155. System Notification regex \n",
870
+ "1803 System reboot initiated by user User204. System Notification regex \n",
871
+ "1804 System reboot initiated by user User899. System Notification regex \n",
872
+ "1852 System reboot initiated by user User223. System Notification regex \n",
873
+ "1865 System reboot initiated by user User932. System Notification regex \n",
874
+ "1956 System reboot initiated by user User264. System Notification regex \n",
875
+ "2003 System reboot initiated by user User517. System Notification regex \n",
876
+ "2014 System reboot initiated by user User293. System Notification regex \n",
877
+ "2043 System reboot initiated by user User262. System Notification regex \n",
878
+ "2074 System reboot initiated by user User937. System Notification regex \n",
879
+ "2228 System reboot initiated by user User179. System Notification regex \n",
880
+ "2243 System reboot initiated by user User770. System Notification regex \n",
881
+ "2246 System reboot initiated by user User488. System Notification regex \n",
882
+ "2253 System reboot initiated by user User644. System Notification regex \n",
883
+ "2317 System reboot initiated by user User724. System Notification regex \n",
884
+ "2360 System reboot initiated by user User876. System Notification regex "
885
+ ]
886
+ },
887
+ "execution_count": 5,
888
+ "metadata": {},
889
+ "output_type": "execute_result"
890
+ }
891
+ ],
892
+ "source": [
893
+ "df[df.log_message.str.startswith(\"System reboot initiated by user\")]"
894
+ ]
895
+ },
896
+ {
897
+ "cell_type": "markdown",
898
+ "id": "f9848e705b7eaa60",
899
+ "metadata": {},
900
+ "source": [
901
+ "### Clustering"
902
+ ]
903
+ },
904
+ {
905
+ "cell_type": "code",
906
+ "execution_count": 6,
907
+ "id": "4ac33c95fa16ebc0",
908
+ "metadata": {
909
+ "ExecuteTime": {
910
+ "end_time": "2025-01-15T20:27:53.727373Z",
911
+ "start_time": "2025-01-15T20:27:53.711739Z"
912
+ }
913
+ },
914
+ "outputs": [
915
+ {
916
+ "name": "stderr",
917
+ "output_type": "stream",
918
+ "text": [
919
+ "/Users/yuvaraj/Desktop/projects/project-nlp-log-classification/venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
920
+ " from .autonotebook import tqdm as notebook_tqdm\n"
921
+ ]
922
+ }
923
+ ],
924
+ "source": [
925
+ "from sklearn.cluster import DBSCAN\n",
926
+ "from sentence_transformers import SentenceTransformer"
927
+ ]
928
+ },
929
+ {
930
+ "cell_type": "code",
931
+ "execution_count": 7,
932
+ "id": "6722e5924d2a1fc0",
933
+ "metadata": {
934
+ "ExecuteTime": {
935
+ "end_time": "2025-01-15T20:27:57.371284Z",
936
+ "start_time": "2025-01-15T20:27:53.820041Z"
937
+ }
938
+ },
939
+ "outputs": [],
940
+ "source": [
941
+ "model = SentenceTransformer('all-MiniLM-L6-v2') # Lightweight embedding model\n",
942
+ "embeddings = model.encode(df['log_message'].tolist())"
943
+ ]
944
+ },
945
+ {
946
+ "cell_type": "code",
947
+ "execution_count": 11,
948
+ "id": "8e97b58b60296c93",
949
+ "metadata": {
950
+ "ExecuteTime": {
951
+ "end_time": "2025-01-15T20:27:57.391292Z",
952
+ "start_time": "2025-01-15T20:27:57.371284Z"
953
+ }
954
+ },
955
+ "outputs": [
956
+ {
957
+ "data": {
958
+ "text/plain": [
959
+ "array([[-0.10293969, 0.03354594, -0.02202599, ..., 0.00457783,\n",
960
+ " -0.04259719, 0.00322625],\n",
961
+ " [ 0.00804575, -0.03573925, 0.0493874 , ..., 0.01538318,\n",
962
+ " -0.0623095 , -0.02774667],\n",
963
+ " [-0.00908216, 0.13003923, -0.05275571, ..., 0.02014102,\n",
964
+ " -0.05117101, -0.02930292],\n",
965
+ " [-0.0975106 , 0.04911299, -0.03977427, ..., 0.02477493,\n",
966
+ " -0.03546083, -0.00018602],\n",
967
+ " [-0.10468345, 0.05926038, -0.02488496, ..., 0.0250205 ,\n",
968
+ " -0.03719296, -0.02568912]], shape=(5, 384), dtype=float32)"
969
+ ]
970
+ },
971
+ "execution_count": 11,
972
+ "metadata": {},
973
+ "output_type": "execute_result"
974
+ }
975
+ ],
976
+ "source": [
977
+ "embeddings[:5]"
978
+ ]
979
+ },
980
+ {
981
+ "cell_type": "code",
982
+ "execution_count": 12,
983
+ "id": "797b761439f42836",
984
+ "metadata": {
985
+ "ExecuteTime": {
986
+ "end_time": "2025-01-15T20:27:57.492015Z",
987
+ "start_time": "2025-01-15T20:27:57.421383Z"
988
+ }
989
+ },
990
+ "outputs": [],
991
+ "source": [
992
+ "clustering = DBSCAN(eps=0.2, min_samples=1, metric='cosine').fit(embeddings)\n",
993
+ "df['cluster'] = clustering.labels_"
994
+ ]
995
+ },
996
+ {
997
+ "cell_type": "code",
998
+ "execution_count": 13,
999
+ "id": "f86db1d238061a83",
1000
+ "metadata": {
1001
+ "ExecuteTime": {
1002
+ "end_time": "2025-01-15T20:27:57.554699Z",
1003
+ "start_time": "2025-01-15T20:27:57.540698Z"
1004
+ }
1005
+ },
1006
+ "outputs": [
1007
+ {
1008
+ "data": {
1009
+ "text/html": [
1010
+ "<div>\n",
1011
+ "<style scoped>\n",
1012
+ " .dataframe tbody tr th:only-of-type {\n",
1013
+ " vertical-align: middle;\n",
1014
+ " }\n",
1015
+ "\n",
1016
+ " .dataframe tbody tr th {\n",
1017
+ " vertical-align: top;\n",
1018
+ " }\n",
1019
+ "\n",
1020
+ " .dataframe thead th {\n",
1021
+ " text-align: right;\n",
1022
+ " }\n",
1023
+ "</style>\n",
1024
+ "<table border=\"1\" class=\"dataframe\">\n",
1025
+ " <thead>\n",
1026
+ " <tr style=\"text-align: right;\">\n",
1027
+ " <th></th>\n",
1028
+ " <th>timestamp</th>\n",
1029
+ " <th>source</th>\n",
1030
+ " <th>log_message</th>\n",
1031
+ " <th>target_label</th>\n",
1032
+ " <th>complexity</th>\n",
1033
+ " <th>cluster</th>\n",
1034
+ " </tr>\n",
1035
+ " </thead>\n",
1036
+ " <tbody>\n",
1037
+ " <tr>\n",
1038
+ " <th>0</th>\n",
1039
+ " <td>2025-06-27 07:20:25</td>\n",
1040
+ " <td>ModernCRM</td>\n",
1041
+ " <td>nova.osapi_compute.wsgi.server [req-b9718cd8-f...</td>\n",
1042
+ " <td>HTTP Status</td>\n",
1043
+ " <td>bert</td>\n",
1044
+ " <td>0</td>\n",
1045
+ " </tr>\n",
1046
+ " <tr>\n",
1047
+ " <th>1</th>\n",
1048
+ " <td>1/14/2025 23:07</td>\n",
1049
+ " <td>ModernCRM</td>\n",
1050
+ " <td>Email service experiencing issues with sending</td>\n",
1051
+ " <td>Critical Error</td>\n",
1052
+ " <td>bert</td>\n",
1053
+ " <td>1</td>\n",
1054
+ " </tr>\n",
1055
+ " <tr>\n",
1056
+ " <th>2</th>\n",
1057
+ " <td>1/17/2025 1:29</td>\n",
1058
+ " <td>AnalyticsEngine</td>\n",
1059
+ " <td>Unauthorized access to data was attempted</td>\n",
1060
+ " <td>Security Alert</td>\n",
1061
+ " <td>bert</td>\n",
1062
+ " <td>2</td>\n",
1063
+ " </tr>\n",
1064
+ " <tr>\n",
1065
+ " <th>3</th>\n",
1066
+ " <td>2025-07-12 00:24:16</td>\n",
1067
+ " <td>ModernHR</td>\n",
1068
+ " <td>nova.osapi_compute.wsgi.server [req-4895c258-b...</td>\n",
1069
+ " <td>HTTP Status</td>\n",
1070
+ " <td>bert</td>\n",
1071
+ " <td>0</td>\n",
1072
+ " </tr>\n",
1073
+ " <tr>\n",
1074
+ " <th>4</th>\n",
1075
+ " <td>2025-06-02 18:25:23</td>\n",
1076
+ " <td>BillingSystem</td>\n",
1077
+ " <td>nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...</td>\n",
1078
+ " <td>HTTP Status</td>\n",
1079
+ " <td>bert</td>\n",
1080
+ " <td>0</td>\n",
1081
+ " </tr>\n",
1082
+ " </tbody>\n",
1083
+ "</table>\n",
1084
+ "</div>"
1085
+ ],
1086
+ "text/plain": [
1087
+ " timestamp source \\\n",
1088
+ "0 2025-06-27 07:20:25 ModernCRM \n",
1089
+ "1 1/14/2025 23:07 ModernCRM \n",
1090
+ "2 1/17/2025 1:29 AnalyticsEngine \n",
1091
+ "3 2025-07-12 00:24:16 ModernHR \n",
1092
+ "4 2025-06-02 18:25:23 BillingSystem \n",
1093
+ "\n",
1094
+ " log_message target_label \\\n",
1095
+ "0 nova.osapi_compute.wsgi.server [req-b9718cd8-f... HTTP Status \n",
1096
+ "1 Email service experiencing issues with sending Critical Error \n",
1097
+ "2 Unauthorized access to data was attempted Security Alert \n",
1098
+ "3 nova.osapi_compute.wsgi.server [req-4895c258-b... HTTP Status \n",
1099
+ "4 nova.osapi_compute.wsgi.server [req-ee8bc8ba-9... HTTP Status \n",
1100
+ "\n",
1101
+ " complexity cluster \n",
1102
+ "0 bert 0 \n",
1103
+ "1 bert 1 \n",
1104
+ "2 bert 2 \n",
1105
+ "3 bert 0 \n",
1106
+ "4 bert 0 "
1107
+ ]
1108
+ },
1109
+ "execution_count": 13,
1110
+ "metadata": {},
1111
+ "output_type": "execute_result"
1112
+ }
1113
+ ],
1114
+ "source": [
1115
+ "df.head()"
1116
+ ]
1117
+ },
1118
+ {
1119
+ "cell_type": "code",
1120
+ "execution_count": 14,
1121
+ "id": "d3ec326ef8793ed8",
1122
+ "metadata": {
1123
+ "ExecuteTime": {
1124
+ "end_time": "2025-01-15T20:27:57.689518Z",
1125
+ "start_time": "2025-01-15T20:27:57.676503Z"
1126
+ }
1127
+ },
1128
+ "outputs": [],
1129
+ "source": [
1130
+ "# Group by cluster to inspect patterns\n",
1131
+ "clusters = df.groupby('cluster')['log_message'].apply(list)\n",
1132
+ "sorted_clusters = clusters.sort_values(key=lambda x: x.map(len), ascending=False)"
1133
+ ]
1134
+ },
1135
+ {
1136
+ "cell_type": "code",
1137
+ "execution_count": 15,
1138
+ "id": "53dd43fd2cab0141",
1139
+ "metadata": {
1140
+ "ExecuteTime": {
1141
+ "end_time": "2025-01-15T20:27:58.467824Z",
1142
+ "start_time": "2025-01-15T20:27:58.449975Z"
1143
+ }
1144
+ },
1145
+ "outputs": [
1146
+ {
1147
+ "name": "stdout",
1148
+ "output_type": "stream",
1149
+ "text": [
1150
+ "Clustered Patterns:\n",
1151
+ "Cluster 0:\n",
1152
+ " nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 \"GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1\" status: 200 len: 1893 time: 0.2675118\n",
1153
+ " nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 \"GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1\" HTTP status code - 200 len: 211 time: 0.0968180\n",
1154
+ " nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 \"GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1\" RCODE 200 len: 1874 time: 0.2280791\n",
1155
+ " nova.osapi_compute.wsgi.server [req-f0bffbc3-5ab0-4916-91c1-0a61dd7d4ec2 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 \"GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1\" Return code: 200 len: 1874 time: 0.2131531\n",
1156
+ " nova.osapi_compute.wsgi.server [req-2bf7cfee-a236-42f3-8fb1-96fefab0b302 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 \"GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1\" RCODE 200 len: 1874 time: 0.1794369\n",
1157
+ "Cluster 5:\n",
1158
+ " nova.compute.claims [req-a07ac654-8e81-416d-bfbb-189116b07969 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] [instance: bf8c824d-f099-4433-a41e-e3da7578262e] Total memory: 64172 MB, used: 512.00 MB\n",
1159
+ " nova.compute.claims [req-d6986b54-3735-4a42-9074-0ba7d9717de9 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] [instance: af5f7392-f7d4-4298-b647-c98924c64aa1] memory limit: 96258.00 MB, free: 95746.00 MB\n",
1160
+ " nova.compute.claims [req-72b4858f-049e-49e1-b31e-b562c5018eaf 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] [instance: 63a0d960-70b6-44c6-b606-491478a5cadf] disk limit not specified, defaulting to unlimited\n",
1161
+ " nova.compute.claims [req-5c8f52bd-8e3c-41f0-95a5-7861d247cafa 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] [instance: d96a117b-0193-4549-bdcc-63b917273d1d] vcpu limit not specified, defaulting to unlimited\n",
1162
+ " nova.compute.claims [req-d38f479d-9bb9-4276-9688-52607e8fd350 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] [instance: 95960536-049b-41f6-9049-05fc479b6a7c] disk limit not specified, defaulting to unlimited\n",
1163
+ "Cluster 11:\n",
1164
+ " User User685 logged out.\n",
1165
+ " User User395 logged in.\n",
1166
+ " User User225 logged in.\n",
1167
+ " User User494 logged out.\n",
1168
+ " User User900 logged in.\n",
1169
+ "Cluster 13:\n",
1170
+ " Backup started at 2025-05-14 07:06:55.\n",
1171
+ " Backup started at 2025-02-15 20:00:19.\n",
1172
+ " Backup ended at 2025-08-08 13:06:23.\n",
1173
+ " Backup started at 2025-11-14 08:27:43.\n",
1174
+ " Backup started at 2025-12-09 10:19:11.\n",
1175
+ "Cluster 7:\n",
1176
+ " Multiple bad login attempts detected on user 8538 account\n",
1177
+ " Multiple login failures occurred on user 9052 account\n",
1178
+ " User 7153 made multiple incorrect login attempts\n",
1179
+ " User 8300 made multiple incorrect login attempts\n",
1180
+ " Multiple login failures were detected for user 6373\n",
1181
+ "Cluster 8:\n",
1182
+ " Backup completed successfully.\n",
1183
+ " Backup completed successfully.\n",
1184
+ " Backup completed successfully.\n",
1185
+ " Backup completed successfully.\n",
1186
+ " Backup completed successfully.\n",
1187
+ "Cluster 21:\n",
1188
+ " System updated to version 3.9.1.\n",
1189
+ " System updated to version 5.5.4.\n",
1190
+ " System updated to version 4.7.4.\n",
1191
+ " System updated to version 3.7.7.\n",
1192
+ " System updated to version 2.6.2.\n",
1193
+ "Cluster 3:\n",
1194
+ " Shard 6 replication task ended in failure\n",
1195
+ " Data replication task for shard 14 did not complete\n",
1196
+ " Data replication task failed for shard 17\n",
1197
+ " Replication of data to shard 14 failed\n",
1198
+ " Data replication task for shard 6 did not complete\n",
1199
+ "Cluster 4:\n",
1200
+ " File data_6169.csv uploaded successfully by user User953.\n",
1201
+ " File data_3847.csv uploaded successfully by user User175.\n",
1202
+ " File data_7366.csv uploaded successfully by user User282.\n",
1203
+ " File data_1206.csv uploaded successfully by user User359.\n",
1204
+ " File data_1503.csv uploaded successfully by user User151.\n",
1205
+ "Cluster 17:\n",
1206
+ " Denied access attempt on restricted account Account2682\n",
1207
+ " Unauthorized login attempt on protected account Account5030\n",
1208
+ " Account Account9437 blocked due to failed login\n",
1209
+ " Account Account7999 access denied due to login failure\n",
1210
+ " Invalid login attempt made on secured account Account7864\n",
1211
+ "Cluster 32:\n",
1212
+ " Disk cleanup completed successfully.\n",
1213
+ " Disk cleanup completed successfully.\n",
1214
+ " Disk cleanup completed successfully.\n",
1215
+ " Disk cleanup completed successfully.\n",
1216
+ " Disk cleanup completed successfully.\n",
1217
+ "Cluster 6:\n",
1218
+ " Critical system unit error: unit ID Component55\n",
1219
+ " System component malfunction: component ID Component79\n",
1220
+ " Critical system element is down: element ID Component96\n",
1221
+ " Essential system part malfunction: part ID Component6\n",
1222
+ " Failure occurred in critical system component: component ID Component92\n",
1223
+ "Cluster 16:\n",
1224
+ " System reboot initiated by user User243.\n",
1225
+ " System reboot initiated by user User471.\n",
1226
+ " System reboot initiated by user User216.\n",
1227
+ " System reboot initiated by user User639.\n",
1228
+ " System reboot initiated by user User819.\n",
1229
+ "Cluster 20:\n",
1230
+ " User 7662 tried to bypass API security measures\n",
1231
+ " User 2367 failed to provide valid API access credentials\n",
1232
+ " User 3569 made an unauthorized API request\n",
1233
+ " Unauthorized user 2968 tried to access restricted API\n",
1234
+ " User 2186 attempted to access API without proper authorization\n",
1235
+ "Cluster 9:\n",
1236
+ " Account with ID 5351 created by User634.\n",
1237
+ " Account with ID 7813 created by User373.\n",
1238
+ " Account with ID 9827 created by User965.\n",
1239
+ " Account with ID 2520 created by User546.\n",
1240
+ " Account with ID 2300 created by User964.\n",
1241
+ "Cluster 1:\n",
1242
+ " Email service experiencing issues with sending\n",
1243
+ " Email server encountered a sending fault\n",
1244
+ " Mail service encountered a delivery glitch\n",
1245
+ " Service disruption caused by email sending error\n",
1246
+ " Email system had a problem sending emails\n",
1247
+ "Cluster 10:\n",
1248
+ " nova.compute.resource_tracker [req-addc1839-2ed5-4778-b57e-5854eb7b8b09 - - - - -] Final resource view: name=cp-1.slowvm1.tcloud-pg0.utah.cloudlab.us phys_ram=64172MB used_ram=512MB phys_disk=15GB used_disk=0GB total_vcpus=16 used_vcpus=0 pci_stats=[]\n",
1249
+ " nova.compute.resource_tracker [req-addc1839-2ed5-4778-b57e-5854eb7b8b09 - - - - -] Final resource view: name=cp-1.slowvm1.tcloud-pg0.utah.cloudlab.us phys_ram=64172MB used_ram=2560MB phys_disk=15GB used_disk=20GB total_vcpus=16 used_vcpus=1 pci_stats=[]\n",
1250
+ " nova.compute.resource_tracker [req-addc1839-2ed5-4778-b57e-5854eb7b8b09 - - - - -] Total usable vcpus: 16, total allocated vcpus: 0\n",
1251
+ " nova.compute.resource_tracker [req-addc1839-2ed5-4778-b57e-5854eb7b8b09 - - - - -] Final resource view: name=cp-1.slowvm1.tcloud-pg0.utah.cloudlab.us phys_ram=64172MB used_ram=2560MB phys_disk=15GB used_disk=20GB total_vcpus=16 used_vcpus=1 pci_stats=[]\n",
1252
+ " nova.compute.resource_tracker [req-addc1839-2ed5-4778-b57e-5854eb7b8b09 - - - - -] Final resource view: name=cp-1.slowvm1.tcloud-pg0.utah.cloudlab.us phys_ram=64172MB used_ram=2560MB phys_disk=15GB used_disk=20GB total_vcpus=16 used_vcpus=1 pci_stats=[]\n",
1253
+ "Cluster 34:\n",
1254
+ " Abnormal behavior found on server 10, possible security threat\n",
1255
+ " Security alert: suspicious activity on server 1\n",
1256
+ " Anomalous activity identified on server 23, security review recommended\n",
1257
+ " Server 27 experienced potential security incident, review required\n",
1258
+ " Server 36 experienced potential security incident, review required\n",
1259
+ "Cluster 14:\n",
1260
+ " Detection of multiple disk faults in RAID setup\n",
1261
+ " RAID array suffered multiple hard drive failures\n",
1262
+ " RAID array experienced multiple disk crashes\n",
1263
+ " Multiple hard drive issues in RAID configuration found\n",
1264
+ " Identification of multiple faulty disks in RAID array\n",
1265
+ "Cluster 53:\n",
1266
+ " Module X experienced an invalid data format issue\n",
1267
+ " Input format mismatch occurred in module X\n",
1268
+ " Module X reported an error in input format validation\n",
1269
+ " Module X failed to process input due to formatting error\n",
1270
+ " Input data format in module X was invalid or corrupted\n",
1271
+ "Cluster 52:\n",
1272
+ " Service health check was not successful because of SSL certificate validation failures.\n",
1273
+ " Invalid SSL certificate resulted in a failed service health check.\n",
1274
+ " Service health check failure was due to an invalid or improperly configured SSL certificate.\n",
1275
+ " An issue with the SSL certificate caused the service health check to fail.\n",
1276
+ " Service health check failure was caused by an expired SSL certificate.\n",
1277
+ "Cluster 18:\n",
1278
+ " Boot process terminated unexpectedly due to kernel issue\n",
1279
+ " System encountered kernel panic during initialization phase\n",
1280
+ " Boot process was stopped by kernel malfunction\n",
1281
+ " System encountered kernel failure during bootup sequence\n",
1282
+ " Boot sequence failed due to kernel panic\n",
1283
+ "Cluster 25:\n",
1284
+ " System configuration is no longer valid\n",
1285
+ " Configuration is corrupted throughout the system\n",
1286
+ " Cross-system configuration failure occurred\n",
1287
+ " System configuration is experiencing errors\n",
1288
+ " Configuration malfunction is system-wide\n",
1289
+ "Cluster 42:\n",
1290
+ " User 5127 has escalated admin privileges without authorization\n",
1291
+ " User 9745 has escalated to admin level\n",
1292
+ " User 8483 escalated privileges to admin level\n",
1293
+ " User 1987 has escalated to admin level\n",
1294
+ " User 8395 escalated privileges to admin level\n",
1295
+ "Cluster 59:\n",
1296
+ " Potential security threat: Admin privilege escalation for user 5130\n",
1297
+ " Admin privilege escalation alert for user 2893\n",
1298
+ " Admin privilege escalation alert for user 8532\n",
1299
+ " Potential security threat: Admin privilege escalation for user 1554\n",
1300
+ " Warning: Potential admin privilege escalation for user 5720\n",
1301
+ "Cluster 26:\n",
1302
+ " Privilege elevation detected for user 5038\n",
1303
+ " Elevation of admin privileges detected for user 6137\n",
1304
+ " Elevation of admin privileges detected for user 4907\n",
1305
+ " User 6069 has been granted elevated admin privileges\n",
1306
+ " Admin privilege elevation warning for user 7574\n"
1307
+ ]
1308
+ }
1309
+ ],
1310
+ "source": [
1311
+ "print(\"Clustered Patterns:\")\n",
1312
+ "for cluster_id, messages in sorted_clusters.items():\n",
1313
+ " if len(messages) > 10:\n",
1314
+ " print(f\"Cluster {cluster_id}:\")\n",
1315
+ " for msg in messages[:5]:\n",
1316
+ " print(f\" {msg}\")"
1317
+ ]
1318
+ },
1319
+ {
1320
+ "cell_type": "markdown",
1321
+ "id": "bbec6795396f2d6b",
1322
+ "metadata": {},
1323
+ "source": [
1324
+ "### Classification Stage 1: Regex"
1325
+ ]
1326
+ },
1327
+ {
1328
+ "cell_type": "code",
1329
+ "execution_count": 16,
1330
+ "id": "ca32020e4fdb8f40",
1331
+ "metadata": {
1332
+ "ExecuteTime": {
1333
+ "end_time": "2025-01-15T20:27:58.549493Z",
1334
+ "start_time": "2025-01-15T20:27:58.529458Z"
1335
+ }
1336
+ },
1337
+ "outputs": [],
1338
+ "source": [
1339
+ "import re\n",
1340
+ "def classify_with_regex(log_message):\n",
1341
+ " regex_patterns = {\n",
1342
+ " r\"User User\\d+ logged (in|out).\": \"User Action\",\n",
1343
+ " r\"Backup (started|ended) at .*\": \"System Notification\",\n",
1344
+ " r\"Backup completed successfully.\": \"System Notification\",\n",
1345
+ " r\"System updated to version .*\": \"System Notification\",\n",
1346
+ " r\"File .* uploaded successfully by user .*\": \"System Notification\",\n",
1347
+ " r\"Disk cleanup completed successfully.\": \"System Notification\",\n",
1348
+ " r\"System reboot initiated by user .*\": \"System Notification\",\n",
1349
+ " r\"Account with ID .* created by .*\": \"User Action\"\n",
1350
+ " }\n",
1351
+ " for pattern, label in regex_patterns.items():\n",
1352
+ " if re.search(pattern, log_message):\n",
1353
+ " return label\n",
1354
+ " return None"
1355
+ ]
1356
+ },
1357
+ {
1358
+ "cell_type": "code",
1359
+ "execution_count": 17,
1360
+ "id": "4d9645ec6812da4a",
1361
+ "metadata": {
1362
+ "ExecuteTime": {
1363
+ "end_time": "2025-01-15T20:27:58.589510Z",
1364
+ "start_time": "2025-01-15T20:27:58.579485Z"
1365
+ }
1366
+ },
1367
+ "outputs": [
1368
+ {
1369
+ "data": {
1370
+ "text/plain": [
1371
+ "'User Action'"
1372
+ ]
1373
+ },
1374
+ "execution_count": 17,
1375
+ "metadata": {},
1376
+ "output_type": "execute_result"
1377
+ }
1378
+ ],
1379
+ "source": [
1380
+ "classify_with_regex(\"User User123 logged in.\")"
1381
+ ]
1382
+ },
1383
+ {
1384
+ "cell_type": "code",
1385
+ "execution_count": 18,
1386
+ "id": "6b3b838a2d270190",
1387
+ "metadata": {
1388
+ "ExecuteTime": {
1389
+ "end_time": "2025-01-15T20:27:58.636755Z",
1390
+ "start_time": "2025-01-15T20:27:58.624648Z"
1391
+ }
1392
+ },
1393
+ "outputs": [
1394
+ {
1395
+ "data": {
1396
+ "text/plain": [
1397
+ "'System Notification'"
1398
+ ]
1399
+ },
1400
+ "execution_count": 18,
1401
+ "metadata": {},
1402
+ "output_type": "execute_result"
1403
+ }
1404
+ ],
1405
+ "source": [
1406
+ "classify_with_regex(\"System reboot initiated by user User179.\")"
1407
+ ]
1408
+ },
1409
+ {
1410
+ "cell_type": "code",
1411
+ "execution_count": 19,
1412
+ "id": "99bcfd70b451835c",
1413
+ "metadata": {
1414
+ "ExecuteTime": {
1415
+ "end_time": "2025-01-15T20:27:58.682792Z",
1416
+ "start_time": "2025-01-15T20:27:58.672822Z"
1417
+ }
1418
+ },
1419
+ "outputs": [],
1420
+ "source": [
1421
+ "classify_with_regex(\"Hey you, chill bro\")"
1422
+ ]
1423
+ },
1424
+ {
1425
+ "cell_type": "code",
1426
+ "execution_count": 20,
1427
+ "id": "22619eedaa15acc3",
1428
+ "metadata": {
1429
+ "ExecuteTime": {
1430
+ "end_time": "2025-01-15T20:27:58.760034Z",
1431
+ "start_time": "2025-01-15T20:27:58.731326Z"
1432
+ }
1433
+ },
1434
+ "outputs": [
1435
+ {
1436
+ "data": {
1437
+ "text/html": [
1438
+ "<div>\n",
1439
+ "<style scoped>\n",
1440
+ " .dataframe tbody tr th:only-of-type {\n",
1441
+ " vertical-align: middle;\n",
1442
+ " }\n",
1443
+ "\n",
1444
+ " .dataframe tbody tr th {\n",
1445
+ " vertical-align: top;\n",
1446
+ " }\n",
1447
+ "\n",
1448
+ " .dataframe thead th {\n",
1449
+ " text-align: right;\n",
1450
+ " }\n",
1451
+ "</style>\n",
1452
+ "<table border=\"1\" class=\"dataframe\">\n",
1453
+ " <thead>\n",
1454
+ " <tr style=\"text-align: right;\">\n",
1455
+ " <th></th>\n",
1456
+ " <th>timestamp</th>\n",
1457
+ " <th>source</th>\n",
1458
+ " <th>log_message</th>\n",
1459
+ " <th>target_label</th>\n",
1460
+ " <th>complexity</th>\n",
1461
+ " <th>cluster</th>\n",
1462
+ " <th>regex_label</th>\n",
1463
+ " </tr>\n",
1464
+ " </thead>\n",
1465
+ " <tbody>\n",
1466
+ " <tr>\n",
1467
+ " <th>7</th>\n",
1468
+ " <td>10/11/2025 8:44</td>\n",
1469
+ " <td>ModernHR</td>\n",
1470
+ " <td>File data_6169.csv uploaded successfully by us...</td>\n",
1471
+ " <td>System Notification</td>\n",
1472
+ " <td>regex</td>\n",
1473
+ " <td>4</td>\n",
1474
+ " <td>System Notification</td>\n",
1475
+ " </tr>\n",
1476
+ " <tr>\n",
1477
+ " <th>14</th>\n",
1478
+ " <td>1/4/2025 1:43</td>\n",
1479
+ " <td>ThirdPartyAPI</td>\n",
1480
+ " <td>File data_3847.csv uploaded successfully by us...</td>\n",
1481
+ " <td>System Notification</td>\n",
1482
+ " <td>regex</td>\n",
1483
+ " <td>4</td>\n",
1484
+ " <td>System Notification</td>\n",
1485
+ " </tr>\n",
1486
+ " <tr>\n",
1487
+ " <th>15</th>\n",
1488
+ " <td>5/1/2025 9:41</td>\n",
1489
+ " <td>ModernCRM</td>\n",
1490
+ " <td>Backup completed successfully.</td>\n",
1491
+ " <td>System Notification</td>\n",
1492
+ " <td>regex</td>\n",
1493
+ " <td>8</td>\n",
1494
+ " <td>System Notification</td>\n",
1495
+ " </tr>\n",
1496
+ " <tr>\n",
1497
+ " <th>18</th>\n",
1498
+ " <td>2/22/2025 17:49</td>\n",
1499
+ " <td>ModernCRM</td>\n",
1500
+ " <td>Account with ID 5351 created by User634.</td>\n",
1501
+ " <td>User Action</td>\n",
1502
+ " <td>regex</td>\n",
1503
+ " <td>9</td>\n",
1504
+ " <td>User Action</td>\n",
1505
+ " </tr>\n",
1506
+ " <tr>\n",
1507
+ " <th>27</th>\n",
1508
+ " <td>9/24/2025 19:57</td>\n",
1509
+ " <td>ThirdPartyAPI</td>\n",
1510
+ " <td>User User685 logged out.</td>\n",
1511
+ " <td>User Action</td>\n",
1512
+ " <td>regex</td>\n",
1513
+ " <td>11</td>\n",
1514
+ " <td>User Action</td>\n",
1515
+ " </tr>\n",
1516
+ " <tr>\n",
1517
+ " <th>...</th>\n",
1518
+ " <td>...</td>\n",
1519
+ " <td>...</td>\n",
1520
+ " <td>...</td>\n",
1521
+ " <td>...</td>\n",
1522
+ " <td>...</td>\n",
1523
+ " <td>...</td>\n",
1524
+ " <td>...</td>\n",
1525
+ " </tr>\n",
1526
+ " <tr>\n",
1527
+ " <th>2376</th>\n",
1528
+ " <td>6/27/2025 8:47</td>\n",
1529
+ " <td>ModernCRM</td>\n",
1530
+ " <td>System updated to version 2.0.5.</td>\n",
1531
+ " <td>System Notification</td>\n",
1532
+ " <td>regex</td>\n",
1533
+ " <td>21</td>\n",
1534
+ " <td>System Notification</td>\n",
1535
+ " </tr>\n",
1536
+ " <tr>\n",
1537
+ " <th>2381</th>\n",
1538
+ " <td>9/5/2025 6:39</td>\n",
1539
+ " <td>ThirdPartyAPI</td>\n",
1540
+ " <td>Disk cleanup completed successfully.</td>\n",
1541
+ " <td>System Notification</td>\n",
1542
+ " <td>regex</td>\n",
1543
+ " <td>32</td>\n",
1544
+ " <td>System Notification</td>\n",
1545
+ " </tr>\n",
1546
+ " <tr>\n",
1547
+ " <th>2394</th>\n",
1548
+ " <td>4/3/2025 13:13</td>\n",
1549
+ " <td>ModernHR</td>\n",
1550
+ " <td>Disk cleanup completed successfully.</td>\n",
1551
+ " <td>System Notification</td>\n",
1552
+ " <td>regex</td>\n",
1553
+ " <td>32</td>\n",
1554
+ " <td>System Notification</td>\n",
1555
+ " </tr>\n",
1556
+ " <tr>\n",
1557
+ " <th>2395</th>\n",
1558
+ " <td>5/2/2025 14:29</td>\n",
1559
+ " <td>ThirdPartyAPI</td>\n",
1560
+ " <td>Backup ended at 2025-05-06 11:23:16.</td>\n",
1561
+ " <td>System Notification</td>\n",
1562
+ " <td>regex</td>\n",
1563
+ " <td>13</td>\n",
1564
+ " <td>System Notification</td>\n",
1565
+ " </tr>\n",
1566
+ " <tr>\n",
1567
+ " <th>2403</th>\n",
1568
+ " <td>10/1/2025 1:31</td>\n",
1569
+ " <td>ModernCRM</td>\n",
1570
+ " <td>Backup completed successfully.</td>\n",
1571
+ " <td>System Notification</td>\n",
1572
+ " <td>regex</td>\n",
1573
+ " <td>8</td>\n",
1574
+ " <td>System Notification</td>\n",
1575
+ " </tr>\n",
1576
+ " </tbody>\n",
1577
+ "</table>\n",
1578
+ "<p>500 rows × 7 columns</p>\n",
1579
+ "</div>"
1580
+ ],
1581
+ "text/plain": [
1582
+ " timestamp source \\\n",
1583
+ "7 10/11/2025 8:44 ModernHR \n",
1584
+ "14 1/4/2025 1:43 ThirdPartyAPI \n",
1585
+ "15 5/1/2025 9:41 ModernCRM \n",
1586
+ "18 2/22/2025 17:49 ModernCRM \n",
1587
+ "27 9/24/2025 19:57 ThirdPartyAPI \n",
1588
+ "... ... ... \n",
1589
+ "2376 6/27/2025 8:47 ModernCRM \n",
1590
+ "2381 9/5/2025 6:39 ThirdPartyAPI \n",
1591
+ "2394 4/3/2025 13:13 ModernHR \n",
1592
+ "2395 5/2/2025 14:29 ThirdPartyAPI \n",
1593
+ "2403 10/1/2025 1:31 ModernCRM \n",
1594
+ "\n",
1595
+ " log_message target_label \\\n",
1596
+ "7 File data_6169.csv uploaded successfully by us... System Notification \n",
1597
+ "14 File data_3847.csv uploaded successfully by us... System Notification \n",
1598
+ "15 Backup completed successfully. System Notification \n",
1599
+ "18 Account with ID 5351 created by User634. User Action \n",
1600
+ "27 User User685 logged out. User Action \n",
1601
+ "... ... ... \n",
1602
+ "2376 System updated to version 2.0.5. System Notification \n",
1603
+ "2381 Disk cleanup completed successfully. System Notification \n",
1604
+ "2394 Disk cleanup completed successfully. System Notification \n",
1605
+ "2395 Backup ended at 2025-05-06 11:23:16. System Notification \n",
1606
+ "2403 Backup completed successfully. System Notification \n",
1607
+ "\n",
1608
+ " complexity cluster regex_label \n",
1609
+ "7 regex 4 System Notification \n",
1610
+ "14 regex 4 System Notification \n",
1611
+ "15 regex 8 System Notification \n",
1612
+ "18 regex 9 User Action \n",
1613
+ "27 regex 11 User Action \n",
1614
+ "... ... ... ... \n",
1615
+ "2376 regex 21 System Notification \n",
1616
+ "2381 regex 32 System Notification \n",
1617
+ "2394 regex 32 System Notification \n",
1618
+ "2395 regex 13 System Notification \n",
1619
+ "2403 regex 8 System Notification \n",
1620
+ "\n",
1621
+ "[500 rows x 7 columns]"
1622
+ ]
1623
+ },
1624
+ "execution_count": 20,
1625
+ "metadata": {},
1626
+ "output_type": "execute_result"
1627
+ }
1628
+ ],
1629
+ "source": [
1630
+ "# Apply regex classification\n",
1631
+ "df['regex_label'] = df['log_message'].apply(lambda x: classify_with_regex(x))\n",
1632
+ "df[df['regex_label'].notnull()]"
1633
+ ]
1634
+ },
1635
+ {
1636
+ "cell_type": "code",
1637
+ "execution_count": 21,
1638
+ "id": "e12979fc8238277f",
1639
+ "metadata": {
1640
+ "ExecuteTime": {
1641
+ "end_time": "2025-01-15T20:27:58.806198Z",
1642
+ "start_time": "2025-01-15T20:27:58.792662Z"
1643
+ }
1644
+ },
1645
+ "outputs": [
1646
+ {
1647
+ "data": {
1648
+ "text/html": [
1649
+ "<div>\n",
1650
+ "<style scoped>\n",
1651
+ " .dataframe tbody tr th:only-of-type {\n",
1652
+ " vertical-align: middle;\n",
1653
+ " }\n",
1654
+ "\n",
1655
+ " .dataframe tbody tr th {\n",
1656
+ " vertical-align: top;\n",
1657
+ " }\n",
1658
+ "\n",
1659
+ " .dataframe thead th {\n",
1660
+ " text-align: right;\n",
1661
+ " }\n",
1662
+ "</style>\n",
1663
+ "<table border=\"1\" class=\"dataframe\">\n",
1664
+ " <thead>\n",
1665
+ " <tr style=\"text-align: right;\">\n",
1666
+ " <th></th>\n",
1667
+ " <th>timestamp</th>\n",
1668
+ " <th>source</th>\n",
1669
+ " <th>log_message</th>\n",
1670
+ " <th>target_label</th>\n",
1671
+ " <th>complexity</th>\n",
1672
+ " <th>cluster</th>\n",
1673
+ " <th>regex_label</th>\n",
1674
+ " </tr>\n",
1675
+ " </thead>\n",
1676
+ " <tbody>\n",
1677
+ " <tr>\n",
1678
+ " <th>0</th>\n",
1679
+ " <td>2025-06-27 07:20:25</td>\n",
1680
+ " <td>ModernCRM</td>\n",
1681
+ " <td>nova.osapi_compute.wsgi.server [req-b9718cd8-f...</td>\n",
1682
+ " <td>HTTP Status</td>\n",
1683
+ " <td>bert</td>\n",
1684
+ " <td>0</td>\n",
1685
+ " <td>None</td>\n",
1686
+ " </tr>\n",
1687
+ " <tr>\n",
1688
+ " <th>1</th>\n",
1689
+ " <td>1/14/2025 23:07</td>\n",
1690
+ " <td>ModernCRM</td>\n",
1691
+ " <td>Email service experiencing issues with sending</td>\n",
1692
+ " <td>Critical Error</td>\n",
1693
+ " <td>bert</td>\n",
1694
+ " <td>1</td>\n",
1695
+ " <td>None</td>\n",
1696
+ " </tr>\n",
1697
+ " <tr>\n",
1698
+ " <th>2</th>\n",
1699
+ " <td>1/17/2025 1:29</td>\n",
1700
+ " <td>AnalyticsEngine</td>\n",
1701
+ " <td>Unauthorized access to data was attempted</td>\n",
1702
+ " <td>Security Alert</td>\n",
1703
+ " <td>bert</td>\n",
1704
+ " <td>2</td>\n",
1705
+ " <td>None</td>\n",
1706
+ " </tr>\n",
1707
+ " <tr>\n",
1708
+ " <th>3</th>\n",
1709
+ " <td>2025-07-12 00:24:16</td>\n",
1710
+ " <td>ModernHR</td>\n",
1711
+ " <td>nova.osapi_compute.wsgi.server [req-4895c258-b...</td>\n",
1712
+ " <td>HTTP Status</td>\n",
1713
+ " <td>bert</td>\n",
1714
+ " <td>0</td>\n",
1715
+ " <td>None</td>\n",
1716
+ " </tr>\n",
1717
+ " <tr>\n",
1718
+ " <th>4</th>\n",
1719
+ " <td>2025-06-02 18:25:23</td>\n",
1720
+ " <td>BillingSystem</td>\n",
1721
+ " <td>nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...</td>\n",
1722
+ " <td>HTTP Status</td>\n",
1723
+ " <td>bert</td>\n",
1724
+ " <td>0</td>\n",
1725
+ " <td>None</td>\n",
1726
+ " </tr>\n",
1727
+ " </tbody>\n",
1728
+ "</table>\n",
1729
+ "</div>"
1730
+ ],
1731
+ "text/plain": [
1732
+ " timestamp source \\\n",
1733
+ "0 2025-06-27 07:20:25 ModernCRM \n",
1734
+ "1 1/14/2025 23:07 ModernCRM \n",
1735
+ "2 1/17/2025 1:29 AnalyticsEngine \n",
1736
+ "3 2025-07-12 00:24:16 ModernHR \n",
1737
+ "4 2025-06-02 18:25:23 BillingSystem \n",
1738
+ "\n",
1739
+ " log_message target_label \\\n",
1740
+ "0 nova.osapi_compute.wsgi.server [req-b9718cd8-f... HTTP Status \n",
1741
+ "1 Email service experiencing issues with sending Critical Error \n",
1742
+ "2 Unauthorized access to data was attempted Security Alert \n",
1743
+ "3 nova.osapi_compute.wsgi.server [req-4895c258-b... HTTP Status \n",
1744
+ "4 nova.osapi_compute.wsgi.server [req-ee8bc8ba-9... HTTP Status \n",
1745
+ "\n",
1746
+ " complexity cluster regex_label \n",
1747
+ "0 bert 0 None \n",
1748
+ "1 bert 1 None \n",
1749
+ "2 bert 2 None \n",
1750
+ "3 bert 0 None \n",
1751
+ "4 bert 0 None "
1752
+ ]
1753
+ },
1754
+ "execution_count": 21,
1755
+ "metadata": {},
1756
+ "output_type": "execute_result"
1757
+ }
1758
+ ],
1759
+ "source": [
1760
+ "df[df['regex_label'].isnull()].head(5)"
1761
+ ]
1762
+ },
1763
+ {
1764
+ "cell_type": "markdown",
1765
+ "id": "b58274a035c82628",
1766
+ "metadata": {},
1767
+ "source": [
1768
+ "### Classification Stage 2: Classification Using Embeddings"
1769
+ ]
1770
+ },
1771
+ {
1772
+ "cell_type": "code",
1773
+ "execution_count": 22,
1774
+ "id": "7c21958116c1429b",
1775
+ "metadata": {
1776
+ "ExecuteTime": {
1777
+ "end_time": "2025-01-15T20:29:48.629503Z",
1778
+ "start_time": "2025-01-15T20:29:48.598340Z"
1779
+ }
1780
+ },
1781
+ "outputs": [
1782
+ {
1783
+ "data": {
1784
+ "text/plain": [
1785
+ "(1910, 7)"
1786
+ ]
1787
+ },
1788
+ "execution_count": 22,
1789
+ "metadata": {},
1790
+ "output_type": "execute_result"
1791
+ }
1792
+ ],
1793
+ "source": [
1794
+ "df_non_regex = df[df['regex_label'].isnull()].copy()\n",
1795
+ "df_non_regex.shape"
1796
+ ]
1797
+ },
1798
+ {
1799
+ "cell_type": "code",
1800
+ "execution_count": 23,
1801
+ "id": "b340b51441a741a8",
1802
+ "metadata": {
1803
+ "ExecuteTime": {
1804
+ "end_time": "2025-01-15T20:30:04.093929Z",
1805
+ "start_time": "2025-01-15T20:30:04.062728Z"
1806
+ }
1807
+ },
1808
+ "outputs": [
1809
+ {
1810
+ "data": {
1811
+ "text/html": [
1812
+ "<div>\n",
1813
+ "<style scoped>\n",
1814
+ " .dataframe tbody tr th:only-of-type {\n",
1815
+ " vertical-align: middle;\n",
1816
+ " }\n",
1817
+ "\n",
1818
+ " .dataframe tbody tr th {\n",
1819
+ " vertical-align: top;\n",
1820
+ " }\n",
1821
+ "\n",
1822
+ " .dataframe thead th {\n",
1823
+ " text-align: right;\n",
1824
+ " }\n",
1825
+ "</style>\n",
1826
+ "<table border=\"1\" class=\"dataframe\">\n",
1827
+ " <thead>\n",
1828
+ " <tr style=\"text-align: right;\">\n",
1829
+ " <th></th>\n",
1830
+ " <th>timestamp</th>\n",
1831
+ " <th>source</th>\n",
1832
+ " <th>log_message</th>\n",
1833
+ " <th>target_label</th>\n",
1834
+ " <th>complexity</th>\n",
1835
+ " <th>cluster</th>\n",
1836
+ " <th>regex_label</th>\n",
1837
+ " </tr>\n",
1838
+ " </thead>\n",
1839
+ " <tbody>\n",
1840
+ " <tr>\n",
1841
+ " <th>60</th>\n",
1842
+ " <td>2025-10-06 16:55:23</td>\n",
1843
+ " <td>LegacyCRM</td>\n",
1844
+ " <td>Lead conversion failed for prospect ID 7842 du...</td>\n",
1845
+ " <td>Workflow Error</td>\n",
1846
+ " <td>llm</td>\n",
1847
+ " <td>24</td>\n",
1848
+ " <td>None</td>\n",
1849
+ " </tr>\n",
1850
+ " <tr>\n",
1851
+ " <th>255</th>\n",
1852
+ " <td>2025-05-03 16:55:35</td>\n",
1853
+ " <td>LegacyCRM</td>\n",
1854
+ " <td>API endpoint 'getCustomerDetails' is deprecate...</td>\n",
1855
+ " <td>Deprecation Warning</td>\n",
1856
+ " <td>llm</td>\n",
1857
+ " <td>48</td>\n",
1858
+ " <td>None</td>\n",
1859
+ " </tr>\n",
1860
+ " <tr>\n",
1861
+ " <th>377</th>\n",
1862
+ " <td>2025-06-24 12:16:29</td>\n",
1863
+ " <td>LegacyCRM</td>\n",
1864
+ " <td>Customer follow-up process for lead ID 5621 fa...</td>\n",
1865
+ " <td>Workflow Error</td>\n",
1866
+ " <td>llm</td>\n",
1867
+ " <td>62</td>\n",
1868
+ " <td>None</td>\n",
1869
+ " </tr>\n",
1870
+ " <tr>\n",
1871
+ " <th>1325</th>\n",
1872
+ " <td>2025-04-17 07:33:44</td>\n",
1873
+ " <td>LegacyCRM</td>\n",
1874
+ " <td>Escalation rule execution failed for ticket ID...</td>\n",
1875
+ " <td>Workflow Error</td>\n",
1876
+ " <td>llm</td>\n",
1877
+ " <td>105</td>\n",
1878
+ " <td>None</td>\n",
1879
+ " </tr>\n",
1880
+ " <tr>\n",
1881
+ " <th>1734</th>\n",
1882
+ " <td>2025-04-30 07:47:30</td>\n",
1883
+ " <td>LegacyCRM</td>\n",
1884
+ " <td>The 'ExportToCSV' feature is outdated. Please ...</td>\n",
1885
+ " <td>Deprecation Warning</td>\n",
1886
+ " <td>llm</td>\n",
1887
+ " <td>118</td>\n",
1888
+ " <td>None</td>\n",
1889
+ " </tr>\n",
1890
+ " <tr>\n",
1891
+ " <th>1826</th>\n",
1892
+ " <td>2025-01-23 10:33:36</td>\n",
1893
+ " <td>LegacyCRM</td>\n",
1894
+ " <td>Support for legacy authentication methods will...</td>\n",
1895
+ " <td>Deprecation Warning</td>\n",
1896
+ " <td>llm</td>\n",
1897
+ " <td>122</td>\n",
1898
+ " <td>None</td>\n",
1899
+ " </tr>\n",
1900
+ " <tr>\n",
1901
+ " <th>2217</th>\n",
1902
+ " <td>2025-05-12 09:46:54</td>\n",
1903
+ " <td>LegacyCRM</td>\n",
1904
+ " <td>Task assignment for TeamID 3425 could not comp...</td>\n",
1905
+ " <td>Workflow Error</td>\n",
1906
+ " <td>llm</td>\n",
1907
+ " <td>133</td>\n",
1908
+ " <td>None</td>\n",
1909
+ " </tr>\n",
1910
+ " </tbody>\n",
1911
+ "</table>\n",
1912
+ "</div>"
1913
+ ],
1914
+ "text/plain": [
1915
+ " timestamp source \\\n",
1916
+ "60 2025-10-06 16:55:23 LegacyCRM \n",
1917
+ "255 2025-05-03 16:55:35 LegacyCRM \n",
1918
+ "377 2025-06-24 12:16:29 LegacyCRM \n",
1919
+ "1325 2025-04-17 07:33:44 LegacyCRM \n",
1920
+ "1734 2025-04-30 07:47:30 LegacyCRM \n",
1921
+ "1826 2025-01-23 10:33:36 LegacyCRM \n",
1922
+ "2217 2025-05-12 09:46:54 LegacyCRM \n",
1923
+ "\n",
1924
+ " log_message target_label \\\n",
1925
+ "60 Lead conversion failed for prospect ID 7842 du... Workflow Error \n",
1926
+ "255 API endpoint 'getCustomerDetails' is deprecate... Deprecation Warning \n",
1927
+ "377 Customer follow-up process for lead ID 5621 fa... Workflow Error \n",
1928
+ "1325 Escalation rule execution failed for ticket ID... Workflow Error \n",
1929
+ "1734 The 'ExportToCSV' feature is outdated. Please ... Deprecation Warning \n",
1930
+ "1826 Support for legacy authentication methods will... Deprecation Warning \n",
1931
+ "2217 Task assignment for TeamID 3425 could not comp... Workflow Error \n",
1932
+ "\n",
1933
+ " complexity cluster regex_label \n",
1934
+ "60 llm 24 None \n",
1935
+ "255 llm 48 None \n",
1936
+ "377 llm 62 None \n",
1937
+ "1325 llm 105 None \n",
1938
+ "1734 llm 118 None \n",
1939
+ "1826 llm 122 None \n",
1940
+ "2217 llm 133 None "
1941
+ ]
1942
+ },
1943
+ "execution_count": 23,
1944
+ "metadata": {},
1945
+ "output_type": "execute_result"
1946
+ }
1947
+ ],
1948
+ "source": [
1949
+ "df_legacy = df_non_regex[df_non_regex.source==\"LegacyCRM\"]\n",
1950
+ "df_legacy"
1951
+ ]
1952
+ },
1953
+ {
1954
+ "cell_type": "code",
1955
+ "execution_count": 24,
1956
+ "id": "8d8f7e5902aca5f8",
1957
+ "metadata": {
1958
+ "ExecuteTime": {
1959
+ "end_time": "2025-01-15T20:30:48.679137Z",
1960
+ "start_time": "2025-01-15T20:30:48.647857Z"
1961
+ }
1962
+ },
1963
+ "outputs": [
1964
+ {
1965
+ "data": {
1966
+ "text/html": [
1967
+ "<div>\n",
1968
+ "<style scoped>\n",
1969
+ " .dataframe tbody tr th:only-of-type {\n",
1970
+ " vertical-align: middle;\n",
1971
+ " }\n",
1972
+ "\n",
1973
+ " .dataframe tbody tr th {\n",
1974
+ " vertical-align: top;\n",
1975
+ " }\n",
1976
+ "\n",
1977
+ " .dataframe thead th {\n",
1978
+ " text-align: right;\n",
1979
+ " }\n",
1980
+ "</style>\n",
1981
+ "<table border=\"1\" class=\"dataframe\">\n",
1982
+ " <thead>\n",
1983
+ " <tr style=\"text-align: right;\">\n",
1984
+ " <th></th>\n",
1985
+ " <th>timestamp</th>\n",
1986
+ " <th>source</th>\n",
1987
+ " <th>log_message</th>\n",
1988
+ " <th>target_label</th>\n",
1989
+ " <th>complexity</th>\n",
1990
+ " <th>cluster</th>\n",
1991
+ " <th>regex_label</th>\n",
1992
+ " </tr>\n",
1993
+ " </thead>\n",
1994
+ " <tbody>\n",
1995
+ " <tr>\n",
1996
+ " <th>0</th>\n",
1997
+ " <td>2025-06-27 07:20:25</td>\n",
1998
+ " <td>ModernCRM</td>\n",
1999
+ " <td>nova.osapi_compute.wsgi.server [req-b9718cd8-f...</td>\n",
2000
+ " <td>HTTP Status</td>\n",
2001
+ " <td>bert</td>\n",
2002
+ " <td>0</td>\n",
2003
+ " <td>None</td>\n",
2004
+ " </tr>\n",
2005
+ " <tr>\n",
2006
+ " <th>1</th>\n",
2007
+ " <td>1/14/2025 23:07</td>\n",
2008
+ " <td>ModernCRM</td>\n",
2009
+ " <td>Email service experiencing issues with sending</td>\n",
2010
+ " <td>Critical Error</td>\n",
2011
+ " <td>bert</td>\n",
2012
+ " <td>1</td>\n",
2013
+ " <td>None</td>\n",
2014
+ " </tr>\n",
2015
+ " <tr>\n",
2016
+ " <th>2</th>\n",
2017
+ " <td>1/17/2025 1:29</td>\n",
2018
+ " <td>AnalyticsEngine</td>\n",
2019
+ " <td>Unauthorized access to data was attempted</td>\n",
2020
+ " <td>Security Alert</td>\n",
2021
+ " <td>bert</td>\n",
2022
+ " <td>2</td>\n",
2023
+ " <td>None</td>\n",
2024
+ " </tr>\n",
2025
+ " <tr>\n",
2026
+ " <th>3</th>\n",
2027
+ " <td>2025-07-12 00:24:16</td>\n",
2028
+ " <td>ModernHR</td>\n",
2029
+ " <td>nova.osapi_compute.wsgi.server [req-4895c258-b...</td>\n",
2030
+ " <td>HTTP Status</td>\n",
2031
+ " <td>bert</td>\n",
2032
+ " <td>0</td>\n",
2033
+ " <td>None</td>\n",
2034
+ " </tr>\n",
2035
+ " <tr>\n",
2036
+ " <th>4</th>\n",
2037
+ " <td>2025-06-02 18:25:23</td>\n",
2038
+ " <td>BillingSystem</td>\n",
2039
+ " <td>nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...</td>\n",
2040
+ " <td>HTTP Status</td>\n",
2041
+ " <td>bert</td>\n",
2042
+ " <td>0</td>\n",
2043
+ " <td>None</td>\n",
2044
+ " </tr>\n",
2045
+ " <tr>\n",
2046
+ " <th>...</th>\n",
2047
+ " <td>...</td>\n",
2048
+ " <td>...</td>\n",
2049
+ " <td>...</td>\n",
2050
+ " <td>...</td>\n",
2051
+ " <td>...</td>\n",
2052
+ " <td>...</td>\n",
2053
+ " <td>...</td>\n",
2054
+ " </tr>\n",
2055
+ " <tr>\n",
2056
+ " <th>2405</th>\n",
2057
+ " <td>2025-08-13 07:29:25</td>\n",
2058
+ " <td>ModernHR</td>\n",
2059
+ " <td>nova.osapi_compute.wsgi.server [req-96c3ec98-2...</td>\n",
2060
+ " <td>HTTP Status</td>\n",
2061
+ " <td>bert</td>\n",
2062
+ " <td>0</td>\n",
2063
+ " <td>None</td>\n",
2064
+ " </tr>\n",
2065
+ " <tr>\n",
2066
+ " <th>2406</th>\n",
2067
+ " <td>1/11/2025 5:32</td>\n",
2068
+ " <td>ModernHR</td>\n",
2069
+ " <td>User 3844 account experienced multiple failed ...</td>\n",
2070
+ " <td>Security Alert</td>\n",
2071
+ " <td>bert</td>\n",
2072
+ " <td>7</td>\n",
2073
+ " <td>None</td>\n",
2074
+ " </tr>\n",
2075
+ " <tr>\n",
2076
+ " <th>2407</th>\n",
2077
+ " <td>2025-08-03 03:07:47</td>\n",
2078
+ " <td>ThirdPartyAPI</td>\n",
2079
+ " <td>nova.metadata.wsgi.server [req-b6d4a270-accb-4...</td>\n",
2080
+ " <td>HTTP Status</td>\n",
2081
+ " <td>bert</td>\n",
2082
+ " <td>0</td>\n",
2083
+ " <td>None</td>\n",
2084
+ " </tr>\n",
2085
+ " <tr>\n",
2086
+ " <th>2408</th>\n",
2087
+ " <td>11/11/2025 11:52</td>\n",
2088
+ " <td>BillingSystem</td>\n",
2089
+ " <td>Email service affected by failed transmission</td>\n",
2090
+ " <td>Critical Error</td>\n",
2091
+ " <td>bert</td>\n",
2092
+ " <td>1</td>\n",
2093
+ " <td>None</td>\n",
2094
+ " </tr>\n",
2095
+ " <tr>\n",
2096
+ " <th>2409</th>\n",
2097
+ " <td>12/25/2025 13:21</td>\n",
2098
+ " <td>AnalyticsEngine</td>\n",
2099
+ " <td>Repeated failed login attempts occurred for us...</td>\n",
2100
+ " <td>Security Alert</td>\n",
2101
+ " <td>bert</td>\n",
2102
+ " <td>7</td>\n",
2103
+ " <td>None</td>\n",
2104
+ " </tr>\n",
2105
+ " </tbody>\n",
2106
+ "</table>\n",
2107
+ "<p>1903 rows × 7 columns</p>\n",
2108
+ "</div>"
2109
+ ],
2110
+ "text/plain": [
2111
+ " timestamp source \\\n",
2112
+ "0 2025-06-27 07:20:25 ModernCRM \n",
2113
+ "1 1/14/2025 23:07 ModernCRM \n",
2114
+ "2 1/17/2025 1:29 AnalyticsEngine \n",
2115
+ "3 2025-07-12 00:24:16 ModernHR \n",
2116
+ "4 2025-06-02 18:25:23 BillingSystem \n",
2117
+ "... ... ... \n",
2118
+ "2405 2025-08-13 07:29:25 ModernHR \n",
2119
+ "2406 1/11/2025 5:32 ModernHR \n",
2120
+ "2407 2025-08-03 03:07:47 ThirdPartyAPI \n",
2121
+ "2408 11/11/2025 11:52 BillingSystem \n",
2122
+ "2409 12/25/2025 13:21 AnalyticsEngine \n",
2123
+ "\n",
2124
+ " log_message target_label \\\n",
2125
+ "0 nova.osapi_compute.wsgi.server [req-b9718cd8-f... HTTP Status \n",
2126
+ "1 Email service experiencing issues with sending Critical Error \n",
2127
+ "2 Unauthorized access to data was attempted Security Alert \n",
2128
+ "3 nova.osapi_compute.wsgi.server [req-4895c258-b... HTTP Status \n",
2129
+ "4 nova.osapi_compute.wsgi.server [req-ee8bc8ba-9... HTTP Status \n",
2130
+ "... ... ... \n",
2131
+ "2405 nova.osapi_compute.wsgi.server [req-96c3ec98-2... HTTP Status \n",
2132
+ "2406 User 3844 account experienced multiple failed ... Security Alert \n",
2133
+ "2407 nova.metadata.wsgi.server [req-b6d4a270-accb-4... HTTP Status \n",
2134
+ "2408 Email service affected by failed transmission Critical Error \n",
2135
+ "2409 Repeated failed login attempts occurred for us... Security Alert \n",
2136
+ "\n",
2137
+ " complexity cluster regex_label \n",
2138
+ "0 bert 0 None \n",
2139
+ "1 bert 1 None \n",
2140
+ "2 bert 2 None \n",
2141
+ "3 bert 0 None \n",
2142
+ "4 bert 0 None \n",
2143
+ "... ... ... ... \n",
2144
+ "2405 bert 0 None \n",
2145
+ "2406 bert 7 None \n",
2146
+ "2407 bert 0 None \n",
2147
+ "2408 bert 1 None \n",
2148
+ "2409 bert 7 None \n",
2149
+ "\n",
2150
+ "[1903 rows x 7 columns]"
2151
+ ]
2152
+ },
2153
+ "execution_count": 24,
2154
+ "metadata": {},
2155
+ "output_type": "execute_result"
2156
+ }
2157
+ ],
2158
+ "source": [
2159
+ "df_non_legacy = df_non_regex[df_non_regex.source!=\"LegacyCRM\"]\n",
2160
+ "df_non_legacy"
2161
+ ]
2162
+ },
2163
+ {
2164
+ "cell_type": "code",
2165
+ "execution_count": 25,
2166
+ "id": "27295bd7ada09140",
2167
+ "metadata": {
2168
+ "ExecuteTime": {
2169
+ "end_time": "2025-01-15T20:30:59.173856Z",
2170
+ "start_time": "2025-01-15T20:30:59.142606Z"
2171
+ }
2172
+ },
2173
+ "outputs": [
2174
+ {
2175
+ "data": {
2176
+ "text/plain": [
2177
+ "(1903, 7)"
2178
+ ]
2179
+ },
2180
+ "execution_count": 25,
2181
+ "metadata": {},
2182
+ "output_type": "execute_result"
2183
+ }
2184
+ ],
2185
+ "source": [
2186
+ "df_non_legacy.shape"
2187
+ ]
2188
+ },
2189
+ {
2190
+ "cell_type": "code",
2191
+ "execution_count": 26,
2192
+ "id": "566831c64be8ed7",
2193
+ "metadata": {
2194
+ "ExecuteTime": {
2195
+ "end_time": "2025-01-15T20:31:10.610031Z",
2196
+ "start_time": "2025-01-15T20:31:07.235690Z"
2197
+ }
2198
+ },
2199
+ "outputs": [],
2200
+ "source": [
2201
+ "model = SentenceTransformer('all-MiniLM-L6-v2') # Lightweight embedding model\n",
2202
+ "embeddings_filtered = model.encode(df_non_legacy['log_message'].tolist())"
2203
+ ]
2204
+ },
2205
+ {
2206
+ "cell_type": "code",
2207
+ "execution_count": 27,
2208
+ "id": "ae5a2c977f0330cd",
2209
+ "metadata": {
2210
+ "ExecuteTime": {
2211
+ "end_time": "2025-01-15T20:31:15.767984Z",
2212
+ "start_time": "2025-01-15T20:31:15.757908Z"
2213
+ }
2214
+ },
2215
+ "outputs": [
2216
+ {
2217
+ "data": {
2218
+ "text/plain": [
2219
+ "1903"
2220
+ ]
2221
+ },
2222
+ "execution_count": 27,
2223
+ "metadata": {},
2224
+ "output_type": "execute_result"
2225
+ }
2226
+ ],
2227
+ "source": [
2228
+ "len(embeddings_filtered)"
2229
+ ]
2230
+ },
2231
+ {
2232
+ "cell_type": "code",
2233
+ "execution_count": 28,
2234
+ "id": "a9b0c2b8798c9247",
2235
+ "metadata": {
2236
+ "ExecuteTime": {
2237
+ "end_time": "2025-01-15T20:32:19.597483Z",
2238
+ "start_time": "2025-01-15T20:32:19.566230Z"
2239
+ }
2240
+ },
2241
+ "outputs": [],
2242
+ "source": [
2243
+ "X = embeddings_filtered\n",
2244
+ "y = df_non_legacy['target_label'].values"
2245
+ ]
2246
+ },
2247
+ {
2248
+ "cell_type": "code",
2249
+ "execution_count": 29,
2250
+ "id": "b831de9df6a1d4c4",
2251
+ "metadata": {
2252
+ "ExecuteTime": {
2253
+ "end_time": "2025-01-15T20:32:24.546133Z",
2254
+ "start_time": "2025-01-15T20:32:24.357812Z"
2255
+ }
2256
+ },
2257
+ "outputs": [
2258
+ {
2259
+ "name": "stdout",
2260
+ "output_type": "stream",
2261
+ "text": [
2262
+ " precision recall f1-score support\n",
2263
+ "\n",
2264
+ "Critical Error 0.91 1.00 0.95 48\n",
2265
+ " Error 0.98 0.89 0.93 47\n",
2266
+ " HTTP Status 1.00 1.00 1.00 304\n",
2267
+ "Resource Usage 1.00 1.00 1.00 49\n",
2268
+ "Security Alert 1.00 0.99 1.00 123\n",
2269
+ "\n",
2270
+ " accuracy 0.99 571\n",
2271
+ " macro avg 0.98 0.98 0.98 571\n",
2272
+ " weighted avg 0.99 0.99 0.99 571\n",
2273
+ "\n"
2274
+ ]
2275
+ }
2276
+ ],
2277
+ "source": [
2278
+ "from sklearn.model_selection import train_test_split\n",
2279
+ "from sklearn.linear_model import LogisticRegression\n",
2280
+ "from sklearn.metrics import accuracy_score\n",
2281
+ "from sklearn.metrics import classification_report\n",
2282
+ "\n",
2283
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n",
2284
+ "clf = LogisticRegression(max_iter=1000)\n",
2285
+ "clf.fit(X_train, y_train)\n",
2286
+ "y_pred = clf.predict(X_test)\n",
2287
+ "report = classification_report(y_test, y_pred)\n",
2288
+ "print(report)"
2289
+ ]
2290
+ },
2291
+ {
2292
+ "cell_type": "code",
2293
+ "execution_count": 31,
2294
+ "id": "1317f9b2de813a32",
2295
+ "metadata": {
2296
+ "ExecuteTime": {
2297
+ "end_time": "2025-01-15T20:36:52.942021Z",
2298
+ "start_time": "2025-01-15T20:36:52.910539Z"
2299
+ }
2300
+ },
2301
+ "outputs": [
2302
+ {
2303
+ "data": {
2304
+ "text/plain": [
2305
+ "['../models/log_classifier.joblib']"
2306
+ ]
2307
+ },
2308
+ "execution_count": 31,
2309
+ "metadata": {},
2310
+ "output_type": "execute_result"
2311
+ }
2312
+ ],
2313
+ "source": [
2314
+ "import joblib\n",
2315
+ "joblib.dump(clf, '../models/log_classifier.joblib')\n"
2316
+ ]
2317
+ },
2318
+ {
2319
+ "cell_type": "code",
2320
+ "execution_count": null,
2321
+ "id": "0ddb2803",
2322
+ "metadata": {},
2323
+ "outputs": [],
2324
+ "source": []
2325
+ }
2326
+ ],
2327
+ "metadata": {
2328
+ "kernelspec": {
2329
+ "display_name": "venv",
2330
+ "language": "python",
2331
+ "name": "python3"
2332
+ },
2333
+ "language_info": {
2334
+ "codemirror_mode": {
2335
+ "name": "ipython",
2336
+ "version": 3
2337
+ },
2338
+ "file_extension": ".py",
2339
+ "mimetype": "text/x-python",
2340
+ "name": "python",
2341
+ "nbconvert_exporter": "python",
2342
+ "pygments_lexer": "ipython3",
2343
+ "version": "3.12.6"
2344
+ }
2345
+ },
2346
+ "nbformat": 4,
2347
+ "nbformat_minor": 5
2348
+ }