HARISH20205 commited on
Commit
f3352b5
·
0 Parent(s):
Files changed (16) hide show
  1. .dockerignore +2 -0
  2. .gitattributes +35 -0
  3. .gitignore +2 -0
  4. .hfignore +2 -0
  5. Dockerfile +39 -0
  6. README.md +10 -0
  7. app.py +138 -0
  8. async_utils.py +11 -0
  9. catboost_model.bin +3 -0
  10. model_utils.py +0 -0
  11. requirements.txt +13 -0
  12. static/styles.css +79 -0
  13. sync_utils.py +7 -0
  14. templates/index.html +34 -0
  15. test.py +5 -0
  16. url_process.py +439 -0
.dockerignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__
2
+ myenv/
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__/
2
+ myenv/
.hfignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__/
2
+ myenv/
Dockerfile ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Base image with Python
2
+ FROM python:3.11-slim
3
+
4
+ # Set environment variables
5
+ ENV PYTHONDONTWRITEBYTECODE 1
6
+ ENV PYTHONUNBUFFERED 1
7
+
8
+ # Set working directory
9
+ WORKDIR /app
10
+
11
+ # Copy the requirements file
12
+ COPY requirements.txt /app/
13
+
14
+ # Install Python dependencies
15
+ RUN pip install --no-cache-dir --upgrade pip && \
16
+ pip install --no-cache-dir -r requirements.txt
17
+
18
+
19
+
20
+ # Install system dependencies
21
+ RUN apt-get update && apt-get install -y \
22
+ gcc \
23
+ libgomp1 \
24
+ && apt-get clean \
25
+ && rm -rf /var/lib/apt/lists/*
26
+
27
+ # Copy the application code
28
+ COPY . /app/
29
+
30
+
31
+
32
+ # Expose the port
33
+ EXPOSE 5000
34
+
35
+ ENV FLASK_APP app.py
36
+ ENV FLASK_RUN_HOST 0.0.0.0
37
+
38
+ # Start the application
39
+ CMD ["flask", "run", "-h", "0.0.0.0", "-p", "5000"]
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Kebos Ai
3
+ emoji: 🐠
4
+ colorFrom: purple
5
+ colorTo: green
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import pandas as pd
3
+ from concurrent.futures import ThreadPoolExecutor
4
+ from flask import Flask, request, render_template
5
+ from catboost import CatBoostClassifier # Import CatBoost
6
+ from url_process import extract_url_features # Ensure you have the appropriate feature extraction function
7
+ import os
8
+ # Batch Processing: Ensures URLs are processed in manageable chunks
9
+ def process_urls_in_batches(urls, batch_size=10):
10
+ for i in range(0, len(urls), batch_size):
11
+ yield urls[i:i + batch_size]
12
+
13
+ # Async function for non-blocking DNS lookups and HTTP requests
14
+ async def async_extract_features(url):
15
+ features = await asyncio.to_thread(extract_url_features, url)
16
+ return features
17
+
18
+ # ThreadPoolExecutor for CPU-bound tasks like feature extraction
19
+ def extract_features_in_parallel(urls):
20
+ with ThreadPoolExecutor(max_workers=5) as executor:
21
+ return list(executor.map(extract_url_features, urls))
22
+
23
+ # Load the CatBoost model for inference
24
+ def predict_with_catboost(features_df, model_path):
25
+ model = CatBoostClassifier()
26
+ model.load_model(model_path)
27
+ predictions = model.predict(features_df)
28
+ return predictions
29
+
30
+ # Flask App Setup
31
+ app = Flask(__name__)
32
+
33
+ @app.route("/", methods=["GET", "POST"])
34
+ async def index():
35
+ result = None
36
+ url_features = None
37
+
38
+ if request.method == "POST":
39
+ # Get the URL input from the form
40
+ url = request.form["url"]
41
+
42
+ try:
43
+ # Asynchronously process the URL features
44
+ features = await async_extract_features(url)
45
+
46
+ # Convert the features to DataFrame (in case you need to do further processing)
47
+ features_df = pd.DataFrame([features])
48
+
49
+ # Perform prediction using the CatBoost model
50
+ model_path = (os.path.join(os.getcwd(),"catboost_model.bin"))
51
+ # model_path = "F:\\pyro guard\\model\\catboost_model.bin" # Specify your CatBoost model path here
52
+ predictions = predict_with_catboost(features_df, model_path)
53
+
54
+ # Determine if the URL is malicious or legitimate
55
+ if predictions[0] == 1:
56
+ result = "Malicious"
57
+ else:
58
+ result = "Legitimate"
59
+
60
+ # Optionally, display the extracted features
61
+ url_features = features
62
+
63
+ except Exception as e:
64
+ result = f"Error processing URL: {str(e)}"
65
+
66
+ return render_template("index.html", result=result, url_features=url_features)
67
+
68
+ if __name__ == "__main__":
69
+ app.run(debug=False,host="0.0.0.0")
70
+
71
+ '''
72
+ import asyncio
73
+ import pandas as pd
74
+ from concurrent.futures import ThreadPoolExecutor
75
+ from flask import Flask, request, render_template
76
+ from catboost import CatBoostClassifier
77
+ from url_process import extract_url_features, predict_urls # Import necessary functions
78
+
79
+ # Flask App Setup
80
+ app = Flask(__name__)
81
+
82
+ # Batch Processing: Ensures URLs are processed in manageable chunks
83
+ def process_urls_in_batches(urls, batch_size=10):
84
+ for i in range(0, len(urls), batch_size):
85
+ yield urls[i:i + batch_size]
86
+
87
+ # Async function for non-blocking DNS lookups and HTTP requests
88
+ async def async_extract_features(url):
89
+ features = await asyncio.to_thread(extract_url_features, url)
90
+ return features
91
+
92
+ # ThreadPoolExecutor for CPU-bound tasks like feature extraction
93
+ def extract_features_in_parallel(urls):
94
+ with ThreadPoolExecutor(max_workers=5) as executor:
95
+ return list(executor.map(extract_url_features, urls))
96
+
97
+ # Load the CatBoost model for inference
98
+ def predict_with_catboost(features_df, model_path):
99
+ model = CatBoostClassifier()
100
+ model.load_model(model_path)
101
+ predictions = model.predict(features_df)
102
+ return predictions
103
+
104
+ @app.route("/", methods=["GET", "POST"])
105
+ async def index():
106
+ result = None
107
+ url_features = None
108
+
109
+ if request.method == "POST":
110
+ # Get the URL input from the form
111
+ url = request.form["url"]
112
+
113
+ try:
114
+ # Asynchronously process the URL features
115
+ features = await async_extract_features(url)
116
+
117
+ # Convert the features to a DataFrame for further processing
118
+ features_df = pd.DataFrame([features])
119
+
120
+ # Perform prediction using the CatBoost model
121
+ model_path = "F:\\pyro guard\\model\\catboost_model.bin" # Specify your CatBoost model path
122
+ predictions = predict_with_catboost(features_df, model_path)
123
+
124
+ # Determine if the URL is malicious or legitimate
125
+ if predictions[0] == 1:
126
+ result = "Malicious"
127
+ else:
128
+ result = "Legitimate"
129
+
130
+ except Exception as e:
131
+ result = f"Error processing URL: {str(e)}"
132
+
133
+ return render_template("index.html", result=result, url_features=url_features)
134
+
135
+ if __name__ == "__main__":
136
+ app.run(debug=True)
137
+
138
+ '''
async_utils.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from url_process import extract_url_features
3
+
4
+ async def async_extract_features(url):
5
+ loop = asyncio.get_event_loop()
6
+ return await loop.run_in_executor(None, extract_url_features, url)
7
+
8
+ async def process_urls_async(urls):
9
+ tasks = [async_extract_features(url) for url in urls]
10
+ results = await asyncio.gather(*tasks)
11
+ return results
catboost_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6186b650ea61418983e18e94fde8127cfb4983aa45c456d68cd2ef396d70e54
3
+ size 8151864
model_utils.py ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ requests
2
+ pandas
3
+ tld
4
+ dnspython
5
+ catboost
6
+ lime
7
+ google
8
+ python-whois
9
+ flask==2.1.0
10
+ werkzeug==2.1.0
11
+ aiohttp
12
+ cryptography
13
+ flask[async]
static/styles.css ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body {
2
+ font-family: Verdana, Geneva, Tahoma, sans-serif;
3
+ margin: 0;
4
+ padding: 0;
5
+ background-color: #0a0a0a;
6
+ color: #333;
7
+ }
8
+
9
+ header {
10
+ background-color: #999999;
11
+ color: white;
12
+ padding: 20px;
13
+ text-align: center;
14
+ }
15
+
16
+ .container {
17
+ max-width: 500px;
18
+ margin: 30px auto;
19
+ padding: 20px;
20
+ background-color: #fff;
21
+ box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
22
+ border-radius: 10px;
23
+ }
24
+
25
+ .form-group {
26
+ margin-bottom: 15px;
27
+ }
28
+
29
+ label {
30
+ font-weight: bold;
31
+ display: block;
32
+ margin-bottom: 5px;
33
+ }
34
+
35
+ input[type="text"] {
36
+ width: 95%;
37
+ padding: 10px;
38
+ font-size: 16px;
39
+ border: 1px solid #000000;
40
+ border-radius: 5px;
41
+ }
42
+
43
+ button {
44
+ background-color: #000000;
45
+ color: white;
46
+ border: 2px solid #fff;
47
+ padding: 10px 20px;
48
+ cursor: pointer;
49
+ font-size: 16px;
50
+ border-radius: 5px;
51
+ }
52
+
53
+ button:hover {
54
+ background-color: #545454;
55
+ }
56
+
57
+ .result {
58
+ margin-top: 20px;
59
+ padding: 10px;
60
+ font-size: 18px;
61
+ text-align: center;
62
+ border-radius: 5px;
63
+ }
64
+
65
+ .result.legitimate {
66
+ background-color: #000000;
67
+ color: #ffffff;
68
+ }
69
+
70
+ .result.malicious {
71
+ background-color: #000000;
72
+ color: #ffffff;
73
+ }
74
+
75
+ footer {
76
+ text-align: center;
77
+ margin-top: 20px;
78
+ color: #777;
79
+ }
sync_utils.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from concurrent.futures import ThreadPoolExecutor
2
+ from url_process import extract_url_features
3
+
4
+ def process_urls_sync(urls):
5
+ with ThreadPoolExecutor() as executor:
6
+ results = list(executor.map(extract_url_features, urls))
7
+ return results
templates/index.html ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>URL Classification</title>
7
+ <link rel="stylesheet" href="{{ url_for('static', filename='styles.css') }}">
8
+ </head>
9
+ <body>
10
+ <header>
11
+ <h1>KeBos AI - Intelligent URL Threat Analyser</h1>
12
+ </header>
13
+
14
+ <div class="container">
15
+ <form method="POST">
16
+ <div class="form-group">
17
+ <label for="url">Enter URL:</label>
18
+ <input type="text" id="url" name="url" placeholder="https://example.com" required value="{{ url if url else '' }}">
19
+ </div>
20
+ <button type="submit">Classify URL</button>
21
+ </form>
22
+
23
+ {% if result %}
24
+ <div class="result {% if result == 'Legitimate' %}legitimate{% else %}malicious{% endif %}">
25
+ Result: {{ result }}
26
+ </div>
27
+ {% endif %}
28
+ </div>
29
+
30
+ <!--<footer>
31
+ <p>&copy; 2024 URL Classification Tool</p>
32
+ </footer>-->
33
+ </body>
34
+ </html>
test.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import os
2
+
3
+ model_path = (os.path.join(os.getcwd(),"catboost_model.bin"))
4
+
5
+ print(model_path)
url_process.py ADDED
@@ -0,0 +1,439 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import re
3
+ import socket
4
+ import requests
5
+ import dns.resolver
6
+ import ssl
7
+ from urllib.parse import urlparse, parse_qs
8
+ import whois
9
+ from tld import get_tld
10
+ import pandas as pd
11
+ import time
12
+ from googlesearch import search
13
+ from catboost import CatBoostClassifier
14
+ from lime.lime_tabular import LimeTabularExplainer
15
+ import logging
16
+ import asyncio
17
+ import aiohttp
18
+
19
+ # Setup logging
20
+ logging.basicConfig(level=logging.INFO)
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # Feature extraction functions
24
+ def extract_url_features(url):
25
+ features = {}
26
+ try:
27
+ # Basic URL features
28
+ features['qty_dot_url'] = url.count('.')
29
+ features['qty_slash_url'] = url.count('/')
30
+ features['qty_at_url'] = url.count('@')
31
+ features['qty_space_url'] = url.count(' ')
32
+ features['qty_plus_url'] = url.count('+')
33
+ features['qty_dollar_url'] = url.count('$')
34
+ features['length_url'] = len(url)
35
+ features['qty_equal_url'] = url.count('=')
36
+ features['qty_asterisk_url'] = url.count('*')
37
+ features['qty_percent_url'] = url.count('%')
38
+ features['qty_exclamation_url'] = url.count('!')
39
+ features['qty_questionmark_url'] = url.count('?')
40
+ features['qty_tilde_url'] = url.count('~')
41
+ features['qty_hyphen_url'] = url.count('-')
42
+ features['qty_hashtag_url'] = url.count('#')
43
+ features['qty_underline_url'] = url.count('_')
44
+
45
+ tld_pattern = r'\.(com|org|net|gov|edu|io|xyz|info|biz|co|uk|us|ca|au|in|cn|jp|ru|de|fr|it|nl|es|ch|se|no|dk|fi|pl|tr|br|za|mx|kr|sg|my|th|hk|vn|ar|cl|pe|nz|il|pk)'
46
+ features['qty_tld_url'] = len(re.findall(tld_pattern, url, re.IGNORECASE))
47
+ features['qty_tld_url'] == get_tld(url, as_object=True, fail_silently=False)
48
+ features['email_in_url'] = 1 if '@' in url else 0
49
+ features['url_google_index'] = is_url_indexed(url)
50
+ features['url_shortened'] = 1 if 'bit.ly' in url or 'tinyurl.com' in url else 0
51
+ features['qty_comma_url'] = url.count(',')
52
+ features['qty_and_url'] = url.count('&')
53
+
54
+ # Extract domain from URL
55
+ domain = url.split('/')[2] if '://' in url else url.split('/')[0]
56
+
57
+ # Domain features
58
+ features['qty_underline_domain'] = domain.count('_')
59
+ features['qty_equal_domain'] = domain.count('=')
60
+ features['qty_exclamation_domain'] = domain.count('!')
61
+ features['qty_comma_domain'] = domain.count(',')
62
+ features['qty_hashtag_domain'] = domain.count('#')
63
+ features['qty_vowels_domain'] = sum(1 for c in domain if c in 'aeiouAEIOU')
64
+ features['server_client_domain'] = 1 if 'server' in domain or 'client' in domain else 0
65
+ features['qty_dot_domain'] = domain.count('.')
66
+ features['domain_in_ip'] = 1 if is_ip_address(domain) else 0
67
+ features['domain_length'] = len(domain)
68
+ features['qty_hyphen_domain'] = domain.count('-')
69
+ features['time_domain_expiration'] = get_domain_time_features(domain)[1]
70
+ features['qty_percent_domain'] = domain.count('%')
71
+ features['qty_at_domain'] = domain.count('@')
72
+ features['domain_spf'] = get_spf_record(domain)
73
+ features['domain_google_index'] = is_domain_indexed(domain)
74
+
75
+ # Directory features
76
+ directory = url.split('/')[3] if len(url.split('/')) > 3 else ""
77
+ features['qty_underline_directory'] = directory.count('_')
78
+ features['qty_equal_directory'] = directory.count('=')
79
+ features['qty_exclamation_directory'] = directory.count('!')
80
+ features['qty_comma_directory'] = directory.count(',')
81
+ features['qty_hashtag_directory'] = directory.count('#')
82
+ features['directory_length'] = len(directory)
83
+ features['qty_space_directory'] = directory.count(' ')
84
+ features['qty_tilde_directory'] = directory.count('~')
85
+ features['qty_dollar_directory'] = directory.count('$')
86
+ features['qty_plus_directory'] = directory.count('+')
87
+ features['qty_and_directory'] = directory.count('&')
88
+ features['qty_slash_directory'] = directory.count('/')
89
+ features['qty_dot_directory'] = directory.count('.')
90
+ features['qty_asterisk_directory'] = directory.count('*')
91
+ features['qty_at_directory'] = directory.count('@')
92
+ features['qty_questionmark_directory'] = directory.count('?')
93
+ features['qty_hyphen_directory'] = directory.count('-')
94
+ features['qty_percent_directory'] = directory.count('%')
95
+ features['qty_equal_directory'] = directory.count('=')
96
+
97
+ # File features
98
+ file = url.split('/')[4] if len(url.split('/')) > 4 else ""
99
+ features['qty_underline_file'] = file.count('_')
100
+ features['qty_and_file'] = domain.count('&')
101
+ features['qty_dollar_file'] = domain.count('$')
102
+ features['qty_questionmark_file'] = domain.count('?')
103
+ features['qty_equal_file'] = file.count('=')
104
+ features['qty_slash_file'] = file.count('/')
105
+ features['qty_exclamation_file'] = file.count('!')
106
+ features['qty_comma_file'] = file.count(',')
107
+ features['qty_hashtag_file'] = file.count('#')
108
+ features['file_length'] = len(file)
109
+ features['qty_tilde_file'] = file.count('~')
110
+ features['qty_at_file'] = file.count('@')
111
+ features['qty_dot_file'] = file.count('.')
112
+ features['qty_space_file'] = file.count(' ')
113
+ features['qty_plus_file'] = file.count('+')
114
+ features['qty_asterisk_file'] = file.count('*')
115
+ features['qty_hyphen_file'] = file.count('-')
116
+ features['qty_underline_file'] = file.count('_')
117
+ features['qty_percent_file'] = file.count('%')
118
+ features['qty_equal_file'] = file.count('=')
119
+
120
+ # Parameters features
121
+ params = url.split('?')[1] if '?' in url else ""
122
+ features['qty_underline_params'] = params.count('_')
123
+ features['qty_equal_params'] = params.count('=')
124
+ features['qty_exclamation_params'] = params.count('!')
125
+ features['qty_comma_params'] = params.count(',')
126
+ features['qty_hashtag_params'] = params.count('#')
127
+ features['params_length'] = len(params)
128
+ features['qty_tilde_params'] = params.count('~')
129
+ features['qty_asterisk_params'] = params.count('*')
130
+ features['qty_space_params'] = params.count(' ')
131
+ features['qty_dollar_params'] = params.count('$')
132
+ features['qty_questionmark_params'] = params.count('?')
133
+ features['tld_present_params'] = 1 if get_tld(url, as_object=True, fail_silently=False) else 0
134
+ features['qty_plus_params'] = params.count('+')
135
+ features['qty_at_params'] = params.count('@')
136
+ features['qty_params'] = url.count('?')
137
+ features['qty_and_params'] = params.count('&')
138
+ features['qty_hyphen_params'] = params.count('-')
139
+ features['qty_dot_params'] = params.count('.')
140
+ features['qty_percent_params'] = params.count('%')
141
+ features['qty_slash_params'] = params.count('/')
142
+
143
+ # Other features
144
+ features['email_in_url'] = 1 if '@' in url else 0
145
+ features['asn_ip'] = get_asn(get_ip_from_url(url))
146
+ features['qty_ip_resolved'] = get_resolved_ips(domain)
147
+ features['ttl_hostname'] = get_ttl(domain)
148
+ features['url_google_index'] = is_url_indexed(url)
149
+
150
+ # Extract domain time features and ensure timestamps
151
+ features['time_domain_activation'], features['time_domain_expiration'] = get_domain_time_features(domain)
152
+
153
+ # Convert activation time to a timestamp if it's a datetime object
154
+ if isinstance(features['time_domain_activation'], datetime):
155
+ features['time_domain_activation'] = features['time_domain_activation'].timestamp()
156
+
157
+ # Convert expiration time to a timestamp if it's a datetime object
158
+ if isinstance(features['time_domain_expiration'], datetime):
159
+ features['time_domain_expiration'] = features['time_domain_expiration'].timestamp()
160
+
161
+ try:
162
+ features['qty_redirects'] = len(requests.get(url, timeout=5).history)
163
+ except requests.exceptions.RequestException as e:
164
+ print(f"Error processing redirects for URL '{url}': {e}")
165
+ features['qty_redirects'] = -1
166
+
167
+ features['qty_mx_servers'] = get_mx_record_count(domain)
168
+ features['qty_nameservers'] = get_nameserver_count(domain)
169
+ features['tls_ssl_certificate'] = get_tls_ssl_certificate(domain)
170
+ features['time_response'] = get_response_time(url)
171
+
172
+ except Exception as e:
173
+ print(f"Error extracting features for {url}: {e}")
174
+ for key in features.keys():
175
+ features[key] = -1
176
+
177
+ return features
178
+
179
+ # Function to count specific characters in a URL
180
+ def count_char_in_url(url, char):
181
+ try:
182
+ return url.count(char)
183
+ except Exception:
184
+ return -1 # Return -1 if unable to count characters
185
+
186
+ # Function to extract domain features from a URL
187
+ def extract_domain_features(url):
188
+ try:
189
+ domain = urlparse(url).netloc
190
+ domain_parts = domain.split('.')
191
+ tld_length = len(domain_parts[-1]) # Top-level domain length
192
+ return domain, tld_length
193
+ except Exception:
194
+ return -1, -1 # Return -1 if extraction fails
195
+
196
+ def is_domain_indexed(domain):
197
+ query = f"site:{domain}"
198
+ try:
199
+ results = list(search(query, num=1))
200
+ return 1 if results else 0
201
+ except Exception as e:
202
+ print(f"Error checking Google index for {domain}: {e}")
203
+ return -1
204
+
205
+ def get_response_time(url):
206
+ try:
207
+ start_time = time.time()
208
+ response = requests.get(url, timeout=10) # 10-second timeout
209
+ end_time = time.time()
210
+ return end_time - start_time # Response time in seconds
211
+ except requests.exceptions.RequestException as e:
212
+ print(f"Error measuring response time for {url}: {e}")
213
+ return -1 # Return None if there's an error
214
+
215
+
216
+ def get_mx_record_count(domain):
217
+ try:
218
+ # Use dns.resolver.resolve instead of the deprecated query
219
+ answers = dns.resolver.resolve(domain, 'MX')
220
+ return len(answers)
221
+ except dns.resolver.NoAnswer:
222
+ # No MX records found for the domain
223
+ print(f"No MX records found for {domain}.")
224
+ return 0
225
+ except (dns.resolver.NXDOMAIN, dns.resolver.Timeout, dns.resolver.NoNameservers) as e:
226
+ # Handle other DNS errors
227
+ print(f"Error fetching MX records for {domain}: {e}")
228
+ return -1
229
+
230
+ def get_nameserver_count(domain):
231
+ try:
232
+ # Query the NS records for the domain
233
+ ns_records = dns.resolver.resolve(domain, 'NS')
234
+ return len(ns_records) # Return the count of NS records
235
+ except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN, dns.resolver.Timeout):
236
+ # Handle cases where no NS records are found or DNS query fails
237
+ print(f"No NS records found for {domain}.")
238
+ return 0 # Return 0 if no NS records exist
239
+
240
+
241
+ # Function to extract directory and file related features
242
+ def extract_path_features(url):
243
+ try:
244
+ parsed_url = urlparse(url)
245
+ path = parsed_url.path
246
+ return path
247
+ except Exception:
248
+ return -1 # Return -1 if extraction fails
249
+
250
+ # Function to extract query parameter related features
251
+ def extract_query_features(url):
252
+ try:
253
+ parsed_url = urlparse(url)
254
+ query_params = parse_qs(parsed_url.query)
255
+ return query_params
256
+ except Exception:
257
+ return -1 # Return -1 if extraction fails
258
+
259
+ # Function to check if the domain is an IP address format
260
+ def is_ip_address(domain):
261
+ try:
262
+ socket.inet_aton(domain)
263
+ return 1 # It's an IP address
264
+ except socket.error:
265
+ return 0 # It's not an IP address
266
+
267
+ # Function to get the time-related features
268
+ from datetime import datetime
269
+
270
+ def get_domain_time_features(domain):
271
+ try:
272
+ domain_info = whois.whois(domain)
273
+ activation_time = domain_info.creation_date
274
+ expiration_time = domain_info.expiration_date
275
+
276
+ # Ensure activation_time and expiration_time are valid datetime objects or None
277
+ if not isinstance(activation_time, (datetime, type(None))):
278
+ activation_time = None
279
+ if not isinstance(expiration_time, (datetime, type(None))):
280
+ expiration_time = None
281
+
282
+ return activation_time, expiration_time
283
+ except Exception as e:
284
+ print(f"Error fetching domain times for {domain}: {e}")
285
+ return -1, -1
286
+
287
+ '''def get_domain_time_features(domain):
288
+ rdap_url = f"https://rdap.org/domain/{domain}" # RDAP public API endpoint
289
+ try:
290
+ response = requests.get(rdap_url, timeout=5)
291
+
292
+ if response.status_code == 200:
293
+ domain_data = response.json()
294
+
295
+ # Extract activation and expiration dates
296
+ activation_time = domain_data.get("events", [{}])
297
+ creation_date = None
298
+ expiration_date = None
299
+
300
+ for event in activation_time:
301
+ if event.get("eventAction") == "registration":
302
+ creation_date = event.get("eventDate")
303
+ elif event.get("eventAction") == "expiration":
304
+ expiration_date = event.get("eventDate")
305
+
306
+ # Convert string dates to datetime objects
307
+ creation_date = datetime.fromisoformat(creation_date) if creation_date else 0
308
+ expiration_date = datetime.fromisoformat(expiration_date) if expiration_date else 0
309
+
310
+ return creation_date, expiration_date
311
+ elif response.status_code == 404:
312
+ # Domain does not exist
313
+ return 0, 0
314
+ else:
315
+ # Failed to fetch data for other reasons
316
+ return -1, -1
317
+
318
+ except Exception as e:
319
+ print(f"Error fetching domain times for {domain}: {e}")
320
+ return -1, -1'''
321
+
322
+
323
+
324
+ # Function to get SPF record
325
+ def get_spf_record(domain):
326
+ try:
327
+ txt_records = dns.resolver.resolve(domain, 'TXT')
328
+ for record in txt_records:
329
+ if "v=spf1" in str(record):
330
+ return 1
331
+ return 0
332
+ except Exception:
333
+ return -1 # Return -1 if SPF check fails
334
+
335
+ # Function to get TLS/SSL certificate
336
+ def get_tls_ssl_certificate(domain):
337
+ try:
338
+ context = ssl.create_default_context()
339
+ with socket.create_connection((domain, 443)) as sock:
340
+ with context.wrap_socket(sock, server_hostname=domain) as ssock:
341
+ cert = ssock.getpeercert()
342
+ return 1 # TLS/SSL certificate exists
343
+ except Exception:
344
+ return -1 # Return -1 if SSL check fails
345
+
346
+ # Function to get IP address from URL
347
+ def get_ip_from_url(url):
348
+ try:
349
+ domain = url.split('/')[2] if '://' in url else url.split('/')[0]
350
+ ip = socket.gethostbyname(domain)
351
+ return ip
352
+ except Exception:
353
+ return -1 # Return -1 if IP extraction fails
354
+
355
+ # Function to get ASN for an IP
356
+ def get_asn(ip):
357
+ if not ip or ip == -1:
358
+ return -1 # Return -1 if IP is invalid
359
+ try:
360
+ response = requests.get(f"https://ipinfo.io/{ip}/json")
361
+ data = response.json()
362
+ org = data.get("org", "Unknown ASN")
363
+ match = re.search(r'AS(\d+)', org)
364
+ return int(match.group(1)) if match else -1
365
+ except Exception:
366
+ return -1
367
+
368
+ # Function to get resolved IPs for a domain
369
+ def get_resolved_ips(domain):
370
+ try:
371
+ return len(socket.gethostbyname_ex(domain)[2])
372
+ except Exception:
373
+ return -1
374
+
375
+ # Function to get TTL value for a domain
376
+ def get_ttl(domain):
377
+ try:
378
+ answers = dns.resolver.resolve(domain, 'A')
379
+ return answers.rrset.ttl
380
+ except Exception:
381
+ return -1
382
+
383
+ def is_url_indexed(url):
384
+ query = f"site:{url}"
385
+ try:
386
+ results = list(search(query, num=1))
387
+ return 1 if results else 0
388
+ except Exception as e:
389
+ print(f"Error checking if URL is indexed: {e}")
390
+ return -1
391
+
392
+ def process_urls(urls):
393
+ url_features = []
394
+ for url in urls:
395
+ if not (url.startswith("http://") or url.startswith("https://")):
396
+ url = "https://" + url
397
+ features = extract_url_features(url)
398
+ url_features.append(features)
399
+ return pd.DataFrame(url_features)
400
+
401
+ '''def predict_urls(urls, model_path):
402
+ features_df = process_urls(urls)
403
+ features_df.fillna(-1, inplace=True)
404
+ model = CatBoostClassifier()
405
+ model.load_model(model_path)
406
+ predictions = model.predict(features_df)
407
+ return predictions'''
408
+
409
+ '''def explain_prediction(features_df, model_path):
410
+ model = CatBoostClassifier()
411
+ model.load_model(model_path)
412
+ from lime.lime_tabular import LimeTabularExplainer
413
+ explainer = LimeTabularExplainer(
414
+ training_data=features_df.values,
415
+ feature_names=features_df.columns.tolist(),
416
+ class_names=["Legitimate", "Malicious"],
417
+ mode="classification"
418
+ )
419
+ explanation = explainer.explain_instance(
420
+ data_row=features_df.iloc[0].values,
421
+ predict_fn=model.predict_proba,
422
+ num_features=5
423
+ )
424
+ explanation.show_in_notebook(show_table=True)
425
+ return explanation'''
426
+
427
+ # Async enhancements for faster processing (optional)
428
+ async def fetch_url(session, url):
429
+ try:
430
+ async with session.get(url, timeout=10) as response:
431
+ return len(response.history)
432
+ except Exception:
433
+ return -1
434
+
435
+ async def process_urls_async(urls):
436
+ async with aiohttp.ClientSession() as session:
437
+ tasks = [fetch_url(session, url) for url in urls]
438
+ results = await asyncio.gather(*tasks)
439
+ return results