OVH
commited on
Commit
·
67b1c6c
1
Parent(s):
12ab6cd
Added all the files
Browse files- .ipynb_checkpoints/Dockerfile-checkpoint +0 -0
- .ipynb_checkpoints/app-checkpoint.py +281 -0
- .ipynb_checkpoints/main-checkpoint.py +132 -0
- Dockerfile +0 -0
- app.py +281 -0
- main.py +132 -0
- requirements.txt +19 -0
- src/.ipynb_checkpoints/analyze_yelp_data-checkpoint.py +320 -0
- src/.ipynb_checkpoints/clean_data-checkpoint.py +77 -0
- src/.ipynb_checkpoints/create_dataset-checkpoint.py +217 -0
- src/.ipynb_checkpoints/feature_analyzer-checkpoint.py +212 -0
- src/.ipynb_checkpoints/model-checkpoint.py +541 -0
- src/.ipynb_checkpoints/model_trainer-checkpoint.py +35 -0
- src/.ipynb_checkpoints/preprocessing-checkpoint.py +831 -0
- src/__pycache__/analyze_yelp_data.cpython-311.pyc +0 -0
- src/__pycache__/clean_data.cpython-311.pyc +0 -0
- src/__pycache__/clean_data.cpython-39.pyc +0 -0
- src/__pycache__/create_dataset.cpython-311.pyc +0 -0
- src/__pycache__/create_dataset.cpython-39.pyc +0 -0
- src/__pycache__/data_balancing.cpython-311.pyc +0 -0
- src/__pycache__/feature_analyzer.cpython-311.pyc +0 -0
- src/__pycache__/feature_analyzer.cpython-39.pyc +0 -0
- src/__pycache__/feature_importance.cpython-311.pyc +0 -0
- src/__pycache__/model.cpython-311.pyc +0 -0
- src/__pycache__/model.cpython-39.pyc +0 -0
- src/__pycache__/model1.cpython-311.pyc +0 -0
- src/__pycache__/model1.cpython-39.pyc +0 -0
- src/__pycache__/model3.cpython-311.pyc +0 -0
- src/__pycache__/model3.cpython-39.pyc +0 -0
- src/__pycache__/model_trainer.cpython-311.pyc +0 -0
- src/__pycache__/model_trainer.cpython-39.pyc +0 -0
- src/__pycache__/models.cpython-311.pyc +0 -0
- src/__pycache__/preprocessing.cpython-311.pyc +0 -0
- src/__pycache__/preprocessing.cpython-39.pyc +0 -0
- src/analyze_yelp_data.py +320 -0
- src/clean_data.py +83 -0
- src/create_dataset.py +217 -0
- src/feature_analyzer.py +212 -0
- src/model.py +540 -0
- src/model_trainer.py +35 -0
- src/preprocessing.py +832 -0
.ipynb_checkpoints/Dockerfile-checkpoint
ADDED
File without changes
|
.ipynb_checkpoints/app-checkpoint.py
ADDED
@@ -0,0 +1,281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, request, jsonify
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
import torch.nn.functional as F
|
5 |
+
from torch_geometric.data import HeteroData
|
6 |
+
import numpy as np
|
7 |
+
import pandas as pd
|
8 |
+
import networkx as nx
|
9 |
+
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve
|
10 |
+
from sklearn.model_selection import train_test_split
|
11 |
+
from pathlib import Path
|
12 |
+
from datetime import datetime
|
13 |
+
from loguru import logger
|
14 |
+
from huggingface_hub import hf_hub_download
|
15 |
+
import json
|
16 |
+
from preprocessing_test import Preprocessor
|
17 |
+
from src.model import *
|
18 |
+
from main import start_pipelines
|
19 |
+
|
20 |
+
app = Flask(__name__)
|
21 |
+
|
22 |
+
# Define default values for each column
|
23 |
+
default_values = {
|
24 |
+
'review_id': 'KU_O5udG6zpxOg-VcAEodg',
|
25 |
+
'user_id': 'mh_-eMZ6K5RLWhZyISBhwA',
|
26 |
+
'business_id': 'XQfwVwDr-v0ZS3_CbbE5Xw',
|
27 |
+
'review_stars': 0,
|
28 |
+
'review_useful': 0,
|
29 |
+
'review_funny': 0,
|
30 |
+
'review_cool': 0,
|
31 |
+
'review_text': 'It was a moderate experience',
|
32 |
+
'review_date': 1531001351000,
|
33 |
+
'business_name': 'Coffe at LA',
|
34 |
+
'address': '1460 LA',
|
35 |
+
'city': 'LA',
|
36 |
+
'state': 'CA',
|
37 |
+
'postal_code': '00000',
|
38 |
+
'latitude': 0.0,
|
39 |
+
'longitude': 0.0,
|
40 |
+
'business_stars': 0.0,
|
41 |
+
'business_review_count': 0,
|
42 |
+
'is_open': 0,
|
43 |
+
'attributes': '{}',
|
44 |
+
'categories': 'Restaurants',
|
45 |
+
'hours': '{"Monday": "7:0-20:0", "Tuesday": "7:0-20:0", "Wednesday": "7:0-20:0", "Thursday": "7:0-20:0", "Friday": "7:0-21:0", "Saturday": "7:0-21:0", "Sunday": "7:0-21:0"}',
|
46 |
+
'user_name': 'default_user',
|
47 |
+
'user_review_count': 0,
|
48 |
+
'yelping_since': '2023-01-01 00:00:00',
|
49 |
+
'user_useful': 0,
|
50 |
+
'user_funny': 0,
|
51 |
+
'user_cool': 0,
|
52 |
+
'elite': '2024,2025',
|
53 |
+
'friends': '',
|
54 |
+
'fans': 0,
|
55 |
+
'average_stars': 0.0,
|
56 |
+
'compliment_hot': 0,
|
57 |
+
'compliment_more': 0,
|
58 |
+
'compliment_profile': 0,
|
59 |
+
'compliment_cute': 0,
|
60 |
+
'compliment_list': 0,
|
61 |
+
'compliment_note': 0,
|
62 |
+
'compliment_plain': 0,
|
63 |
+
'compliment_cool': 0,
|
64 |
+
'compliment_funny': 0,
|
65 |
+
'compliment_writer': 0,
|
66 |
+
'compliment_photos': 0,
|
67 |
+
'checkin_date': '2023-01-01 00:00:00',
|
68 |
+
'tip_compliment_count': 0.0,
|
69 |
+
'tip_count': 0.0
|
70 |
+
}
|
71 |
+
|
72 |
+
# Expected types for validation
|
73 |
+
expected_types = {
|
74 |
+
'review_id': str,
|
75 |
+
'user_id': str,
|
76 |
+
'business_id': str,
|
77 |
+
'review_stars': int,
|
78 |
+
'review_useful': int,
|
79 |
+
'review_funny': int,
|
80 |
+
'review_cool': int,
|
81 |
+
'review_text': str,
|
82 |
+
'review_date': int,
|
83 |
+
'business_name': str,
|
84 |
+
'address': str,
|
85 |
+
'city': str,
|
86 |
+
'state': str,
|
87 |
+
'postal_code': str,
|
88 |
+
'latitude': float,
|
89 |
+
'longitude': float,
|
90 |
+
'business_stars': float,
|
91 |
+
'business_review_count': int,
|
92 |
+
'is_open': int,
|
93 |
+
'attributes': dict, # Assuming string representation of dict
|
94 |
+
'categories': str,
|
95 |
+
'hours': dict, # Assuming string representation of dict
|
96 |
+
'user_name': str,
|
97 |
+
'user_review_count': int,
|
98 |
+
'yelping_since': str,
|
99 |
+
'user_useful': int,
|
100 |
+
'user_funny': int,
|
101 |
+
'user_cool': int,
|
102 |
+
'elite': str,
|
103 |
+
'friends': str,
|
104 |
+
'fans': int,
|
105 |
+
'average_stars': float,
|
106 |
+
'compliment_hot': int,
|
107 |
+
'compliment_more': int,
|
108 |
+
'compliment_profile': int,
|
109 |
+
'compliment_cute': int,
|
110 |
+
'compliment_list': int,
|
111 |
+
'compliment_note': int,
|
112 |
+
'compliment_plain': int,
|
113 |
+
'compliment_cool': int,
|
114 |
+
'compliment_funny': int,
|
115 |
+
'compliment_writer': int,
|
116 |
+
'compliment_photos': int,
|
117 |
+
'checkin_date': str,
|
118 |
+
'tip_compliment_count': float,
|
119 |
+
'tip_count': float
|
120 |
+
}
|
121 |
+
|
122 |
+
@app.route('/predict', methods=['POST'])
|
123 |
+
def predict():
|
124 |
+
try:
|
125 |
+
# Check if request contains JSON data
|
126 |
+
if not request.json:
|
127 |
+
return jsonify({'error': 'Request must contain JSON data'}), 400
|
128 |
+
|
129 |
+
data = request.json
|
130 |
+
|
131 |
+
# Extract train, test, and test_size with defaults
|
132 |
+
train = data.get('train', False)
|
133 |
+
test = data.get('test', False)
|
134 |
+
test_size = float(data.get('test_size', 0.1))
|
135 |
+
|
136 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
137 |
+
|
138 |
+
# Handle training mode
|
139 |
+
if train in (True, 'true', 'True'):
|
140 |
+
start_pipelines(test_size=test_size)
|
141 |
+
logger.info("PIPELINES FINISHED SUCCESSFULLY")
|
142 |
+
return jsonify({
|
143 |
+
'message': 'Training pipelines executed successfully',
|
144 |
+
'test_size': test_size
|
145 |
+
}), 200
|
146 |
+
|
147 |
+
# Handle testing/inference mode
|
148 |
+
elif test in (True, 'test', 'True'):
|
149 |
+
REPO_ID = "Askhedi/graphformermodel"
|
150 |
+
MODEL_FILENAME = "model_GraphformerModel_latest.pth"
|
151 |
+
model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME)
|
152 |
+
|
153 |
+
# Load model
|
154 |
+
model = HeteroGraphormer(hidden_dim=64, output_dim=1, edge_dim=4).to(device)
|
155 |
+
model.load_state_dict(torch.load(model_path, map_location=device))
|
156 |
+
model.eval()
|
157 |
+
|
158 |
+
# Process input data from JSON
|
159 |
+
row = {}
|
160 |
+
warnings = []
|
161 |
+
for col, expected_type in expected_types.items():
|
162 |
+
value = data.get(col, default_values[col])
|
163 |
+
try:
|
164 |
+
if value == "" or value is None:
|
165 |
+
row[col] = default_values[col]
|
166 |
+
elif col in ['attributes', 'hours']:
|
167 |
+
# Expect a valid JSON string that parses to a dict
|
168 |
+
if isinstance(value, str):
|
169 |
+
parsed = json.loads(value)
|
170 |
+
if not isinstance(parsed, dict):
|
171 |
+
raise ValueError
|
172 |
+
row[col] = value # Keep as string for Preprocessor
|
173 |
+
else:
|
174 |
+
raise ValueError
|
175 |
+
else:
|
176 |
+
row[col] = expected_type(value)
|
177 |
+
except (ValueError, TypeError, json.JSONDecodeError):
|
178 |
+
row[col] = default_values[col]
|
179 |
+
warnings.append(f"Invalid input for '{col}' (expected {expected_type.__name__}), using default value: {default_values[col]}")
|
180 |
+
|
181 |
+
# Convert dictionaries to strings before passing to DataFrame
|
182 |
+
for col in ['attributes', 'hours']:
|
183 |
+
if isinstance(row[col], dict):
|
184 |
+
row[col] = json.dumps(row[col])
|
185 |
+
|
186 |
+
# Create DataFrame from input
|
187 |
+
input_df = pd.DataFrame([row])
|
188 |
+
|
189 |
+
# Preprocess using Preprocessor
|
190 |
+
preprocessor = Preprocessor(input_df)
|
191 |
+
processed_df = preprocessor.run_pipeline()
|
192 |
+
logger.info(f"PREPROCESSING COMPLETED VALUES ARE {processed_df}")
|
193 |
+
|
194 |
+
# Build standalone graph from processed data
|
195 |
+
num_users = 1
|
196 |
+
num_businesses = 1
|
197 |
+
num_rows = 1
|
198 |
+
|
199 |
+
graph = HeteroData()
|
200 |
+
features = torch.tensor(processed_df.drop(columns=['user_id', 'review_id', 'business_id']).values, dtype=torch.float, device=device)
|
201 |
+
time_since_user = torch.tensor(processed_df['time_since_last_review_user'].values, dtype=torch.float, device=device)
|
202 |
+
time_since_business = torch.tensor(processed_df['time_since_last_review_business'].values, dtype=torch.float, device=device)
|
203 |
+
|
204 |
+
user_indices = torch.tensor([0], dtype=torch.long, device=device)
|
205 |
+
business_indices = torch.tensor([0], dtype=torch.long, device=device)
|
206 |
+
review_indices = torch.tensor([0], dtype=torch.long, device=device)
|
207 |
+
|
208 |
+
user_feats = torch.zeros(num_users, 14, device=device)
|
209 |
+
business_feats = torch.zeros(num_businesses, 8, device=device)
|
210 |
+
review_feats = torch.zeros(num_rows, 16, device=device)
|
211 |
+
|
212 |
+
user_feats[0] = features[0, :14]
|
213 |
+
business_feats[0] = features[0, 14:22]
|
214 |
+
review_feats[0] = features[0, 22:38]
|
215 |
+
|
216 |
+
graph['user'].x = user_feats
|
217 |
+
graph['business'].x = business_feats
|
218 |
+
graph['review'].x = review_feats
|
219 |
+
|
220 |
+
graph['user', 'writes', 'review'].edge_index = torch.stack([user_indices, review_indices], dim=0)
|
221 |
+
graph['review', 'about', 'business'].edge_index = torch.stack([review_indices, business_indices], dim=0)
|
222 |
+
|
223 |
+
# Compute encodings
|
224 |
+
G = nx.DiGraph()
|
225 |
+
node_type_map = {0: 'user', 1: 'business', 2: 'review'}
|
226 |
+
G.add_nodes_from([0, 1, 2])
|
227 |
+
G.add_edge(0, 2) # user -> review
|
228 |
+
G.add_edge(2, 1) # review -> business
|
229 |
+
|
230 |
+
num_nodes = 3
|
231 |
+
spatial_encoding = torch.full((num_nodes, num_nodes), float('inf'), device=device)
|
232 |
+
for i in range(num_nodes):
|
233 |
+
for j in range(num_nodes):
|
234 |
+
if i == j:
|
235 |
+
spatial_encoding[i, j] = 0
|
236 |
+
elif nx.has_path(G, i, j):
|
237 |
+
spatial_encoding[i, j] = nx.shortest_path_length(G, i, j)
|
238 |
+
|
239 |
+
centrality_encoding = torch.tensor([G.degree(i) for i in range(num_nodes)], dtype=torch.float, device=device).view(-1, 1)
|
240 |
+
|
241 |
+
edge_features_dict = {}
|
242 |
+
user_writes_edge = graph['user', 'writes', 'review'].edge_index
|
243 |
+
review_about_edge = graph['review', 'about', 'business'].edge_index
|
244 |
+
|
245 |
+
edge_features_dict[('user', 'writes', 'review')] = create_temporal_edge_features(
|
246 |
+
time_since_user[user_writes_edge[0]], time_since_user[user_writes_edge[1]],
|
247 |
+
user_indices[user_writes_edge[0]], user_indices[user_writes_edge[0]]
|
248 |
+
)
|
249 |
+
edge_features_dict[('review', 'about', 'business')] = create_temporal_edge_features(
|
250 |
+
time_since_business[review_about_edge[0]], time_since_business[review_about_edge[1]],
|
251 |
+
torch.zeros_like(review_about_edge[0]), torch.zeros_like(review_about_edge[0])
|
252 |
+
)
|
253 |
+
|
254 |
+
time_since_dict = {
|
255 |
+
'user': torch.tensor([time_since_user[0]], dtype=torch.float, device=device)
|
256 |
+
}
|
257 |
+
|
258 |
+
# Inference
|
259 |
+
with torch.no_grad():
|
260 |
+
out = model(graph, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict)
|
261 |
+
pred_label = 1 if out.squeeze().item() > 0.5 else 0
|
262 |
+
prob = out.squeeze().item()
|
263 |
+
|
264 |
+
# Combine warnings and result
|
265 |
+
result = {
|
266 |
+
'warnings': warnings,
|
267 |
+
'prediction': 'Fake' if pred_label == 1 else 'Not Fake',
|
268 |
+
'probability': float(prob)
|
269 |
+
}
|
270 |
+
return jsonify(result), 200
|
271 |
+
|
272 |
+
else:
|
273 |
+
return jsonify({
|
274 |
+
'error': 'Either "train" or "test" must be set to true'
|
275 |
+
}), 400
|
276 |
+
|
277 |
+
except Exception as e:
|
278 |
+
return jsonify({'error': str(e)}), 500
|
279 |
+
|
280 |
+
if __name__ == '__main__':
|
281 |
+
app.run(debug=True, host='0.0.0.0', port=5000)
|
.ipynb_checkpoints/main-checkpoint.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from pathlib import Path
|
4 |
+
import logging
|
5 |
+
import sys
|
6 |
+
from datetime import datetime
|
7 |
+
import warnings
|
8 |
+
import gc
|
9 |
+
import json
|
10 |
+
|
11 |
+
from loguru import logger
|
12 |
+
from src.create_dataset import process_datasets
|
13 |
+
from src.preprocessing import Preprocessor
|
14 |
+
from src.clean_data import DataCleaner
|
15 |
+
from src.feature_analyzer import FeatureAnalyzer
|
16 |
+
from src.model_trainer import ModelTrainer
|
17 |
+
from pathlib import Path
|
18 |
+
|
19 |
+
|
20 |
+
def create_directories():
|
21 |
+
"""Create all necessary directories for the pipeline"""
|
22 |
+
directories = {
|
23 |
+
'combined_data': Path('output_files/combined_data'),
|
24 |
+
'preprocessed': Path('output_files/cleaned_preprocessed_data'),
|
25 |
+
'feature_analyzer': Path('output_files/feature_analysis'),
|
26 |
+
'model_outputs': Path('output_files/model_outputs'),
|
27 |
+
|
28 |
+
}
|
29 |
+
|
30 |
+
for dir_path in directories.values():
|
31 |
+
dir_path.mkdir(parents=True, exist_ok=True)
|
32 |
+
|
33 |
+
return directories
|
34 |
+
|
35 |
+
def handle_memory():
|
36 |
+
"""Handle memory management"""
|
37 |
+
gc.collect()
|
38 |
+
warnings.filterwarnings('ignore')
|
39 |
+
|
40 |
+
def save_pipeline_metrics(metrics: dict, filepath: Path):
|
41 |
+
"""Save pipeline metrics to JSON file"""
|
42 |
+
with open(filepath, 'w') as f:
|
43 |
+
json.dump(metrics, f, indent=4, default=str)
|
44 |
+
|
45 |
+
def start_pipelines(train_size=0.25):
|
46 |
+
# Setup logging
|
47 |
+
logger.info("STARTING YELP DATA ANALYSIS PIPELINES...")
|
48 |
+
dirs = create_directories()
|
49 |
+
logger.info("Created necessary directories")
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
logger.info("Pipeline 1: Creating initial dataset...")
|
56 |
+
try:
|
57 |
+
filename="combined_merged_full.csv"
|
58 |
+
df = process_datasets(output_path=dirs['combined_data'],filename=filename)
|
59 |
+
|
60 |
+
logger.info(f"Dataset created successfully with shape: {df.shape}")
|
61 |
+
except Exception as e:
|
62 |
+
logger.error(f"Error in dataset creation: {str(e)}")
|
63 |
+
|
64 |
+
|
65 |
+
|
66 |
+
|
67 |
+
try:
|
68 |
+
logger.info("Pipeline 2: Preprocessing and Feature Engineering....")
|
69 |
+
output_before_preprocess=Path(str(dirs['combined_data']) )/ "combined_merged_full.csv"
|
70 |
+
df = pd.read_csv(output_before_preprocess)
|
71 |
+
prep=Preprocessor(df)
|
72 |
+
feature_engineered_df=prep.run_pipeline()
|
73 |
+
|
74 |
+
except Exception as e:
|
75 |
+
logger.error(f"Error in Pipeline 2 Preprocessing and Feature Engineering as : {e}")
|
76 |
+
|
77 |
+
|
78 |
+
try:
|
79 |
+
logger.info("Pipeline 3: Cleaning data...")
|
80 |
+
filename="preprocessed_cleaned.csv"
|
81 |
+
|
82 |
+
cleaner = DataCleaner(df=feature_engineered_df,output_path=str(dirs['preprocessed']),filename=filename)
|
83 |
+
cleaner.run_pipeline()
|
84 |
+
clean_output_file_path = Path(str(dirs['preprocessed']) )/ filename
|
85 |
+
print("Preprocessed and Cleand data saved in ",clean_output_file_path)
|
86 |
+
|
87 |
+
|
88 |
+
|
89 |
+
|
90 |
+
except Exception as e:
|
91 |
+
logger.error(f"Error in Pipeline 3 Cleaning Data : {str(e)}")
|
92 |
+
|
93 |
+
|
94 |
+
|
95 |
+
try:
|
96 |
+
logger.info("Pipeline 4: Analyzing features...")
|
97 |
+
filename="preprocessed_cleaned.csv"
|
98 |
+
preprocessed_clean_output_file=Path(str(dirs['preprocessed']) )/ filename
|
99 |
+
preprocessed_clean_df=pd.read_csv(preprocessed_clean_output_file)
|
100 |
+
|
101 |
+
analyzer = FeatureAnalyzer(df=preprocessed_clean_df,output_path=str(dirs['feature_analyzer']))
|
102 |
+
analyzer.run_pipeline()
|
103 |
+
except Exception as e:
|
104 |
+
logger.error(f"Error in Feature analysis: {str(e)}")
|
105 |
+
raise
|
106 |
+
|
107 |
+
|
108 |
+
try:
|
109 |
+
logger.info("Pipeline 5 : Training and Evaluating Models...")
|
110 |
+
filename="preprocessed_cleaned.csv"
|
111 |
+
preprocessed_clean_output_file=Path(str(dirs['preprocessed']) )/ filename
|
112 |
+
preprocessed_clean_df=pd.read_csv(preprocessed_clean_output_file)
|
113 |
+
preprocessed_clean_df = preprocessed_clean_df.sample(frac=1, random_state=42).reset_index(drop=True)
|
114 |
+
size=int(train_size*len(preprocessed_clean_df))
|
115 |
+
|
116 |
+
preprocessed_clean_df=preprocessed_clean_df.iloc[:size,:]
|
117 |
+
|
118 |
+
|
119 |
+
|
120 |
+
trainer = ModelTrainer(df=preprocessed_clean_df,output_path=str(dirs['model_outputs']), epochs=50,test_size=0.3)
|
121 |
+
trainer.train_and_evaluate()
|
122 |
+
|
123 |
+
logger.info(f"Models training completed ")
|
124 |
+
except Exception as e:
|
125 |
+
logger.error(f"Error in Model Trainer: {str(e)}")
|
126 |
+
|
127 |
+
|
128 |
+
|
129 |
+
|
130 |
+
|
131 |
+
|
132 |
+
|
Dockerfile
ADDED
File without changes
|
app.py
ADDED
@@ -0,0 +1,281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, request, jsonify
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
import torch.nn.functional as F
|
5 |
+
from torch_geometric.data import HeteroData
|
6 |
+
import numpy as np
|
7 |
+
import pandas as pd
|
8 |
+
import networkx as nx
|
9 |
+
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve
|
10 |
+
from sklearn.model_selection import train_test_split
|
11 |
+
from pathlib import Path
|
12 |
+
from datetime import datetime
|
13 |
+
from loguru import logger
|
14 |
+
from huggingface_hub import hf_hub_download
|
15 |
+
import json
|
16 |
+
from preprocessing_test import Preprocessor
|
17 |
+
from src.model import *
|
18 |
+
from main import start_pipelines
|
19 |
+
|
20 |
+
app = Flask(__name__)
|
21 |
+
|
22 |
+
# Define default values for each column
|
23 |
+
default_values = {
|
24 |
+
'review_id': 'KU_O5udG6zpxOg-VcAEodg',
|
25 |
+
'user_id': 'mh_-eMZ6K5RLWhZyISBhwA',
|
26 |
+
'business_id': 'XQfwVwDr-v0ZS3_CbbE5Xw',
|
27 |
+
'review_stars': 0,
|
28 |
+
'review_useful': 0,
|
29 |
+
'review_funny': 0,
|
30 |
+
'review_cool': 0,
|
31 |
+
'review_text': 'It was a moderate experience',
|
32 |
+
'review_date': 1531001351000,
|
33 |
+
'business_name': 'Coffe at LA',
|
34 |
+
'address': '1460 LA',
|
35 |
+
'city': 'LA',
|
36 |
+
'state': 'CA',
|
37 |
+
'postal_code': '00000',
|
38 |
+
'latitude': 0.0,
|
39 |
+
'longitude': 0.0,
|
40 |
+
'business_stars': 0.0,
|
41 |
+
'business_review_count': 0,
|
42 |
+
'is_open': 0,
|
43 |
+
'attributes': '{}',
|
44 |
+
'categories': 'Restaurants',
|
45 |
+
'hours': '{"Monday": "7:0-20:0", "Tuesday": "7:0-20:0", "Wednesday": "7:0-20:0", "Thursday": "7:0-20:0", "Friday": "7:0-21:0", "Saturday": "7:0-21:0", "Sunday": "7:0-21:0"}',
|
46 |
+
'user_name': 'default_user',
|
47 |
+
'user_review_count': 0,
|
48 |
+
'yelping_since': '2023-01-01 00:00:00',
|
49 |
+
'user_useful': 0,
|
50 |
+
'user_funny': 0,
|
51 |
+
'user_cool': 0,
|
52 |
+
'elite': '2024,2025',
|
53 |
+
'friends': '',
|
54 |
+
'fans': 0,
|
55 |
+
'average_stars': 0.0,
|
56 |
+
'compliment_hot': 0,
|
57 |
+
'compliment_more': 0,
|
58 |
+
'compliment_profile': 0,
|
59 |
+
'compliment_cute': 0,
|
60 |
+
'compliment_list': 0,
|
61 |
+
'compliment_note': 0,
|
62 |
+
'compliment_plain': 0,
|
63 |
+
'compliment_cool': 0,
|
64 |
+
'compliment_funny': 0,
|
65 |
+
'compliment_writer': 0,
|
66 |
+
'compliment_photos': 0,
|
67 |
+
'checkin_date': '2023-01-01 00:00:00',
|
68 |
+
'tip_compliment_count': 0.0,
|
69 |
+
'tip_count': 0.0
|
70 |
+
}
|
71 |
+
|
72 |
+
# Expected types for validation
|
73 |
+
expected_types = {
|
74 |
+
'review_id': str,
|
75 |
+
'user_id': str,
|
76 |
+
'business_id': str,
|
77 |
+
'review_stars': int,
|
78 |
+
'review_useful': int,
|
79 |
+
'review_funny': int,
|
80 |
+
'review_cool': int,
|
81 |
+
'review_text': str,
|
82 |
+
'review_date': int,
|
83 |
+
'business_name': str,
|
84 |
+
'address': str,
|
85 |
+
'city': str,
|
86 |
+
'state': str,
|
87 |
+
'postal_code': str,
|
88 |
+
'latitude': float,
|
89 |
+
'longitude': float,
|
90 |
+
'business_stars': float,
|
91 |
+
'business_review_count': int,
|
92 |
+
'is_open': int,
|
93 |
+
'attributes': dict, # Assuming string representation of dict
|
94 |
+
'categories': str,
|
95 |
+
'hours': dict, # Assuming string representation of dict
|
96 |
+
'user_name': str,
|
97 |
+
'user_review_count': int,
|
98 |
+
'yelping_since': str,
|
99 |
+
'user_useful': int,
|
100 |
+
'user_funny': int,
|
101 |
+
'user_cool': int,
|
102 |
+
'elite': str,
|
103 |
+
'friends': str,
|
104 |
+
'fans': int,
|
105 |
+
'average_stars': float,
|
106 |
+
'compliment_hot': int,
|
107 |
+
'compliment_more': int,
|
108 |
+
'compliment_profile': int,
|
109 |
+
'compliment_cute': int,
|
110 |
+
'compliment_list': int,
|
111 |
+
'compliment_note': int,
|
112 |
+
'compliment_plain': int,
|
113 |
+
'compliment_cool': int,
|
114 |
+
'compliment_funny': int,
|
115 |
+
'compliment_writer': int,
|
116 |
+
'compliment_photos': int,
|
117 |
+
'checkin_date': str,
|
118 |
+
'tip_compliment_count': float,
|
119 |
+
'tip_count': float
|
120 |
+
}
|
121 |
+
|
122 |
+
@app.route('/predict', methods=['POST'])
|
123 |
+
def predict():
|
124 |
+
try:
|
125 |
+
# Check if request contains JSON data
|
126 |
+
if not request.json:
|
127 |
+
return jsonify({'error': 'Request must contain JSON data'}), 400
|
128 |
+
|
129 |
+
data = request.json
|
130 |
+
|
131 |
+
# Extract train, test, and test_size with defaults
|
132 |
+
train = data.get('train', False)
|
133 |
+
test = data.get('test', False)
|
134 |
+
test_size = float(data.get('test_size', 0.1))
|
135 |
+
|
136 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
137 |
+
|
138 |
+
# Handle training mode
|
139 |
+
if train in (True, 'true', 'True'):
|
140 |
+
start_pipelines(test_size=test_size)
|
141 |
+
logger.info("PIPELINES FINISHED SUCCESSFULLY")
|
142 |
+
return jsonify({
|
143 |
+
'message': 'Training pipelines executed successfully',
|
144 |
+
'test_size': test_size
|
145 |
+
}), 200
|
146 |
+
|
147 |
+
# Handle testing/inference mode
|
148 |
+
elif test in (True, 'test', 'True'):
|
149 |
+
REPO_ID = "Askhedi/graphformermodel"
|
150 |
+
MODEL_FILENAME = "model_GraphformerModel_latest.pth"
|
151 |
+
model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME)
|
152 |
+
|
153 |
+
# Load model
|
154 |
+
model = HeteroGraphormer(hidden_dim=64, output_dim=1, edge_dim=4).to(device)
|
155 |
+
model.load_state_dict(torch.load(model_path, map_location=device))
|
156 |
+
model.eval()
|
157 |
+
|
158 |
+
# Process input data from JSON
|
159 |
+
row = {}
|
160 |
+
warnings = []
|
161 |
+
for col, expected_type in expected_types.items():
|
162 |
+
value = data.get(col, default_values[col])
|
163 |
+
try:
|
164 |
+
if value == "" or value is None:
|
165 |
+
row[col] = default_values[col]
|
166 |
+
elif col in ['attributes', 'hours']:
|
167 |
+
# Expect a valid JSON string that parses to a dict
|
168 |
+
if isinstance(value, str):
|
169 |
+
parsed = json.loads(value)
|
170 |
+
if not isinstance(parsed, dict):
|
171 |
+
raise ValueError
|
172 |
+
row[col] = value # Keep as string for Preprocessor
|
173 |
+
else:
|
174 |
+
raise ValueError
|
175 |
+
else:
|
176 |
+
row[col] = expected_type(value)
|
177 |
+
except (ValueError, TypeError, json.JSONDecodeError):
|
178 |
+
row[col] = default_values[col]
|
179 |
+
warnings.append(f"Invalid input for '{col}' (expected {expected_type.__name__}), using default value: {default_values[col]}")
|
180 |
+
|
181 |
+
# Convert dictionaries to strings before passing to DataFrame
|
182 |
+
for col in ['attributes', 'hours']:
|
183 |
+
if isinstance(row[col], dict):
|
184 |
+
row[col] = json.dumps(row[col])
|
185 |
+
|
186 |
+
# Create DataFrame from input
|
187 |
+
input_df = pd.DataFrame([row])
|
188 |
+
|
189 |
+
# Preprocess using Preprocessor
|
190 |
+
preprocessor = Preprocessor(input_df)
|
191 |
+
processed_df = preprocessor.run_pipeline()
|
192 |
+
logger.info(f"PREPROCESSING COMPLETED VALUES ARE {processed_df}")
|
193 |
+
|
194 |
+
# Build standalone graph from processed data
|
195 |
+
num_users = 1
|
196 |
+
num_businesses = 1
|
197 |
+
num_rows = 1
|
198 |
+
|
199 |
+
graph = HeteroData()
|
200 |
+
features = torch.tensor(processed_df.drop(columns=['user_id', 'review_id', 'business_id']).values, dtype=torch.float, device=device)
|
201 |
+
time_since_user = torch.tensor(processed_df['time_since_last_review_user'].values, dtype=torch.float, device=device)
|
202 |
+
time_since_business = torch.tensor(processed_df['time_since_last_review_business'].values, dtype=torch.float, device=device)
|
203 |
+
|
204 |
+
user_indices = torch.tensor([0], dtype=torch.long, device=device)
|
205 |
+
business_indices = torch.tensor([0], dtype=torch.long, device=device)
|
206 |
+
review_indices = torch.tensor([0], dtype=torch.long, device=device)
|
207 |
+
|
208 |
+
user_feats = torch.zeros(num_users, 14, device=device)
|
209 |
+
business_feats = torch.zeros(num_businesses, 8, device=device)
|
210 |
+
review_feats = torch.zeros(num_rows, 16, device=device)
|
211 |
+
|
212 |
+
user_feats[0] = features[0, :14]
|
213 |
+
business_feats[0] = features[0, 14:22]
|
214 |
+
review_feats[0] = features[0, 22:38]
|
215 |
+
|
216 |
+
graph['user'].x = user_feats
|
217 |
+
graph['business'].x = business_feats
|
218 |
+
graph['review'].x = review_feats
|
219 |
+
|
220 |
+
graph['user', 'writes', 'review'].edge_index = torch.stack([user_indices, review_indices], dim=0)
|
221 |
+
graph['review', 'about', 'business'].edge_index = torch.stack([review_indices, business_indices], dim=0)
|
222 |
+
|
223 |
+
# Compute encodings
|
224 |
+
G = nx.DiGraph()
|
225 |
+
node_type_map = {0: 'user', 1: 'business', 2: 'review'}
|
226 |
+
G.add_nodes_from([0, 1, 2])
|
227 |
+
G.add_edge(0, 2) # user -> review
|
228 |
+
G.add_edge(2, 1) # review -> business
|
229 |
+
|
230 |
+
num_nodes = 3
|
231 |
+
spatial_encoding = torch.full((num_nodes, num_nodes), float('inf'), device=device)
|
232 |
+
for i in range(num_nodes):
|
233 |
+
for j in range(num_nodes):
|
234 |
+
if i == j:
|
235 |
+
spatial_encoding[i, j] = 0
|
236 |
+
elif nx.has_path(G, i, j):
|
237 |
+
spatial_encoding[i, j] = nx.shortest_path_length(G, i, j)
|
238 |
+
|
239 |
+
centrality_encoding = torch.tensor([G.degree(i) for i in range(num_nodes)], dtype=torch.float, device=device).view(-1, 1)
|
240 |
+
|
241 |
+
edge_features_dict = {}
|
242 |
+
user_writes_edge = graph['user', 'writes', 'review'].edge_index
|
243 |
+
review_about_edge = graph['review', 'about', 'business'].edge_index
|
244 |
+
|
245 |
+
edge_features_dict[('user', 'writes', 'review')] = create_temporal_edge_features(
|
246 |
+
time_since_user[user_writes_edge[0]], time_since_user[user_writes_edge[1]],
|
247 |
+
user_indices[user_writes_edge[0]], user_indices[user_writes_edge[0]]
|
248 |
+
)
|
249 |
+
edge_features_dict[('review', 'about', 'business')] = create_temporal_edge_features(
|
250 |
+
time_since_business[review_about_edge[0]], time_since_business[review_about_edge[1]],
|
251 |
+
torch.zeros_like(review_about_edge[0]), torch.zeros_like(review_about_edge[0])
|
252 |
+
)
|
253 |
+
|
254 |
+
time_since_dict = {
|
255 |
+
'user': torch.tensor([time_since_user[0]], dtype=torch.float, device=device)
|
256 |
+
}
|
257 |
+
|
258 |
+
# Inference
|
259 |
+
with torch.no_grad():
|
260 |
+
out = model(graph, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict)
|
261 |
+
pred_label = 1 if out.squeeze().item() > 0.5 else 0
|
262 |
+
prob = out.squeeze().item()
|
263 |
+
|
264 |
+
# Combine warnings and result
|
265 |
+
result = {
|
266 |
+
'warnings': warnings,
|
267 |
+
'prediction': 'Fake' if pred_label == 1 else 'Not Fake',
|
268 |
+
'probability': float(prob)
|
269 |
+
}
|
270 |
+
return jsonify(result), 200
|
271 |
+
|
272 |
+
else:
|
273 |
+
return jsonify({
|
274 |
+
'error': 'Either "train" or "test" must be set to true'
|
275 |
+
}), 400
|
276 |
+
|
277 |
+
except Exception as e:
|
278 |
+
return jsonify({'error': str(e)}), 500
|
279 |
+
|
280 |
+
if __name__ == '__main__':
|
281 |
+
app.run(debug=True, host='0.0.0.0', port=5000)
|
main.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from pathlib import Path
|
4 |
+
import logging
|
5 |
+
import sys
|
6 |
+
from datetime import datetime
|
7 |
+
import warnings
|
8 |
+
import gc
|
9 |
+
import json
|
10 |
+
|
11 |
+
from loguru import logger
|
12 |
+
from src.create_dataset import process_datasets
|
13 |
+
from src.preprocessing import Preprocessor
|
14 |
+
from src.clean_data import DataCleaner
|
15 |
+
from src.feature_analyzer import FeatureAnalyzer
|
16 |
+
from src.model_trainer import ModelTrainer
|
17 |
+
from pathlib import Path
|
18 |
+
|
19 |
+
|
20 |
+
def create_directories():
|
21 |
+
"""Create all necessary directories for the pipeline"""
|
22 |
+
directories = {
|
23 |
+
'combined_data': Path('output_files/combined_data'),
|
24 |
+
'preprocessed': Path('output_files/cleaned_preprocessed_data'),
|
25 |
+
'feature_analyzer': Path('output_files/feature_analysis'),
|
26 |
+
'model_outputs': Path('output_files/model_outputs'),
|
27 |
+
|
28 |
+
}
|
29 |
+
|
30 |
+
for dir_path in directories.values():
|
31 |
+
dir_path.mkdir(parents=True, exist_ok=True)
|
32 |
+
|
33 |
+
return directories
|
34 |
+
|
35 |
+
def handle_memory():
|
36 |
+
"""Handle memory management"""
|
37 |
+
gc.collect()
|
38 |
+
warnings.filterwarnings('ignore')
|
39 |
+
|
40 |
+
def save_pipeline_metrics(metrics: dict, filepath: Path):
|
41 |
+
"""Save pipeline metrics to JSON file"""
|
42 |
+
with open(filepath, 'w') as f:
|
43 |
+
json.dump(metrics, f, indent=4, default=str)
|
44 |
+
|
45 |
+
def start_pipelines(train_size=0.25):
|
46 |
+
# Setup logging
|
47 |
+
logger.info("STARTING YELP DATA ANALYSIS PIPELINES...")
|
48 |
+
dirs = create_directories()
|
49 |
+
logger.info("Created necessary directories")
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
logger.info("Pipeline 1: Creating initial dataset...")
|
56 |
+
try:
|
57 |
+
filename="combined_merged_full.csv"
|
58 |
+
df = process_datasets(output_path=dirs['combined_data'],filename=filename)
|
59 |
+
|
60 |
+
logger.info(f"Dataset created successfully with shape: {df.shape}")
|
61 |
+
except Exception as e:
|
62 |
+
logger.error(f"Error in dataset creation: {str(e)}")
|
63 |
+
|
64 |
+
|
65 |
+
|
66 |
+
|
67 |
+
try:
|
68 |
+
logger.info("Pipeline 2: Preprocessing and Feature Engineering....")
|
69 |
+
output_before_preprocess=Path(str(dirs['combined_data']) )/ "combined_merged_full.csv"
|
70 |
+
df = pd.read_csv(output_before_preprocess)
|
71 |
+
prep=Preprocessor(df)
|
72 |
+
feature_engineered_df=prep.run_pipeline()
|
73 |
+
|
74 |
+
except Exception as e:
|
75 |
+
logger.error(f"Error in Pipeline 2 Preprocessing and Feature Engineering as : {e}")
|
76 |
+
|
77 |
+
|
78 |
+
try:
|
79 |
+
logger.info("Pipeline 3: Cleaning data...")
|
80 |
+
filename="preprocessed_cleaned.csv"
|
81 |
+
|
82 |
+
cleaner = DataCleaner(df=feature_engineered_df,output_path=str(dirs['preprocessed']),filename=filename)
|
83 |
+
cleaner.run_pipeline()
|
84 |
+
clean_output_file_path = Path(str(dirs['preprocessed']) )/ filename
|
85 |
+
print("Preprocessed and Cleand data saved in ",clean_output_file_path)
|
86 |
+
|
87 |
+
|
88 |
+
|
89 |
+
|
90 |
+
except Exception as e:
|
91 |
+
logger.error(f"Error in Pipeline 3 Cleaning Data : {str(e)}")
|
92 |
+
|
93 |
+
|
94 |
+
|
95 |
+
try:
|
96 |
+
logger.info("Pipeline 4: Analyzing features...")
|
97 |
+
filename="preprocessed_cleaned.csv"
|
98 |
+
preprocessed_clean_output_file=Path(str(dirs['preprocessed']) )/ filename
|
99 |
+
preprocessed_clean_df=pd.read_csv(preprocessed_clean_output_file)
|
100 |
+
|
101 |
+
analyzer = FeatureAnalyzer(df=preprocessed_clean_df,output_path=str(dirs['feature_analyzer']))
|
102 |
+
analyzer.run_pipeline()
|
103 |
+
except Exception as e:
|
104 |
+
logger.error(f"Error in Feature analysis: {str(e)}")
|
105 |
+
raise
|
106 |
+
|
107 |
+
|
108 |
+
try:
|
109 |
+
logger.info("Pipeline 5 : Training and Evaluating Models...")
|
110 |
+
filename="preprocessed_cleaned.csv"
|
111 |
+
preprocessed_clean_output_file=Path(str(dirs['preprocessed']) )/ filename
|
112 |
+
preprocessed_clean_df=pd.read_csv(preprocessed_clean_output_file)
|
113 |
+
preprocessed_clean_df = preprocessed_clean_df.sample(frac=1, random_state=42).reset_index(drop=True)
|
114 |
+
size=int(train_size*len(preprocessed_clean_df))
|
115 |
+
|
116 |
+
preprocessed_clean_df=preprocessed_clean_df.iloc[:size,:]
|
117 |
+
|
118 |
+
|
119 |
+
|
120 |
+
trainer = ModelTrainer(df=preprocessed_clean_df,output_path=str(dirs['model_outputs']), epochs=50,test_size=0.3)
|
121 |
+
trainer.train_and_evaluate()
|
122 |
+
|
123 |
+
logger.info(f"Models training completed ")
|
124 |
+
except Exception as e:
|
125 |
+
logger.error(f"Error in Model Trainer: {str(e)}")
|
126 |
+
|
127 |
+
|
128 |
+
|
129 |
+
|
130 |
+
|
131 |
+
|
132 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ujson
|
2 |
+
imblearn
|
3 |
+
scikit-learn==1.5.2
|
4 |
+
loguru
|
5 |
+
astropy
|
6 |
+
textblob
|
7 |
+
nltk
|
8 |
+
transformers
|
9 |
+
pandas
|
10 |
+
numpy
|
11 |
+
tqdm
|
12 |
+
pymongo
|
13 |
+
scikit-learn
|
14 |
+
torch
|
15 |
+
pathlib
|
16 |
+
torch-geometric
|
17 |
+
huggingface-hub
|
18 |
+
matplotlib
|
19 |
+
seaborn
|
src/.ipynb_checkpoints/analyze_yelp_data-checkpoint.py
ADDED
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from transformers import AutoTokenizer, AutoModel
|
4 |
+
import torch
|
5 |
+
from sklearn.ensemble import IsolationForest
|
6 |
+
from sklearn.preprocessing import StandardScaler
|
7 |
+
from textblob import TextBlob
|
8 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
9 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
10 |
+
from sklearn.decomposition import PCA
|
11 |
+
import warnings
|
12 |
+
from typing import Dict, List, Tuple
|
13 |
+
import logging
|
14 |
+
from collections import Counter
|
15 |
+
from detoxify import Detoxify
|
16 |
+
import re
|
17 |
+
from datetime import datetime
|
18 |
+
import seaborn as sns
|
19 |
+
import matplotlib.pyplot as plt
|
20 |
+
from pathlib import Path
|
21 |
+
import json
|
22 |
+
|
23 |
+
class AdvancedYelpAnalyzer:
|
24 |
+
def __init__(self, df: pd.DataFrame):
|
25 |
+
"""Initialize the analyzer with necessary models and configurations"""
|
26 |
+
self.df = df.copy()
|
27 |
+
self.bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
28 |
+
self.bert_model = AutoModel.from_pretrained('bert-base-uncased')
|
29 |
+
self.vader = SentimentIntensityAnalyzer()
|
30 |
+
self.toxic_model = Detoxify('original')
|
31 |
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
32 |
+
self.bert_model.to(self.device)
|
33 |
+
|
34 |
+
# Configure logging
|
35 |
+
logging.basicConfig(level=logging.INFO)
|
36 |
+
self.logger = logging.getLogger(__name__)
|
37 |
+
|
38 |
+
def get_bert_embeddings(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
|
39 |
+
"""Generate BERT embeddings for text"""
|
40 |
+
embeddings = []
|
41 |
+
|
42 |
+
for i in range(0, len(texts), batch_size):
|
43 |
+
batch_texts = texts[i:i + batch_size]
|
44 |
+
encoded = self.bert_tokenizer(batch_texts,
|
45 |
+
padding=True,
|
46 |
+
truncation=True,
|
47 |
+
max_length=512,
|
48 |
+
return_tensors='pt')
|
49 |
+
|
50 |
+
with torch.no_grad():
|
51 |
+
encoded = {k: v.to(self.device) for k, v in encoded.items()}
|
52 |
+
outputs = self.bert_model(**encoded)
|
53 |
+
batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
|
54 |
+
embeddings.append(batch_embeddings)
|
55 |
+
|
56 |
+
return np.vstack(embeddings)
|
57 |
+
|
58 |
+
def analyze_sentiment(self) -> pd.DataFrame:
|
59 |
+
"""Perform comprehensive sentiment analysis using multiple tools"""
|
60 |
+
self.logger.info("Starting sentiment analysis...")
|
61 |
+
|
62 |
+
# Calculate BERT embeddings for reviews
|
63 |
+
self.logger.info("Calculating BERT embeddings...")
|
64 |
+
review_texts = self.df['review_text'].fillna('').tolist()
|
65 |
+
bert_embeddings = self.get_bert_embeddings(review_texts)
|
66 |
+
|
67 |
+
# Calculate review length using BERT tokenizer
|
68 |
+
self.logger.info("Calculating tokenized lengths...")
|
69 |
+
self.df['review_length'] = self.df['review_text'].apply(
|
70 |
+
lambda x: len(self.bert_tokenizer.encode(str(x)))
|
71 |
+
)
|
72 |
+
|
73 |
+
# Store BERT embeddings mean and std as features
|
74 |
+
self.df['bert_embedding_mean'] = np.mean(bert_embeddings, axis=1)
|
75 |
+
self.df['bert_embedding_std'] = np.std(bert_embeddings, axis=1)
|
76 |
+
|
77 |
+
# TextBlob sentiment and subjectivity
|
78 |
+
self.df['textblob_polarity'] = self.df['review_text'].apply(
|
79 |
+
lambda x: TextBlob(str(x)).sentiment.polarity
|
80 |
+
)
|
81 |
+
self.df['textblob_subjectivity'] = self.df['review_text'].apply(
|
82 |
+
lambda x: TextBlob(str(x)).sentiment.subjectivity
|
83 |
+
)
|
84 |
+
|
85 |
+
# VADER sentiment with custom negative phrase handling
|
86 |
+
def get_enhanced_vader_scores(text):
|
87 |
+
# Custom negative phrases
|
88 |
+
negative_phrases = [
|
89 |
+
'too long', 'way too long', 'waiting', 'changed our minds',
|
90 |
+
'too many', 'took forever', 'took too long', 'waste of time',
|
91 |
+
'not worth', 'disappointing', 'mediocre', 'needs improvement'
|
92 |
+
]
|
93 |
+
|
94 |
+
# Get base VADER scores
|
95 |
+
base_scores = self.vader.polarity_scores(str(text))
|
96 |
+
|
97 |
+
# Check for negative phrases
|
98 |
+
text_lower = str(text).lower()
|
99 |
+
neg_count = sum(1 for phrase in negative_phrases if phrase in text_lower)
|
100 |
+
|
101 |
+
# Adjust scores if negative phrases are found
|
102 |
+
if neg_count > 0:
|
103 |
+
base_scores['neg'] = max(base_scores['neg'], min(0.7, neg_count * 0.2))
|
104 |
+
base_scores['compound'] *= (1 - (neg_count * 0.15))
|
105 |
+
# Readjust neutral score
|
106 |
+
base_scores['neu'] = max(0, 1 - base_scores['neg'] - base_scores['pos'])
|
107 |
+
|
108 |
+
return base_scores
|
109 |
+
|
110 |
+
# Apply enhanced VADER scoring
|
111 |
+
vader_scores = self.df['review_text'].apply(get_enhanced_vader_scores)
|
112 |
+
self.df['vader_compound'] = vader_scores.apply(lambda x: x['compound'])
|
113 |
+
self.df['vader_negative'] = vader_scores.apply(lambda x: x['neg'])
|
114 |
+
self.df['vader_positive'] = vader_scores.apply(lambda x: x['pos'])
|
115 |
+
self.df['vader_neutral'] = vader_scores.apply(lambda x: x['neu'])
|
116 |
+
|
117 |
+
# Calculate sentiment extremity
|
118 |
+
self.df['sentiment_extremity'] = self.df['vader_compound'].abs()
|
119 |
+
|
120 |
+
return self.df
|
121 |
+
|
122 |
+
def detect_anomalies(self) -> pd.DataFrame:
|
123 |
+
"""Detect anomalous reviews using Isolation Forest with BERT features"""
|
124 |
+
self.logger.info("Detecting anomalies...")
|
125 |
+
|
126 |
+
# Prepare features for anomaly detection
|
127 |
+
features = [
|
128 |
+
'review_stars',
|
129 |
+
'textblob_polarity',
|
130 |
+
'vader_compound',
|
131 |
+
'sentiment_extremity',
|
132 |
+
'review_length',
|
133 |
+
'bert_embedding_mean',
|
134 |
+
'bert_embedding_std'
|
135 |
+
]
|
136 |
+
|
137 |
+
# Ensure all features exist
|
138 |
+
missing_features = [f for f in features if f not in self.df.columns]
|
139 |
+
if missing_features:
|
140 |
+
self.analyze_sentiment()
|
141 |
+
|
142 |
+
# Standardize features
|
143 |
+
scaler = StandardScaler()
|
144 |
+
X = scaler.fit_transform(self.df[features])
|
145 |
+
|
146 |
+
# Apply Isolation Forest
|
147 |
+
iso_forest = IsolationForest(
|
148 |
+
contamination=0.1,
|
149 |
+
random_state=42,
|
150 |
+
n_jobs=-1
|
151 |
+
)
|
152 |
+
|
153 |
+
# Fit and predict
|
154 |
+
self.df['is_anomaly'] = iso_forest.fit_predict(X)
|
155 |
+
self.df['anomaly_score'] = iso_forest.score_samples(X)
|
156 |
+
|
157 |
+
return self.df
|
158 |
+
|
159 |
+
def detect_ai_generated_text(self) -> pd.DataFrame:
|
160 |
+
"""Estimate likelihood of AI-generated content"""
|
161 |
+
self.logger.info("Detecting AI-generated content...")
|
162 |
+
|
163 |
+
# Ensure sentiment analysis has been run
|
164 |
+
if 'textblob_subjectivity' not in self.df.columns:
|
165 |
+
self.analyze_sentiment()
|
166 |
+
|
167 |
+
# Use detoxify model to get toxicity scores
|
168 |
+
texts = self.df['review_text'].fillna('').tolist()
|
169 |
+
toxic_scores = self.toxic_model.predict(texts)
|
170 |
+
|
171 |
+
# Add scores to DataFrame
|
172 |
+
toxic_score_types = ['toxicity', 'severe_toxicity', 'obscene', 'identity_attack',
|
173 |
+
'insult', 'threat', 'sexual_explicit']
|
174 |
+
for score_type in toxic_score_types:
|
175 |
+
if score_type in toxic_scores:
|
176 |
+
self.df[f'toxic_{score_type}'] = toxic_scores[score_type]
|
177 |
+
|
178 |
+
# Calculate AI generation likelihood based on various factors
|
179 |
+
self.df['ai_generated_likelihood'] = (
|
180 |
+
(self.df['textblob_subjectivity'] < 0.3) & # Low subjectivity
|
181 |
+
(self.df['sentiment_extremity'] > 0.8) & # Extreme sentiment
|
182 |
+
(self.df['review_length'] > self.df['review_length'].quantile(0.95)) & # Unusually long
|
183 |
+
(self.df['bert_embedding_std'] < self.df['bert_embedding_std'].quantile(0.25)) # Unusual language patterns
|
184 |
+
).astype(int)
|
185 |
+
|
186 |
+
# Add additional AI detection features
|
187 |
+
self.df['ai_detection_score'] = (
|
188 |
+
(self.df['textblob_subjectivity'] * -1) + # Lower subjectivity increases score
|
189 |
+
(self.df['sentiment_extremity'] * 0.5) + # Extreme sentiment contributes somewhat
|
190 |
+
(self.df['bert_embedding_std'] * -0.5) # Lower variation in embeddings increases score
|
191 |
+
).clip(0, 1) # Normalize between 0 and 1
|
192 |
+
|
193 |
+
return self.df
|
194 |
+
|
195 |
+
def analyze_business_categories(self) -> Dict:
|
196 |
+
"""Analyze trends and patterns specific to business categories"""
|
197 |
+
self.logger.info("Analyzing business categories...")
|
198 |
+
|
199 |
+
# Extract categories
|
200 |
+
categories = self.df['categories'].fillna('').str.split(', ')
|
201 |
+
all_categories = [cat for cats in categories if isinstance(cats, list) for cat in cats]
|
202 |
+
category_counts = Counter(all_categories)
|
203 |
+
|
204 |
+
# Analyze reviews by category
|
205 |
+
category_analysis = {}
|
206 |
+
for category in set(all_categories):
|
207 |
+
category_reviews = self.df[self.df['categories'].str.contains(category, na=False)]
|
208 |
+
|
209 |
+
category_analysis[category] = {
|
210 |
+
'review_count': len(category_reviews),
|
211 |
+
'avg_rating': category_reviews['review_stars'].mean() if not category_reviews.empty else None,
|
212 |
+
'avg_sentiment': category_reviews['vader_compound'].mean() if 'vader_compound' in self.df.columns and not category_reviews.empty else None,
|
213 |
+
'avg_subjectivity': category_reviews['textblob_subjectivity'].mean() if 'textblob_subjectivity' in self.df.columns and not category_reviews.empty else None
|
214 |
+
}
|
215 |
+
|
216 |
+
return category_analysis
|
217 |
+
|
218 |
+
def visualize_results(self, output_dir: str):
|
219 |
+
"""Create visualizations for analysis results"""
|
220 |
+
plt.figure(figsize=(15, 10))
|
221 |
+
|
222 |
+
# Sentiment Distribution
|
223 |
+
plt.subplot(2, 2, 1)
|
224 |
+
sns.histplot(data=self.df, x='vader_compound', bins=50)
|
225 |
+
plt.title('Sentiment Distribution')
|
226 |
+
|
227 |
+
# Review Volume Over Time
|
228 |
+
plt.subplot(2, 2, 2)
|
229 |
+
daily_reviews = self.df.groupby('review_date').size()
|
230 |
+
daily_reviews.plot()
|
231 |
+
plt.title('Review Volume Over Time')
|
232 |
+
|
233 |
+
# Anomaly Score Distribution
|
234 |
+
plt.subplot(2, 2, 3)
|
235 |
+
if 'anomaly_score' not in self.df.columns:
|
236 |
+
self.detect_anomalies()
|
237 |
+
sns.histplot(data=self.df, x='anomaly_score', bins=50)
|
238 |
+
plt.title('Anomaly Score Distribution')
|
239 |
+
|
240 |
+
# AI Generation Likelihood
|
241 |
+
plt.subplot(2, 2, 4)
|
242 |
+
if 'ai_generated_likelihood' not in self.df.columns:
|
243 |
+
self.detect_ai_generated_text()
|
244 |
+
sns.histplot(data=self.df, x='ai_generated_likelihood', bins=2)
|
245 |
+
plt.title('AI Generation Likelihood')
|
246 |
+
|
247 |
+
plt.tight_layout()
|
248 |
+
plt.savefig(f'{output_dir}/analysis_results.png')
|
249 |
+
plt.close()
|
250 |
+
|
251 |
+
def run_full_analysis(self, output_dir: str) -> Tuple[pd.DataFrame, Dict]:
|
252 |
+
"""Run complete analysis pipeline with detailed outputs"""
|
253 |
+
self.logger.info("Starting full analysis pipeline...")
|
254 |
+
|
255 |
+
# Create output directory if it doesn't exist
|
256 |
+
output_dir = Path(output_dir)
|
257 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
258 |
+
|
259 |
+
try:
|
260 |
+
# Run all analyses
|
261 |
+
self.analyze_sentiment()
|
262 |
+
self.detect_anomalies()
|
263 |
+
self.detect_ai_generated_text()
|
264 |
+
category_analysis = self.analyze_business_categories()
|
265 |
+
|
266 |
+
# Create visualizations
|
267 |
+
self.visualize_results(str(output_dir))
|
268 |
+
|
269 |
+
# Compile results
|
270 |
+
analysis_results = {
|
271 |
+
'category_analysis': category_analysis,
|
272 |
+
'sentiment_summary': {
|
273 |
+
'avg_sentiment': self.df['vader_compound'].mean(),
|
274 |
+
'positive_reviews': len(self.df[self.df['vader_compound'] > 0.5]),
|
275 |
+
'negative_reviews': len(self.df[self.df['vader_compound'] < -0.5]),
|
276 |
+
'neutral_reviews': len(self.df[abs(self.df['vader_compound']) <= 0.5])
|
277 |
+
},
|
278 |
+
'ai_detection_summary': {
|
279 |
+
'likely_ai_generated': len(self.df[self.df['ai_generated_likelihood'] == 1]),
|
280 |
+
'avg_ai_score': self.df['ai_detection_score'].mean()
|
281 |
+
},
|
282 |
+
'anomaly_summary': {
|
283 |
+
'anomalous_reviews': len(self.df[self.df['is_anomaly'] == -1]),
|
284 |
+
'avg_anomaly_score': self.df['anomaly_score'].mean()
|
285 |
+
}
|
286 |
+
}
|
287 |
+
|
288 |
+
# Save results
|
289 |
+
self.df.to_csv(output_dir / "analyzed_data.csv", index=False)
|
290 |
+
with open(output_dir / "analysis_results.json", 'w') as f:
|
291 |
+
json.dump(analysis_results, f, indent=4)
|
292 |
+
|
293 |
+
return self.df, analysis_results
|
294 |
+
|
295 |
+
except Exception as e:
|
296 |
+
self.logger.error(f"Error during analysis: {str(e)}")
|
297 |
+
raise
|
298 |
+
|
299 |
+
# For testing
|
300 |
+
if __name__ == "__main__":
|
301 |
+
# Set up logging
|
302 |
+
logging.basicConfig(level=logging.INFO)
|
303 |
+
logger = logging.getLogger(__name__)
|
304 |
+
|
305 |
+
try:
|
306 |
+
# Read test data
|
307 |
+
df = pd.read_csv("test_data.csv")
|
308 |
+
|
309 |
+
# Initialize analyzer
|
310 |
+
analyzer = AdvancedYelpAnalyzer(df)
|
311 |
+
|
312 |
+
# Run analysis
|
313 |
+
output_dir = "output"
|
314 |
+
analyzed_df, results = analyzer.run_full_analysis(output_dir)
|
315 |
+
|
316 |
+
logger.info("Analysis completed successfully!")
|
317 |
+
|
318 |
+
except Exception as e:
|
319 |
+
logger.error(f"Error during testing: {str(e)}")
|
320 |
+
raise
|
src/.ipynb_checkpoints/clean_data-checkpoint.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# clean_yelp_data.py
|
2 |
+
from loguru import logger
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
from dataclasses import dataclass
|
6 |
+
from typing import Dict, List, Optional, Tuple
|
7 |
+
import json
|
8 |
+
from pathlib import Path
|
9 |
+
import logging
|
10 |
+
from scipy.stats import entropy
|
11 |
+
import warnings
|
12 |
+
from datetime import datetime
|
13 |
+
import matplotlib.pyplot as plt
|
14 |
+
import seaborn as sns
|
15 |
+
import re
|
16 |
+
from textblob import TextBlob
|
17 |
+
import os
|
18 |
+
from pathlib import Path
|
19 |
+
|
20 |
+
class DataCleaner:
|
21 |
+
def __init__(self,df,output_path,filename="preprocessed_cleaned.csv"):
|
22 |
+
self.df=df
|
23 |
+
self.output_path=output_path
|
24 |
+
self.filename=filename
|
25 |
+
def saving_cleaned_preprocess(self):
|
26 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
27 |
+
|
28 |
+
output_file = Path(self.output_path) / self.filename
|
29 |
+
logger.info(f"Files saved in directory {output_file} as : { self.filename}")
|
30 |
+
self.df.to_csv(output_file, index=False)
|
31 |
+
|
32 |
+
def dropping_unncessary_columns(self):
|
33 |
+
self.df.drop("review_text", axis=1, inplace=True)
|
34 |
+
self.df.drop("review_date", axis=1, inplace=True)
|
35 |
+
self.df.drop("business_name", axis=1, inplace=True)
|
36 |
+
self.df.drop("city", axis=1, inplace=True)
|
37 |
+
self.df.drop("state", axis=1, inplace=True)
|
38 |
+
self.df.drop("postal_code", axis=1, inplace=True)
|
39 |
+
self.df.drop("categories", axis=1, inplace=True)
|
40 |
+
self.df.drop("user_name", axis=1, inplace=True)
|
41 |
+
self.df.drop("yelping_since", axis=1, inplace=True)
|
42 |
+
self.df.drop("checkin_date", axis=1, inplace=True)
|
43 |
+
self.df.drop("review_useful", axis=1, inplace=True)
|
44 |
+
self.df.drop("review_funny", axis=1, inplace=True)
|
45 |
+
self.df.drop("review_cool", axis=1, inplace=True)
|
46 |
+
self.df.drop("user_useful", axis=1, inplace=True)
|
47 |
+
self.df.drop("user_funny", axis=1, inplace=True)
|
48 |
+
self.df.drop("user_cool", axis=1, inplace=True)
|
49 |
+
self.df.drop("is_open", axis=1, inplace=True)
|
50 |
+
self.df.drop("compliment_hot", axis=1, inplace=True)
|
51 |
+
self.df.drop("compliment_more", axis=1, inplace=True)
|
52 |
+
self.df.drop("compliment_profile", axis=1, inplace=True)
|
53 |
+
self.df.drop("compliment_cute", axis=1, inplace=True)
|
54 |
+
self.df.drop("compliment_list", axis=1, inplace=True)
|
55 |
+
self.df.drop("compliment_note", axis=1, inplace=True)
|
56 |
+
self.df.drop("compliment_plain", axis=1, inplace=True)
|
57 |
+
self.df.drop("compliment_cool", axis=1, inplace=True)
|
58 |
+
self.df.drop("compliment_funny", axis=1, inplace=True)
|
59 |
+
self.df.drop("compliment_writer", axis=1, inplace=True)
|
60 |
+
self.df.drop("compliment_photos", axis=1, inplace=True)
|
61 |
+
|
62 |
+
def run_pipeline(self):
|
63 |
+
logger.info("Dropping Unnecessary Columns")
|
64 |
+
self.dropping_unncessary_columns()
|
65 |
+
|
66 |
+
|
67 |
+
logger.info("Saving Cleaned and Preprocessed Data")
|
68 |
+
self.saving_cleaned_preprocess()
|
69 |
+
|
70 |
+
|
71 |
+
|
72 |
+
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
|
77 |
+
|
src/.ipynb_checkpoints/create_dataset-checkpoint.py
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import ujson as json
|
3 |
+
import gc
|
4 |
+
import numpy as np
|
5 |
+
from concurrent.futures import ProcessPoolExecutor
|
6 |
+
import multiprocessing as mp
|
7 |
+
from pymongo import MongoClient
|
8 |
+
from collections import defaultdict
|
9 |
+
from pathlib import Path
|
10 |
+
|
11 |
+
# def read_json_parallel(file_path, num_workers=None):
|
12 |
+
# """Read JSON file using parallel processing"""
|
13 |
+
# if num_workers is None:
|
14 |
+
# num_workers = max(1, mp.cpu_count() - 1)
|
15 |
+
|
16 |
+
# print(f"Reading {file_path}...")
|
17 |
+
# # Read chunks and concatenate them into a single DataFrame
|
18 |
+
# df = pd.read_json(file_path, lines=True, dtype_backend="pyarrow", chunksize=100000)
|
19 |
+
# return next(df)
|
20 |
+
|
21 |
+
|
22 |
+
def read_data_mongo(file_path, num_workers=None):
|
23 |
+
"""Read JSON file using parallel processing"""
|
24 |
+
if num_workers is None:
|
25 |
+
num_workers = max(1, mp.cpu_count() - 1)
|
26 |
+
|
27 |
+
print(f"Reading {file_path}...")
|
28 |
+
conn_str = "mongodb://Mtalha:[email protected]/"
|
29 |
+
|
30 |
+
client = MongoClient(conn_str)
|
31 |
+
databases = client.list_database_names()
|
32 |
+
db_client=client["Yelp"]
|
33 |
+
|
34 |
+
# Read the entire file at once since chunksize isn't needed for parallel reading here
|
35 |
+
# Use 'records' orient if your JSON was saved with this format
|
36 |
+
try:
|
37 |
+
|
38 |
+
collection = db_client[file_path]
|
39 |
+
documents = collection.find({}, {"_id": 0})
|
40 |
+
data = list(documents)
|
41 |
+
final_dict=defaultdict(list)
|
42 |
+
|
43 |
+
for dictt in data:
|
44 |
+
for k,v in dictt.items():
|
45 |
+
final_dict[k].append(v)
|
46 |
+
df=pd.DataFrame(final_dict)
|
47 |
+
|
48 |
+
# df = pd.read_json(file_path, orient='records', dtype_backend="pyarrow")
|
49 |
+
except Exception as e:
|
50 |
+
# If 'records' doesn't work, try without specifying orient or with 'split'
|
51 |
+
# This is a fallback for different JSON structures
|
52 |
+
# df = pd.read_json(file_path, dtype_backend="pyarrow")
|
53 |
+
print("ERROR WHILE READING FILES FORM MONGODB AS : ",e)
|
54 |
+
print(f"Finished reading. DataFrame shape: {df.shape}")
|
55 |
+
return df
|
56 |
+
|
57 |
+
def process_datasets(output_path,filename):
|
58 |
+
# File paths
|
59 |
+
file_paths = {
|
60 |
+
'business': "yelp_academic_dataset_business",
|
61 |
+
'checkin': "yelp_academic_dataset_checkin",
|
62 |
+
'review': "yelp_academic_dataset_review",
|
63 |
+
'tip': "yelp_academic_dataset_tip",
|
64 |
+
'user': "yelp_academic_dataset_user",
|
65 |
+
'google': "google_review_dataset"
|
66 |
+
}
|
67 |
+
|
68 |
+
# Read datasets with progress tracking
|
69 |
+
print("Reading datasets...")
|
70 |
+
dfs = {}
|
71 |
+
for name, path in file_paths.items():
|
72 |
+
print(f"Processing {name} dataset...")
|
73 |
+
dfs[name] = read_data_mongo(path)
|
74 |
+
print(f"Finished reading {name} dataset. Shape: {dfs[name].shape}")
|
75 |
+
|
76 |
+
print("All files read. Starting column renaming...")
|
77 |
+
|
78 |
+
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
|
84 |
+
|
85 |
+
|
86 |
+
# Rename columns to avoid conflicts
|
87 |
+
# Reviews
|
88 |
+
dfs['review'] = dfs['review'].rename(columns={
|
89 |
+
'date': 'review_date',
|
90 |
+
'stars': 'review_stars',
|
91 |
+
'text': 'review_text',
|
92 |
+
'useful': 'review_useful',
|
93 |
+
'funny': 'review_funny',
|
94 |
+
'cool': 'review_cool'
|
95 |
+
})
|
96 |
+
# print("COLUMNS IN REVIEW DAFRA)
|
97 |
+
|
98 |
+
# Tips
|
99 |
+
dfs['tip'] = dfs['tip'].rename(columns={
|
100 |
+
'date': 'tip_date',
|
101 |
+
'text': 'tip_text',
|
102 |
+
'compliment_count': 'tip_compliment_count'
|
103 |
+
})
|
104 |
+
|
105 |
+
# Checkins
|
106 |
+
dfs['checkin'] = dfs['checkin'].rename(columns={
|
107 |
+
'date': 'checkin_date'
|
108 |
+
})
|
109 |
+
|
110 |
+
# Users
|
111 |
+
dfs['user'] = dfs['user'].rename(columns={
|
112 |
+
'name': 'user_name',
|
113 |
+
'review_count': 'user_review_count',
|
114 |
+
'useful': 'user_useful',
|
115 |
+
'funny': 'user_funny',
|
116 |
+
'cool': 'user_cool'
|
117 |
+
})
|
118 |
+
|
119 |
+
# Business
|
120 |
+
dfs['business'] = dfs['business'].rename(columns={
|
121 |
+
'name': 'business_name',
|
122 |
+
'stars': 'business_stars',
|
123 |
+
'review_count': 'business_review_count'
|
124 |
+
})
|
125 |
+
dfs['google'] = dfs['google'].rename(columns={
|
126 |
+
'name': 'business_name',
|
127 |
+
'stars': 'business_stars',
|
128 |
+
'review_count': 'business_review_count'
|
129 |
+
})
|
130 |
+
df_business_final= dfs['business']
|
131 |
+
df_google_final=dfs['google']
|
132 |
+
df_review_final=dfs['review']
|
133 |
+
df_tip_final=dfs['tip']
|
134 |
+
df_checkin_final=dfs['checkin']
|
135 |
+
df_user_final=dfs['user']
|
136 |
+
|
137 |
+
|
138 |
+
df_business_final=pd.concat([df_business_final,df_google_final],axis=0)
|
139 |
+
df_business_final.reset_index(drop=True,inplace=True)
|
140 |
+
|
141 |
+
|
142 |
+
|
143 |
+
|
144 |
+
print("Starting merge process...")
|
145 |
+
|
146 |
+
# Merge process with memory management
|
147 |
+
print("Step 1: Starting with reviews...")
|
148 |
+
merged_df = df_review_final
|
149 |
+
|
150 |
+
|
151 |
+
print("Step 2: Merging with business data...")
|
152 |
+
merged_df = merged_df.merge(
|
153 |
+
df_business_final,
|
154 |
+
on='business_id',
|
155 |
+
how='left'
|
156 |
+
)
|
157 |
+
|
158 |
+
|
159 |
+
print("Step 3: Merging with user data...")
|
160 |
+
merged_df = merged_df.merge(
|
161 |
+
df_user_final,
|
162 |
+
on='user_id',
|
163 |
+
how='left'
|
164 |
+
)
|
165 |
+
|
166 |
+
|
167 |
+
print("Step 4: Merging with checkin data...")
|
168 |
+
merged_df = merged_df.merge(
|
169 |
+
df_checkin_final,
|
170 |
+
on='business_id',
|
171 |
+
how='left'
|
172 |
+
)
|
173 |
+
|
174 |
+
|
175 |
+
print("Step 5: Aggregating and merging tip data...")
|
176 |
+
tip_agg = df_tip_final.groupby('business_id').agg({
|
177 |
+
'tip_compliment_count': 'sum',
|
178 |
+
'tip_text': 'count'
|
179 |
+
}).rename(columns={'tip_text': 'tip_count'})
|
180 |
+
|
181 |
+
merged_df = merged_df.merge(
|
182 |
+
tip_agg,
|
183 |
+
on='business_id',
|
184 |
+
how='left'
|
185 |
+
)
|
186 |
+
|
187 |
+
|
188 |
+
|
189 |
+
print("Filling NaN values...")
|
190 |
+
merged_df['tip_count'] = merged_df['tip_count'].fillna(0)
|
191 |
+
merged_df['tip_compliment_count'] = merged_df['tip_compliment_count'].fillna(0)
|
192 |
+
merged_df['checkin_date'] = merged_df['checkin_date'].fillna('')
|
193 |
+
merged_df["friends"].fillna(0,inplace=True)
|
194 |
+
|
195 |
+
for col in merged_df.columns:
|
196 |
+
if merged_df[col].isnull().sum()>0:
|
197 |
+
print(f" {col} has {merged_df[col].isnull().sum()} null values")
|
198 |
+
|
199 |
+
|
200 |
+
print("Shape of Merged Dataset is : ",merged_df.shape)
|
201 |
+
output_file = Path(output_path) / filename
|
202 |
+
print("COLUMNS BEFORE PREPROCESING")
|
203 |
+
print()
|
204 |
+
print(merged_df.info())
|
205 |
+
for col in merged_df.columns:
|
206 |
+
for v in merged_df[col]:
|
207 |
+
print(f"Type of values in {col} is {type(v)} and values are like : {v}")
|
208 |
+
break
|
209 |
+
merged_df.to_csv(output_file,index=False)
|
210 |
+
|
211 |
+
|
212 |
+
|
213 |
+
|
214 |
+
return merged_df
|
215 |
+
|
216 |
+
# if __name__ == "__main__":
|
217 |
+
# process_datasets()
|
src/.ipynb_checkpoints/feature_analyzer-checkpoint.py
ADDED
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
import seaborn as sns
|
5 |
+
from pathlib import Path
|
6 |
+
from loguru import logger
|
7 |
+
|
8 |
+
class FeatureAnalyzer:
|
9 |
+
def __init__(self,df,output_path):
|
10 |
+
self.df=df
|
11 |
+
self.output_path=output_path
|
12 |
+
|
13 |
+
|
14 |
+
def plot_correlation_heatmap(self):
|
15 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
16 |
+
numeric_cols = self.df.select_dtypes(include=[np.number]).columns.drop('fake')
|
17 |
+
correlation_matrix = self.df[numeric_cols].corr()
|
18 |
+
plt.figure(figsize=(14, 12))
|
19 |
+
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1, center=0)
|
20 |
+
plt.title('Correlation Heatmap of Numeric Features', fontsize=16)
|
21 |
+
plt.tight_layout()
|
22 |
+
output_file = Path(self.output_path) / 'correlation_heatmap.png'
|
23 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
24 |
+
plt.close()
|
25 |
+
logger.info(f"Saved correlation heatmap to {output_file}")
|
26 |
+
|
27 |
+
def plot_mean_by_fake_bar(self):
|
28 |
+
key_features = [
|
29 |
+
'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
|
30 |
+
'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
|
31 |
+
'time_since_last_review_user', 'user_account_age', 'pronoun_density',
|
32 |
+
'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
|
33 |
+
'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
|
34 |
+
]
|
35 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
36 |
+
mean_by_fake = self.df.groupby('fake')[key_features].mean().T
|
37 |
+
mean_by_fake.columns = ['Genuine (0)', 'Fake (1)']
|
38 |
+
plt.figure(figsize=(12, 8))
|
39 |
+
mean_by_fake.plot(kind='bar', color=['skyblue', 'salmon'], width=0.8)
|
40 |
+
plt.title('Mean Feature Values by Fake Label', fontsize=16)
|
41 |
+
plt.xlabel('Features', fontsize=12)
|
42 |
+
plt.ylabel('Mean Value', fontsize=12)
|
43 |
+
plt.xticks(rotation=45, ha='right')
|
44 |
+
plt.legend(title='Fake Label')
|
45 |
+
plt.tight_layout()
|
46 |
+
output_file = Path(self.output_path) / 'mean_by_fake_bar.png'
|
47 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
48 |
+
plt.close()
|
49 |
+
logger.info(f"Saved mean by fake bar plot to {output_file}")
|
50 |
+
|
51 |
+
def plot_violin_plots(self):
|
52 |
+
key_features = [
|
53 |
+
'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
|
54 |
+
'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
|
55 |
+
'time_since_last_review_user', 'user_account_age', 'pronoun_density',
|
56 |
+
'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
|
57 |
+
'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
|
58 |
+
]
|
59 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
60 |
+
plt.figure(figsize=(14, 10))
|
61 |
+
for i, feature in enumerate(key_features[:6], 1):
|
62 |
+
plt.subplot(2, 3, i)
|
63 |
+
sns.violinplot(x='fake', y=feature, data=self.df, palette=['skyblue', 'salmon'])
|
64 |
+
plt.title(f'{feature} Distribution', fontsize=12)
|
65 |
+
plt.xlabel('Fake (0/1)', fontsize=10)
|
66 |
+
plt.tight_layout()
|
67 |
+
output_file = Path(self.output_path) / 'violin_plots.png'
|
68 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
69 |
+
plt.close()
|
70 |
+
logger.info(f"Saved violin plots to {output_file}")
|
71 |
+
|
72 |
+
def plot_box_plots(self):
|
73 |
+
key_features = [
|
74 |
+
'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
|
75 |
+
'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
|
76 |
+
'time_since_last_review_user', 'user_account_age', 'pronoun_density',
|
77 |
+
'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
|
78 |
+
'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
|
79 |
+
]
|
80 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
81 |
+
plt.figure(figsize=(14, 10))
|
82 |
+
for i, feature in enumerate(key_features[6:11], 1):
|
83 |
+
plt.subplot(2, 3, i)
|
84 |
+
sns.boxplot(x='fake', y=feature, data=self.df, palette=['skyblue', 'salmon'])
|
85 |
+
plt.title(f'{feature} Distribution', fontsize=12)
|
86 |
+
plt.xlabel('Fake (0/1)', fontsize=10)
|
87 |
+
plt.tight_layout()
|
88 |
+
output_file = Path(self.output_path) / 'box_plots.png'
|
89 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
90 |
+
plt.close()
|
91 |
+
logger.info(f"Saved box plots to {output_file}")
|
92 |
+
|
93 |
+
def plot_scatter_review_grammar(self):
|
94 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
95 |
+
plt.figure(figsize=(10, 6))
|
96 |
+
sns.scatterplot(x='review_stars', y='grammar_error_score', hue='fake', data=self.df, palette=['blue', 'red'], alpha=0.5)
|
97 |
+
plt.title('Review Stars vs Grammar Error Score by Fake Label', fontsize=16)
|
98 |
+
plt.xlabel('Review Stars', fontsize=12)
|
99 |
+
plt.ylabel('Grammar Error Score', fontsize=12)
|
100 |
+
plt.legend(title='Fake')
|
101 |
+
plt.tight_layout()
|
102 |
+
output_file = Path(self.output_path) / 'scatter_review_grammar.png'
|
103 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
104 |
+
plt.close()
|
105 |
+
logger.info(f"Saved scatter plot to {output_file}")
|
106 |
+
|
107 |
+
def plot_density_plots(self):
|
108 |
+
key_features = [
|
109 |
+
'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
|
110 |
+
'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
|
111 |
+
'time_since_last_review_user', 'user_account_age', 'pronoun_density',
|
112 |
+
'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
|
113 |
+
'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
|
114 |
+
]
|
115 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
116 |
+
plt.figure(figsize=(14, 10))
|
117 |
+
for i, feature in enumerate(key_features[:4], 1):
|
118 |
+
plt.subplot(2, 2, i)
|
119 |
+
for label in [0, 1]:
|
120 |
+
subset = self.df[self.df['fake'] == label]
|
121 |
+
sns.kdeplot(subset[feature], label=f'Fake={label}', fill=True, alpha=0.5)
|
122 |
+
plt.title(f'{feature} Density', fontsize=12)
|
123 |
+
plt.xlabel(feature, fontsize=10)
|
124 |
+
plt.legend()
|
125 |
+
plt.tight_layout()
|
126 |
+
output_file = Path(self.output_path) / 'density_plots.png'
|
127 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
128 |
+
plt.close()
|
129 |
+
logger.info(f"Saved density plots to {output_file}")
|
130 |
+
|
131 |
+
def plot_stacked_bar_similarity(self):
|
132 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
133 |
+
bins = pd.cut(self.df['similarity_to_other_reviews'], bins=10)
|
134 |
+
stacked_data = self.df.groupby([bins, 'fake']).size().unstack(fill_value=0)
|
135 |
+
stacked_data = stacked_data.div(stacked_data.sum(axis=1), axis=0)
|
136 |
+
plt.figure(figsize=(12, 8))
|
137 |
+
stacked_data.plot(kind='bar', stacked=True, color=['skyblue', 'salmon'], width=0.8)
|
138 |
+
plt.title('Proportion of Fake by Similarity to Other Reviews Bins', fontsize=16)
|
139 |
+
plt.xlabel('Similarity Bins', fontsize=12)
|
140 |
+
plt.ylabel('Proportion', fontsize=12)
|
141 |
+
plt.legend(['Genuine (0)', 'Fake (1)'], title='Fake Label')
|
142 |
+
plt.xticks(rotation=45, ha='right')
|
143 |
+
plt.tight_layout()
|
144 |
+
output_file = Path(self.output_path) / 'stacked_bar_similarity.png'
|
145 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
146 |
+
plt.close()
|
147 |
+
logger.info(f"Saved stacked bar plot to {output_file}")
|
148 |
+
|
149 |
+
def plot_pie_fake_distribution(self):
|
150 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
151 |
+
fake_counts = self.df['fake'].value_counts()
|
152 |
+
plt.figure(figsize=(8, 8))
|
153 |
+
plt.pie(fake_counts, labels=['Genuine (0)', 'Fake (1)'], colors=['skyblue', 'salmon'], autopct='%1.1f%%', startangle=90)
|
154 |
+
plt.title('Distribution of Fake Labels', fontsize=16)
|
155 |
+
plt.axis('equal')
|
156 |
+
output_file = Path(self.output_path) / 'pie_fake_distribution.png'
|
157 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
158 |
+
plt.close()
|
159 |
+
logger.info(f"Saved pie chart to {output_file}")
|
160 |
+
|
161 |
+
def plot_count_code_switching(self):
|
162 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
163 |
+
plt.figure(figsize=(8, 6))
|
164 |
+
sns.countplot(x='code_switching_flag', hue='fake', data=self.df, palette=['skyblue', 'salmon'])
|
165 |
+
plt.title('Count of Fake by Code Switching Flag', fontsize=16)
|
166 |
+
plt.xlabel('Code Switching Flag (0/1)', fontsize=12)
|
167 |
+
plt.ylabel('Count', fontsize=12)
|
168 |
+
plt.legend(title='Fake Label')
|
169 |
+
plt.tight_layout()
|
170 |
+
output_file = Path(self.output_path) / 'count_code_switching.png'
|
171 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
172 |
+
plt.close()
|
173 |
+
logger.info(f"Saved count plot to {output_file}")
|
174 |
+
|
175 |
+
def plot_variance_by_fake_bar(self):
|
176 |
+
key_features = [
|
177 |
+
'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
|
178 |
+
'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
|
179 |
+
'time_since_last_review_user', 'user_account_age', 'pronoun_density',
|
180 |
+
'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
|
181 |
+
'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
|
182 |
+
]
|
183 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
184 |
+
variance_by_fake = self.df.groupby('fake')[key_features].var().T
|
185 |
+
variance_by_fake.columns = ['Genuine (0)', 'Fake (1)']
|
186 |
+
plt.figure(figsize=(12, 8))
|
187 |
+
variance_by_fake.plot(kind='bar', color=['skyblue', 'salmon'], width=0.8)
|
188 |
+
plt.title('Feature Variance by Fake Label', fontsize=16)
|
189 |
+
plt.xlabel('Features', fontsize=12)
|
190 |
+
plt.ylabel('Variance', fontsize=12)
|
191 |
+
plt.xticks(rotation=45, ha='right')
|
192 |
+
plt.legend(title='Fake Label')
|
193 |
+
plt.tight_layout()
|
194 |
+
output_file = Path(self.output_path) / 'variance_by_fake_bar.png'
|
195 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
196 |
+
plt.close()
|
197 |
+
logger.info(f"Saved variance bar plot to {output_file}")
|
198 |
+
|
199 |
+
def run_pipeline(self):
|
200 |
+
|
201 |
+
sns.set(style="whitegrid")
|
202 |
+
plt.rcParams['figure.figsize'] = (12, 8)
|
203 |
+
self.plot_correlation_heatmap()
|
204 |
+
self.plot_mean_by_fake_bar()
|
205 |
+
self.plot_violin_plots()
|
206 |
+
self.plot_box_plots()
|
207 |
+
self.plot_scatter_review_grammar()
|
208 |
+
self.plot_density_plots()
|
209 |
+
self.plot_stacked_bar_similarity()
|
210 |
+
self.plot_pie_fake_distribution()
|
211 |
+
self.plot_count_code_switching()
|
212 |
+
self.plot_variance_by_fake_bar()
|
src/.ipynb_checkpoints/model-checkpoint.py
ADDED
@@ -0,0 +1,541 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from torch_geometric.data import HeteroData
|
5 |
+
import numpy as np
|
6 |
+
import pandas as pd
|
7 |
+
import networkx as nx
|
8 |
+
import matplotlib.pyplot as plt
|
9 |
+
import seaborn as sns
|
10 |
+
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve
|
11 |
+
from sklearn.model_selection import train_test_split
|
12 |
+
from pathlib import Path
|
13 |
+
from datetime import datetime
|
14 |
+
from loguru import logger
|
15 |
+
|
16 |
+
# Temporal Edge Features Function
|
17 |
+
def create_temporal_edge_features(time_since_src, time_since_tgt, user_i, user_j):
|
18 |
+
delta_t = torch.abs(time_since_src - time_since_tgt).float()
|
19 |
+
hour_scale = torch.sin(delta_t / 3600)
|
20 |
+
day_scale = torch.sin(delta_t / (24 * 3600))
|
21 |
+
week_scale = torch.sin(delta_t / (7 * 24 * 3600))
|
22 |
+
same_user = (user_i == user_j).float()
|
23 |
+
burst_feature = same_user * torch.exp(-delta_t / (24 * 3600))
|
24 |
+
return torch.stack([hour_scale, day_scale, week_scale, burst_feature], dim=-1)
|
25 |
+
|
26 |
+
# Custom Multihead Attention (unchanged)
|
27 |
+
class CustomMultiheadAttention(nn.Module):
|
28 |
+
def __init__(self, embed_dim, num_heads):
|
29 |
+
super().__init__()
|
30 |
+
self.embed_dim = embed_dim
|
31 |
+
self.num_heads = num_heads
|
32 |
+
self.head_dim = embed_dim // num_heads
|
33 |
+
|
34 |
+
assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
|
35 |
+
|
36 |
+
self.q_proj = nn.Linear(embed_dim, embed_dim)
|
37 |
+
self.k_proj = nn.Linear(embed_dim, embed_dim)
|
38 |
+
self.v_proj = nn.Linear(embed_dim, embed_dim)
|
39 |
+
self.out_proj = nn.Linear(embed_dim, embed_dim)
|
40 |
+
|
41 |
+
self.scale = self.head_dim ** -0.5
|
42 |
+
|
43 |
+
def forward(self, query, key, value, attn_bias=None):
|
44 |
+
batch_size, seq_len, embed_dim = query.size()
|
45 |
+
q = self.q_proj(query)
|
46 |
+
k = self.k_proj(key)
|
47 |
+
v = self.v_proj(value)
|
48 |
+
q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
|
49 |
+
k = k.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
|
50 |
+
v = v.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
|
51 |
+
scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale
|
52 |
+
if attn_bias is not None:
|
53 |
+
scores = scores + attn_bias.unsqueeze(1)
|
54 |
+
attn = F.softmax(scores, dim=-1)
|
55 |
+
out = torch.matmul(attn, v)
|
56 |
+
out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, embed_dim)
|
57 |
+
out = self.out_proj(out)
|
58 |
+
return out, attn
|
59 |
+
|
60 |
+
# HeteroGraphormer (unchanged)
|
61 |
+
class HeteroGraphormer(nn.Module):
|
62 |
+
def __init__(self, hidden_dim, output_dim, num_heads=4, edge_dim=4):
|
63 |
+
super().__init__()
|
64 |
+
self.hidden_dim = hidden_dim
|
65 |
+
|
66 |
+
self.embed_dict = nn.ModuleDict({
|
67 |
+
'user': nn.Linear(14, hidden_dim),
|
68 |
+
'business': nn.Linear(8, hidden_dim),
|
69 |
+
'review': nn.Linear(16, hidden_dim)
|
70 |
+
})
|
71 |
+
|
72 |
+
self.edge_proj = nn.Linear(edge_dim, hidden_dim)
|
73 |
+
|
74 |
+
self.gru_user = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
|
75 |
+
self.gru_business = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
|
76 |
+
self.gru_review = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
|
77 |
+
|
78 |
+
self.attention1 = CustomMultiheadAttention(hidden_dim, num_heads)
|
79 |
+
self.attention2 = CustomMultiheadAttention(hidden_dim, num_heads)
|
80 |
+
|
81 |
+
self.ffn1 = nn.Sequential(
|
82 |
+
nn.Linear(hidden_dim, hidden_dim * 4),
|
83 |
+
nn.ReLU(),
|
84 |
+
nn.Dropout(0.1),
|
85 |
+
nn.Linear(hidden_dim * 4, hidden_dim)
|
86 |
+
)
|
87 |
+
self.ffn2 = nn.Sequential(
|
88 |
+
nn.Linear(hidden_dim, hidden_dim * 4),
|
89 |
+
nn.ReLU(),
|
90 |
+
nn.Dropout(0.1),
|
91 |
+
nn.Linear(hidden_dim * 4, hidden_dim)
|
92 |
+
)
|
93 |
+
|
94 |
+
self.norm1 = nn.LayerNorm(hidden_dim)
|
95 |
+
self.norm2 = nn.LayerNorm(hidden_dim)
|
96 |
+
self.norm3 = nn.LayerNorm(hidden_dim)
|
97 |
+
self.norm4 = nn.LayerNorm(hidden_dim)
|
98 |
+
|
99 |
+
self.centrality_proj = nn.Linear(1, hidden_dim)
|
100 |
+
|
101 |
+
self.classifier = nn.Sequential(
|
102 |
+
nn.Linear(hidden_dim * 3, hidden_dim),
|
103 |
+
nn.ReLU(),
|
104 |
+
nn.Dropout(0.1),
|
105 |
+
nn.Linear(hidden_dim, 1)
|
106 |
+
)
|
107 |
+
|
108 |
+
self.dropout = nn.Dropout(0.1)
|
109 |
+
|
110 |
+
def time_aware_aggregation(self, x, time_since, decay_rate=0.1):
|
111 |
+
weights = torch.exp(-decay_rate * time_since.unsqueeze(-1))
|
112 |
+
return x * weights
|
113 |
+
|
114 |
+
def forward(self, data, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict):
|
115 |
+
x_dict = {}
|
116 |
+
for node_type in data.x_dict:
|
117 |
+
x = self.embed_dict[node_type](data[node_type].x)
|
118 |
+
if node_type in time_since_dict:
|
119 |
+
x = self.time_aware_aggregation(x, time_since_dict[node_type])
|
120 |
+
x_dict[node_type] = x
|
121 |
+
|
122 |
+
x = torch.cat([x_dict['user'], x_dict['business'], x_dict['review']], dim=0)
|
123 |
+
|
124 |
+
centrality = self.centrality_proj(centrality_encoding)
|
125 |
+
x = x + centrality
|
126 |
+
|
127 |
+
x = x.unsqueeze(0)
|
128 |
+
|
129 |
+
x_user = x[:, :data['user'].x.size(0), :]
|
130 |
+
x_business = x[:, data['user'].x.size(0):data['user'].x.size(0) + data['business'].x.size(0), :]
|
131 |
+
x_review = x[:, data['user'].x.size(0) + data['business'].x.size(0):, :]
|
132 |
+
|
133 |
+
x_user, _ = self.gru_user(x_user)
|
134 |
+
x_business, _ = self.gru_business(x_business)
|
135 |
+
x_review, _ = self.gru_review(x_review)
|
136 |
+
|
137 |
+
x = torch.cat([x_user, x_business, x_review], dim=1)
|
138 |
+
|
139 |
+
total_nodes = x.size(1)
|
140 |
+
attn_bias = torch.zeros(1, total_nodes, total_nodes, device=x.device)
|
141 |
+
attn_bias[0] = -spatial_encoding
|
142 |
+
|
143 |
+
for edge_type in edge_features_dict:
|
144 |
+
edge_index = data[edge_type].edge_index
|
145 |
+
edge_feats = self.edge_proj(edge_features_dict[edge_type])
|
146 |
+
for i, (src, tgt) in enumerate(edge_index.t()):
|
147 |
+
attn_bias[0, src, tgt] += edge_feats[i].sum()
|
148 |
+
|
149 |
+
residual = x
|
150 |
+
x, _ = self.attention1(x, x, x, attn_bias=attn_bias)
|
151 |
+
x = self.norm1(x + residual)
|
152 |
+
x = self.dropout(x)
|
153 |
+
|
154 |
+
residual = x
|
155 |
+
x = self.ffn1(x)
|
156 |
+
x = self.norm2(x + residual)
|
157 |
+
x = self.dropout(x)
|
158 |
+
|
159 |
+
residual = x
|
160 |
+
x, _ = self.attention2(x, x, x, attn_bias=attn_bias)
|
161 |
+
x = self.norm3(x + residual)
|
162 |
+
x = self.dropout(x)
|
163 |
+
|
164 |
+
residual = x
|
165 |
+
x = self.ffn2(x)
|
166 |
+
x = self.norm4(x + residual)
|
167 |
+
x = self.dropout(x)
|
168 |
+
|
169 |
+
x = x.squeeze(0)
|
170 |
+
|
171 |
+
user_start = 0
|
172 |
+
business_start = data['user'].x.size(0)
|
173 |
+
review_start = business_start + data['business'].x.size(0)
|
174 |
+
|
175 |
+
h_user = x[user_start:business_start]
|
176 |
+
h_business = x[business_start:review_start]
|
177 |
+
h_review = x[review_start:]
|
178 |
+
|
179 |
+
user_indices = data['user', 'writes', 'review'].edge_index[0]
|
180 |
+
business_indices = data['review', 'about', 'business'].edge_index[1]
|
181 |
+
review_indices = data['user', 'writes', 'review'].edge_index[1]
|
182 |
+
|
183 |
+
h_user_mapped = h_user[user_indices]
|
184 |
+
h_business_mapped = h_business[business_indices]
|
185 |
+
h_review_mapped = h_review[review_indices]
|
186 |
+
|
187 |
+
combined = torch.cat([h_review_mapped, h_user_mapped, h_business_mapped], dim=-1)
|
188 |
+
|
189 |
+
logits = self.classifier(combined)
|
190 |
+
return torch.sigmoid(logits)
|
191 |
+
|
192 |
+
# Updated GraphformerModel with Plotting
|
193 |
+
class GraphformerModel:
|
194 |
+
def __init__(self, df, output_path, epochs, test_size=0.3):
|
195 |
+
self.df_whole = df
|
196 |
+
self.output_path = output_path
|
197 |
+
self.output_path = Path(self.output_path) / "GraphformerModel"
|
198 |
+
self.epochs = epochs
|
199 |
+
self.df, self.test_df = train_test_split(self.df_whole, test_size=test_size, random_state=42)
|
200 |
+
|
201 |
+
torch.manual_seed(42)
|
202 |
+
np.random.seed(42)
|
203 |
+
|
204 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
205 |
+
|
206 |
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
207 |
+
self.model = HeteroGraphormer(hidden_dim=64, output_dim=1, edge_dim=4).to(self.device)
|
208 |
+
self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.005)
|
209 |
+
self.criterion = nn.BCELoss()
|
210 |
+
|
211 |
+
def compute_graph_encodings(self, data):
|
212 |
+
G = nx.DiGraph()
|
213 |
+
node_offset = 0
|
214 |
+
node_type_map = {}
|
215 |
+
|
216 |
+
for node_type in ['user', 'business', 'review']:
|
217 |
+
num_nodes = data[node_type].x.size(0)
|
218 |
+
for i in range(num_nodes):
|
219 |
+
G.add_node(node_offset + i)
|
220 |
+
node_type_map[node_offset + i] = node_type
|
221 |
+
node_offset += num_nodes
|
222 |
+
|
223 |
+
edge_types = [('user', 'writes', 'review'), ('review', 'about', 'business')]
|
224 |
+
for src_type, rel, tgt_type in edge_types:
|
225 |
+
edge_index = data[src_type, rel, tgt_type].edge_index
|
226 |
+
src_nodes = edge_index[0].tolist()
|
227 |
+
tgt_nodes = edge_index[1].tolist()
|
228 |
+
src_offset = 0 if src_type == 'user' else (self.num_users if src_type == 'business' else self.num_users + self.num_businesses)
|
229 |
+
tgt_offset = 0 if tgt_type == 'user' else (self.num_users if tgt_type == 'business' else self.num_users + self.num_businesses)
|
230 |
+
for src, tgt in zip(src_nodes, tgt_nodes):
|
231 |
+
G.add_edge(src + src_offset, tgt + tgt_offset)
|
232 |
+
|
233 |
+
num_nodes = G.number_of_nodes()
|
234 |
+
spatial_encoding = torch.full((num_nodes, num_nodes), float('inf'), device=self.device)
|
235 |
+
for i in range(num_nodes):
|
236 |
+
for j in range(num_nodes):
|
237 |
+
if i == j:
|
238 |
+
spatial_encoding[i, j] = 0
|
239 |
+
elif nx.has_path(G, i, j):
|
240 |
+
spatial_encoding[i, j] = nx.shortest_path_length(G, i, j)
|
241 |
+
|
242 |
+
centrality_encoding = torch.tensor([G.degree(i) for i in range(num_nodes)], dtype=torch.float, device=self.device).view(-1, 1)
|
243 |
+
|
244 |
+
return spatial_encoding, centrality_encoding, node_type_map
|
245 |
+
|
246 |
+
def compute_metrics(self, y_true, y_pred, y_prob, prefix=""):
|
247 |
+
metrics = {}
|
248 |
+
metrics[f"{prefix}accuracy"] = accuracy_score(y_true, y_pred)
|
249 |
+
metrics[f"{prefix}precision"] = precision_score(y_true, y_pred, zero_division=0)
|
250 |
+
metrics[f"{prefix}recall"] = recall_score(y_true, y_pred, zero_division=0)
|
251 |
+
metrics[f"{prefix}f1"] = f1_score(y_true, y_pred, zero_division=0)
|
252 |
+
metrics[f"{prefix}auc_roc"] = roc_auc_score(y_true, y_prob)
|
253 |
+
metrics[f"{prefix}conf_matrix"] = confusion_matrix(y_true, y_pred)
|
254 |
+
metrics[f"{prefix}class_report"] = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
|
255 |
+
return metrics
|
256 |
+
|
257 |
+
def run_model(self):
|
258 |
+
features = torch.tensor(self.df.drop(columns=['user_id', 'review_id', 'business_id', 'fake']).values, dtype=torch.float, device=self.device)
|
259 |
+
y = torch.tensor(self.df['fake'].values, dtype=torch.float, device=self.device)
|
260 |
+
time_since_user = torch.tensor(self.df['time_since_last_review_user'].values, dtype=torch.float, device=self.device)
|
261 |
+
time_since_business = torch.tensor(self.df['time_since_last_review_business'].values, dtype=torch.float, device=self.device)
|
262 |
+
num_rows = len(self.df)
|
263 |
+
|
264 |
+
graph = HeteroData()
|
265 |
+
|
266 |
+
self.num_users = len(self.df['user_id'].unique())
|
267 |
+
self.num_businesses = len(self.df['business_id'].unique())
|
268 |
+
|
269 |
+
user_indices = torch.tensor(self.df['user_id'].map({uid: i for i, uid in enumerate(self.df['user_id'].unique())}).values, dtype=torch.long, device=self.device)
|
270 |
+
business_indices = torch.tensor(self.df['business_id'].map({bid: i for i, bid in enumerate(self.df['business_id'].unique())}).values, dtype=torch.long, device=self.device)
|
271 |
+
review_indices = torch.arange(num_rows, dtype=torch.long, device=self.device)
|
272 |
+
|
273 |
+
user_feats = torch.zeros(self.num_users, 14, device=self.device)
|
274 |
+
business_feats = torch.zeros(self.num_businesses, 8, device=self.device)
|
275 |
+
review_feats = torch.zeros(num_rows, 16, device=self.device)
|
276 |
+
|
277 |
+
user_cols = ['hours', 'user_review_count', 'elite', 'friends', 'fans', 'average_stars',
|
278 |
+
'time_since_last_review_user', 'user_account_age', 'user_degree',
|
279 |
+
'user_review_burst_count', 'review_like_ratio', 'latest_checkin_hours',
|
280 |
+
'user_useful_funny_cool', 'rating_variance_user']
|
281 |
+
business_cols = ['latitude', 'longitude', 'business_stars', 'business_review_count',
|
282 |
+
'time_since_last_review_business', 'business_degree',
|
283 |
+
'business_review_burst_count', 'rating_deviation_from_business_average']
|
284 |
+
review_cols = ['review_stars', 'tip_compliment_count', 'tip_count', 'average_time_between_reviews',
|
285 |
+
'temporal_similarity', 'pronoun_density', 'avg_sentence_length',
|
286 |
+
'excessive_punctuation_count', 'sentiment_polarity', 'good_severity',
|
287 |
+
'bad_severity', 'code_switching_flag', 'grammar_error_score',
|
288 |
+
'repetitive_words_count', 'similarity_to_other_reviews', 'review_useful_funny_cool']
|
289 |
+
|
290 |
+
for i in range(len(self.df)):
|
291 |
+
user_idx = user_indices[i]
|
292 |
+
business_idx = business_indices[i]
|
293 |
+
user_feats[user_idx] += features[i, :14]
|
294 |
+
business_feats[business_idx] += features[i, 14:22]
|
295 |
+
review_feats = features[:, 22:38]
|
296 |
+
|
297 |
+
graph['user'].x = user_feats
|
298 |
+
graph['business'].x = business_feats
|
299 |
+
graph['review'].x = review_feats
|
300 |
+
graph['review'].y = y
|
301 |
+
|
302 |
+
graph['user', 'writes', 'review'].edge_index = torch.stack([user_indices, review_indices], dim=0)
|
303 |
+
graph['review', 'about', 'business'].edge_index = torch.stack([review_indices, business_indices], dim=0)
|
304 |
+
|
305 |
+
edge_features_dict = {}
|
306 |
+
user_writes_edge = graph['user', 'writes', 'review'].edge_index
|
307 |
+
review_about_edge = graph['review', 'about', 'business'].edge_index
|
308 |
+
|
309 |
+
src_users = user_indices[user_writes_edge[0]]
|
310 |
+
tgt_reviews = review_indices[user_writes_edge[1]]
|
311 |
+
edge_features_dict[('user', 'writes', 'review')] = create_temporal_edge_features(
|
312 |
+
time_since_user[src_users], time_since_user[tgt_reviews], src_users, src_users
|
313 |
+
)
|
314 |
+
|
315 |
+
src_reviews = review_indices[review_about_edge[0]]
|
316 |
+
tgt_businesses = business_indices[review_about_edge[1]]
|
317 |
+
edge_features_dict[('review', 'about', 'business')] = create_temporal_edge_features(
|
318 |
+
time_since_business[src_reviews], time_since_business[tgt_businesses],
|
319 |
+
torch.zeros_like(src_reviews), torch.zeros_like(src_reviews)
|
320 |
+
)
|
321 |
+
|
322 |
+
user_time_since = self.df.groupby('user_id')['time_since_last_review_user'].min().reindex(
|
323 |
+
self.df['user_id'].unique(), fill_value=0).values
|
324 |
+
time_since_dict = {
|
325 |
+
'user': torch.tensor(user_time_since, dtype=torch.float, device=self.device)
|
326 |
+
}
|
327 |
+
|
328 |
+
spatial_encoding, centrality_encoding, node_type_map = self.compute_graph_encodings(graph)
|
329 |
+
|
330 |
+
# Training with metrics history
|
331 |
+
self.model.train()
|
332 |
+
train_metrics_history = []
|
333 |
+
for epoch in range(self.epochs):
|
334 |
+
self.optimizer.zero_grad()
|
335 |
+
out = self.model(graph, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict)
|
336 |
+
loss = self.criterion(out.squeeze(), y)
|
337 |
+
loss.backward()
|
338 |
+
self.optimizer.step()
|
339 |
+
|
340 |
+
pred_labels = (out.squeeze() > 0.5).float()
|
341 |
+
logger.info(f"PREDICTED LABELS : {pred_labels}")
|
342 |
+
# print(pred_labels)
|
343 |
+
probs = out.squeeze().detach().cpu().numpy()
|
344 |
+
train_metrics = self.compute_metrics(y.cpu().numpy(), pred_labels.cpu().numpy(), probs, prefix="train_")
|
345 |
+
train_metrics['loss'] = loss.item()
|
346 |
+
train_metrics_history.append(train_metrics)
|
347 |
+
|
348 |
+
if epoch % 10 == 0:
|
349 |
+
logger.info(f"Epoch {epoch}, Loss: {loss.item():.4f}, Accuracy: {train_metrics['train_accuracy']:.4f}, F1: {train_metrics['train_f1']:.4f}")
|
350 |
+
|
351 |
+
# Save model
|
352 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
353 |
+
model_save_path = Path(self.output_path) / f"model_GraphformerModel_latest.pth"
|
354 |
+
torch.save(self.model.state_dict(), model_save_path)
|
355 |
+
|
356 |
+
# Testing
|
357 |
+
if self.test_df is not None:
|
358 |
+
test_features = torch.tensor(self.test_df.drop(columns=['user_id', 'review_id', 'business_id', 'fake']).values, dtype=torch.float, device=self.device)
|
359 |
+
test_y = torch.tensor(self.test_df['fake'].values, dtype=torch.float, device=self.device)
|
360 |
+
test_time_since_user = torch.tensor(self.test_df['time_since_last_review_user'].values, dtype=torch.float, device=self.device)
|
361 |
+
test_time_since_business = torch.tensor(self.test_df['time_since_last_review_business'].values, dtype=torch.float, device=self.device)
|
362 |
+
num_test_rows = len(self.test_df)
|
363 |
+
|
364 |
+
new_user_unique = self.test_df['user_id'].unique()
|
365 |
+
new_business_unique = self.test_df['business_id'].unique()
|
366 |
+
|
367 |
+
existing_user_ids = list(self.df['user_id'].unique())
|
368 |
+
user_mapping = {uid: i for i, uid in enumerate(existing_user_ids)}
|
369 |
+
total_users = self.num_users
|
370 |
+
for uid in new_user_unique:
|
371 |
+
if uid not in user_mapping:
|
372 |
+
user_mapping[uid] = total_users
|
373 |
+
total_users += 1
|
374 |
+
|
375 |
+
existing_business_ids = list(self.df['business_id'].unique())
|
376 |
+
business_mapping = {bid: i for i, bid in enumerate(existing_business_ids)}
|
377 |
+
total_businesses = self.num_businesses
|
378 |
+
for bid in new_business_unique:
|
379 |
+
if bid not in business_mapping:
|
380 |
+
business_mapping[bid] = total_businesses
|
381 |
+
total_businesses += 1
|
382 |
+
|
383 |
+
new_user_indices = torch.tensor([user_mapping[uid] for uid in self.test_df['user_id']], dtype=torch.long, device=self.device)
|
384 |
+
new_business_indices = torch.tensor([business_mapping[bid] for bid in self.test_df['business_id']], dtype=torch.long, device=self.device)
|
385 |
+
new_review_indices = torch.arange(num_rows, num_rows + num_test_rows, device=self.device)
|
386 |
+
|
387 |
+
if total_users > self.num_users:
|
388 |
+
additional_user_feats = torch.zeros(total_users - self.num_users, 14, device=self.device)
|
389 |
+
graph['user'].x = torch.cat([graph['user'].x, additional_user_feats], dim=0)
|
390 |
+
if total_businesses > self.num_businesses:
|
391 |
+
additional_business_feats = torch.zeros(total_businesses - self.num_businesses, 8, device=self.device)
|
392 |
+
graph['business'].x = torch.cat([graph['business'].x, additional_business_feats], dim=0)
|
393 |
+
|
394 |
+
for i in range(num_test_rows):
|
395 |
+
user_idx = new_user_indices[i]
|
396 |
+
business_idx = new_business_indices[i]
|
397 |
+
if user_idx < graph['user'].x.size(0):
|
398 |
+
graph['user'].x[user_idx] += test_features[i, :14]
|
399 |
+
if business_idx < graph['business'].x.size(0):
|
400 |
+
graph['business'].x[business_idx] += test_features[i, 14:22]
|
401 |
+
graph['review'].x = torch.cat([graph['review'].x, test_features[:, 22:38]], dim=0)
|
402 |
+
graph['review'].y = torch.cat([graph['review'].y, test_y], dim=0)
|
403 |
+
|
404 |
+
graph['user', 'writes', 'review'].edge_index = torch.cat([
|
405 |
+
graph['user', 'writes', 'review'].edge_index,
|
406 |
+
torch.stack([new_user_indices, new_review_indices], dim=0)], dim=1)
|
407 |
+
graph['review', 'about', 'business'].edge_index = torch.cat([
|
408 |
+
graph['review', 'about', 'business'].edge_index,
|
409 |
+
torch.stack([new_review_indices, new_business_indices], dim=0)], dim=1)
|
410 |
+
|
411 |
+
all_time_since_user = torch.cat([time_since_user, test_time_since_user])
|
412 |
+
all_time_since_business = torch.cat([time_since_business, test_time_since_business])
|
413 |
+
all_user_indices = torch.cat([user_indices, new_user_indices])
|
414 |
+
all_business_indices = torch.cat([business_indices, new_business_indices])
|
415 |
+
all_review_indices = torch.cat([review_indices, new_review_indices])
|
416 |
+
|
417 |
+
user_writes_edge = graph['user', 'writes', 'review'].edge_index
|
418 |
+
review_about_edge = graph['review', 'about', 'business'].edge_index
|
419 |
+
|
420 |
+
edge_features_dict[('user', 'writes', 'review')] = create_temporal_edge_features(
|
421 |
+
all_time_since_user[user_writes_edge[0]], all_time_since_user[user_writes_edge[1]],
|
422 |
+
all_user_indices[user_writes_edge[0]], all_user_indices[user_writes_edge[0]]
|
423 |
+
)
|
424 |
+
edge_features_dict[('review', 'about', 'business')] = create_temporal_edge_features(
|
425 |
+
all_time_since_business[review_about_edge[0]], all_time_since_business[review_about_edge[1]],
|
426 |
+
torch.zeros_like(review_about_edge[0]), torch.zeros_like(review_about_edge[0])
|
427 |
+
)
|
428 |
+
|
429 |
+
self.num_users = total_users
|
430 |
+
self.num_businesses = total_businesses
|
431 |
+
|
432 |
+
test_user_time_since = self.test_df.groupby('user_id')['time_since_last_review_user'].min().reindex(
|
433 |
+
pd.Index(list(self.df['user_id'].unique()) + list(self.test_df['user_id'].unique())), fill_value=0).values
|
434 |
+
time_since_dict['user'] = torch.tensor(test_user_time_since[:total_users], dtype=torch.float, device=self.device)
|
435 |
+
|
436 |
+
spatial_encoding, centrality_encoding, node_type_map = self.compute_graph_encodings(graph)
|
437 |
+
|
438 |
+
self.model.eval()
|
439 |
+
with torch.no_grad():
|
440 |
+
out = self.model(graph, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict)
|
441 |
+
pred_labels = (out.squeeze() > 0.5).float()
|
442 |
+
probs = out.squeeze().detach().cpu().numpy()
|
443 |
+
test_metrics = self.compute_metrics(graph['review'].y[-num_test_rows:].cpu().numpy(), pred_labels[-num_test_rows:].cpu().numpy(), probs[-num_test_rows:], prefix="test_")
|
444 |
+
train_metrics = self.compute_metrics(y.cpu().numpy(), pred_labels[:num_rows].cpu().numpy(), probs[:num_rows], prefix="train_")
|
445 |
+
logger.info(f"Test Accuracy: {test_metrics['test_accuracy']:.4f}, F1: {test_metrics['test_f1']:.4f}, AUC-ROC: {test_metrics['test_auc_roc']:.4f}")
|
446 |
+
|
447 |
+
# Save metrics to file
|
448 |
+
metrics_file = Path(self.output_path) / f"metrics_{timestamp}.txt"
|
449 |
+
with open(metrics_file, 'w') as f:
|
450 |
+
f.write("Training Metrics (Final Epoch):\n")
|
451 |
+
for k, v in train_metrics.items():
|
452 |
+
f.write(f"{k}: {v}\n")
|
453 |
+
f.write("\nTest Metrics:\n")
|
454 |
+
for k, v in test_metrics.items():
|
455 |
+
f.write(f"{k}: {v}\n")
|
456 |
+
|
457 |
+
# Plotting and saving to output_path
|
458 |
+
plt.figure(figsize=(12, 8))
|
459 |
+
plt.plot([m['loss'] for m in train_metrics_history], label='Training Loss')
|
460 |
+
plt.xlabel('Epoch')
|
461 |
+
plt.ylabel('Loss')
|
462 |
+
plt.title('Training Loss Curve')
|
463 |
+
plt.legend()
|
464 |
+
plt.grid(True)
|
465 |
+
plt.savefig(Path(self.output_path) / f"loss_curve_{timestamp}.png")
|
466 |
+
plt.close()
|
467 |
+
|
468 |
+
plt.figure(figsize=(12, 8))
|
469 |
+
plt.plot([m['train_accuracy'] for m in train_metrics_history], label='Training Accuracy')
|
470 |
+
plt.xlabel('Epoch')
|
471 |
+
plt.ylabel('Accuracy')
|
472 |
+
plt.title('Training Accuracy Curve')
|
473 |
+
plt.legend()
|
474 |
+
plt.grid(True)
|
475 |
+
plt.savefig(Path(self.output_path) / f"accuracy_curve_{timestamp}.png")
|
476 |
+
plt.close()
|
477 |
+
|
478 |
+
plt.figure(figsize=(12, 8))
|
479 |
+
plt.plot([m['train_precision'] for m in train_metrics_history], label='Training Precision')
|
480 |
+
plt.plot([m['train_recall'] for m in train_metrics_history], label='Training Recall')
|
481 |
+
plt.plot([m['train_f1'] for m in train_metrics_history], label='Training F1-Score')
|
482 |
+
plt.xlabel('Epoch')
|
483 |
+
plt.ylabel('Score')
|
484 |
+
plt.title('Training Precision, Recall, and F1-Score Curves')
|
485 |
+
plt.legend()
|
486 |
+
plt.grid(True)
|
487 |
+
plt.savefig(Path(self.output_path) / f"prf1_curves_{timestamp}.png")
|
488 |
+
plt.close()
|
489 |
+
|
490 |
+
plt.figure(figsize=(12, 8))
|
491 |
+
plt.plot([m['train_auc_roc'] for m in train_metrics_history], label='Training AUC-ROC')
|
492 |
+
plt.xlabel('Epoch')
|
493 |
+
plt.ylabel('AUC-ROC')
|
494 |
+
plt.title('Training AUC-ROC Curve')
|
495 |
+
plt.legend()
|
496 |
+
plt.grid(True)
|
497 |
+
plt.savefig(Path(self.output_path) / f"auc_roc_curve_train_{timestamp}.png")
|
498 |
+
plt.close()
|
499 |
+
|
500 |
+
plt.figure(figsize=(8, 6))
|
501 |
+
sns.heatmap(test_metrics['test_conf_matrix'], annot=True, fmt='d', cmap='Blues', cbar=False)
|
502 |
+
plt.xlabel('Predicted')
|
503 |
+
plt.ylabel('True')
|
504 |
+
plt.title('Test Confusion Matrix')
|
505 |
+
plt.savefig(Path(self.output_path) / f"confusion_matrix_test_{timestamp}.png")
|
506 |
+
plt.close()
|
507 |
+
|
508 |
+
fpr, tpr, _ = roc_curve(graph['review'].y[-num_test_rows:].cpu().numpy(), probs[-num_test_rows:])
|
509 |
+
plt.figure(figsize=(10, 6))
|
510 |
+
plt.plot(fpr, tpr, label=f'Test ROC Curve (AUC = {test_metrics["test_auc_roc"]:.4f})')
|
511 |
+
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
|
512 |
+
plt.xlabel('False Positive Rate')
|
513 |
+
plt.ylabel('True Positive Rate')
|
514 |
+
plt.title('Test ROC Curve')
|
515 |
+
plt.legend()
|
516 |
+
plt.grid(True)
|
517 |
+
plt.savefig(Path(self.output_path) / f"roc_curve_test_{timestamp}.png")
|
518 |
+
plt.close()
|
519 |
+
|
520 |
+
plt.figure(figsize=(8, 6))
|
521 |
+
sns.heatmap(train_metrics['train_conf_matrix'], annot=True, fmt='d', cmap='Blues', cbar=False)
|
522 |
+
plt.xlabel('Predicted')
|
523 |
+
plt.ylabel('True')
|
524 |
+
plt.title('Training Confusion Matrix (Final Epoch)')
|
525 |
+
plt.savefig(Path(self.output_path) / f"confusion_matrix_train_{timestamp}.png")
|
526 |
+
plt.close()
|
527 |
+
|
528 |
+
fpr_train, tpr_train, _ = roc_curve(graph['review'].y[:num_rows].cpu().numpy(), probs[:num_rows])
|
529 |
+
plt.figure(figsize=(10, 6))
|
530 |
+
plt.plot(fpr_train, tpr_train, label=f'Training ROC Curve (AUC = {train_metrics["train_auc_roc"]:.4f})')
|
531 |
+
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
|
532 |
+
plt.xlabel('False Positive Rate')
|
533 |
+
plt.ylabel('True Positive Rate')
|
534 |
+
plt.title('Training ROC Curve (Final Epoch)')
|
535 |
+
plt.legend()
|
536 |
+
plt.grid(True)
|
537 |
+
plt.savefig(Path(self.output_path) / f"roc_curve_train_{timestamp}.png")
|
538 |
+
plt.close()
|
539 |
+
|
540 |
+
logger.info(f"All metrics, plots, and model saved to {self.output_path}")
|
541 |
+
|
src/.ipynb_checkpoints/model_trainer-checkpoint.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.model import GraphformerModel
|
2 |
+
from pathlib import Path
|
3 |
+
from loguru import logger
|
4 |
+
|
5 |
+
|
6 |
+
class ModelTrainer:
|
7 |
+
def __init__(self, df, output_path, epochs=100,test_size=0.3):
|
8 |
+
self.df = df
|
9 |
+
self.output_path = output_path
|
10 |
+
self.epochs = epochs
|
11 |
+
self.test_size=test_size
|
12 |
+
|
13 |
+
# Create output directory
|
14 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
15 |
+
|
16 |
+
# Initialize the HeteroGraphormerModel
|
17 |
+
|
18 |
+
self.model = GraphformerModel(df=self.df, output_path=self.output_path, epochs=self.epochs,test_size=self.test_size)
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
logger.info(f"Initialized ModelTrainer with output_path: {self.output_path} and epochs: {self.epochs}")
|
23 |
+
|
24 |
+
|
25 |
+
def train_and_evaluate(self):
|
26 |
+
|
27 |
+
try:
|
28 |
+
logger.info("Starting model training and evaluation")
|
29 |
+
self.model.run_model()
|
30 |
+
logger.info("GraphformerModel training and evaluation completed successfully")
|
31 |
+
except Exception as e:
|
32 |
+
logger.error(f"Error during GraphformerModel training and evaluation: {e}")
|
33 |
+
raise
|
34 |
+
|
35 |
+
|
src/.ipynb_checkpoints/preprocessing-checkpoint.py
ADDED
@@ -0,0 +1,831 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from loguru import logger
|
2 |
+
import pandas as pd
|
3 |
+
import json
|
4 |
+
from datetime import datetime
|
5 |
+
import ast
|
6 |
+
import numpy as np
|
7 |
+
from pymongo import MongoClient
|
8 |
+
from collections import defaultdict
|
9 |
+
|
10 |
+
from tqdm import tqdm
|
11 |
+
import time
|
12 |
+
|
13 |
+
import requests
|
14 |
+
import json
|
15 |
+
import os
|
16 |
+
import pandas as pd
|
17 |
+
import nltk
|
18 |
+
from nltk.tokenize import sent_tokenize, word_tokenize
|
19 |
+
from nltk.corpus import stopwords
|
20 |
+
from textblob import TextBlob
|
21 |
+
import re
|
22 |
+
from transformers import BertTokenizer, BertModel
|
23 |
+
from transformers import RobertaTokenizer, RobertaModel
|
24 |
+
import torch
|
25 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
26 |
+
import numpy as np
|
27 |
+
|
28 |
+
# Download NLTK resources
|
29 |
+
nltk.download('punkt')
|
30 |
+
nltk.download('averaged_perceptron_tagger')
|
31 |
+
nltk.download('stopwords')
|
32 |
+
nltk.download('punkt_tab')
|
33 |
+
nltk.download('averaged_perceptron_tagger_eng')
|
34 |
+
class Preprocessor:
|
35 |
+
def __init__(self,df):
|
36 |
+
self.df=df
|
37 |
+
self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
|
38 |
+
self.model = RobertaModel.from_pretrained('roberta-base')
|
39 |
+
self.stop_words = set(stopwords.words('english'))
|
40 |
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Add this line
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
def get_bert_embedding(self, text):
|
45 |
+
inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
|
46 |
+
with torch.no_grad():
|
47 |
+
outputs = self.model(**inputs)
|
48 |
+
return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
|
49 |
+
|
50 |
+
def preprocess_text(self,text):
|
51 |
+
return text if pd.notna(text) else ""
|
52 |
+
|
53 |
+
|
54 |
+
def calculate_duration(self, time_range):
|
55 |
+
if not isinstance(time_range, str) or "-" not in time_range:
|
56 |
+
return None
|
57 |
+
start_str, end_str = time_range.split('-')
|
58 |
+
start_str = start_str.strip() + ':00' if len(start_str.split(':')) == 1 else start_str.strip()
|
59 |
+
end_str = end_str.strip() + ':00' if len(end_str.split(':')) == 1 else end_str.strip()
|
60 |
+
try:
|
61 |
+
start = datetime.strptime(start_str, '%H:%M')
|
62 |
+
end = datetime.strptime(end_str, '%H:%M')
|
63 |
+
duration = (end - start).total_seconds() / 3600
|
64 |
+
return duration if duration >= 0 else duration + 24
|
65 |
+
except ValueError:
|
66 |
+
return None
|
67 |
+
def calculate_sentiment_severity(self, text):
|
68 |
+
if pd.isna(text) or not text.strip():
|
69 |
+
return pd.Series({"good_severity": 0.0, "bad_severity": 0.0})
|
70 |
+
|
71 |
+
# Get sentiment polarity (-1 to 1)
|
72 |
+
blob = TextBlob(text)
|
73 |
+
polarity = blob.sentiment.polarity
|
74 |
+
|
75 |
+
# Define severity weights
|
76 |
+
good_weight = 0.7
|
77 |
+
bad_weight = 0.3
|
78 |
+
|
79 |
+
if polarity > 0:
|
80 |
+
good_severity = good_weight * polarity
|
81 |
+
bad_severity = 0.0
|
82 |
+
elif polarity < 0:
|
83 |
+
good_severity = 0.0
|
84 |
+
bad_severity = bad_weight * abs(polarity)
|
85 |
+
else: # Neutral (polarity = 0)
|
86 |
+
good_severity = 0.0
|
87 |
+
bad_severity = 0.0
|
88 |
+
|
89 |
+
return pd.Series({"good_severity": good_severity, "bad_severity": bad_severity})
|
90 |
+
|
91 |
+
|
92 |
+
def get_avg_duration(self, hours_str):
|
93 |
+
if pd.isna(hours_str) or not isinstance(hours_str, str):
|
94 |
+
return pd.NA
|
95 |
+
try:
|
96 |
+
hours_dict = ast.literal_eval(hours_str)
|
97 |
+
if not hours_dict:
|
98 |
+
return pd.NA
|
99 |
+
durations = [self.calculate_duration(time_range) for time_range in hours_dict.values()]
|
100 |
+
valid_durations = [d for d in durations if d is not None]
|
101 |
+
return sum(valid_durations) / len(valid_durations) if valid_durations else pd.NA
|
102 |
+
except (ValueError, SyntaxError, ZeroDivisionError):
|
103 |
+
return pd.NA
|
104 |
+
|
105 |
+
|
106 |
+
def calculate_time_since_last_review(self):
|
107 |
+
present_date = datetime.now()
|
108 |
+
user_latest_timestamp = {}
|
109 |
+
|
110 |
+
# Convert review_date to datetime
|
111 |
+
self.df["review_date"] = pd.to_datetime(self.df["review_date"])
|
112 |
+
|
113 |
+
# Calculate hours difference for each user's latest review
|
114 |
+
for user_id in self.df["user_id"].unique():
|
115 |
+
latest_date = self.df[self.df["user_id"] == user_id]["review_date"].max()
|
116 |
+
|
117 |
+
if not isinstance(latest_date, datetime):
|
118 |
+
latest_date = latest_date.to_pydatetime()
|
119 |
+
|
120 |
+
hours_difference = (present_date - latest_date).total_seconds() / 3600
|
121 |
+
user_latest_timestamp[user_id] = hours_difference
|
122 |
+
|
123 |
+
# Map the hours difference to a new column
|
124 |
+
self.df["time_since_last_review_user"] = self.df["user_id"].map(user_latest_timestamp)
|
125 |
+
|
126 |
+
def calculate_time_since_last_review_business(self):
|
127 |
+
present_date = datetime.now()
|
128 |
+
|
129 |
+
# Ensure review_date is in datetime format
|
130 |
+
self.df["review_date"] = pd.to_datetime(self.df["review_date"])
|
131 |
+
|
132 |
+
# Initialize dictionary to store hours since last review for each business
|
133 |
+
business_latest_timestamp = {}
|
134 |
+
|
135 |
+
# Iterate over unique business_ids
|
136 |
+
for business_id in self.df["business_id"].unique():
|
137 |
+
# Get the latest review date for this business
|
138 |
+
latest_date = self.df[self.df["business_id"] == business_id]["review_date"].max()
|
139 |
+
|
140 |
+
# Convert to datetime object if needed
|
141 |
+
if not isinstance(latest_date, datetime):
|
142 |
+
latest_date = latest_date.to_pydatetime()
|
143 |
+
|
144 |
+
# Calculate hours difference (already in hours)
|
145 |
+
hours_difference = (present_date - latest_date).total_seconds() / 3600
|
146 |
+
business_latest_timestamp[business_id] = hours_difference
|
147 |
+
|
148 |
+
# Map the hours difference to the new column
|
149 |
+
self.df["time_since_last_review_business"] = self.df["business_id"].map(business_latest_timestamp)
|
150 |
+
|
151 |
+
|
152 |
+
|
153 |
+
def calculate_user_account_age(self):
|
154 |
+
present_date = datetime.now()
|
155 |
+
|
156 |
+
# Convert yelping_since to datetime
|
157 |
+
self.df["yelping_since"] = pd.to_datetime(self.df["yelping_since"])
|
158 |
+
|
159 |
+
# Calculate user account age in days
|
160 |
+
self.df["user_account_age"] = (present_date - self.df["yelping_since"]).dt.days
|
161 |
+
|
162 |
+
|
163 |
+
def calculate_avg_time_between_reviews(self):
|
164 |
+
# Ensure review_date is in datetime format
|
165 |
+
self.df["review_date"] = pd.to_datetime(self.df["review_date"])
|
166 |
+
|
167 |
+
# Sort the DataFrame by user_id and review_date to ensure chronological order
|
168 |
+
self.df = self.df.sort_values(["user_id", "review_date"])
|
169 |
+
|
170 |
+
# Define helper function to calculate average time between reviews
|
171 |
+
def calculate_avg_time(group):
|
172 |
+
if len(group) == 1:
|
173 |
+
return 0 # If only one review, assign 0
|
174 |
+
# Calculate differences in hours between consecutive reviews
|
175 |
+
diffs = group["review_date"].diff().dt.total_seconds() / 3600
|
176 |
+
# Drop the first NaN (from diff) and compute the mean
|
177 |
+
return diffs.dropna().mean()
|
178 |
+
|
179 |
+
# Apply the function to each user_id group and create a mapping
|
180 |
+
avg_time_per_user = self.df.groupby("user_id").apply(calculate_avg_time)
|
181 |
+
|
182 |
+
# Map the average time back to the original DataFrame
|
183 |
+
self.df["average_time_between_reviews"] = self.df["user_id"].map(avg_time_per_user)
|
184 |
+
|
185 |
+
|
186 |
+
def calculate_user_degree(self):
|
187 |
+
# Calculate the number of unique businesses per user
|
188 |
+
user_business_counts = self.df.groupby("user_id")["business_id"].nunique()
|
189 |
+
|
190 |
+
# Map the counts back to the original DataFrame
|
191 |
+
self.df["user_degree"] = self.df["user_id"].map(user_business_counts)
|
192 |
+
|
193 |
+
|
194 |
+
def calculate_business_degree(self):
|
195 |
+
# Calculate the number of unique users per business
|
196 |
+
business_user_counts = self.df.groupby("business_id")["user_id"].nunique()
|
197 |
+
|
198 |
+
# Map the counts back to the original DataFrame
|
199 |
+
self.df["business_degree"] = self.df["business_id"].map(business_user_counts)
|
200 |
+
|
201 |
+
|
202 |
+
def calculate_rating_variance_user(self):
|
203 |
+
# Calculate the mode (most frequent rating) per user
|
204 |
+
user_rating_mode = self.df.groupby("user_id")["review_stars"].agg(lambda x: x.mode()[0])
|
205 |
+
|
206 |
+
# Map the most frequent rating back to the original DataFrame
|
207 |
+
self.df["rating_variance_user"] = self.df["user_id"].map(user_rating_mode)
|
208 |
+
|
209 |
+
|
210 |
+
def calculate_user_review_burst_count(self):
|
211 |
+
# Ensure review_date is in datetime format
|
212 |
+
self.df["review_date"] = pd.to_datetime(self.df["review_date"])
|
213 |
+
|
214 |
+
# Sort by user_id and review_date for chronological order
|
215 |
+
self.df = self.df.sort_values(["user_id", "review_date"])
|
216 |
+
|
217 |
+
# Function to calculate the max number of reviews in any 20-day window
|
218 |
+
def calculate_burst_count(group):
|
219 |
+
if len(group) <= 1:
|
220 |
+
return 0 # No burst if 1 or fewer reviews
|
221 |
+
|
222 |
+
# Convert review_date to a Series for rolling window
|
223 |
+
dates = group["review_date"]
|
224 |
+
|
225 |
+
# Calculate the number of reviews within 20 days of each review
|
226 |
+
burst_counts = []
|
227 |
+
for i, date in enumerate(dates):
|
228 |
+
# Count reviews within 20 days after this date
|
229 |
+
window_end = date + pd.Timedelta(days=20)
|
230 |
+
count = ((dates >= date) & (dates <= window_end)).sum()
|
231 |
+
burst_counts.append(count)
|
232 |
+
|
233 |
+
# Return the maximum burst count for this user
|
234 |
+
return max(burst_counts)
|
235 |
+
|
236 |
+
# Calculate the burst count per user
|
237 |
+
user_burst_counts = self.df.groupby("user_id").apply(calculate_burst_count)
|
238 |
+
|
239 |
+
# Map the burst count back to the original DataFrame
|
240 |
+
self.df["user_review_burst_count"] = self.df["user_id"].map(user_burst_counts)
|
241 |
+
|
242 |
+
|
243 |
+
def calculate_business_review_burst_count(self):
|
244 |
+
# Ensure review_date is in datetime format
|
245 |
+
self.df["review_date"] = pd.to_datetime(self.df["review_date"])
|
246 |
+
|
247 |
+
# Sort by business_id and review_date for chronological order
|
248 |
+
self.df = self.df.sort_values(["business_id", "review_date"])
|
249 |
+
|
250 |
+
# Function to calculate the max number of reviews in any 10-day window
|
251 |
+
def calculate_burst_count(group):
|
252 |
+
if len(group) <= 1:
|
253 |
+
return 0 # No burst if 1 or fewer reviews
|
254 |
+
|
255 |
+
# Convert review_date to a Series for rolling window
|
256 |
+
dates = group["review_date"]
|
257 |
+
|
258 |
+
# Calculate the number of reviews within 10 days of each review
|
259 |
+
burst_counts = []
|
260 |
+
for i, date in enumerate(dates):
|
261 |
+
# Count reviews within 10 days after this date
|
262 |
+
window_end = date + pd.Timedelta(days=10)
|
263 |
+
count = ((dates >= date) & (dates <= window_end)).sum()
|
264 |
+
burst_counts.append(count)
|
265 |
+
|
266 |
+
# Return the maximum burst count for this business
|
267 |
+
return max(burst_counts)
|
268 |
+
|
269 |
+
# Calculate the burst count per business
|
270 |
+
business_burst_counts = self.df.groupby("business_id").apply(calculate_burst_count)
|
271 |
+
|
272 |
+
# Map the burst count back to the original DataFrame
|
273 |
+
self.df["business_review_burst_count"] = self.df["business_id"].map(business_burst_counts)
|
274 |
+
|
275 |
+
|
276 |
+
def calculate_temporal_similarity(self):
|
277 |
+
self.df["review_date"] = pd.to_datetime(self.df["review_date"])
|
278 |
+
|
279 |
+
# Extract the day of the week (0 = Monday, 6 = Sunday)
|
280 |
+
self.df["day_of_week"] = self.df["review_date"].dt.dayofweek
|
281 |
+
|
282 |
+
# Function to calculate avg hours between reviews on frequent days
|
283 |
+
def calculate_avg_hours_on_frequent_days(group):
|
284 |
+
frequent_days = group["day_of_week"].mode().tolist()
|
285 |
+
|
286 |
+
if len(group) <= 1:
|
287 |
+
return 0
|
288 |
+
|
289 |
+
frequent_reviews = group[group["day_of_week"].isin(frequent_days)]
|
290 |
+
|
291 |
+
if len(frequent_reviews) <= 1:
|
292 |
+
return 0
|
293 |
+
|
294 |
+
frequent_reviews = frequent_reviews.sort_values("review_date")
|
295 |
+
diffs = frequent_reviews["review_date"].diff().dt.total_seconds() / 3600
|
296 |
+
|
297 |
+
return diffs.dropna().mean()
|
298 |
+
|
299 |
+
# Calculate average hours for each user
|
300 |
+
avg_hours_per_user = self.df.groupby("user_id").apply(calculate_avg_hours_on_frequent_days)
|
301 |
+
|
302 |
+
# Map the average hours to the new column
|
303 |
+
self.df["temporal_similarity"] = self.df["user_id"].map(avg_hours_per_user)
|
304 |
+
|
305 |
+
# Drop temporary column
|
306 |
+
self.df = self.df.drop(columns=["day_of_week"])
|
307 |
+
|
308 |
+
|
309 |
+
def calculate_rating_deviation_from_business_average(self):
|
310 |
+
# Calculate the average rating per business
|
311 |
+
business_avg_rating = self.df.groupby("business_id")["review_stars"].mean()
|
312 |
+
|
313 |
+
# Map the average rating to each row
|
314 |
+
self.df["business_avg_rating"] = self.df["business_id"].map(business_avg_rating)
|
315 |
+
|
316 |
+
# Calculate the deviation from the business average
|
317 |
+
self.df["rating_deviation_from_business_average"] = (
|
318 |
+
self.df["review_stars"] - self.df["business_avg_rating"]
|
319 |
+
)
|
320 |
+
|
321 |
+
# Drop the temporary column
|
322 |
+
self.df = self.df.drop(columns=["business_avg_rating"])
|
323 |
+
|
324 |
+
def calculate_review_like_ratio(self):
|
325 |
+
# Create a binary column for liked reviews (stars >= 4)
|
326 |
+
self.df["is_liked"] = (self.df["review_stars"] >= 4).astype(int)
|
327 |
+
|
328 |
+
# Calculate the like ratio per user
|
329 |
+
user_like_ratio = self.df.groupby("user_id")["is_liked"].mean()
|
330 |
+
|
331 |
+
# Map the like ratio back to the DataFrame
|
332 |
+
self.df["review_like_ratio"] = self.df["user_id"].map(user_like_ratio)
|
333 |
+
|
334 |
+
# Drop the temporary column
|
335 |
+
self.df = self.df.drop(columns=["is_liked"])
|
336 |
+
|
337 |
+
def calculate_latest_checkin_hours(self):
|
338 |
+
self.df["yelping_since"] = pd.to_datetime(self.df["yelping_since"])
|
339 |
+
|
340 |
+
# Function to get the latest check-in date from a list of strings
|
341 |
+
def get_latest_checkin(checkin_list):
|
342 |
+
if not checkin_list or pd.isna(checkin_list): # Handle empty or NaN
|
343 |
+
return None
|
344 |
+
if isinstance(checkin_list, str):
|
345 |
+
checkin_dates = checkin_list.split(", ")
|
346 |
+
else:
|
347 |
+
checkin_dates = checkin_list
|
348 |
+
return pd.to_datetime(checkin_dates).max()
|
349 |
+
|
350 |
+
# Apply the function to get the latest check-in date per row
|
351 |
+
self.df["latest_checkin_date"] = self.df["checkin_date"].apply(get_latest_checkin)
|
352 |
+
|
353 |
+
# Calculate the hours difference between latest check-in and yelping_since
|
354 |
+
self.df["latest_checkin_hours"] = (
|
355 |
+
(self.df["latest_checkin_date"] - self.df["yelping_since"])
|
356 |
+
.dt.total_seconds() / 3600
|
357 |
+
)
|
358 |
+
|
359 |
+
# Drop the temporary column
|
360 |
+
self.df = self.df.drop(columns=["latest_checkin_date"])
|
361 |
+
self.df["latest_checkin_hours"].fillna(0,inplace=True)
|
362 |
+
|
363 |
+
|
364 |
+
def compute_pronoun_density(self, text):
|
365 |
+
text = self.preprocess_text(text)
|
366 |
+
if not text:
|
367 |
+
return 0
|
368 |
+
words = word_tokenize(text.lower())
|
369 |
+
pos_tags = nltk.pos_tag(words)
|
370 |
+
pronouns = sum(1 for word, pos in pos_tags if pos in ['PRP', 'PRP$'] and word in ['i', 'we'])
|
371 |
+
return pronouns / len(words) if words else 0
|
372 |
+
|
373 |
+
def compute_avg_sentence_length(self, text):
|
374 |
+
text = self.preprocess_text(text)
|
375 |
+
if not text:
|
376 |
+
return 0
|
377 |
+
sentences = sent_tokenize(text)
|
378 |
+
return sum(len(word_tokenize(sent)) for sent in sentences) / len(sentences) if sentences else 0
|
379 |
+
|
380 |
+
def compute_excessive_punctuation(self, text):
|
381 |
+
text = self.preprocess_text(text)
|
382 |
+
return len(re.findall(r'[!?.]{2,}', text))
|
383 |
+
|
384 |
+
def compute_sentiment_polarity(self, text):
|
385 |
+
text = self.preprocess_text(text)
|
386 |
+
return TextBlob(text).sentiment.polarity if text else 0
|
387 |
+
|
388 |
+
def compute_code_switching_flag(self, text):
|
389 |
+
text = self.preprocess_text(text)
|
390 |
+
if not text:
|
391 |
+
return 0
|
392 |
+
|
393 |
+
tokens = self.tokenizer.tokenize(text.lower())
|
394 |
+
if not tokens:
|
395 |
+
return 0
|
396 |
+
|
397 |
+
english_words = self.stop_words # Use self.stop_words from __init__
|
398 |
+
token_set = set(tokens)
|
399 |
+
english_count = sum(1 for token in tokens if token in english_words)
|
400 |
+
|
401 |
+
non_english_pattern = re.compile(r'[^\x00-\x7F]')
|
402 |
+
has_non_ascii = 1 if non_english_pattern.search(text) else 0
|
403 |
+
|
404 |
+
english_ratio = english_count / len(tokens) if tokens else 0
|
405 |
+
|
406 |
+
non_english_tokens = sum(1 for token in token_set if token not in english_words and "##" in token and has_non_ascii)
|
407 |
+
|
408 |
+
# Flag as code-switching if:
|
409 |
+
# 1. Mixed English presence (ratio between 0.1 and 0.9)
|
410 |
+
# 2. Non-ASCII characters present OR some non-English subword tokens
|
411 |
+
if 0.1 < english_ratio < 0.9 and (has_non_ascii or non_english_tokens > 0):
|
412 |
+
return 1
|
413 |
+
return 0
|
414 |
+
|
415 |
+
|
416 |
+
def batch_tokenize(self, texts, batch_size=32, max_length=512):
|
417 |
+
tokenized_outputs = []
|
418 |
+
for i in tqdm(range(0, len(texts), batch_size), desc="Tokenizing with RoBERTa on GPU"):
|
419 |
+
batch_texts = texts[i:i + batch_size]
|
420 |
+
valid_texts = [self.preprocess_text(t) for t in batch_texts]
|
421 |
+
# Tokenize with fixed max_length to ensure consistent tensor sizes
|
422 |
+
inputs = self.tokenizer(valid_texts, return_tensors='pt', truncation=True, padding='max_length', max_length=max_length)
|
423 |
+
tokenized_outputs.append(inputs['input_ids'].to(self.device)) # Move to GPU
|
424 |
+
# Concatenate on GPU with consistent sizes
|
425 |
+
return torch.cat(tokenized_outputs, dim=0)
|
426 |
+
|
427 |
+
def compute_grammar_error_score(self, texts, tokenized_ids):
|
428 |
+
print("Computing grammar error scores...")
|
429 |
+
error_scores = np.zeros(len(texts), dtype=float)
|
430 |
+
|
431 |
+
vocab_set = set(self.tokenizer.get_vocab().keys())
|
432 |
+
for i, input_ids in enumerate(tqdm(tokenized_ids, desc="Processing Grammar Errors")):
|
433 |
+
if input_ids.sum() == 0: # Empty input
|
434 |
+
continue
|
435 |
+
tokens = self.tokenizer.convert_ids_to_tokens(input_ids.cpu().tolist(), skip_special_tokens=True)
|
436 |
+
unknown_count = sum(1 for token in tokens if token not in vocab_set and token not in self.stop_words)
|
437 |
+
total_count = len([t for t in tokens if t not in self.stop_words])
|
438 |
+
error_scores[i] = unknown_count / total_count if total_count > 0 else 0
|
439 |
+
|
440 |
+
return error_scores
|
441 |
+
|
442 |
+
def compute_repetitive_words_count(self, texts, tokenized_ids):
|
443 |
+
print("Computing repetitive words counts...")
|
444 |
+
rep_counts = np.zeros(len(texts), dtype=int)
|
445 |
+
|
446 |
+
for i, input_ids in enumerate(tqdm(tokenized_ids, desc="Processing Repetition")):
|
447 |
+
if input_ids.sum() == 0: # Empty input
|
448 |
+
continue
|
449 |
+
tokens = self.tokenizer.convert_ids_to_tokens(input_ids.cpu().tolist(), skip_special_tokens=True)
|
450 |
+
valid_tokens = [t for t in tokens if t not in self.stop_words and len(t) > 2]
|
451 |
+
if valid_tokens:
|
452 |
+
token_counts = {}
|
453 |
+
for token in valid_tokens:
|
454 |
+
token_counts[token] = token_counts.get(token, 0) + 1
|
455 |
+
rep_counts[i] = sum(1 for count in token_counts.values() if count > 1)
|
456 |
+
|
457 |
+
return rep_counts
|
458 |
+
|
459 |
+
def preprocess_text_for_similarity(self, text):
|
460 |
+
if pd.isna(text) or not text.strip():
|
461 |
+
return []
|
462 |
+
return [w for w in word_tokenize(str(text).lower()) if w not in self.stop_words]
|
463 |
+
|
464 |
+
def batch_encode_words(self, texts, batch_size=32, max_length=512):
|
465 |
+
word_lists = [self.preprocess_text_for_similarity(t) for t in tqdm(texts, desc="Tokenizing Texts")]
|
466 |
+
vocab = {word: idx + 1 for idx, word in enumerate(set.union(*[set(w) for w in word_lists if w]))}
|
467 |
+
|
468 |
+
encoded_batches = []
|
469 |
+
for i in tqdm(range(0, len(word_lists), batch_size), desc="Encoding Words on GPU"):
|
470 |
+
batch_words = word_lists[i:i + batch_size]
|
471 |
+
encoded = np.zeros((len(batch_words), max_length), dtype=np.int64)
|
472 |
+
for j, words in enumerate(batch_words):
|
473 |
+
if words:
|
474 |
+
word_ids = [vocab.get(w, 0) for w in words][:max_length]
|
475 |
+
encoded[j, :len(word_ids)] = word_ids
|
476 |
+
encoded_tensor = torch.tensor(encoded, dtype=torch.int64).to(self.device)
|
477 |
+
encoded_batches.append(encoded_tensor)
|
478 |
+
|
479 |
+
return torch.cat(encoded_batches, dim=0), vocab
|
480 |
+
|
481 |
+
def compute_similarity_to_other_reviews(self, batch_size=32, max_length=512):
|
482 |
+
all_texts = self.df["review_text"].tolist()
|
483 |
+
all_users = self.df["user_id"].tolist()
|
484 |
+
all_review_ids = self.df["review_id"].tolist()
|
485 |
+
|
486 |
+
encoded_words, vocab = self.batch_encode_words(all_texts, batch_size, max_length)
|
487 |
+
|
488 |
+
similarity_scores = {rid: 0.0 for rid in all_review_ids} # Default scores
|
489 |
+
for i, (review_id, user_id) in enumerate(tqdm(zip(all_review_ids, all_users), desc="Computing Similarities on GPU")):
|
490 |
+
if pd.isna(review_id) or pd.isna(user_id):
|
491 |
+
continue
|
492 |
+
|
493 |
+
current_words = encoded_words[i]
|
494 |
+
if current_words.sum() == 0:
|
495 |
+
continue
|
496 |
+
|
497 |
+
other_indices = torch.tensor([j for j, u in enumerate(all_users) if u != user_id and pd.notna(u)],
|
498 |
+
dtype=torch.long).to(self.device)
|
499 |
+
if not other_indices.numel():
|
500 |
+
continue
|
501 |
+
|
502 |
+
other_words = encoded_words[other_indices]
|
503 |
+
current_set = torch.unique(current_words[current_words > 0])
|
504 |
+
other_flat = other_words[other_words > 0]
|
505 |
+
|
506 |
+
if other_flat.numel() == 0:
|
507 |
+
continue
|
508 |
+
|
509 |
+
other_set = torch.unique(other_flat)
|
510 |
+
intersection = torch.sum(torch.isin(current_set, other_set)).float()
|
511 |
+
union = torch.unique(torch.cat([current_set, other_set])).numel()
|
512 |
+
similarity = intersection / union if union > 0 else 0.0
|
513 |
+
|
514 |
+
similarity_scores[review_id] = similarity.item()
|
515 |
+
return pd.Series(similarity_scores, index=all_review_ids)
|
516 |
+
|
517 |
+
def calculate_friend_count(self):
|
518 |
+
friends = []
|
519 |
+
for v in self.df["friends"]:
|
520 |
+
if isinstance(v, str):
|
521 |
+
friends.append(len(v.split(",")))
|
522 |
+
elif type(v)==int or type(v)==float:
|
523 |
+
friends.append(0)
|
524 |
+
self.df["friends"] = friends
|
525 |
+
|
526 |
+
def count_elite_years(self, elite):
|
527 |
+
if pd.isna(elite):
|
528 |
+
return 0
|
529 |
+
return len(str(elite).split(","))
|
530 |
+
|
531 |
+
def transform_elite_status(self):
|
532 |
+
self.df["elite"] = self.df["elite"].apply(lambda x: True if self.count_elite_years(x) > 1 else False)
|
533 |
+
|
534 |
+
def calculate_review_useful_funny_cool(self):
|
535 |
+
self.df["review_useful"] = pd.to_numeric(self.df["review_useful"], errors='coerce').fillna(0)
|
536 |
+
self.df["review_funny"] = pd.to_numeric(self.df["review_funny"], errors='coerce').fillna(0)
|
537 |
+
self.df["review_cool"] = pd.to_numeric(self.df["review_cool"], errors='coerce').fillna(0)
|
538 |
+
self.df["review_useful_funny_cool"] = (
|
539 |
+
self.df["review_useful"] +
|
540 |
+
self.df["review_funny"] +
|
541 |
+
self.df["review_cool"]
|
542 |
+
)
|
543 |
+
self.df["review_useful_funny_cool"] = self.df["review_useful_funny_cool"].fillna(0).astype(int)
|
544 |
+
|
545 |
+
|
546 |
+
def calculate_user_useful_funny_cool(self):
|
547 |
+
self.df["user_useful_funny_cool"] = (
|
548 |
+
self.df["user_useful"] +
|
549 |
+
self.df["user_funny"] +
|
550 |
+
self.df["user_cool"]
|
551 |
+
)
|
552 |
+
self.df["user_useful_funny_cool"] = self.df["user_useful_funny_cool"].fillna(0).astype(int)
|
553 |
+
|
554 |
+
def compute_fake_score(self, row):
|
555 |
+
suspicion_points = 0
|
556 |
+
|
557 |
+
# Linguistic Features
|
558 |
+
if row["pronoun_density"] < 0.01: # Low personal engagement
|
559 |
+
suspicion_points += 1
|
560 |
+
if row["avg_sentence_length"] < 5 or row["avg_sentence_length"] > 30: # Extreme lengths
|
561 |
+
suspicion_points += 1
|
562 |
+
if row["grammar_error_score"] > 5: # Many errors
|
563 |
+
suspicion_points += 1
|
564 |
+
if row["repetitive_words_count"] > 5: # High repetition
|
565 |
+
suspicion_points += 1
|
566 |
+
if row["code_switching_flag"] == 1: # Language mixing
|
567 |
+
suspicion_points += 1
|
568 |
+
if row["excessive_punctuation_count"] > 3: # Overuse of punctuation
|
569 |
+
suspicion_points += 1
|
570 |
+
if abs(row["sentiment_polarity"]) > 0.8: # Extreme sentiment
|
571 |
+
suspicion_points += 1
|
572 |
+
|
573 |
+
# Review Patterns
|
574 |
+
if row["similarity_to_other_reviews"] > 0.8: # High duplication
|
575 |
+
suspicion_points += 1
|
576 |
+
if row["user_review_burst_count"] > 5: # Spammy bursts
|
577 |
+
suspicion_points += 1
|
578 |
+
if row["business_review_burst_count"] > 5: # Targeted bursts
|
579 |
+
suspicion_points += 1
|
580 |
+
if abs(row["rating_deviation_from_business_average"]) > 2: # Large rating deviation
|
581 |
+
suspicion_points += 1
|
582 |
+
if row["review_like_ratio"] > 0.9 or row["review_like_ratio"] < 0.1: # Extreme like ratio
|
583 |
+
suspicion_points += 1
|
584 |
+
|
585 |
+
# User Behavior
|
586 |
+
if row["user_account_age"] < 30: # Very new account (days)
|
587 |
+
suspicion_points += 1
|
588 |
+
if row["average_time_between_reviews"] < 24: # Rapid reviews (hours)
|
589 |
+
suspicion_points += 1
|
590 |
+
if row["user_degree"] < 2: # Low business interaction
|
591 |
+
suspicion_points += 1
|
592 |
+
if row["time_since_last_review_user"] < 24: # Recent burst (hours)
|
593 |
+
suspicion_points += 1
|
594 |
+
|
595 |
+
# Threshold: 3 or more points = fake
|
596 |
+
return 1 if suspicion_points >= 3 else 0
|
597 |
+
|
598 |
+
|
599 |
+
def run_pipeline(self):
|
600 |
+
|
601 |
+
|
602 |
+
|
603 |
+
logger.info("FINALYZING HOURS COLUMN ...")
|
604 |
+
self.df["hours"] = self.df["hours"].apply(self.get_avg_duration)
|
605 |
+
self.df["hours"] = self.df["hours"].fillna(0)
|
606 |
+
print(self.df["hours"][:10])
|
607 |
+
print(self.df["hours"].isnull().sum())
|
608 |
+
|
609 |
+
|
610 |
+
|
611 |
+
|
612 |
+
logger.info("FINALYZING ATTRIBUTES COLUMN ...")
|
613 |
+
self.df.drop("attributes",axis=1,inplace=True)
|
614 |
+
|
615 |
+
|
616 |
+
|
617 |
+
logger.info("CREATING time_since_last_review_user COLUMN ...")
|
618 |
+
self.calculate_time_since_last_review()
|
619 |
+
print(np.unique(self.df["time_since_last_review_user"] ))
|
620 |
+
|
621 |
+
|
622 |
+
logger.info("CREATING time_since_last_review_business COLUMN ...")
|
623 |
+
self.calculate_time_since_last_review_business()
|
624 |
+
print(np.unique(self.df["time_since_last_review_business"] ))
|
625 |
+
|
626 |
+
|
627 |
+
|
628 |
+
logger.info("CREATING user_account_age COLUMN ...")
|
629 |
+
self.calculate_user_account_age()
|
630 |
+
print(np.unique(self.df["user_account_age"] ))
|
631 |
+
|
632 |
+
|
633 |
+
|
634 |
+
logger.info("CREATING average_time_between_reviews COLUMN ...")
|
635 |
+
self.calculate_avg_time_between_reviews()
|
636 |
+
print(np.unique(self.df["average_time_between_reviews"] ))
|
637 |
+
|
638 |
+
|
639 |
+
|
640 |
+
logger.info("CREATING user_degree COLUMN ...")
|
641 |
+
self.calculate_user_degree()
|
642 |
+
print(np.unique(self.df["user_degree"] ))
|
643 |
+
|
644 |
+
|
645 |
+
logger.info("CREATING business_degree COLUMN ...")
|
646 |
+
self.calculate_business_degree()
|
647 |
+
print(np.unique(self.df["business_degree"] ))
|
648 |
+
|
649 |
+
|
650 |
+
logger.info("CREATING rating_variance_user COLUMN ...")
|
651 |
+
self.calculate_rating_variance_user()
|
652 |
+
print(np.unique(self.df["rating_variance_user"] ))
|
653 |
+
|
654 |
+
|
655 |
+
|
656 |
+
logger.info("CREATING user_review_burst_count COLUMN ...")
|
657 |
+
self.calculate_user_review_burst_count()
|
658 |
+
print(np.unique(self.df["user_review_burst_count"] ))
|
659 |
+
|
660 |
+
|
661 |
+
logger.info("CREATING business_review_burst_count COLUMN ...")
|
662 |
+
self.calculate_business_review_burst_count()
|
663 |
+
print(np.unique(self.df["business_review_burst_count"] ))
|
664 |
+
|
665 |
+
|
666 |
+
|
667 |
+
logger.info("CREATING temporal_similarity COLUMN ...")
|
668 |
+
self.calculate_temporal_similarity()
|
669 |
+
print(np.unique(self.df["temporal_similarity"] ))
|
670 |
+
|
671 |
+
|
672 |
+
|
673 |
+
logger.info("CREATING rating_deviation_from_business_average COLUMN ...")
|
674 |
+
self.calculate_rating_deviation_from_business_average()
|
675 |
+
print(np.unique(self.df["rating_deviation_from_business_average"] ))
|
676 |
+
|
677 |
+
|
678 |
+
|
679 |
+
logger.info("CREATING review_like_ratio COLUMN ...")
|
680 |
+
self.calculate_review_like_ratio()
|
681 |
+
print(np.unique(self.df["review_like_ratio"] ))
|
682 |
+
|
683 |
+
|
684 |
+
|
685 |
+
logger.info("CREATING latest_checkin_hours COLUMN ...")
|
686 |
+
self.calculate_latest_checkin_hours()
|
687 |
+
print(np.unique(self.df["latest_checkin_hours"] ))
|
688 |
+
|
689 |
+
|
690 |
+
|
691 |
+
|
692 |
+
logger.info("CREATING pronoun_density COLUMN ...")
|
693 |
+
self.df["pronoun_density"] = self.df["review_text"].apply(self.compute_pronoun_density)
|
694 |
+
print(np.unique(self.df["pronoun_density"] ))
|
695 |
+
|
696 |
+
logger.info("CREATING avg_sentence_length COLUMN ...")
|
697 |
+
self.df["avg_sentence_length"] = self.df["review_text"].apply(self.compute_avg_sentence_length)
|
698 |
+
print(np.unique(self.df["avg_sentence_length"] ))
|
699 |
+
|
700 |
+
logger.info("CREATING excessive_punctuation_count COLUMN ...")
|
701 |
+
self.df["excessive_punctuation_count"] = self.df["review_text"].apply(self.compute_excessive_punctuation)
|
702 |
+
print(np.unique(self.df["excessive_punctuation_count"] ))
|
703 |
+
|
704 |
+
logger.info("CREATING sentiment_polarity COLUMN ...")
|
705 |
+
self.df["sentiment_polarity"] = self.df["review_text"].apply(self.compute_sentiment_polarity)
|
706 |
+
print(np.unique(self.df["sentiment_polarity"] ))
|
707 |
+
|
708 |
+
logger.info("CREATING good_severity and bad_severity COLUMNS ...")
|
709 |
+
severity_scores = self.df["review_text"].apply(self.calculate_sentiment_severity)
|
710 |
+
self.df[["good_severity", "bad_severity"]] = severity_scores
|
711 |
+
print(np.unique(self.df["good_severity"] ))
|
712 |
+
print(np.unique(self.df["bad_severity"] ))
|
713 |
+
|
714 |
+
|
715 |
+
logger.info("CREATING code_switching_flag COLUMN ...")
|
716 |
+
self.df["code_switching_flag"] = self.df["review_text"].apply(self.compute_code_switching_flag)
|
717 |
+
print(np.unique(self.df["code_switching_flag"] ))
|
718 |
+
|
719 |
+
|
720 |
+
all_texts = self.df["review_text"].tolist()
|
721 |
+
tokenized_ids = self.batch_tokenize(all_texts, batch_size=32, max_length=512)
|
722 |
+
|
723 |
+
logger.info("CREATING grammar_error_score COLUMN ...")
|
724 |
+
self.df["grammar_error_score"] = self.compute_grammar_error_score(all_texts, tokenized_ids)
|
725 |
+
print(np.unique(self.df["grammar_error_score"] ))
|
726 |
+
|
727 |
+
|
728 |
+
logger.info("CREATING repetitive_words_count COLUMN ...")
|
729 |
+
self.df["repetitive_words_count"] = self.compute_repetitive_words_count(all_texts, tokenized_ids)
|
730 |
+
print(np.unique(self.df["repetitive_words_count"] ))
|
731 |
+
|
732 |
+
|
733 |
+
|
734 |
+
logger.info("CREATING similarity_to_other_reviews COLUMN ...")
|
735 |
+
similarity_scores = self.compute_similarity_to_other_reviews(batch_size=32, max_length=512)
|
736 |
+
self.df["similarity_to_other_reviews"] = self.df["review_id"].map(similarity_scores)
|
737 |
+
|
738 |
+
print(np.unique(self.df["similarity_to_other_reviews"] ))
|
739 |
+
|
740 |
+
|
741 |
+
|
742 |
+
logger.info("CREATING friends COLUMN ...")
|
743 |
+
self.calculate_friend_count()
|
744 |
+
print(self.df["friends"].value_counts())
|
745 |
+
|
746 |
+
logger.info("CREATING elite COLUMN ...")
|
747 |
+
self.transform_elite_status()
|
748 |
+
print(self.df["elite"].value_counts())
|
749 |
+
|
750 |
+
|
751 |
+
logger.info("CREATING review_useful_funny_cool COLUMN ...")
|
752 |
+
self.calculate_review_useful_funny_cool()
|
753 |
+
print(self.df["review_useful_funny_cool"].value_counts())
|
754 |
+
|
755 |
+
|
756 |
+
logger.info("CREATING user_useful_funny_cool COLUMN ...")
|
757 |
+
self.calculate_user_useful_funny_cool()
|
758 |
+
print(self.df["user_useful_funny_cool"].value_counts())
|
759 |
+
|
760 |
+
|
761 |
+
logger.info("CREATING LABEL COLUMN ...")
|
762 |
+
self.df["fake"] = self.df.apply(self.compute_fake_score, axis=1)
|
763 |
+
print(self.df["fake"].value_counts())
|
764 |
+
|
765 |
+
|
766 |
+
logger.info("SEEING NULL VALUES IN FINAL COLUMNS.....")
|
767 |
+
print(set(self.df.isnull().sum().values))
|
768 |
+
for col in self.df.columns:
|
769 |
+
if self.df[col].isnull().sum()>0:
|
770 |
+
print(f" {col} has {self.df[col].isnull().sum()} null values")
|
771 |
+
|
772 |
+
|
773 |
+
|
774 |
+
return self.df
|
775 |
+
|
776 |
+
|
777 |
+
|
778 |
+
|
779 |
+
|
780 |
+
|
781 |
+
|
782 |
+
|
783 |
+
|
784 |
+
|
785 |
+
|
786 |
+
|
787 |
+
|
788 |
+
|
789 |
+
|
790 |
+
|
791 |
+
|
792 |
+
|
793 |
+
|
794 |
+
|
795 |
+
|
796 |
+
|
797 |
+
|
798 |
+
|
799 |
+
|
800 |
+
|
801 |
+
|
802 |
+
|
803 |
+
|
804 |
+
|
805 |
+
|
806 |
+
|
807 |
+
|
808 |
+
|
809 |
+
|
810 |
+
|
811 |
+
|
812 |
+
|
813 |
+
|
814 |
+
|
815 |
+
|
816 |
+
|
817 |
+
|
818 |
+
|
819 |
+
|
820 |
+
|
821 |
+
|
822 |
+
|
823 |
+
|
824 |
+
|
825 |
+
|
826 |
+
|
827 |
+
|
828 |
+
|
829 |
+
|
830 |
+
|
831 |
+
|
src/__pycache__/analyze_yelp_data.cpython-311.pyc
ADDED
Binary file (20.6 kB). View file
|
|
src/__pycache__/clean_data.cpython-311.pyc
ADDED
Binary file (6.21 kB). View file
|
|
src/__pycache__/clean_data.cpython-39.pyc
ADDED
Binary file (3.03 kB). View file
|
|
src/__pycache__/create_dataset.cpython-311.pyc
ADDED
Binary file (7.58 kB). View file
|
|
src/__pycache__/create_dataset.cpython-39.pyc
ADDED
Binary file (3.98 kB). View file
|
|
src/__pycache__/data_balancing.cpython-311.pyc
ADDED
Binary file (12.6 kB). View file
|
|
src/__pycache__/feature_analyzer.cpython-311.pyc
ADDED
Binary file (17.3 kB). View file
|
|
src/__pycache__/feature_analyzer.cpython-39.pyc
ADDED
Binary file (8.73 kB). View file
|
|
src/__pycache__/feature_importance.cpython-311.pyc
ADDED
Binary file (10.6 kB). View file
|
|
src/__pycache__/model.cpython-311.pyc
ADDED
Binary file (13.9 kB). View file
|
|
src/__pycache__/model.cpython-39.pyc
ADDED
Binary file (17.6 kB). View file
|
|
src/__pycache__/model1.cpython-311.pyc
ADDED
Binary file (42.9 kB). View file
|
|
src/__pycache__/model1.cpython-39.pyc
ADDED
Binary file (17.2 kB). View file
|
|
src/__pycache__/model3.cpython-311.pyc
ADDED
Binary file (44 kB). View file
|
|
src/__pycache__/model3.cpython-39.pyc
ADDED
Binary file (17.6 kB). View file
|
|
src/__pycache__/model_trainer.cpython-311.pyc
ADDED
Binary file (2.31 kB). View file
|
|
src/__pycache__/model_trainer.cpython-39.pyc
ADDED
Binary file (1.32 kB). View file
|
|
src/__pycache__/models.cpython-311.pyc
ADDED
Binary file (45.6 kB). View file
|
|
src/__pycache__/preprocessing.cpython-311.pyc
ADDED
Binary file (50.7 kB). View file
|
|
src/__pycache__/preprocessing.cpython-39.pyc
ADDED
Binary file (24.4 kB). View file
|
|
src/analyze_yelp_data.py
ADDED
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from transformers import AutoTokenizer, AutoModel
|
4 |
+
import torch
|
5 |
+
from sklearn.ensemble import IsolationForest
|
6 |
+
from sklearn.preprocessing import StandardScaler
|
7 |
+
from textblob import TextBlob
|
8 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
9 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
10 |
+
from sklearn.decomposition import PCA
|
11 |
+
import warnings
|
12 |
+
from typing import Dict, List, Tuple
|
13 |
+
import logging
|
14 |
+
from collections import Counter
|
15 |
+
from detoxify import Detoxify
|
16 |
+
import re
|
17 |
+
from datetime import datetime
|
18 |
+
import seaborn as sns
|
19 |
+
import matplotlib.pyplot as plt
|
20 |
+
from pathlib import Path
|
21 |
+
import json
|
22 |
+
|
23 |
+
class AdvancedYelpAnalyzer:
|
24 |
+
def __init__(self, df: pd.DataFrame):
|
25 |
+
"""Initialize the analyzer with necessary models and configurations"""
|
26 |
+
self.df = df.copy()
|
27 |
+
self.bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
28 |
+
self.bert_model = AutoModel.from_pretrained('bert-base-uncased')
|
29 |
+
self.vader = SentimentIntensityAnalyzer()
|
30 |
+
self.toxic_model = Detoxify('original')
|
31 |
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
32 |
+
self.bert_model.to(self.device)
|
33 |
+
|
34 |
+
# Configure logging
|
35 |
+
logging.basicConfig(level=logging.INFO)
|
36 |
+
self.logger = logging.getLogger(__name__)
|
37 |
+
|
38 |
+
def get_bert_embeddings(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
|
39 |
+
"""Generate BERT embeddings for text"""
|
40 |
+
embeddings = []
|
41 |
+
|
42 |
+
for i in range(0, len(texts), batch_size):
|
43 |
+
batch_texts = texts[i:i + batch_size]
|
44 |
+
encoded = self.bert_tokenizer(batch_texts,
|
45 |
+
padding=True,
|
46 |
+
truncation=True,
|
47 |
+
max_length=512,
|
48 |
+
return_tensors='pt')
|
49 |
+
|
50 |
+
with torch.no_grad():
|
51 |
+
encoded = {k: v.to(self.device) for k, v in encoded.items()}
|
52 |
+
outputs = self.bert_model(**encoded)
|
53 |
+
batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
|
54 |
+
embeddings.append(batch_embeddings)
|
55 |
+
|
56 |
+
return np.vstack(embeddings)
|
57 |
+
|
58 |
+
def analyze_sentiment(self) -> pd.DataFrame:
|
59 |
+
"""Perform comprehensive sentiment analysis using multiple tools"""
|
60 |
+
self.logger.info("Starting sentiment analysis...")
|
61 |
+
|
62 |
+
# Calculate BERT embeddings for reviews
|
63 |
+
self.logger.info("Calculating BERT embeddings...")
|
64 |
+
review_texts = self.df['review_text'].fillna('').tolist()
|
65 |
+
bert_embeddings = self.get_bert_embeddings(review_texts)
|
66 |
+
|
67 |
+
# Calculate review length using BERT tokenizer
|
68 |
+
self.logger.info("Calculating tokenized lengths...")
|
69 |
+
self.df['review_length'] = self.df['review_text'].apply(
|
70 |
+
lambda x: len(self.bert_tokenizer.encode(str(x)))
|
71 |
+
)
|
72 |
+
|
73 |
+
# Store BERT embeddings mean and std as features
|
74 |
+
self.df['bert_embedding_mean'] = np.mean(bert_embeddings, axis=1)
|
75 |
+
self.df['bert_embedding_std'] = np.std(bert_embeddings, axis=1)
|
76 |
+
|
77 |
+
# TextBlob sentiment and subjectivity
|
78 |
+
self.df['textblob_polarity'] = self.df['review_text'].apply(
|
79 |
+
lambda x: TextBlob(str(x)).sentiment.polarity
|
80 |
+
)
|
81 |
+
self.df['textblob_subjectivity'] = self.df['review_text'].apply(
|
82 |
+
lambda x: TextBlob(str(x)).sentiment.subjectivity
|
83 |
+
)
|
84 |
+
|
85 |
+
# VADER sentiment with custom negative phrase handling
|
86 |
+
def get_enhanced_vader_scores(text):
|
87 |
+
# Custom negative phrases
|
88 |
+
negative_phrases = [
|
89 |
+
'too long', 'way too long', 'waiting', 'changed our minds',
|
90 |
+
'too many', 'took forever', 'took too long', 'waste of time',
|
91 |
+
'not worth', 'disappointing', 'mediocre', 'needs improvement'
|
92 |
+
]
|
93 |
+
|
94 |
+
# Get base VADER scores
|
95 |
+
base_scores = self.vader.polarity_scores(str(text))
|
96 |
+
|
97 |
+
# Check for negative phrases
|
98 |
+
text_lower = str(text).lower()
|
99 |
+
neg_count = sum(1 for phrase in negative_phrases if phrase in text_lower)
|
100 |
+
|
101 |
+
# Adjust scores if negative phrases are found
|
102 |
+
if neg_count > 0:
|
103 |
+
base_scores['neg'] = max(base_scores['neg'], min(0.7, neg_count * 0.2))
|
104 |
+
base_scores['compound'] *= (1 - (neg_count * 0.15))
|
105 |
+
# Readjust neutral score
|
106 |
+
base_scores['neu'] = max(0, 1 - base_scores['neg'] - base_scores['pos'])
|
107 |
+
|
108 |
+
return base_scores
|
109 |
+
|
110 |
+
# Apply enhanced VADER scoring
|
111 |
+
vader_scores = self.df['review_text'].apply(get_enhanced_vader_scores)
|
112 |
+
self.df['vader_compound'] = vader_scores.apply(lambda x: x['compound'])
|
113 |
+
self.df['vader_negative'] = vader_scores.apply(lambda x: x['neg'])
|
114 |
+
self.df['vader_positive'] = vader_scores.apply(lambda x: x['pos'])
|
115 |
+
self.df['vader_neutral'] = vader_scores.apply(lambda x: x['neu'])
|
116 |
+
|
117 |
+
# Calculate sentiment extremity
|
118 |
+
self.df['sentiment_extremity'] = self.df['vader_compound'].abs()
|
119 |
+
|
120 |
+
return self.df
|
121 |
+
|
122 |
+
def detect_anomalies(self) -> pd.DataFrame:
|
123 |
+
"""Detect anomalous reviews using Isolation Forest with BERT features"""
|
124 |
+
self.logger.info("Detecting anomalies...")
|
125 |
+
|
126 |
+
# Prepare features for anomaly detection
|
127 |
+
features = [
|
128 |
+
'review_stars',
|
129 |
+
'textblob_polarity',
|
130 |
+
'vader_compound',
|
131 |
+
'sentiment_extremity',
|
132 |
+
'review_length',
|
133 |
+
'bert_embedding_mean',
|
134 |
+
'bert_embedding_std'
|
135 |
+
]
|
136 |
+
|
137 |
+
# Ensure all features exist
|
138 |
+
missing_features = [f for f in features if f not in self.df.columns]
|
139 |
+
if missing_features:
|
140 |
+
self.analyze_sentiment()
|
141 |
+
|
142 |
+
# Standardize features
|
143 |
+
scaler = StandardScaler()
|
144 |
+
X = scaler.fit_transform(self.df[features])
|
145 |
+
|
146 |
+
# Apply Isolation Forest
|
147 |
+
iso_forest = IsolationForest(
|
148 |
+
contamination=0.1,
|
149 |
+
random_state=42,
|
150 |
+
n_jobs=-1
|
151 |
+
)
|
152 |
+
|
153 |
+
# Fit and predict
|
154 |
+
self.df['is_anomaly'] = iso_forest.fit_predict(X)
|
155 |
+
self.df['anomaly_score'] = iso_forest.score_samples(X)
|
156 |
+
|
157 |
+
return self.df
|
158 |
+
|
159 |
+
def detect_ai_generated_text(self) -> pd.DataFrame:
|
160 |
+
"""Estimate likelihood of AI-generated content"""
|
161 |
+
self.logger.info("Detecting AI-generated content...")
|
162 |
+
|
163 |
+
# Ensure sentiment analysis has been run
|
164 |
+
if 'textblob_subjectivity' not in self.df.columns:
|
165 |
+
self.analyze_sentiment()
|
166 |
+
|
167 |
+
# Use detoxify model to get toxicity scores
|
168 |
+
texts = self.df['review_text'].fillna('').tolist()
|
169 |
+
toxic_scores = self.toxic_model.predict(texts)
|
170 |
+
|
171 |
+
# Add scores to DataFrame
|
172 |
+
toxic_score_types = ['toxicity', 'severe_toxicity', 'obscene', 'identity_attack',
|
173 |
+
'insult', 'threat', 'sexual_explicit']
|
174 |
+
for score_type in toxic_score_types:
|
175 |
+
if score_type in toxic_scores:
|
176 |
+
self.df[f'toxic_{score_type}'] = toxic_scores[score_type]
|
177 |
+
|
178 |
+
# Calculate AI generation likelihood based on various factors
|
179 |
+
self.df['ai_generated_likelihood'] = (
|
180 |
+
(self.df['textblob_subjectivity'] < 0.3) & # Low subjectivity
|
181 |
+
(self.df['sentiment_extremity'] > 0.8) & # Extreme sentiment
|
182 |
+
(self.df['review_length'] > self.df['review_length'].quantile(0.95)) & # Unusually long
|
183 |
+
(self.df['bert_embedding_std'] < self.df['bert_embedding_std'].quantile(0.25)) # Unusual language patterns
|
184 |
+
).astype(int)
|
185 |
+
|
186 |
+
# Add additional AI detection features
|
187 |
+
self.df['ai_detection_score'] = (
|
188 |
+
(self.df['textblob_subjectivity'] * -1) + # Lower subjectivity increases score
|
189 |
+
(self.df['sentiment_extremity'] * 0.5) + # Extreme sentiment contributes somewhat
|
190 |
+
(self.df['bert_embedding_std'] * -0.5) # Lower variation in embeddings increases score
|
191 |
+
).clip(0, 1) # Normalize between 0 and 1
|
192 |
+
|
193 |
+
return self.df
|
194 |
+
|
195 |
+
def analyze_business_categories(self) -> Dict:
|
196 |
+
"""Analyze trends and patterns specific to business categories"""
|
197 |
+
self.logger.info("Analyzing business categories...")
|
198 |
+
|
199 |
+
# Extract categories
|
200 |
+
categories = self.df['categories'].fillna('').str.split(', ')
|
201 |
+
all_categories = [cat for cats in categories if isinstance(cats, list) for cat in cats]
|
202 |
+
category_counts = Counter(all_categories)
|
203 |
+
|
204 |
+
# Analyze reviews by category
|
205 |
+
category_analysis = {}
|
206 |
+
for category in set(all_categories):
|
207 |
+
category_reviews = self.df[self.df['categories'].str.contains(category, na=False)]
|
208 |
+
|
209 |
+
category_analysis[category] = {
|
210 |
+
'review_count': len(category_reviews),
|
211 |
+
'avg_rating': category_reviews['review_stars'].mean() if not category_reviews.empty else None,
|
212 |
+
'avg_sentiment': category_reviews['vader_compound'].mean() if 'vader_compound' in self.df.columns and not category_reviews.empty else None,
|
213 |
+
'avg_subjectivity': category_reviews['textblob_subjectivity'].mean() if 'textblob_subjectivity' in self.df.columns and not category_reviews.empty else None
|
214 |
+
}
|
215 |
+
|
216 |
+
return category_analysis
|
217 |
+
|
218 |
+
def visualize_results(self, output_dir: str):
|
219 |
+
"""Create visualizations for analysis results"""
|
220 |
+
plt.figure(figsize=(15, 10))
|
221 |
+
|
222 |
+
# Sentiment Distribution
|
223 |
+
plt.subplot(2, 2, 1)
|
224 |
+
sns.histplot(data=self.df, x='vader_compound', bins=50)
|
225 |
+
plt.title('Sentiment Distribution')
|
226 |
+
|
227 |
+
# Review Volume Over Time
|
228 |
+
plt.subplot(2, 2, 2)
|
229 |
+
daily_reviews = self.df.groupby('review_date').size()
|
230 |
+
daily_reviews.plot()
|
231 |
+
plt.title('Review Volume Over Time')
|
232 |
+
|
233 |
+
# Anomaly Score Distribution
|
234 |
+
plt.subplot(2, 2, 3)
|
235 |
+
if 'anomaly_score' not in self.df.columns:
|
236 |
+
self.detect_anomalies()
|
237 |
+
sns.histplot(data=self.df, x='anomaly_score', bins=50)
|
238 |
+
plt.title('Anomaly Score Distribution')
|
239 |
+
|
240 |
+
# AI Generation Likelihood
|
241 |
+
plt.subplot(2, 2, 4)
|
242 |
+
if 'ai_generated_likelihood' not in self.df.columns:
|
243 |
+
self.detect_ai_generated_text()
|
244 |
+
sns.histplot(data=self.df, x='ai_generated_likelihood', bins=2)
|
245 |
+
plt.title('AI Generation Likelihood')
|
246 |
+
|
247 |
+
plt.tight_layout()
|
248 |
+
plt.savefig(f'{output_dir}/analysis_results.png')
|
249 |
+
plt.close()
|
250 |
+
|
251 |
+
def run_full_analysis(self, output_dir: str) -> Tuple[pd.DataFrame, Dict]:
|
252 |
+
"""Run complete analysis pipeline with detailed outputs"""
|
253 |
+
self.logger.info("Starting full analysis pipeline...")
|
254 |
+
|
255 |
+
# Create output directory if it doesn't exist
|
256 |
+
output_dir = Path(output_dir)
|
257 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
258 |
+
|
259 |
+
try:
|
260 |
+
# Run all analyses
|
261 |
+
self.analyze_sentiment()
|
262 |
+
self.detect_anomalies()
|
263 |
+
self.detect_ai_generated_text()
|
264 |
+
category_analysis = self.analyze_business_categories()
|
265 |
+
|
266 |
+
# Create visualizations
|
267 |
+
self.visualize_results(str(output_dir))
|
268 |
+
|
269 |
+
# Compile results
|
270 |
+
analysis_results = {
|
271 |
+
'category_analysis': category_analysis,
|
272 |
+
'sentiment_summary': {
|
273 |
+
'avg_sentiment': self.df['vader_compound'].mean(),
|
274 |
+
'positive_reviews': len(self.df[self.df['vader_compound'] > 0.5]),
|
275 |
+
'negative_reviews': len(self.df[self.df['vader_compound'] < -0.5]),
|
276 |
+
'neutral_reviews': len(self.df[abs(self.df['vader_compound']) <= 0.5])
|
277 |
+
},
|
278 |
+
'ai_detection_summary': {
|
279 |
+
'likely_ai_generated': len(self.df[self.df['ai_generated_likelihood'] == 1]),
|
280 |
+
'avg_ai_score': self.df['ai_detection_score'].mean()
|
281 |
+
},
|
282 |
+
'anomaly_summary': {
|
283 |
+
'anomalous_reviews': len(self.df[self.df['is_anomaly'] == -1]),
|
284 |
+
'avg_anomaly_score': self.df['anomaly_score'].mean()
|
285 |
+
}
|
286 |
+
}
|
287 |
+
|
288 |
+
# Save results
|
289 |
+
self.df.to_csv(output_dir / "analyzed_data.csv", index=False)
|
290 |
+
with open(output_dir / "analysis_results.json", 'w') as f:
|
291 |
+
json.dump(analysis_results, f, indent=4)
|
292 |
+
|
293 |
+
return self.df, analysis_results
|
294 |
+
|
295 |
+
except Exception as e:
|
296 |
+
self.logger.error(f"Error during analysis: {str(e)}")
|
297 |
+
raise
|
298 |
+
|
299 |
+
# For testing
|
300 |
+
if __name__ == "__main__":
|
301 |
+
# Set up logging
|
302 |
+
logging.basicConfig(level=logging.INFO)
|
303 |
+
logger = logging.getLogger(__name__)
|
304 |
+
|
305 |
+
try:
|
306 |
+
# Read test data
|
307 |
+
df = pd.read_csv("test_data.csv")
|
308 |
+
|
309 |
+
# Initialize analyzer
|
310 |
+
analyzer = AdvancedYelpAnalyzer(df)
|
311 |
+
|
312 |
+
# Run analysis
|
313 |
+
output_dir = "output"
|
314 |
+
analyzed_df, results = analyzer.run_full_analysis(output_dir)
|
315 |
+
|
316 |
+
logger.info("Analysis completed successfully!")
|
317 |
+
|
318 |
+
except Exception as e:
|
319 |
+
logger.error(f"Error during testing: {str(e)}")
|
320 |
+
raise
|
src/clean_data.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# clean_yelp_data.py
|
2 |
+
from loguru import logger
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
from dataclasses import dataclass
|
6 |
+
from typing import Dict, List, Optional, Tuple
|
7 |
+
import json
|
8 |
+
from pathlib import Path
|
9 |
+
import logging
|
10 |
+
from scipy.stats import entropy
|
11 |
+
import warnings
|
12 |
+
from datetime import datetime
|
13 |
+
import matplotlib.pyplot as plt
|
14 |
+
import seaborn as sns
|
15 |
+
import re
|
16 |
+
from textblob import TextBlob
|
17 |
+
import os
|
18 |
+
from pathlib import Path
|
19 |
+
|
20 |
+
class DataCleaner:
|
21 |
+
def __init__(self,df,output_path,filename="preprocessed_cleaned.csv"):
|
22 |
+
self.df=df
|
23 |
+
self.output_path=output_path
|
24 |
+
self.filename=filename
|
25 |
+
def saving_cleaned_preprocess(self):
|
26 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
27 |
+
|
28 |
+
output_file = Path(self.output_path) / self.filename
|
29 |
+
logger.info(f"Files saved in directory {output_file} as : { self.filename}")
|
30 |
+
self.df.to_csv(output_file, index=False)
|
31 |
+
|
32 |
+
def dropping_unncessary_columns(self):
|
33 |
+
self.df.drop("review_text", axis=1, inplace=True)
|
34 |
+
self.df.drop("review_date", axis=1, inplace=True)
|
35 |
+
self.df.drop("business_name", axis=1, inplace=True)
|
36 |
+
self.df.drop("address", axis=1, inplace=True)
|
37 |
+
self.df.drop("city", axis=1, inplace=True)
|
38 |
+
self.df.drop("state", axis=1, inplace=True)
|
39 |
+
self.df.drop("postal_code", axis=1, inplace=True)
|
40 |
+
self.df.drop("categories", axis=1, inplace=True)
|
41 |
+
self.df.drop("user_name", axis=1, inplace=True)
|
42 |
+
self.df.drop("yelping_since", axis=1, inplace=True)
|
43 |
+
self.df.drop("checkin_date", axis=1, inplace=True)
|
44 |
+
self.df.drop("review_useful", axis=1, inplace=True)
|
45 |
+
self.df.drop("review_funny", axis=1, inplace=True)
|
46 |
+
self.df.drop("review_cool", axis=1, inplace=True)
|
47 |
+
self.df.drop("user_useful", axis=1, inplace=True)
|
48 |
+
self.df.drop("user_funny", axis=1, inplace=True)
|
49 |
+
self.df.drop("user_cool", axis=1, inplace=True)
|
50 |
+
self.df.drop("is_open", axis=1, inplace=True)
|
51 |
+
self.df.drop("compliment_hot", axis=1, inplace=True)
|
52 |
+
self.df.drop("compliment_more", axis=1, inplace=True)
|
53 |
+
self.df.drop("compliment_profile", axis=1, inplace=True)
|
54 |
+
self.df.drop("compliment_cute", axis=1, inplace=True)
|
55 |
+
self.df.drop("compliment_list", axis=1, inplace=True)
|
56 |
+
self.df.drop("compliment_note", axis=1, inplace=True)
|
57 |
+
self.df.drop("compliment_plain", axis=1, inplace=True)
|
58 |
+
self.df.drop("compliment_cool", axis=1, inplace=True)
|
59 |
+
self.df.drop("compliment_funny", axis=1, inplace=True)
|
60 |
+
self.df.drop("compliment_writer", axis=1, inplace=True)
|
61 |
+
self.df.drop("compliment_photos", axis=1, inplace=True)
|
62 |
+
|
63 |
+
def run_pipeline(self):
|
64 |
+
logger.info("Dropping Unnecessary Columns")
|
65 |
+
self.dropping_unncessary_columns()
|
66 |
+
|
67 |
+
logger.info("Checking Again for NULL values in Columns")
|
68 |
+
for col in self.df.columns:
|
69 |
+
if self.df[col].isnull().sum()>0:
|
70 |
+
print(f" {col} has {self.df[col].isnull().sum()} null values")
|
71 |
+
|
72 |
+
|
73 |
+
logger.info("Saving Cleaned and Preprocessed Data")
|
74 |
+
self.saving_cleaned_preprocess()
|
75 |
+
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
|
src/create_dataset.py
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import ujson as json
|
3 |
+
import gc
|
4 |
+
import numpy as np
|
5 |
+
from concurrent.futures import ProcessPoolExecutor
|
6 |
+
import multiprocessing as mp
|
7 |
+
from pymongo import MongoClient
|
8 |
+
from collections import defaultdict
|
9 |
+
from pathlib import Path
|
10 |
+
|
11 |
+
# def read_json_parallel(file_path, num_workers=None):
|
12 |
+
# """Read JSON file using parallel processing"""
|
13 |
+
# if num_workers is None:
|
14 |
+
# num_workers = max(1, mp.cpu_count() - 1)
|
15 |
+
|
16 |
+
# print(f"Reading {file_path}...")
|
17 |
+
# # Read chunks and concatenate them into a single DataFrame
|
18 |
+
# df = pd.read_json(file_path, lines=True, dtype_backend="pyarrow", chunksize=100000)
|
19 |
+
# return next(df)
|
20 |
+
|
21 |
+
|
22 |
+
def read_data_mongo(file_path, num_workers=None):
|
23 |
+
"""Read JSON file using parallel processing"""
|
24 |
+
if num_workers is None:
|
25 |
+
num_workers = max(1, mp.cpu_count() - 1)
|
26 |
+
|
27 |
+
print(f"Reading {file_path}...")
|
28 |
+
conn_str = "mongodb://Mtalha:[email protected]/"
|
29 |
+
|
30 |
+
client = MongoClient(conn_str)
|
31 |
+
databases = client.list_database_names()
|
32 |
+
db_client=client["Yelp"]
|
33 |
+
|
34 |
+
# Read the entire file at once since chunksize isn't needed for parallel reading here
|
35 |
+
# Use 'records' orient if your JSON was saved with this format
|
36 |
+
try:
|
37 |
+
|
38 |
+
collection = db_client[file_path]
|
39 |
+
documents = collection.find({}, {"_id": 0})
|
40 |
+
data = list(documents)
|
41 |
+
final_dict=defaultdict(list)
|
42 |
+
|
43 |
+
for dictt in data:
|
44 |
+
for k,v in dictt.items():
|
45 |
+
final_dict[k].append(v)
|
46 |
+
df=pd.DataFrame(final_dict)
|
47 |
+
|
48 |
+
# df = pd.read_json(file_path, orient='records', dtype_backend="pyarrow")
|
49 |
+
except Exception as e:
|
50 |
+
# If 'records' doesn't work, try without specifying orient or with 'split'
|
51 |
+
# This is a fallback for different JSON structures
|
52 |
+
# df = pd.read_json(file_path, dtype_backend="pyarrow")
|
53 |
+
print("ERROR WHILE READING FILES FORM MONGODB AS : ",e)
|
54 |
+
print(f"Finished reading. DataFrame shape: {df.shape}")
|
55 |
+
return df
|
56 |
+
|
57 |
+
def process_datasets(output_path,filename):
|
58 |
+
# File paths
|
59 |
+
file_paths = {
|
60 |
+
'business': "yelp_academic_dataset_business",
|
61 |
+
'checkin': "yelp_academic_dataset_checkin",
|
62 |
+
'review': "yelp_academic_dataset_review",
|
63 |
+
'tip': "yelp_academic_dataset_tip",
|
64 |
+
'user': "yelp_academic_dataset_user",
|
65 |
+
'google': "google_review_dataset"
|
66 |
+
}
|
67 |
+
|
68 |
+
# Read datasets with progress tracking
|
69 |
+
print("Reading datasets...")
|
70 |
+
dfs = {}
|
71 |
+
for name, path in file_paths.items():
|
72 |
+
print(f"Processing {name} dataset...")
|
73 |
+
dfs[name] = read_data_mongo(path)
|
74 |
+
print(f"Finished reading {name} dataset. Shape: {dfs[name].shape}")
|
75 |
+
|
76 |
+
print("All files read. Starting column renaming...")
|
77 |
+
|
78 |
+
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
|
84 |
+
|
85 |
+
|
86 |
+
# Rename columns to avoid conflicts
|
87 |
+
# Reviews
|
88 |
+
dfs['review'] = dfs['review'].rename(columns={
|
89 |
+
'date': 'review_date',
|
90 |
+
'stars': 'review_stars',
|
91 |
+
'text': 'review_text',
|
92 |
+
'useful': 'review_useful',
|
93 |
+
'funny': 'review_funny',
|
94 |
+
'cool': 'review_cool'
|
95 |
+
})
|
96 |
+
# print("COLUMNS IN REVIEW DAFRA)
|
97 |
+
|
98 |
+
# Tips
|
99 |
+
dfs['tip'] = dfs['tip'].rename(columns={
|
100 |
+
'date': 'tip_date',
|
101 |
+
'text': 'tip_text',
|
102 |
+
'compliment_count': 'tip_compliment_count'
|
103 |
+
})
|
104 |
+
|
105 |
+
# Checkins
|
106 |
+
dfs['checkin'] = dfs['checkin'].rename(columns={
|
107 |
+
'date': 'checkin_date'
|
108 |
+
})
|
109 |
+
|
110 |
+
# Users
|
111 |
+
dfs['user'] = dfs['user'].rename(columns={
|
112 |
+
'name': 'user_name',
|
113 |
+
'review_count': 'user_review_count',
|
114 |
+
'useful': 'user_useful',
|
115 |
+
'funny': 'user_funny',
|
116 |
+
'cool': 'user_cool'
|
117 |
+
})
|
118 |
+
|
119 |
+
# Business
|
120 |
+
dfs['business'] = dfs['business'].rename(columns={
|
121 |
+
'name': 'business_name',
|
122 |
+
'stars': 'business_stars',
|
123 |
+
'review_count': 'business_review_count'
|
124 |
+
})
|
125 |
+
dfs['google'] = dfs['google'].rename(columns={
|
126 |
+
'name': 'business_name',
|
127 |
+
'stars': 'business_stars',
|
128 |
+
'review_count': 'business_review_count'
|
129 |
+
})
|
130 |
+
df_business_final= dfs['business']
|
131 |
+
df_google_final=dfs['google']
|
132 |
+
df_review_final=dfs['review']
|
133 |
+
df_tip_final=dfs['tip']
|
134 |
+
df_checkin_final=dfs['checkin']
|
135 |
+
df_user_final=dfs['user']
|
136 |
+
|
137 |
+
|
138 |
+
df_business_final=pd.concat([df_business_final,df_google_final],axis=0)
|
139 |
+
df_business_final.reset_index(drop=True,inplace=True)
|
140 |
+
|
141 |
+
|
142 |
+
|
143 |
+
|
144 |
+
print("Starting merge process...")
|
145 |
+
|
146 |
+
# Merge process with memory management
|
147 |
+
print("Step 1: Starting with reviews...")
|
148 |
+
merged_df = df_review_final
|
149 |
+
|
150 |
+
|
151 |
+
print("Step 2: Merging with business data...")
|
152 |
+
merged_df = merged_df.merge(
|
153 |
+
df_business_final,
|
154 |
+
on='business_id',
|
155 |
+
how='left'
|
156 |
+
)
|
157 |
+
|
158 |
+
|
159 |
+
print("Step 3: Merging with user data...")
|
160 |
+
merged_df = merged_df.merge(
|
161 |
+
df_user_final,
|
162 |
+
on='user_id',
|
163 |
+
how='left'
|
164 |
+
)
|
165 |
+
|
166 |
+
|
167 |
+
print("Step 4: Merging with checkin data...")
|
168 |
+
merged_df = merged_df.merge(
|
169 |
+
df_checkin_final,
|
170 |
+
on='business_id',
|
171 |
+
how='left'
|
172 |
+
)
|
173 |
+
|
174 |
+
|
175 |
+
print("Step 5: Aggregating and merging tip data...")
|
176 |
+
tip_agg = df_tip_final.groupby('business_id').agg({
|
177 |
+
'tip_compliment_count': 'sum',
|
178 |
+
'tip_text': 'count'
|
179 |
+
}).rename(columns={'tip_text': 'tip_count'})
|
180 |
+
|
181 |
+
merged_df = merged_df.merge(
|
182 |
+
tip_agg,
|
183 |
+
on='business_id',
|
184 |
+
how='left'
|
185 |
+
)
|
186 |
+
|
187 |
+
|
188 |
+
|
189 |
+
print("Filling NaN values...")
|
190 |
+
merged_df['tip_count'] = merged_df['tip_count'].fillna(0)
|
191 |
+
merged_df['tip_compliment_count'] = merged_df['tip_compliment_count'].fillna(0)
|
192 |
+
merged_df['checkin_date'] = merged_df['checkin_date'].fillna('')
|
193 |
+
merged_df["friends"].fillna(0,inplace=True)
|
194 |
+
|
195 |
+
for col in merged_df.columns:
|
196 |
+
if merged_df[col].isnull().sum()>0:
|
197 |
+
print(f" {col} has {merged_df[col].isnull().sum()} null values")
|
198 |
+
|
199 |
+
|
200 |
+
print("Shape of Merged Dataset is : ",merged_df.shape)
|
201 |
+
output_file = Path(output_path) / filename
|
202 |
+
print("COLUMNS BEFORE PREPROCESING")
|
203 |
+
print()
|
204 |
+
print(merged_df.info())
|
205 |
+
for col in merged_df.columns:
|
206 |
+
for v in merged_df[col]:
|
207 |
+
print(f"Type of values in {col} is {type(v)} and values are like : {v}")
|
208 |
+
break
|
209 |
+
merged_df.to_csv(output_file,index=False)
|
210 |
+
|
211 |
+
|
212 |
+
|
213 |
+
|
214 |
+
return merged_df
|
215 |
+
|
216 |
+
# if __name__ == "__main__":
|
217 |
+
# process_datasets()
|
src/feature_analyzer.py
ADDED
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
import seaborn as sns
|
5 |
+
from pathlib import Path
|
6 |
+
from loguru import logger
|
7 |
+
|
8 |
+
class FeatureAnalyzer:
|
9 |
+
def __init__(self,df,output_path):
|
10 |
+
self.df=df
|
11 |
+
self.output_path=output_path
|
12 |
+
|
13 |
+
|
14 |
+
def plot_correlation_heatmap(self):
|
15 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
16 |
+
numeric_cols = self.df.select_dtypes(include=[np.number]).columns.drop('fake')
|
17 |
+
correlation_matrix = self.df[numeric_cols].corr()
|
18 |
+
plt.figure(figsize=(14, 12))
|
19 |
+
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1, center=0)
|
20 |
+
plt.title('Correlation Heatmap of Numeric Features', fontsize=16)
|
21 |
+
plt.tight_layout()
|
22 |
+
output_file = Path(self.output_path) / 'correlation_heatmap.png'
|
23 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
24 |
+
plt.close()
|
25 |
+
logger.info(f"Saved correlation heatmap to {output_file}")
|
26 |
+
|
27 |
+
def plot_mean_by_fake_bar(self):
|
28 |
+
key_features = [
|
29 |
+
'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
|
30 |
+
'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
|
31 |
+
'time_since_last_review_user', 'user_account_age', 'pronoun_density',
|
32 |
+
'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
|
33 |
+
'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
|
34 |
+
]
|
35 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
36 |
+
mean_by_fake = self.df.groupby('fake')[key_features].mean().T
|
37 |
+
mean_by_fake.columns = ['Genuine (0)', 'Fake (1)']
|
38 |
+
plt.figure(figsize=(12, 8))
|
39 |
+
mean_by_fake.plot(kind='bar', color=['skyblue', 'salmon'], width=0.8)
|
40 |
+
plt.title('Mean Feature Values by Fake Label', fontsize=16)
|
41 |
+
plt.xlabel('Features', fontsize=12)
|
42 |
+
plt.ylabel('Mean Value', fontsize=12)
|
43 |
+
plt.xticks(rotation=45, ha='right')
|
44 |
+
plt.legend(title='Fake Label')
|
45 |
+
plt.tight_layout()
|
46 |
+
output_file = Path(self.output_path) / 'mean_by_fake_bar.png'
|
47 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
48 |
+
plt.close()
|
49 |
+
logger.info(f"Saved mean by fake bar plot to {output_file}")
|
50 |
+
|
51 |
+
def plot_violin_plots(self):
|
52 |
+
key_features = [
|
53 |
+
'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
|
54 |
+
'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
|
55 |
+
'time_since_last_review_user', 'user_account_age', 'pronoun_density',
|
56 |
+
'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
|
57 |
+
'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
|
58 |
+
]
|
59 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
60 |
+
plt.figure(figsize=(14, 10))
|
61 |
+
for i, feature in enumerate(key_features[:6], 1):
|
62 |
+
plt.subplot(2, 3, i)
|
63 |
+
sns.violinplot(x='fake', y=feature, data=self.df, palette=['skyblue', 'salmon'])
|
64 |
+
plt.title(f'{feature} Distribution', fontsize=12)
|
65 |
+
plt.xlabel('Fake (0/1)', fontsize=10)
|
66 |
+
plt.tight_layout()
|
67 |
+
output_file = Path(self.output_path) / 'violin_plots.png'
|
68 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
69 |
+
plt.close()
|
70 |
+
logger.info(f"Saved violin plots to {output_file}")
|
71 |
+
|
72 |
+
def plot_box_plots(self):
|
73 |
+
key_features = [
|
74 |
+
'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
|
75 |
+
'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
|
76 |
+
'time_since_last_review_user', 'user_account_age', 'pronoun_density',
|
77 |
+
'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
|
78 |
+
'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
|
79 |
+
]
|
80 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
81 |
+
plt.figure(figsize=(14, 10))
|
82 |
+
for i, feature in enumerate(key_features[6:11], 1):
|
83 |
+
plt.subplot(2, 3, i)
|
84 |
+
sns.boxplot(x='fake', y=feature, data=self.df, palette=['skyblue', 'salmon'])
|
85 |
+
plt.title(f'{feature} Distribution', fontsize=12)
|
86 |
+
plt.xlabel('Fake (0/1)', fontsize=10)
|
87 |
+
plt.tight_layout()
|
88 |
+
output_file = Path(self.output_path) / 'box_plots.png'
|
89 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
90 |
+
plt.close()
|
91 |
+
logger.info(f"Saved box plots to {output_file}")
|
92 |
+
|
93 |
+
def plot_scatter_review_grammar(self):
|
94 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
95 |
+
plt.figure(figsize=(10, 6))
|
96 |
+
sns.scatterplot(x='review_stars', y='grammar_error_score', hue='fake', data=self.df, palette=['blue', 'red'], alpha=0.5)
|
97 |
+
plt.title('Review Stars vs Grammar Error Score by Fake Label', fontsize=16)
|
98 |
+
plt.xlabel('Review Stars', fontsize=12)
|
99 |
+
plt.ylabel('Grammar Error Score', fontsize=12)
|
100 |
+
plt.legend(title='Fake')
|
101 |
+
plt.tight_layout()
|
102 |
+
output_file = Path(self.output_path) / 'scatter_review_grammar.png'
|
103 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
104 |
+
plt.close()
|
105 |
+
logger.info(f"Saved scatter plot to {output_file}")
|
106 |
+
|
107 |
+
def plot_density_plots(self):
|
108 |
+
key_features = [
|
109 |
+
'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
|
110 |
+
'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
|
111 |
+
'time_since_last_review_user', 'user_account_age', 'pronoun_density',
|
112 |
+
'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
|
113 |
+
'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
|
114 |
+
]
|
115 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
116 |
+
plt.figure(figsize=(14, 10))
|
117 |
+
for i, feature in enumerate(key_features[:4], 1):
|
118 |
+
plt.subplot(2, 2, i)
|
119 |
+
for label in [0, 1]:
|
120 |
+
subset = self.df[self.df['fake'] == label]
|
121 |
+
sns.kdeplot(subset[feature], label=f'Fake={label}', fill=True, alpha=0.5)
|
122 |
+
plt.title(f'{feature} Density', fontsize=12)
|
123 |
+
plt.xlabel(feature, fontsize=10)
|
124 |
+
plt.legend()
|
125 |
+
plt.tight_layout()
|
126 |
+
output_file = Path(self.output_path) / 'density_plots.png'
|
127 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
128 |
+
plt.close()
|
129 |
+
logger.info(f"Saved density plots to {output_file}")
|
130 |
+
|
131 |
+
def plot_stacked_bar_similarity(self):
|
132 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
133 |
+
bins = pd.cut(self.df['similarity_to_other_reviews'], bins=10)
|
134 |
+
stacked_data = self.df.groupby([bins, 'fake']).size().unstack(fill_value=0)
|
135 |
+
stacked_data = stacked_data.div(stacked_data.sum(axis=1), axis=0)
|
136 |
+
plt.figure(figsize=(12, 8))
|
137 |
+
stacked_data.plot(kind='bar', stacked=True, color=['skyblue', 'salmon'], width=0.8)
|
138 |
+
plt.title('Proportion of Fake by Similarity to Other Reviews Bins', fontsize=16)
|
139 |
+
plt.xlabel('Similarity Bins', fontsize=12)
|
140 |
+
plt.ylabel('Proportion', fontsize=12)
|
141 |
+
plt.legend(['Genuine (0)', 'Fake (1)'], title='Fake Label')
|
142 |
+
plt.xticks(rotation=45, ha='right')
|
143 |
+
plt.tight_layout()
|
144 |
+
output_file = Path(self.output_path) / 'stacked_bar_similarity.png'
|
145 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
146 |
+
plt.close()
|
147 |
+
logger.info(f"Saved stacked bar plot to {output_file}")
|
148 |
+
|
149 |
+
def plot_pie_fake_distribution(self):
|
150 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
151 |
+
fake_counts = self.df['fake'].value_counts()
|
152 |
+
plt.figure(figsize=(8, 8))
|
153 |
+
plt.pie(fake_counts, labels=['Genuine (0)', 'Fake (1)'], colors=['skyblue', 'salmon'], autopct='%1.1f%%', startangle=90)
|
154 |
+
plt.title('Distribution of Fake Labels', fontsize=16)
|
155 |
+
plt.axis('equal')
|
156 |
+
output_file = Path(self.output_path) / 'pie_fake_distribution.png'
|
157 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
158 |
+
plt.close()
|
159 |
+
logger.info(f"Saved pie chart to {output_file}")
|
160 |
+
|
161 |
+
def plot_count_code_switching(self):
|
162 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
163 |
+
plt.figure(figsize=(8, 6))
|
164 |
+
sns.countplot(x='code_switching_flag', hue='fake', data=self.df, palette=['skyblue', 'salmon'])
|
165 |
+
plt.title('Count of Fake by Code Switching Flag', fontsize=16)
|
166 |
+
plt.xlabel('Code Switching Flag (0/1)', fontsize=12)
|
167 |
+
plt.ylabel('Count', fontsize=12)
|
168 |
+
plt.legend(title='Fake Label')
|
169 |
+
plt.tight_layout()
|
170 |
+
output_file = Path(self.output_path) / 'count_code_switching.png'
|
171 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
172 |
+
plt.close()
|
173 |
+
logger.info(f"Saved count plot to {output_file}")
|
174 |
+
|
175 |
+
def plot_variance_by_fake_bar(self):
|
176 |
+
key_features = [
|
177 |
+
'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
|
178 |
+
'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
|
179 |
+
'time_since_last_review_user', 'user_account_age', 'pronoun_density',
|
180 |
+
'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
|
181 |
+
'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
|
182 |
+
]
|
183 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
184 |
+
variance_by_fake = self.df.groupby('fake')[key_features].var().T
|
185 |
+
variance_by_fake.columns = ['Genuine (0)', 'Fake (1)']
|
186 |
+
plt.figure(figsize=(12, 8))
|
187 |
+
variance_by_fake.plot(kind='bar', color=['skyblue', 'salmon'], width=0.8)
|
188 |
+
plt.title('Feature Variance by Fake Label', fontsize=16)
|
189 |
+
plt.xlabel('Features', fontsize=12)
|
190 |
+
plt.ylabel('Variance', fontsize=12)
|
191 |
+
plt.xticks(rotation=45, ha='right')
|
192 |
+
plt.legend(title='Fake Label')
|
193 |
+
plt.tight_layout()
|
194 |
+
output_file = Path(self.output_path) / 'variance_by_fake_bar.png'
|
195 |
+
plt.savefig(output_file, dpi=300, bbox_inches='tight')
|
196 |
+
plt.close()
|
197 |
+
logger.info(f"Saved variance bar plot to {output_file}")
|
198 |
+
|
199 |
+
def run_pipeline(self):
|
200 |
+
|
201 |
+
sns.set(style="whitegrid")
|
202 |
+
plt.rcParams['figure.figsize'] = (12, 8)
|
203 |
+
self.plot_correlation_heatmap()
|
204 |
+
self.plot_mean_by_fake_bar()
|
205 |
+
self.plot_violin_plots()
|
206 |
+
self.plot_box_plots()
|
207 |
+
self.plot_scatter_review_grammar()
|
208 |
+
self.plot_density_plots()
|
209 |
+
self.plot_stacked_bar_similarity()
|
210 |
+
self.plot_pie_fake_distribution()
|
211 |
+
self.plot_count_code_switching()
|
212 |
+
self.plot_variance_by_fake_bar()
|
src/model.py
ADDED
@@ -0,0 +1,540 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from torch_geometric.data import HeteroData
|
5 |
+
import numpy as np
|
6 |
+
import pandas as pd
|
7 |
+
import networkx as nx
|
8 |
+
import matplotlib.pyplot as plt
|
9 |
+
import seaborn as sns
|
10 |
+
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve
|
11 |
+
from sklearn.model_selection import train_test_split
|
12 |
+
from pathlib import Path
|
13 |
+
from datetime import datetime
|
14 |
+
from loguru import logger
|
15 |
+
|
16 |
+
# Temporal Edge Features Function
|
17 |
+
def create_temporal_edge_features(time_since_src, time_since_tgt, user_i, user_j):
|
18 |
+
delta_t = torch.abs(time_since_src - time_since_tgt).float()
|
19 |
+
hour_scale = torch.sin(delta_t / 3600)
|
20 |
+
day_scale = torch.sin(delta_t / (24 * 3600))
|
21 |
+
week_scale = torch.sin(delta_t / (7 * 24 * 3600))
|
22 |
+
same_user = (user_i == user_j).float()
|
23 |
+
burst_feature = same_user * torch.exp(-delta_t / (24 * 3600))
|
24 |
+
return torch.stack([hour_scale, day_scale, week_scale, burst_feature], dim=-1)
|
25 |
+
|
26 |
+
# Custom Multihead Attention (unchanged)
|
27 |
+
class CustomMultiheadAttention(nn.Module):
|
28 |
+
def __init__(self, embed_dim, num_heads):
|
29 |
+
super().__init__()
|
30 |
+
self.embed_dim = embed_dim
|
31 |
+
self.num_heads = num_heads
|
32 |
+
self.head_dim = embed_dim // num_heads
|
33 |
+
|
34 |
+
assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
|
35 |
+
|
36 |
+
self.q_proj = nn.Linear(embed_dim, embed_dim)
|
37 |
+
self.k_proj = nn.Linear(embed_dim, embed_dim)
|
38 |
+
self.v_proj = nn.Linear(embed_dim, embed_dim)
|
39 |
+
self.out_proj = nn.Linear(embed_dim, embed_dim)
|
40 |
+
|
41 |
+
self.scale = self.head_dim ** -0.5
|
42 |
+
|
43 |
+
def forward(self, query, key, value, attn_bias=None):
|
44 |
+
batch_size, seq_len, embed_dim = query.size()
|
45 |
+
q = self.q_proj(query)
|
46 |
+
k = self.k_proj(key)
|
47 |
+
v = self.v_proj(value)
|
48 |
+
q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
|
49 |
+
k = k.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
|
50 |
+
v = v.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
|
51 |
+
scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale
|
52 |
+
if attn_bias is not None:
|
53 |
+
scores = scores + attn_bias.unsqueeze(1)
|
54 |
+
attn = F.softmax(scores, dim=-1)
|
55 |
+
out = torch.matmul(attn, v)
|
56 |
+
out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, embed_dim)
|
57 |
+
out = self.out_proj(out)
|
58 |
+
return out, attn
|
59 |
+
|
60 |
+
# HeteroGraphormer (unchanged)
|
61 |
+
class HeteroGraphormer(nn.Module):
|
62 |
+
def __init__(self, hidden_dim, output_dim, num_heads=4, edge_dim=4):
|
63 |
+
super().__init__()
|
64 |
+
self.hidden_dim = hidden_dim
|
65 |
+
|
66 |
+
self.embed_dict = nn.ModuleDict({
|
67 |
+
'user': nn.Linear(14, hidden_dim),
|
68 |
+
'business': nn.Linear(8, hidden_dim),
|
69 |
+
'review': nn.Linear(16, hidden_dim)
|
70 |
+
})
|
71 |
+
|
72 |
+
self.edge_proj = nn.Linear(edge_dim, hidden_dim)
|
73 |
+
|
74 |
+
self.gru_user = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
|
75 |
+
self.gru_business = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
|
76 |
+
self.gru_review = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
|
77 |
+
|
78 |
+
self.attention1 = CustomMultiheadAttention(hidden_dim, num_heads)
|
79 |
+
self.attention2 = CustomMultiheadAttention(hidden_dim, num_heads)
|
80 |
+
|
81 |
+
self.ffn1 = nn.Sequential(
|
82 |
+
nn.Linear(hidden_dim, hidden_dim * 4),
|
83 |
+
nn.ReLU(),
|
84 |
+
nn.Dropout(0.1),
|
85 |
+
nn.Linear(hidden_dim * 4, hidden_dim)
|
86 |
+
)
|
87 |
+
self.ffn2 = nn.Sequential(
|
88 |
+
nn.Linear(hidden_dim, hidden_dim * 4),
|
89 |
+
nn.ReLU(),
|
90 |
+
nn.Dropout(0.1),
|
91 |
+
nn.Linear(hidden_dim * 4, hidden_dim)
|
92 |
+
)
|
93 |
+
|
94 |
+
self.norm1 = nn.LayerNorm(hidden_dim)
|
95 |
+
self.norm2 = nn.LayerNorm(hidden_dim)
|
96 |
+
self.norm3 = nn.LayerNorm(hidden_dim)
|
97 |
+
self.norm4 = nn.LayerNorm(hidden_dim)
|
98 |
+
|
99 |
+
self.centrality_proj = nn.Linear(1, hidden_dim)
|
100 |
+
|
101 |
+
self.classifier = nn.Sequential(
|
102 |
+
nn.Linear(hidden_dim * 3, hidden_dim),
|
103 |
+
nn.ReLU(),
|
104 |
+
nn.Dropout(0.1),
|
105 |
+
nn.Linear(hidden_dim, 1)
|
106 |
+
)
|
107 |
+
|
108 |
+
self.dropout = nn.Dropout(0.1)
|
109 |
+
|
110 |
+
def time_aware_aggregation(self, x, time_since, decay_rate=0.1):
|
111 |
+
weights = torch.exp(-decay_rate * time_since.unsqueeze(-1))
|
112 |
+
return x * weights
|
113 |
+
|
114 |
+
def forward(self, data, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict):
|
115 |
+
x_dict = {}
|
116 |
+
for node_type in data.x_dict:
|
117 |
+
x = self.embed_dict[node_type](data[node_type].x)
|
118 |
+
if node_type in time_since_dict:
|
119 |
+
x = self.time_aware_aggregation(x, time_since_dict[node_type])
|
120 |
+
x_dict[node_type] = x
|
121 |
+
|
122 |
+
x = torch.cat([x_dict['user'], x_dict['business'], x_dict['review']], dim=0)
|
123 |
+
|
124 |
+
centrality = self.centrality_proj(centrality_encoding)
|
125 |
+
x = x + centrality
|
126 |
+
|
127 |
+
x = x.unsqueeze(0)
|
128 |
+
|
129 |
+
x_user = x[:, :data['user'].x.size(0), :]
|
130 |
+
x_business = x[:, data['user'].x.size(0):data['user'].x.size(0) + data['business'].x.size(0), :]
|
131 |
+
x_review = x[:, data['user'].x.size(0) + data['business'].x.size(0):, :]
|
132 |
+
|
133 |
+
x_user, _ = self.gru_user(x_user)
|
134 |
+
x_business, _ = self.gru_business(x_business)
|
135 |
+
x_review, _ = self.gru_review(x_review)
|
136 |
+
|
137 |
+
x = torch.cat([x_user, x_business, x_review], dim=1)
|
138 |
+
|
139 |
+
total_nodes = x.size(1)
|
140 |
+
attn_bias = torch.zeros(1, total_nodes, total_nodes, device=x.device)
|
141 |
+
attn_bias[0] = -spatial_encoding
|
142 |
+
|
143 |
+
for edge_type in edge_features_dict:
|
144 |
+
edge_index = data[edge_type].edge_index
|
145 |
+
edge_feats = self.edge_proj(edge_features_dict[edge_type])
|
146 |
+
for i, (src, tgt) in enumerate(edge_index.t()):
|
147 |
+
attn_bias[0, src, tgt] += edge_feats[i].sum()
|
148 |
+
|
149 |
+
residual = x
|
150 |
+
x, _ = self.attention1(x, x, x, attn_bias=attn_bias)
|
151 |
+
x = self.norm1(x + residual)
|
152 |
+
x = self.dropout(x)
|
153 |
+
|
154 |
+
residual = x
|
155 |
+
x = self.ffn1(x)
|
156 |
+
x = self.norm2(x + residual)
|
157 |
+
x = self.dropout(x)
|
158 |
+
|
159 |
+
residual = x
|
160 |
+
x, _ = self.attention2(x, x, x, attn_bias=attn_bias)
|
161 |
+
x = self.norm3(x + residual)
|
162 |
+
x = self.dropout(x)
|
163 |
+
|
164 |
+
residual = x
|
165 |
+
x = self.ffn2(x)
|
166 |
+
x = self.norm4(x + residual)
|
167 |
+
x = self.dropout(x)
|
168 |
+
|
169 |
+
x = x.squeeze(0)
|
170 |
+
|
171 |
+
user_start = 0
|
172 |
+
business_start = data['user'].x.size(0)
|
173 |
+
review_start = business_start + data['business'].x.size(0)
|
174 |
+
|
175 |
+
h_user = x[user_start:business_start]
|
176 |
+
h_business = x[business_start:review_start]
|
177 |
+
h_review = x[review_start:]
|
178 |
+
|
179 |
+
user_indices = data['user', 'writes', 'review'].edge_index[0]
|
180 |
+
business_indices = data['review', 'about', 'business'].edge_index[1]
|
181 |
+
review_indices = data['user', 'writes', 'review'].edge_index[1]
|
182 |
+
|
183 |
+
h_user_mapped = h_user[user_indices]
|
184 |
+
h_business_mapped = h_business[business_indices]
|
185 |
+
h_review_mapped = h_review[review_indices]
|
186 |
+
|
187 |
+
combined = torch.cat([h_review_mapped, h_user_mapped, h_business_mapped], dim=-1)
|
188 |
+
|
189 |
+
logits = self.classifier(combined)
|
190 |
+
return torch.sigmoid(logits)
|
191 |
+
|
192 |
+
# Updated GraphformerModel with Plotting
|
193 |
+
class GraphformerModel:
|
194 |
+
def __init__(self, df, output_path, epochs, test_size=0.3):
|
195 |
+
self.df_whole = df
|
196 |
+
self.output_path = output_path
|
197 |
+
self.output_path = Path(self.output_path) / "GraphformerModel"
|
198 |
+
self.epochs = epochs
|
199 |
+
self.df, self.test_df = train_test_split(self.df_whole, test_size=test_size, random_state=42)
|
200 |
+
|
201 |
+
torch.manual_seed(42)
|
202 |
+
np.random.seed(42)
|
203 |
+
|
204 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
205 |
+
|
206 |
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
207 |
+
self.model = HeteroGraphormer(hidden_dim=64, output_dim=1, edge_dim=4).to(self.device)
|
208 |
+
self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.005)
|
209 |
+
self.criterion = nn.BCELoss()
|
210 |
+
|
211 |
+
def compute_graph_encodings(self, data):
|
212 |
+
G = nx.DiGraph()
|
213 |
+
node_offset = 0
|
214 |
+
node_type_map = {}
|
215 |
+
|
216 |
+
for node_type in ['user', 'business', 'review']:
|
217 |
+
num_nodes = data[node_type].x.size(0)
|
218 |
+
for i in range(num_nodes):
|
219 |
+
G.add_node(node_offset + i)
|
220 |
+
node_type_map[node_offset + i] = node_type
|
221 |
+
node_offset += num_nodes
|
222 |
+
|
223 |
+
edge_types = [('user', 'writes', 'review'), ('review', 'about', 'business')]
|
224 |
+
for src_type, rel, tgt_type in edge_types:
|
225 |
+
edge_index = data[src_type, rel, tgt_type].edge_index
|
226 |
+
src_nodes = edge_index[0].tolist()
|
227 |
+
tgt_nodes = edge_index[1].tolist()
|
228 |
+
src_offset = 0 if src_type == 'user' else (self.num_users if src_type == 'business' else self.num_users + self.num_businesses)
|
229 |
+
tgt_offset = 0 if tgt_type == 'user' else (self.num_users if tgt_type == 'business' else self.num_users + self.num_businesses)
|
230 |
+
for src, tgt in zip(src_nodes, tgt_nodes):
|
231 |
+
G.add_edge(src + src_offset, tgt + tgt_offset)
|
232 |
+
|
233 |
+
num_nodes = G.number_of_nodes()
|
234 |
+
spatial_encoding = torch.full((num_nodes, num_nodes), float('inf'), device=self.device)
|
235 |
+
for i in range(num_nodes):
|
236 |
+
for j in range(num_nodes):
|
237 |
+
if i == j:
|
238 |
+
spatial_encoding[i, j] = 0
|
239 |
+
elif nx.has_path(G, i, j):
|
240 |
+
spatial_encoding[i, j] = nx.shortest_path_length(G, i, j)
|
241 |
+
|
242 |
+
centrality_encoding = torch.tensor([G.degree(i) for i in range(num_nodes)], dtype=torch.float, device=self.device).view(-1, 1)
|
243 |
+
|
244 |
+
return spatial_encoding, centrality_encoding, node_type_map
|
245 |
+
|
246 |
+
def compute_metrics(self, y_true, y_pred, y_prob, prefix=""):
|
247 |
+
metrics = {}
|
248 |
+
metrics[f"{prefix}accuracy"] = accuracy_score(y_true, y_pred)
|
249 |
+
metrics[f"{prefix}precision"] = precision_score(y_true, y_pred, zero_division=0)
|
250 |
+
metrics[f"{prefix}recall"] = recall_score(y_true, y_pred, zero_division=0)
|
251 |
+
metrics[f"{prefix}f1"] = f1_score(y_true, y_pred, zero_division=0)
|
252 |
+
metrics[f"{prefix}auc_roc"] = roc_auc_score(y_true, y_prob)
|
253 |
+
metrics[f"{prefix}conf_matrix"] = confusion_matrix(y_true, y_pred)
|
254 |
+
metrics[f"{prefix}class_report"] = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
|
255 |
+
return metrics
|
256 |
+
|
257 |
+
def run_model(self):
|
258 |
+
features = torch.tensor(self.df.drop(columns=['user_id', 'review_id', 'business_id', 'fake']).values, dtype=torch.float, device=self.device)
|
259 |
+
y = torch.tensor(self.df['fake'].values, dtype=torch.float, device=self.device)
|
260 |
+
time_since_user = torch.tensor(self.df['time_since_last_review_user'].values, dtype=torch.float, device=self.device)
|
261 |
+
time_since_business = torch.tensor(self.df['time_since_last_review_business'].values, dtype=torch.float, device=self.device)
|
262 |
+
num_rows = len(self.df)
|
263 |
+
|
264 |
+
graph = HeteroData()
|
265 |
+
|
266 |
+
self.num_users = len(self.df['user_id'].unique())
|
267 |
+
self.num_businesses = len(self.df['business_id'].unique())
|
268 |
+
|
269 |
+
user_indices = torch.tensor(self.df['user_id'].map({uid: i for i, uid in enumerate(self.df['user_id'].unique())}).values, dtype=torch.long, device=self.device)
|
270 |
+
business_indices = torch.tensor(self.df['business_id'].map({bid: i for i, bid in enumerate(self.df['business_id'].unique())}).values, dtype=torch.long, device=self.device)
|
271 |
+
review_indices = torch.arange(num_rows, dtype=torch.long, device=self.device)
|
272 |
+
|
273 |
+
user_feats = torch.zeros(self.num_users, 14, device=self.device)
|
274 |
+
business_feats = torch.zeros(self.num_businesses, 8, device=self.device)
|
275 |
+
review_feats = torch.zeros(num_rows, 16, device=self.device)
|
276 |
+
|
277 |
+
user_cols = ['hours', 'user_review_count', 'elite', 'friends', 'fans', 'average_stars',
|
278 |
+
'time_since_last_review_user', 'user_account_age', 'user_degree',
|
279 |
+
'user_review_burst_count', 'review_like_ratio', 'latest_checkin_hours',
|
280 |
+
'user_useful_funny_cool', 'rating_variance_user']
|
281 |
+
business_cols = ['latitude', 'longitude', 'business_stars', 'business_review_count',
|
282 |
+
'time_since_last_review_business', 'business_degree',
|
283 |
+
'business_review_burst_count', 'rating_deviation_from_business_average']
|
284 |
+
review_cols = ['review_stars', 'tip_compliment_count', 'tip_count', 'average_time_between_reviews',
|
285 |
+
'temporal_similarity', 'pronoun_density', 'avg_sentence_length',
|
286 |
+
'excessive_punctuation_count', 'sentiment_polarity', 'good_severity',
|
287 |
+
'bad_severity', 'code_switching_flag', 'grammar_error_score',
|
288 |
+
'repetitive_words_count', 'similarity_to_other_reviews', 'review_useful_funny_cool']
|
289 |
+
|
290 |
+
for i in range(len(self.df)):
|
291 |
+
user_idx = user_indices[i]
|
292 |
+
business_idx = business_indices[i]
|
293 |
+
user_feats[user_idx] += features[i, :14]
|
294 |
+
business_feats[business_idx] += features[i, 14:22]
|
295 |
+
review_feats = features[:, 22:38]
|
296 |
+
|
297 |
+
graph['user'].x = user_feats
|
298 |
+
graph['business'].x = business_feats
|
299 |
+
graph['review'].x = review_feats
|
300 |
+
graph['review'].y = y
|
301 |
+
|
302 |
+
graph['user', 'writes', 'review'].edge_index = torch.stack([user_indices, review_indices], dim=0)
|
303 |
+
graph['review', 'about', 'business'].edge_index = torch.stack([review_indices, business_indices], dim=0)
|
304 |
+
|
305 |
+
edge_features_dict = {}
|
306 |
+
user_writes_edge = graph['user', 'writes', 'review'].edge_index
|
307 |
+
review_about_edge = graph['review', 'about', 'business'].edge_index
|
308 |
+
|
309 |
+
src_users = user_indices[user_writes_edge[0]]
|
310 |
+
tgt_reviews = review_indices[user_writes_edge[1]]
|
311 |
+
edge_features_dict[('user', 'writes', 'review')] = create_temporal_edge_features(
|
312 |
+
time_since_user[src_users], time_since_user[tgt_reviews], src_users, src_users
|
313 |
+
)
|
314 |
+
|
315 |
+
src_reviews = review_indices[review_about_edge[0]]
|
316 |
+
tgt_businesses = business_indices[review_about_edge[1]]
|
317 |
+
edge_features_dict[('review', 'about', 'business')] = create_temporal_edge_features(
|
318 |
+
time_since_business[src_reviews], time_since_business[tgt_businesses],
|
319 |
+
torch.zeros_like(src_reviews), torch.zeros_like(src_reviews)
|
320 |
+
)
|
321 |
+
|
322 |
+
user_time_since = self.df.groupby('user_id')['time_since_last_review_user'].min().reindex(
|
323 |
+
self.df['user_id'].unique(), fill_value=0).values
|
324 |
+
time_since_dict = {
|
325 |
+
'user': torch.tensor(user_time_since, dtype=torch.float, device=self.device)
|
326 |
+
}
|
327 |
+
|
328 |
+
spatial_encoding, centrality_encoding, node_type_map = self.compute_graph_encodings(graph)
|
329 |
+
|
330 |
+
# Training with metrics history
|
331 |
+
self.model.train()
|
332 |
+
train_metrics_history = []
|
333 |
+
for epoch in range(self.epochs):
|
334 |
+
self.optimizer.zero_grad()
|
335 |
+
out = self.model(graph, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict)
|
336 |
+
loss = self.criterion(out.squeeze(), y)
|
337 |
+
loss.backward()
|
338 |
+
self.optimizer.step()
|
339 |
+
|
340 |
+
pred_labels = (out.squeeze() > 0.5).float()
|
341 |
+
|
342 |
+
probs = out.squeeze().detach().cpu().numpy()
|
343 |
+
train_metrics = self.compute_metrics(y.cpu().numpy(), pred_labels.cpu().numpy(), probs, prefix="train_")
|
344 |
+
train_metrics['loss'] = loss.item()
|
345 |
+
train_metrics_history.append(train_metrics)
|
346 |
+
|
347 |
+
if epoch % 10 == 0:
|
348 |
+
logger.info(f"Epoch {epoch}, Loss: {loss.item():.4f}, Accuracy: {train_metrics['train_accuracy']:.4f}, F1: {train_metrics['train_f1']:.4f}")
|
349 |
+
|
350 |
+
# Save model
|
351 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
352 |
+
model_save_path = Path(self.output_path) / f"model_GraphformerModel_latest.pth"
|
353 |
+
torch.save(self.model.state_dict(), model_save_path)
|
354 |
+
|
355 |
+
# Testing
|
356 |
+
if self.test_df is not None:
|
357 |
+
test_features = torch.tensor(self.test_df.drop(columns=['user_id', 'review_id', 'business_id', 'fake']).values, dtype=torch.float, device=self.device)
|
358 |
+
test_y = torch.tensor(self.test_df['fake'].values, dtype=torch.float, device=self.device)
|
359 |
+
test_time_since_user = torch.tensor(self.test_df['time_since_last_review_user'].values, dtype=torch.float, device=self.device)
|
360 |
+
test_time_since_business = torch.tensor(self.test_df['time_since_last_review_business'].values, dtype=torch.float, device=self.device)
|
361 |
+
num_test_rows = len(self.test_df)
|
362 |
+
|
363 |
+
new_user_unique = self.test_df['user_id'].unique()
|
364 |
+
new_business_unique = self.test_df['business_id'].unique()
|
365 |
+
|
366 |
+
existing_user_ids = list(self.df['user_id'].unique())
|
367 |
+
user_mapping = {uid: i for i, uid in enumerate(existing_user_ids)}
|
368 |
+
total_users = self.num_users
|
369 |
+
for uid in new_user_unique:
|
370 |
+
if uid not in user_mapping:
|
371 |
+
user_mapping[uid] = total_users
|
372 |
+
total_users += 1
|
373 |
+
|
374 |
+
existing_business_ids = list(self.df['business_id'].unique())
|
375 |
+
business_mapping = {bid: i for i, bid in enumerate(existing_business_ids)}
|
376 |
+
total_businesses = self.num_businesses
|
377 |
+
for bid in new_business_unique:
|
378 |
+
if bid not in business_mapping:
|
379 |
+
business_mapping[bid] = total_businesses
|
380 |
+
total_businesses += 1
|
381 |
+
|
382 |
+
new_user_indices = torch.tensor([user_mapping[uid] for uid in self.test_df['user_id']], dtype=torch.long, device=self.device)
|
383 |
+
new_business_indices = torch.tensor([business_mapping[bid] for bid in self.test_df['business_id']], dtype=torch.long, device=self.device)
|
384 |
+
new_review_indices = torch.arange(num_rows, num_rows + num_test_rows, device=self.device)
|
385 |
+
|
386 |
+
if total_users > self.num_users:
|
387 |
+
additional_user_feats = torch.zeros(total_users - self.num_users, 14, device=self.device)
|
388 |
+
graph['user'].x = torch.cat([graph['user'].x, additional_user_feats], dim=0)
|
389 |
+
if total_businesses > self.num_businesses:
|
390 |
+
additional_business_feats = torch.zeros(total_businesses - self.num_businesses, 8, device=self.device)
|
391 |
+
graph['business'].x = torch.cat([graph['business'].x, additional_business_feats], dim=0)
|
392 |
+
|
393 |
+
for i in range(num_test_rows):
|
394 |
+
user_idx = new_user_indices[i]
|
395 |
+
business_idx = new_business_indices[i]
|
396 |
+
if user_idx < graph['user'].x.size(0):
|
397 |
+
graph['user'].x[user_idx] += test_features[i, :14]
|
398 |
+
if business_idx < graph['business'].x.size(0):
|
399 |
+
graph['business'].x[business_idx] += test_features[i, 14:22]
|
400 |
+
graph['review'].x = torch.cat([graph['review'].x, test_features[:, 22:38]], dim=0)
|
401 |
+
graph['review'].y = torch.cat([graph['review'].y, test_y], dim=0)
|
402 |
+
|
403 |
+
graph['user', 'writes', 'review'].edge_index = torch.cat([
|
404 |
+
graph['user', 'writes', 'review'].edge_index,
|
405 |
+
torch.stack([new_user_indices, new_review_indices], dim=0)], dim=1)
|
406 |
+
graph['review', 'about', 'business'].edge_index = torch.cat([
|
407 |
+
graph['review', 'about', 'business'].edge_index,
|
408 |
+
torch.stack([new_review_indices, new_business_indices], dim=0)], dim=1)
|
409 |
+
|
410 |
+
all_time_since_user = torch.cat([time_since_user, test_time_since_user])
|
411 |
+
all_time_since_business = torch.cat([time_since_business, test_time_since_business])
|
412 |
+
all_user_indices = torch.cat([user_indices, new_user_indices])
|
413 |
+
all_business_indices = torch.cat([business_indices, new_business_indices])
|
414 |
+
all_review_indices = torch.cat([review_indices, new_review_indices])
|
415 |
+
|
416 |
+
user_writes_edge = graph['user', 'writes', 'review'].edge_index
|
417 |
+
review_about_edge = graph['review', 'about', 'business'].edge_index
|
418 |
+
|
419 |
+
edge_features_dict[('user', 'writes', 'review')] = create_temporal_edge_features(
|
420 |
+
all_time_since_user[user_writes_edge[0]], all_time_since_user[user_writes_edge[1]],
|
421 |
+
all_user_indices[user_writes_edge[0]], all_user_indices[user_writes_edge[0]]
|
422 |
+
)
|
423 |
+
edge_features_dict[('review', 'about', 'business')] = create_temporal_edge_features(
|
424 |
+
all_time_since_business[review_about_edge[0]], all_time_since_business[review_about_edge[1]],
|
425 |
+
torch.zeros_like(review_about_edge[0]), torch.zeros_like(review_about_edge[0])
|
426 |
+
)
|
427 |
+
|
428 |
+
self.num_users = total_users
|
429 |
+
self.num_businesses = total_businesses
|
430 |
+
|
431 |
+
test_user_time_since = self.test_df.groupby('user_id')['time_since_last_review_user'].min().reindex(
|
432 |
+
pd.Index(list(self.df['user_id'].unique()) + list(self.test_df['user_id'].unique())), fill_value=0).values
|
433 |
+
time_since_dict['user'] = torch.tensor(test_user_time_since[:total_users], dtype=torch.float, device=self.device)
|
434 |
+
|
435 |
+
spatial_encoding, centrality_encoding, node_type_map = self.compute_graph_encodings(graph)
|
436 |
+
|
437 |
+
self.model.eval()
|
438 |
+
with torch.no_grad():
|
439 |
+
out = self.model(graph, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict)
|
440 |
+
pred_labels = (out.squeeze() > 0.5).float()
|
441 |
+
probs = out.squeeze().detach().cpu().numpy()
|
442 |
+
test_metrics = self.compute_metrics(graph['review'].y[-num_test_rows:].cpu().numpy(), pred_labels[-num_test_rows:].cpu().numpy(), probs[-num_test_rows:], prefix="test_")
|
443 |
+
train_metrics = self.compute_metrics(y.cpu().numpy(), pred_labels[:num_rows].cpu().numpy(), probs[:num_rows], prefix="train_")
|
444 |
+
logger.info(f"Test Accuracy: {test_metrics['test_accuracy']:.4f}, F1: {test_metrics['test_f1']:.4f}, AUC-ROC: {test_metrics['test_auc_roc']:.4f}")
|
445 |
+
|
446 |
+
# Save metrics to file
|
447 |
+
metrics_file = Path(self.output_path) / f"metrics_{timestamp}.txt"
|
448 |
+
with open(metrics_file, 'w') as f:
|
449 |
+
f.write("Training Metrics (Final Epoch):\n")
|
450 |
+
for k, v in train_metrics.items():
|
451 |
+
f.write(f"{k}: {v}\n")
|
452 |
+
f.write("\nTest Metrics:\n")
|
453 |
+
for k, v in test_metrics.items():
|
454 |
+
f.write(f"{k}: {v}\n")
|
455 |
+
|
456 |
+
# Plotting and saving to output_path
|
457 |
+
plt.figure(figsize=(12, 8))
|
458 |
+
plt.plot([m['loss'] for m in train_metrics_history], label='Training Loss')
|
459 |
+
plt.xlabel('Epoch')
|
460 |
+
plt.ylabel('Loss')
|
461 |
+
plt.title('Training Loss Curve')
|
462 |
+
plt.legend()
|
463 |
+
plt.grid(True)
|
464 |
+
plt.savefig(Path(self.output_path) / f"loss_curve_{timestamp}.png")
|
465 |
+
plt.close()
|
466 |
+
|
467 |
+
plt.figure(figsize=(12, 8))
|
468 |
+
plt.plot([m['train_accuracy'] for m in train_metrics_history], label='Training Accuracy')
|
469 |
+
plt.xlabel('Epoch')
|
470 |
+
plt.ylabel('Accuracy')
|
471 |
+
plt.title('Training Accuracy Curve')
|
472 |
+
plt.legend()
|
473 |
+
plt.grid(True)
|
474 |
+
plt.savefig(Path(self.output_path) / f"accuracy_curve_{timestamp}.png")
|
475 |
+
plt.close()
|
476 |
+
|
477 |
+
plt.figure(figsize=(12, 8))
|
478 |
+
plt.plot([m['train_precision'] for m in train_metrics_history], label='Training Precision')
|
479 |
+
plt.plot([m['train_recall'] for m in train_metrics_history], label='Training Recall')
|
480 |
+
plt.plot([m['train_f1'] for m in train_metrics_history], label='Training F1-Score')
|
481 |
+
plt.xlabel('Epoch')
|
482 |
+
plt.ylabel('Score')
|
483 |
+
plt.title('Training Precision, Recall, and F1-Score Curves')
|
484 |
+
plt.legend()
|
485 |
+
plt.grid(True)
|
486 |
+
plt.savefig(Path(self.output_path) / f"prf1_curves_{timestamp}.png")
|
487 |
+
plt.close()
|
488 |
+
|
489 |
+
plt.figure(figsize=(12, 8))
|
490 |
+
plt.plot([m['train_auc_roc'] for m in train_metrics_history], label='Training AUC-ROC')
|
491 |
+
plt.xlabel('Epoch')
|
492 |
+
plt.ylabel('AUC-ROC')
|
493 |
+
plt.title('Training AUC-ROC Curve')
|
494 |
+
plt.legend()
|
495 |
+
plt.grid(True)
|
496 |
+
plt.savefig(Path(self.output_path) / f"auc_roc_curve_train_{timestamp}.png")
|
497 |
+
plt.close()
|
498 |
+
|
499 |
+
plt.figure(figsize=(8, 6))
|
500 |
+
sns.heatmap(test_metrics['test_conf_matrix'], annot=True, fmt='d', cmap='Blues', cbar=False)
|
501 |
+
plt.xlabel('Predicted')
|
502 |
+
plt.ylabel('True')
|
503 |
+
plt.title('Test Confusion Matrix')
|
504 |
+
plt.savefig(Path(self.output_path) / f"confusion_matrix_test_{timestamp}.png")
|
505 |
+
plt.close()
|
506 |
+
|
507 |
+
fpr, tpr, _ = roc_curve(graph['review'].y[-num_test_rows:].cpu().numpy(), probs[-num_test_rows:])
|
508 |
+
plt.figure(figsize=(10, 6))
|
509 |
+
plt.plot(fpr, tpr, label=f'Test ROC Curve (AUC = {test_metrics["test_auc_roc"]:.4f})')
|
510 |
+
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
|
511 |
+
plt.xlabel('False Positive Rate')
|
512 |
+
plt.ylabel('True Positive Rate')
|
513 |
+
plt.title('Test ROC Curve')
|
514 |
+
plt.legend()
|
515 |
+
plt.grid(True)
|
516 |
+
plt.savefig(Path(self.output_path) / f"roc_curve_test_{timestamp}.png")
|
517 |
+
plt.close()
|
518 |
+
|
519 |
+
plt.figure(figsize=(8, 6))
|
520 |
+
sns.heatmap(train_metrics['train_conf_matrix'], annot=True, fmt='d', cmap='Blues', cbar=False)
|
521 |
+
plt.xlabel('Predicted')
|
522 |
+
plt.ylabel('True')
|
523 |
+
plt.title('Training Confusion Matrix (Final Epoch)')
|
524 |
+
plt.savefig(Path(self.output_path) / f"confusion_matrix_train_{timestamp}.png")
|
525 |
+
plt.close()
|
526 |
+
|
527 |
+
fpr_train, tpr_train, _ = roc_curve(graph['review'].y[:num_rows].cpu().numpy(), probs[:num_rows])
|
528 |
+
plt.figure(figsize=(10, 6))
|
529 |
+
plt.plot(fpr_train, tpr_train, label=f'Training ROC Curve (AUC = {train_metrics["train_auc_roc"]:.4f})')
|
530 |
+
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
|
531 |
+
plt.xlabel('False Positive Rate')
|
532 |
+
plt.ylabel('True Positive Rate')
|
533 |
+
plt.title('Training ROC Curve (Final Epoch)')
|
534 |
+
plt.legend()
|
535 |
+
plt.grid(True)
|
536 |
+
plt.savefig(Path(self.output_path) / f"roc_curve_train_{timestamp}.png")
|
537 |
+
plt.close()
|
538 |
+
|
539 |
+
logger.info(f"All metrics, plots, and model saved to {self.output_path}")
|
540 |
+
|
src/model_trainer.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.model import GraphformerModel
|
2 |
+
from pathlib import Path
|
3 |
+
from loguru import logger
|
4 |
+
|
5 |
+
|
6 |
+
class ModelTrainer:
|
7 |
+
def __init__(self, df, output_path, epochs=100,test_size=0.3):
|
8 |
+
self.df = df
|
9 |
+
self.output_path = output_path
|
10 |
+
self.epochs = epochs
|
11 |
+
self.test_size=test_size
|
12 |
+
|
13 |
+
# Create output directory
|
14 |
+
Path(self.output_path).mkdir(parents=True, exist_ok=True)
|
15 |
+
|
16 |
+
# Initialize the HeteroGraphormerModel
|
17 |
+
|
18 |
+
self.model = GraphformerModel(df=self.df, output_path=self.output_path, epochs=self.epochs,test_size=self.test_size)
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
logger.info(f"Initialized ModelTrainer with output_path: {self.output_path} and epochs: {self.epochs}")
|
23 |
+
|
24 |
+
|
25 |
+
def train_and_evaluate(self):
|
26 |
+
|
27 |
+
try:
|
28 |
+
logger.info("Starting model training and evaluation")
|
29 |
+
self.model.run_model()
|
30 |
+
logger.info("GraphformerModel training and evaluation completed successfully")
|
31 |
+
except Exception as e:
|
32 |
+
logger.error(f"Error during GraphformerModel training and evaluation: {e}")
|
33 |
+
raise
|
34 |
+
|
35 |
+
|
src/preprocessing.py
ADDED
@@ -0,0 +1,832 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from loguru import logger
|
2 |
+
import pandas as pd
|
3 |
+
import json
|
4 |
+
from datetime import datetime
|
5 |
+
import ast
|
6 |
+
import numpy as np
|
7 |
+
from pymongo import MongoClient
|
8 |
+
from collections import defaultdict
|
9 |
+
|
10 |
+
from tqdm import tqdm
|
11 |
+
import time
|
12 |
+
|
13 |
+
import requests
|
14 |
+
import json
|
15 |
+
import os
|
16 |
+
import pandas as pd
|
17 |
+
import nltk
|
18 |
+
from nltk.tokenize import sent_tokenize, word_tokenize
|
19 |
+
from nltk.corpus import stopwords
|
20 |
+
from textblob import TextBlob
|
21 |
+
import re
|
22 |
+
from transformers import BertTokenizer, BertModel
|
23 |
+
from transformers import RobertaTokenizer, RobertaModel
|
24 |
+
import torch
|
25 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
26 |
+
import numpy as np
|
27 |
+
|
28 |
+
# Download NLTK resources
|
29 |
+
nltk.download('punkt')
|
30 |
+
nltk.download('averaged_perceptron_tagger')
|
31 |
+
nltk.download('stopwords')
|
32 |
+
nltk.download('punkt_tab')
|
33 |
+
nltk.download('averaged_perceptron_tagger_eng')
|
34 |
+
|
35 |
+
class Preprocessor:
|
36 |
+
def __init__(self,df):
|
37 |
+
self.df=df
|
38 |
+
self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
|
39 |
+
self.model = RobertaModel.from_pretrained('roberta-base')
|
40 |
+
self.stop_words = set(stopwords.words('english'))
|
41 |
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Add this line
|
42 |
+
|
43 |
+
|
44 |
+
|
45 |
+
def get_bert_embedding(self, text):
|
46 |
+
inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
|
47 |
+
with torch.no_grad():
|
48 |
+
outputs = self.model(**inputs)
|
49 |
+
return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
|
50 |
+
|
51 |
+
def preprocess_text(self,text):
|
52 |
+
return text if pd.notna(text) else ""
|
53 |
+
|
54 |
+
|
55 |
+
def calculate_duration(self, time_range):
|
56 |
+
if not isinstance(time_range, str) or "-" not in time_range:
|
57 |
+
return None
|
58 |
+
start_str, end_str = time_range.split('-')
|
59 |
+
start_str = start_str.strip() + ':00' if len(start_str.split(':')) == 1 else start_str.strip()
|
60 |
+
end_str = end_str.strip() + ':00' if len(end_str.split(':')) == 1 else end_str.strip()
|
61 |
+
try:
|
62 |
+
start = datetime.strptime(start_str, '%H:%M')
|
63 |
+
end = datetime.strptime(end_str, '%H:%M')
|
64 |
+
duration = (end - start).total_seconds() / 3600
|
65 |
+
return duration if duration >= 0 else duration + 24
|
66 |
+
except ValueError:
|
67 |
+
return None
|
68 |
+
def calculate_sentiment_severity(self, text):
|
69 |
+
if pd.isna(text) or not text.strip():
|
70 |
+
return pd.Series({"good_severity": 0.0, "bad_severity": 0.0})
|
71 |
+
|
72 |
+
# Get sentiment polarity (-1 to 1)
|
73 |
+
blob = TextBlob(text)
|
74 |
+
polarity = blob.sentiment.polarity
|
75 |
+
|
76 |
+
# Define severity weights
|
77 |
+
good_weight = 0.7
|
78 |
+
bad_weight = 0.3
|
79 |
+
|
80 |
+
if polarity > 0:
|
81 |
+
good_severity = good_weight * polarity
|
82 |
+
bad_severity = 0.0
|
83 |
+
elif polarity < 0:
|
84 |
+
good_severity = 0.0
|
85 |
+
bad_severity = bad_weight * abs(polarity)
|
86 |
+
else: # Neutral (polarity = 0)
|
87 |
+
good_severity = 0.0
|
88 |
+
bad_severity = 0.0
|
89 |
+
|
90 |
+
return pd.Series({"good_severity": good_severity, "bad_severity": bad_severity})
|
91 |
+
|
92 |
+
|
93 |
+
def get_avg_duration(self, hours_str):
|
94 |
+
if pd.isna(hours_str) or not isinstance(hours_str, str):
|
95 |
+
return pd.NA
|
96 |
+
try:
|
97 |
+
hours_dict = ast.literal_eval(hours_str)
|
98 |
+
if not hours_dict:
|
99 |
+
return pd.NA
|
100 |
+
durations = [self.calculate_duration(time_range) for time_range in hours_dict.values()]
|
101 |
+
valid_durations = [d for d in durations if d is not None]
|
102 |
+
return sum(valid_durations) / len(valid_durations) if valid_durations else pd.NA
|
103 |
+
except (ValueError, SyntaxError, ZeroDivisionError):
|
104 |
+
return pd.NA
|
105 |
+
|
106 |
+
|
107 |
+
def calculate_time_since_last_review(self):
|
108 |
+
present_date = datetime.now()
|
109 |
+
user_latest_timestamp = {}
|
110 |
+
|
111 |
+
# Convert review_date to datetime
|
112 |
+
self.df["review_date"] = pd.to_datetime(self.df["review_date"])
|
113 |
+
|
114 |
+
# Calculate hours difference for each user's latest review
|
115 |
+
for user_id in self.df["user_id"].unique():
|
116 |
+
latest_date = self.df[self.df["user_id"] == user_id]["review_date"].max()
|
117 |
+
|
118 |
+
if not isinstance(latest_date, datetime):
|
119 |
+
latest_date = latest_date.to_pydatetime()
|
120 |
+
|
121 |
+
hours_difference = (present_date - latest_date).total_seconds() / 3600
|
122 |
+
user_latest_timestamp[user_id] = hours_difference
|
123 |
+
|
124 |
+
# Map the hours difference to a new column
|
125 |
+
self.df["time_since_last_review_user"] = self.df["user_id"].map(user_latest_timestamp)
|
126 |
+
|
127 |
+
def calculate_time_since_last_review_business(self):
|
128 |
+
present_date = datetime.now()
|
129 |
+
|
130 |
+
# Ensure review_date is in datetime format
|
131 |
+
self.df["review_date"] = pd.to_datetime(self.df["review_date"])
|
132 |
+
|
133 |
+
# Initialize dictionary to store hours since last review for each business
|
134 |
+
business_latest_timestamp = {}
|
135 |
+
|
136 |
+
# Iterate over unique business_ids
|
137 |
+
for business_id in self.df["business_id"].unique():
|
138 |
+
# Get the latest review date for this business
|
139 |
+
latest_date = self.df[self.df["business_id"] == business_id]["review_date"].max()
|
140 |
+
|
141 |
+
# Convert to datetime object if needed
|
142 |
+
if not isinstance(latest_date, datetime):
|
143 |
+
latest_date = latest_date.to_pydatetime()
|
144 |
+
|
145 |
+
# Calculate hours difference (already in hours)
|
146 |
+
hours_difference = (present_date - latest_date).total_seconds() / 3600
|
147 |
+
business_latest_timestamp[business_id] = hours_difference
|
148 |
+
|
149 |
+
# Map the hours difference to the new column
|
150 |
+
self.df["time_since_last_review_business"] = self.df["business_id"].map(business_latest_timestamp)
|
151 |
+
|
152 |
+
|
153 |
+
|
154 |
+
def calculate_user_account_age(self):
|
155 |
+
present_date = datetime.now()
|
156 |
+
|
157 |
+
# Convert yelping_since to datetime
|
158 |
+
self.df["yelping_since"] = pd.to_datetime(self.df["yelping_since"])
|
159 |
+
|
160 |
+
# Calculate user account age in days
|
161 |
+
self.df["user_account_age"] = (present_date - self.df["yelping_since"]).dt.days
|
162 |
+
|
163 |
+
|
164 |
+
def calculate_avg_time_between_reviews(self):
|
165 |
+
# Ensure review_date is in datetime format
|
166 |
+
self.df["review_date"] = pd.to_datetime(self.df["review_date"])
|
167 |
+
|
168 |
+
# Sort the DataFrame by user_id and review_date to ensure chronological order
|
169 |
+
self.df = self.df.sort_values(["user_id", "review_date"])
|
170 |
+
|
171 |
+
# Define helper function to calculate average time between reviews
|
172 |
+
def calculate_avg_time(group):
|
173 |
+
if len(group) == 1:
|
174 |
+
return 0 # If only one review, assign 0
|
175 |
+
# Calculate differences in hours between consecutive reviews
|
176 |
+
diffs = group["review_date"].diff().dt.total_seconds() / 3600
|
177 |
+
# Drop the first NaN (from diff) and compute the mean
|
178 |
+
return diffs.dropna().mean()
|
179 |
+
|
180 |
+
# Apply the function to each user_id group and create a mapping
|
181 |
+
avg_time_per_user = self.df.groupby("user_id").apply(calculate_avg_time)
|
182 |
+
|
183 |
+
# Map the average time back to the original DataFrame
|
184 |
+
self.df["average_time_between_reviews"] = self.df["user_id"].map(avg_time_per_user)
|
185 |
+
|
186 |
+
|
187 |
+
def calculate_user_degree(self):
|
188 |
+
# Calculate the number of unique businesses per user
|
189 |
+
user_business_counts = self.df.groupby("user_id")["business_id"].nunique()
|
190 |
+
|
191 |
+
# Map the counts back to the original DataFrame
|
192 |
+
self.df["user_degree"] = self.df["user_id"].map(user_business_counts)
|
193 |
+
|
194 |
+
|
195 |
+
def calculate_business_degree(self):
|
196 |
+
# Calculate the number of unique users per business
|
197 |
+
business_user_counts = self.df.groupby("business_id")["user_id"].nunique()
|
198 |
+
|
199 |
+
# Map the counts back to the original DataFrame
|
200 |
+
self.df["business_degree"] = self.df["business_id"].map(business_user_counts)
|
201 |
+
|
202 |
+
|
203 |
+
def calculate_rating_variance_user(self):
|
204 |
+
# Calculate the mode (most frequent rating) per user
|
205 |
+
user_rating_mode = self.df.groupby("user_id")["review_stars"].agg(lambda x: x.mode()[0])
|
206 |
+
|
207 |
+
# Map the most frequent rating back to the original DataFrame
|
208 |
+
self.df["rating_variance_user"] = self.df["user_id"].map(user_rating_mode)
|
209 |
+
|
210 |
+
|
211 |
+
def calculate_user_review_burst_count(self):
|
212 |
+
# Ensure review_date is in datetime format
|
213 |
+
self.df["review_date"] = pd.to_datetime(self.df["review_date"])
|
214 |
+
|
215 |
+
# Sort by user_id and review_date for chronological order
|
216 |
+
self.df = self.df.sort_values(["user_id", "review_date"])
|
217 |
+
|
218 |
+
# Function to calculate the max number of reviews in any 20-day window
|
219 |
+
def calculate_burst_count(group):
|
220 |
+
if len(group) <= 1:
|
221 |
+
return 0 # No burst if 1 or fewer reviews
|
222 |
+
|
223 |
+
# Convert review_date to a Series for rolling window
|
224 |
+
dates = group["review_date"]
|
225 |
+
|
226 |
+
# Calculate the number of reviews within 20 days of each review
|
227 |
+
burst_counts = []
|
228 |
+
for i, date in enumerate(dates):
|
229 |
+
# Count reviews within 20 days after this date
|
230 |
+
window_end = date + pd.Timedelta(days=20)
|
231 |
+
count = ((dates >= date) & (dates <= window_end)).sum()
|
232 |
+
burst_counts.append(count)
|
233 |
+
|
234 |
+
# Return the maximum burst count for this user
|
235 |
+
return max(burst_counts)
|
236 |
+
|
237 |
+
# Calculate the burst count per user
|
238 |
+
user_burst_counts = self.df.groupby("user_id").apply(calculate_burst_count)
|
239 |
+
|
240 |
+
# Map the burst count back to the original DataFrame
|
241 |
+
self.df["user_review_burst_count"] = self.df["user_id"].map(user_burst_counts)
|
242 |
+
|
243 |
+
|
244 |
+
def calculate_business_review_burst_count(self):
|
245 |
+
# Ensure review_date is in datetime format
|
246 |
+
self.df["review_date"] = pd.to_datetime(self.df["review_date"])
|
247 |
+
|
248 |
+
# Sort by business_id and review_date for chronological order
|
249 |
+
self.df = self.df.sort_values(["business_id", "review_date"])
|
250 |
+
|
251 |
+
# Function to calculate the max number of reviews in any 10-day window
|
252 |
+
def calculate_burst_count(group):
|
253 |
+
if len(group) <= 1:
|
254 |
+
return 0 # No burst if 1 or fewer reviews
|
255 |
+
|
256 |
+
# Convert review_date to a Series for rolling window
|
257 |
+
dates = group["review_date"]
|
258 |
+
|
259 |
+
# Calculate the number of reviews within 10 days of each review
|
260 |
+
burst_counts = []
|
261 |
+
for i, date in enumerate(dates):
|
262 |
+
# Count reviews within 10 days after this date
|
263 |
+
window_end = date + pd.Timedelta(days=10)
|
264 |
+
count = ((dates >= date) & (dates <= window_end)).sum()
|
265 |
+
burst_counts.append(count)
|
266 |
+
|
267 |
+
# Return the maximum burst count for this business
|
268 |
+
return max(burst_counts)
|
269 |
+
|
270 |
+
# Calculate the burst count per business
|
271 |
+
business_burst_counts = self.df.groupby("business_id").apply(calculate_burst_count)
|
272 |
+
|
273 |
+
# Map the burst count back to the original DataFrame
|
274 |
+
self.df["business_review_burst_count"] = self.df["business_id"].map(business_burst_counts)
|
275 |
+
|
276 |
+
|
277 |
+
def calculate_temporal_similarity(self):
|
278 |
+
self.df["review_date"] = pd.to_datetime(self.df["review_date"])
|
279 |
+
|
280 |
+
# Extract the day of the week (0 = Monday, 6 = Sunday)
|
281 |
+
self.df["day_of_week"] = self.df["review_date"].dt.dayofweek
|
282 |
+
|
283 |
+
# Function to calculate avg hours between reviews on frequent days
|
284 |
+
def calculate_avg_hours_on_frequent_days(group):
|
285 |
+
frequent_days = group["day_of_week"].mode().tolist()
|
286 |
+
|
287 |
+
if len(group) <= 1:
|
288 |
+
return 0
|
289 |
+
|
290 |
+
frequent_reviews = group[group["day_of_week"].isin(frequent_days)]
|
291 |
+
|
292 |
+
if len(frequent_reviews) <= 1:
|
293 |
+
return 0
|
294 |
+
|
295 |
+
frequent_reviews = frequent_reviews.sort_values("review_date")
|
296 |
+
diffs = frequent_reviews["review_date"].diff().dt.total_seconds() / 3600
|
297 |
+
|
298 |
+
return diffs.dropna().mean()
|
299 |
+
|
300 |
+
# Calculate average hours for each user
|
301 |
+
avg_hours_per_user = self.df.groupby("user_id").apply(calculate_avg_hours_on_frequent_days)
|
302 |
+
|
303 |
+
# Map the average hours to the new column
|
304 |
+
self.df["temporal_similarity"] = self.df["user_id"].map(avg_hours_per_user)
|
305 |
+
|
306 |
+
# Drop temporary column
|
307 |
+
self.df = self.df.drop(columns=["day_of_week"])
|
308 |
+
|
309 |
+
|
310 |
+
def calculate_rating_deviation_from_business_average(self):
|
311 |
+
# Calculate the average rating per business
|
312 |
+
business_avg_rating = self.df.groupby("business_id")["review_stars"].mean()
|
313 |
+
|
314 |
+
# Map the average rating to each row
|
315 |
+
self.df["business_avg_rating"] = self.df["business_id"].map(business_avg_rating)
|
316 |
+
|
317 |
+
# Calculate the deviation from the business average
|
318 |
+
self.df["rating_deviation_from_business_average"] = (
|
319 |
+
self.df["review_stars"] - self.df["business_avg_rating"]
|
320 |
+
)
|
321 |
+
|
322 |
+
# Drop the temporary column
|
323 |
+
self.df = self.df.drop(columns=["business_avg_rating"])
|
324 |
+
|
325 |
+
def calculate_review_like_ratio(self):
|
326 |
+
# Create a binary column for liked reviews (stars >= 4)
|
327 |
+
self.df["is_liked"] = (self.df["review_stars"] >= 4).astype(int)
|
328 |
+
|
329 |
+
# Calculate the like ratio per user
|
330 |
+
user_like_ratio = self.df.groupby("user_id")["is_liked"].mean()
|
331 |
+
|
332 |
+
# Map the like ratio back to the DataFrame
|
333 |
+
self.df["review_like_ratio"] = self.df["user_id"].map(user_like_ratio)
|
334 |
+
|
335 |
+
# Drop the temporary column
|
336 |
+
self.df = self.df.drop(columns=["is_liked"])
|
337 |
+
|
338 |
+
def calculate_latest_checkin_hours(self):
|
339 |
+
self.df["yelping_since"] = pd.to_datetime(self.df["yelping_since"])
|
340 |
+
|
341 |
+
# Function to get the latest check-in date from a list of strings
|
342 |
+
def get_latest_checkin(checkin_list):
|
343 |
+
if not checkin_list or pd.isna(checkin_list): # Handle empty or NaN
|
344 |
+
return None
|
345 |
+
if isinstance(checkin_list, str):
|
346 |
+
checkin_dates = checkin_list.split(", ")
|
347 |
+
else:
|
348 |
+
checkin_dates = checkin_list
|
349 |
+
return pd.to_datetime(checkin_dates).max()
|
350 |
+
|
351 |
+
# Apply the function to get the latest check-in date per row
|
352 |
+
self.df["latest_checkin_date"] = self.df["checkin_date"].apply(get_latest_checkin)
|
353 |
+
|
354 |
+
# Calculate the hours difference between latest check-in and yelping_since
|
355 |
+
self.df["latest_checkin_hours"] = (
|
356 |
+
(self.df["latest_checkin_date"] - self.df["yelping_since"])
|
357 |
+
.dt.total_seconds() / 3600
|
358 |
+
)
|
359 |
+
|
360 |
+
# Drop the temporary column
|
361 |
+
self.df = self.df.drop(columns=["latest_checkin_date"])
|
362 |
+
self.df["latest_checkin_hours"].fillna(0,inplace=True)
|
363 |
+
|
364 |
+
|
365 |
+
def compute_pronoun_density(self, text):
|
366 |
+
text = self.preprocess_text(text)
|
367 |
+
if not text:
|
368 |
+
return 0
|
369 |
+
words = word_tokenize(text.lower())
|
370 |
+
pos_tags = nltk.pos_tag(words)
|
371 |
+
pronouns = sum(1 for word, pos in pos_tags if pos in ['PRP', 'PRP$'] and word in ['i', 'we'])
|
372 |
+
return pronouns / len(words) if words else 0
|
373 |
+
|
374 |
+
def compute_avg_sentence_length(self, text):
|
375 |
+
text = self.preprocess_text(text)
|
376 |
+
if not text:
|
377 |
+
return 0
|
378 |
+
sentences = sent_tokenize(text)
|
379 |
+
return sum(len(word_tokenize(sent)) for sent in sentences) / len(sentences) if sentences else 0
|
380 |
+
|
381 |
+
def compute_excessive_punctuation(self, text):
|
382 |
+
text = self.preprocess_text(text)
|
383 |
+
return len(re.findall(r'[!?.]{2,}', text))
|
384 |
+
|
385 |
+
def compute_sentiment_polarity(self, text):
|
386 |
+
text = self.preprocess_text(text)
|
387 |
+
return TextBlob(text).sentiment.polarity if text else 0
|
388 |
+
|
389 |
+
def compute_code_switching_flag(self, text):
|
390 |
+
text = self.preprocess_text(text)
|
391 |
+
if not text:
|
392 |
+
return 0
|
393 |
+
|
394 |
+
tokens = self.tokenizer.tokenize(text.lower())
|
395 |
+
if not tokens:
|
396 |
+
return 0
|
397 |
+
|
398 |
+
english_words = self.stop_words # Use self.stop_words from __init__
|
399 |
+
token_set = set(tokens)
|
400 |
+
english_count = sum(1 for token in tokens if token in english_words)
|
401 |
+
|
402 |
+
non_english_pattern = re.compile(r'[^\x00-\x7F]')
|
403 |
+
has_non_ascii = 1 if non_english_pattern.search(text) else 0
|
404 |
+
|
405 |
+
english_ratio = english_count / len(tokens) if tokens else 0
|
406 |
+
|
407 |
+
non_english_tokens = sum(1 for token in token_set if token not in english_words and "##" in token and has_non_ascii)
|
408 |
+
|
409 |
+
# Flag as code-switching if:
|
410 |
+
# 1. Mixed English presence (ratio between 0.1 and 0.9)
|
411 |
+
# 2. Non-ASCII characters present OR some non-English subword tokens
|
412 |
+
if 0.1 < english_ratio < 0.9 and (has_non_ascii or non_english_tokens > 0):
|
413 |
+
return 1
|
414 |
+
return 0
|
415 |
+
|
416 |
+
|
417 |
+
def batch_tokenize(self, texts, batch_size=32, max_length=512):
|
418 |
+
tokenized_outputs = []
|
419 |
+
for i in tqdm(range(0, len(texts), batch_size), desc="Tokenizing with RoBERTa on GPU"):
|
420 |
+
batch_texts = texts[i:i + batch_size]
|
421 |
+
valid_texts = [self.preprocess_text(t) for t in batch_texts]
|
422 |
+
# Tokenize with fixed max_length to ensure consistent tensor sizes
|
423 |
+
inputs = self.tokenizer(valid_texts, return_tensors='pt', truncation=True, padding='max_length', max_length=max_length)
|
424 |
+
tokenized_outputs.append(inputs['input_ids'].to(self.device)) # Move to GPU
|
425 |
+
# Concatenate on GPU with consistent sizes
|
426 |
+
return torch.cat(tokenized_outputs, dim=0)
|
427 |
+
|
428 |
+
def compute_grammar_error_score(self, texts, tokenized_ids):
|
429 |
+
print("Computing grammar error scores...")
|
430 |
+
error_scores = np.zeros(len(texts), dtype=float)
|
431 |
+
|
432 |
+
vocab_set = set(self.tokenizer.get_vocab().keys())
|
433 |
+
for i, input_ids in enumerate(tqdm(tokenized_ids, desc="Processing Grammar Errors")):
|
434 |
+
if input_ids.sum() == 0: # Empty input
|
435 |
+
continue
|
436 |
+
tokens = self.tokenizer.convert_ids_to_tokens(input_ids.cpu().tolist(), skip_special_tokens=True)
|
437 |
+
unknown_count = sum(1 for token in tokens if token not in vocab_set and token not in self.stop_words)
|
438 |
+
total_count = len([t for t in tokens if t not in self.stop_words])
|
439 |
+
error_scores[i] = unknown_count / total_count if total_count > 0 else 0
|
440 |
+
|
441 |
+
return error_scores
|
442 |
+
|
443 |
+
def compute_repetitive_words_count(self, texts, tokenized_ids):
|
444 |
+
print("Computing repetitive words counts...")
|
445 |
+
rep_counts = np.zeros(len(texts), dtype=int)
|
446 |
+
|
447 |
+
for i, input_ids in enumerate(tqdm(tokenized_ids, desc="Processing Repetition")):
|
448 |
+
if input_ids.sum() == 0: # Empty input
|
449 |
+
continue
|
450 |
+
tokens = self.tokenizer.convert_ids_to_tokens(input_ids.cpu().tolist(), skip_special_tokens=True)
|
451 |
+
valid_tokens = [t for t in tokens if t not in self.stop_words and len(t) > 2]
|
452 |
+
if valid_tokens:
|
453 |
+
token_counts = {}
|
454 |
+
for token in valid_tokens:
|
455 |
+
token_counts[token] = token_counts.get(token, 0) + 1
|
456 |
+
rep_counts[i] = sum(1 for count in token_counts.values() if count > 1)
|
457 |
+
|
458 |
+
return rep_counts
|
459 |
+
|
460 |
+
def preprocess_text_for_similarity(self, text):
|
461 |
+
if pd.isna(text) or not text.strip():
|
462 |
+
return []
|
463 |
+
return [w for w in word_tokenize(str(text).lower()) if w not in self.stop_words]
|
464 |
+
|
465 |
+
def batch_encode_words(self, texts, batch_size=32, max_length=512):
|
466 |
+
word_lists = [self.preprocess_text_for_similarity(t) for t in tqdm(texts, desc="Tokenizing Texts")]
|
467 |
+
vocab = {word: idx + 1 for idx, word in enumerate(set.union(*[set(w) for w in word_lists if w]))}
|
468 |
+
|
469 |
+
encoded_batches = []
|
470 |
+
for i in tqdm(range(0, len(word_lists), batch_size), desc="Encoding Words on GPU"):
|
471 |
+
batch_words = word_lists[i:i + batch_size]
|
472 |
+
encoded = np.zeros((len(batch_words), max_length), dtype=np.int64)
|
473 |
+
for j, words in enumerate(batch_words):
|
474 |
+
if words:
|
475 |
+
word_ids = [vocab.get(w, 0) for w in words][:max_length]
|
476 |
+
encoded[j, :len(word_ids)] = word_ids
|
477 |
+
encoded_tensor = torch.tensor(encoded, dtype=torch.int64).to(self.device)
|
478 |
+
encoded_batches.append(encoded_tensor)
|
479 |
+
|
480 |
+
return torch.cat(encoded_batches, dim=0), vocab
|
481 |
+
|
482 |
+
def compute_similarity_to_other_reviews(self, batch_size=32, max_length=512):
|
483 |
+
all_texts = self.df["review_text"].tolist()
|
484 |
+
all_users = self.df["user_id"].tolist()
|
485 |
+
all_review_ids = self.df["review_id"].tolist()
|
486 |
+
|
487 |
+
encoded_words, vocab = self.batch_encode_words(all_texts, batch_size, max_length)
|
488 |
+
|
489 |
+
similarity_scores = {rid: 0.0 for rid in all_review_ids} # Default scores
|
490 |
+
for i, (review_id, user_id) in enumerate(tqdm(zip(all_review_ids, all_users), desc="Computing Similarities on GPU")):
|
491 |
+
if pd.isna(review_id) or pd.isna(user_id):
|
492 |
+
continue
|
493 |
+
|
494 |
+
current_words = encoded_words[i]
|
495 |
+
if current_words.sum() == 0:
|
496 |
+
continue
|
497 |
+
|
498 |
+
other_indices = torch.tensor([j for j, u in enumerate(all_users) if u != user_id and pd.notna(u)],
|
499 |
+
dtype=torch.long).to(self.device)
|
500 |
+
if not other_indices.numel():
|
501 |
+
continue
|
502 |
+
|
503 |
+
other_words = encoded_words[other_indices]
|
504 |
+
current_set = torch.unique(current_words[current_words > 0])
|
505 |
+
other_flat = other_words[other_words > 0]
|
506 |
+
|
507 |
+
if other_flat.numel() == 0:
|
508 |
+
continue
|
509 |
+
|
510 |
+
other_set = torch.unique(other_flat)
|
511 |
+
intersection = torch.sum(torch.isin(current_set, other_set)).float()
|
512 |
+
union = torch.unique(torch.cat([current_set, other_set])).numel()
|
513 |
+
similarity = intersection / union if union > 0 else 0.0
|
514 |
+
|
515 |
+
similarity_scores[review_id] = similarity.item()
|
516 |
+
return pd.Series(similarity_scores, index=all_review_ids)
|
517 |
+
|
518 |
+
def calculate_friend_count(self):
|
519 |
+
friends = []
|
520 |
+
for v in self.df["friends"]:
|
521 |
+
if isinstance(v, str):
|
522 |
+
friends.append(len(v.split(",")))
|
523 |
+
elif type(v)==int or type(v)==float:
|
524 |
+
friends.append(0)
|
525 |
+
self.df["friends"] = friends
|
526 |
+
|
527 |
+
def count_elite_years(self, elite):
|
528 |
+
if pd.isna(elite):
|
529 |
+
return 0
|
530 |
+
return len(str(elite).split(","))
|
531 |
+
|
532 |
+
def transform_elite_status(self):
|
533 |
+
self.df["elite"] = self.df["elite"].apply(lambda x: True if self.count_elite_years(x) > 1 else False)
|
534 |
+
self.df["elite"] = self.df["elite"].astype(int)
|
535 |
+
|
536 |
+
|
537 |
+
def calculate_review_useful_funny_cool(self):
|
538 |
+
self.df["review_useful"] = pd.to_numeric(self.df["review_useful"], errors='coerce').fillna(0)
|
539 |
+
self.df["review_funny"] = pd.to_numeric(self.df["review_funny"], errors='coerce').fillna(0)
|
540 |
+
self.df["review_cool"] = pd.to_numeric(self.df["review_cool"], errors='coerce').fillna(0)
|
541 |
+
self.df["review_useful_funny_cool"] = (
|
542 |
+
self.df["review_useful"] +
|
543 |
+
self.df["review_funny"] +
|
544 |
+
self.df["review_cool"]
|
545 |
+
)
|
546 |
+
self.df["review_useful_funny_cool"] = self.df["review_useful_funny_cool"].fillna(0).astype(int)
|
547 |
+
|
548 |
+
|
549 |
+
def calculate_user_useful_funny_cool(self):
|
550 |
+
self.df["user_useful_funny_cool"] = (
|
551 |
+
self.df["user_useful"] +
|
552 |
+
self.df["user_funny"] +
|
553 |
+
self.df["user_cool"]
|
554 |
+
)
|
555 |
+
self.df["user_useful_funny_cool"] = self.df["user_useful_funny_cool"].fillna(0).astype(int)
|
556 |
+
|
557 |
+
def compute_fake_score(self, row):
|
558 |
+
suspicion_points = 0
|
559 |
+
|
560 |
+
# Linguistic Features
|
561 |
+
if row["pronoun_density"] < 0.01: # Low personal engagement
|
562 |
+
suspicion_points += 1
|
563 |
+
if row["avg_sentence_length"] < 5 or row["avg_sentence_length"] > 30: # Extreme lengths
|
564 |
+
suspicion_points += 1
|
565 |
+
if row["grammar_error_score"] > 5: # Many errors
|
566 |
+
suspicion_points += 1
|
567 |
+
if row["repetitive_words_count"] > 5: # High repetition
|
568 |
+
suspicion_points += 1
|
569 |
+
if row["code_switching_flag"] == 1: # Language mixing
|
570 |
+
suspicion_points += 1
|
571 |
+
if row["excessive_punctuation_count"] > 3: # Overuse of punctuation
|
572 |
+
suspicion_points += 1
|
573 |
+
if abs(row["sentiment_polarity"]) > 0.8: # Extreme sentiment
|
574 |
+
suspicion_points += 1
|
575 |
+
|
576 |
+
# Review Patterns
|
577 |
+
if row["similarity_to_other_reviews"] > 0.8: # High duplication
|
578 |
+
suspicion_points += 1
|
579 |
+
if row["user_review_burst_count"] > 5: # Spammy bursts
|
580 |
+
suspicion_points += 1
|
581 |
+
if row["business_review_burst_count"] > 5: # Targeted bursts
|
582 |
+
suspicion_points += 1
|
583 |
+
if abs(row["rating_deviation_from_business_average"]) > 2: # Large rating deviation
|
584 |
+
suspicion_points += 1
|
585 |
+
if row["review_like_ratio"] > 0.9 or row["review_like_ratio"] < 0.1: # Extreme like ratio
|
586 |
+
suspicion_points += 1
|
587 |
+
|
588 |
+
# User Behavior
|
589 |
+
if row["user_account_age"] < 30: # Very new account (days)
|
590 |
+
suspicion_points += 1
|
591 |
+
if row["average_time_between_reviews"] < 24: # Rapid reviews (hours)
|
592 |
+
suspicion_points += 1
|
593 |
+
if row["user_degree"] < 2: # Low business interaction
|
594 |
+
suspicion_points += 1
|
595 |
+
if row["time_since_last_review_user"] < 24: # Recent burst (hours)
|
596 |
+
suspicion_points += 1
|
597 |
+
|
598 |
+
# Threshold: 3 or more points = fake
|
599 |
+
return 1 if suspicion_points >= 3 else 0
|
600 |
+
|
601 |
+
|
602 |
+
def run_pipeline(self):
|
603 |
+
|
604 |
+
|
605 |
+
|
606 |
+
logger.info("FINALYZING HOURS COLUMN ...")
|
607 |
+
self.df["hours"] = self.df["hours"].apply(self.get_avg_duration)
|
608 |
+
self.df["hours"] = self.df["hours"].fillna(0)
|
609 |
+
print(self.df["hours"][:10])
|
610 |
+
print(self.df["hours"].isnull().sum())
|
611 |
+
|
612 |
+
|
613 |
+
|
614 |
+
|
615 |
+
logger.info("FINALYZING ATTRIBUTES COLUMN ...")
|
616 |
+
self.df.drop("attributes",axis=1,inplace=True)
|
617 |
+
|
618 |
+
|
619 |
+
|
620 |
+
logger.info("CREATING time_since_last_review_user COLUMN ...")
|
621 |
+
self.calculate_time_since_last_review()
|
622 |
+
print(np.unique(self.df["time_since_last_review_user"] ))
|
623 |
+
|
624 |
+
|
625 |
+
logger.info("CREATING time_since_last_review_business COLUMN ...")
|
626 |
+
self.calculate_time_since_last_review_business()
|
627 |
+
print(np.unique(self.df["time_since_last_review_business"] ))
|
628 |
+
|
629 |
+
|
630 |
+
|
631 |
+
logger.info("CREATING user_account_age COLUMN ...")
|
632 |
+
self.calculate_user_account_age()
|
633 |
+
print(np.unique(self.df["user_account_age"] ))
|
634 |
+
|
635 |
+
|
636 |
+
|
637 |
+
logger.info("CREATING average_time_between_reviews COLUMN ...")
|
638 |
+
self.calculate_avg_time_between_reviews()
|
639 |
+
print(np.unique(self.df["average_time_between_reviews"] ))
|
640 |
+
|
641 |
+
|
642 |
+
|
643 |
+
logger.info("CREATING user_degree COLUMN ...")
|
644 |
+
self.calculate_user_degree()
|
645 |
+
print(np.unique(self.df["user_degree"] ))
|
646 |
+
|
647 |
+
|
648 |
+
logger.info("CREATING business_degree COLUMN ...")
|
649 |
+
self.calculate_business_degree()
|
650 |
+
print(np.unique(self.df["business_degree"] ))
|
651 |
+
|
652 |
+
|
653 |
+
logger.info("CREATING rating_variance_user COLUMN ...")
|
654 |
+
self.calculate_rating_variance_user()
|
655 |
+
print(np.unique(self.df["rating_variance_user"] ))
|
656 |
+
|
657 |
+
|
658 |
+
|
659 |
+
logger.info("CREATING user_review_burst_count COLUMN ...")
|
660 |
+
self.calculate_user_review_burst_count()
|
661 |
+
print(np.unique(self.df["user_review_burst_count"] ))
|
662 |
+
|
663 |
+
|
664 |
+
logger.info("CREATING business_review_burst_count COLUMN ...")
|
665 |
+
self.calculate_business_review_burst_count()
|
666 |
+
print(np.unique(self.df["business_review_burst_count"] ))
|
667 |
+
|
668 |
+
|
669 |
+
|
670 |
+
logger.info("CREATING temporal_similarity COLUMN ...")
|
671 |
+
self.calculate_temporal_similarity()
|
672 |
+
print(np.unique(self.df["temporal_similarity"] ))
|
673 |
+
|
674 |
+
|
675 |
+
|
676 |
+
logger.info("CREATING rating_deviation_from_business_average COLUMN ...")
|
677 |
+
self.calculate_rating_deviation_from_business_average()
|
678 |
+
print(np.unique(self.df["rating_deviation_from_business_average"] ))
|
679 |
+
|
680 |
+
|
681 |
+
|
682 |
+
logger.info("CREATING review_like_ratio COLUMN ...")
|
683 |
+
self.calculate_review_like_ratio()
|
684 |
+
print(np.unique(self.df["review_like_ratio"] ))
|
685 |
+
|
686 |
+
|
687 |
+
|
688 |
+
logger.info("CREATING latest_checkin_hours COLUMN ...")
|
689 |
+
self.calculate_latest_checkin_hours()
|
690 |
+
print(np.unique(self.df["latest_checkin_hours"] ))
|
691 |
+
|
692 |
+
|
693 |
+
|
694 |
+
|
695 |
+
logger.info("CREATING pronoun_density COLUMN ...")
|
696 |
+
self.df["pronoun_density"] = self.df["review_text"].apply(self.compute_pronoun_density)
|
697 |
+
print(np.unique(self.df["pronoun_density"] ))
|
698 |
+
|
699 |
+
logger.info("CREATING avg_sentence_length COLUMN ...")
|
700 |
+
self.df["avg_sentence_length"] = self.df["review_text"].apply(self.compute_avg_sentence_length)
|
701 |
+
print(np.unique(self.df["avg_sentence_length"] ))
|
702 |
+
|
703 |
+
logger.info("CREATING excessive_punctuation_count COLUMN ...")
|
704 |
+
self.df["excessive_punctuation_count"] = self.df["review_text"].apply(self.compute_excessive_punctuation)
|
705 |
+
print(np.unique(self.df["excessive_punctuation_count"] ))
|
706 |
+
|
707 |
+
logger.info("CREATING sentiment_polarity COLUMN ...")
|
708 |
+
self.df["sentiment_polarity"] = self.df["review_text"].apply(self.compute_sentiment_polarity)
|
709 |
+
print(np.unique(self.df["sentiment_polarity"] ))
|
710 |
+
|
711 |
+
logger.info("CREATING good_severity and bad_severity COLUMNS ...")
|
712 |
+
severity_scores = self.df["review_text"].apply(self.calculate_sentiment_severity)
|
713 |
+
self.df[["good_severity", "bad_severity"]] = severity_scores
|
714 |
+
print(np.unique(self.df["good_severity"] ))
|
715 |
+
print(np.unique(self.df["bad_severity"] ))
|
716 |
+
|
717 |
+
|
718 |
+
logger.info("CREATING code_switching_flag COLUMN ...")
|
719 |
+
self.df["code_switching_flag"] = self.df["review_text"].apply(self.compute_code_switching_flag)
|
720 |
+
print(np.unique(self.df["code_switching_flag"] ))
|
721 |
+
|
722 |
+
|
723 |
+
all_texts = self.df["review_text"].tolist()
|
724 |
+
tokenized_ids = self.batch_tokenize(all_texts, batch_size=32, max_length=512)
|
725 |
+
|
726 |
+
logger.info("CREATING grammar_error_score COLUMN ...")
|
727 |
+
self.df["grammar_error_score"] = self.compute_grammar_error_score(all_texts, tokenized_ids)
|
728 |
+
print(np.unique(self.df["grammar_error_score"] ))
|
729 |
+
|
730 |
+
|
731 |
+
logger.info("CREATING repetitive_words_count COLUMN ...")
|
732 |
+
self.df["repetitive_words_count"] = self.compute_repetitive_words_count(all_texts, tokenized_ids)
|
733 |
+
print(np.unique(self.df["repetitive_words_count"] ))
|
734 |
+
|
735 |
+
|
736 |
+
|
737 |
+
logger.info("CREATING similarity_to_other_reviews COLUMN ...")
|
738 |
+
similarity_scores = self.compute_similarity_to_other_reviews(batch_size=32, max_length=512)
|
739 |
+
self.df["similarity_to_other_reviews"] = self.df["review_id"].map(similarity_scores)
|
740 |
+
|
741 |
+
print(np.unique(self.df["similarity_to_other_reviews"] ))
|
742 |
+
|
743 |
+
|
744 |
+
|
745 |
+
logger.info("CREATING friends COLUMN ...")
|
746 |
+
self.calculate_friend_count()
|
747 |
+
print(self.df["friends"].value_counts())
|
748 |
+
|
749 |
+
logger.info("CREATING elite COLUMN ...")
|
750 |
+
self.transform_elite_status()
|
751 |
+
print(self.df["elite"].value_counts())
|
752 |
+
|
753 |
+
|
754 |
+
logger.info("CREATING review_useful_funny_cool COLUMN ...")
|
755 |
+
self.calculate_review_useful_funny_cool()
|
756 |
+
print(self.df["review_useful_funny_cool"].value_counts())
|
757 |
+
|
758 |
+
|
759 |
+
logger.info("CREATING user_useful_funny_cool COLUMN ...")
|
760 |
+
self.calculate_user_useful_funny_cool()
|
761 |
+
print(self.df["user_useful_funny_cool"].value_counts())
|
762 |
+
|
763 |
+
|
764 |
+
logger.info("CREATING LABEL COLUMN ...")
|
765 |
+
self.df["fake"] = self.df.apply(self.compute_fake_score, axis=1)
|
766 |
+
print(self.df["fake"].value_counts())
|
767 |
+
|
768 |
+
|
769 |
+
logger.info("SEEING NULL VALUES IN FINAL COLUMNS.....")
|
770 |
+
print(set(self.df.isnull().sum().values))
|
771 |
+
|
772 |
+
|
773 |
+
|
774 |
+
|
775 |
+
return self.df
|
776 |
+
|
777 |
+
|
778 |
+
|
779 |
+
|
780 |
+
|
781 |
+
|
782 |
+
|
783 |
+
|
784 |
+
|
785 |
+
|
786 |
+
|
787 |
+
|
788 |
+
|
789 |
+
|
790 |
+
|
791 |
+
|
792 |
+
|
793 |
+
|
794 |
+
|
795 |
+
|
796 |
+
|
797 |
+
|
798 |
+
|
799 |
+
|
800 |
+
|
801 |
+
|
802 |
+
|
803 |
+
|
804 |
+
|
805 |
+
|
806 |
+
|
807 |
+
|
808 |
+
|
809 |
+
|
810 |
+
|
811 |
+
|
812 |
+
|
813 |
+
|
814 |
+
|
815 |
+
|
816 |
+
|
817 |
+
|
818 |
+
|
819 |
+
|
820 |
+
|
821 |
+
|
822 |
+
|
823 |
+
|
824 |
+
|
825 |
+
|
826 |
+
|
827 |
+
|
828 |
+
|
829 |
+
|
830 |
+
|
831 |
+
|
832 |
+
|