OVH commited on
Commit
67b1c6c
·
1 Parent(s): 12ab6cd

Added all the files

Browse files
Files changed (41) hide show
  1. .ipynb_checkpoints/Dockerfile-checkpoint +0 -0
  2. .ipynb_checkpoints/app-checkpoint.py +281 -0
  3. .ipynb_checkpoints/main-checkpoint.py +132 -0
  4. Dockerfile +0 -0
  5. app.py +281 -0
  6. main.py +132 -0
  7. requirements.txt +19 -0
  8. src/.ipynb_checkpoints/analyze_yelp_data-checkpoint.py +320 -0
  9. src/.ipynb_checkpoints/clean_data-checkpoint.py +77 -0
  10. src/.ipynb_checkpoints/create_dataset-checkpoint.py +217 -0
  11. src/.ipynb_checkpoints/feature_analyzer-checkpoint.py +212 -0
  12. src/.ipynb_checkpoints/model-checkpoint.py +541 -0
  13. src/.ipynb_checkpoints/model_trainer-checkpoint.py +35 -0
  14. src/.ipynb_checkpoints/preprocessing-checkpoint.py +831 -0
  15. src/__pycache__/analyze_yelp_data.cpython-311.pyc +0 -0
  16. src/__pycache__/clean_data.cpython-311.pyc +0 -0
  17. src/__pycache__/clean_data.cpython-39.pyc +0 -0
  18. src/__pycache__/create_dataset.cpython-311.pyc +0 -0
  19. src/__pycache__/create_dataset.cpython-39.pyc +0 -0
  20. src/__pycache__/data_balancing.cpython-311.pyc +0 -0
  21. src/__pycache__/feature_analyzer.cpython-311.pyc +0 -0
  22. src/__pycache__/feature_analyzer.cpython-39.pyc +0 -0
  23. src/__pycache__/feature_importance.cpython-311.pyc +0 -0
  24. src/__pycache__/model.cpython-311.pyc +0 -0
  25. src/__pycache__/model.cpython-39.pyc +0 -0
  26. src/__pycache__/model1.cpython-311.pyc +0 -0
  27. src/__pycache__/model1.cpython-39.pyc +0 -0
  28. src/__pycache__/model3.cpython-311.pyc +0 -0
  29. src/__pycache__/model3.cpython-39.pyc +0 -0
  30. src/__pycache__/model_trainer.cpython-311.pyc +0 -0
  31. src/__pycache__/model_trainer.cpython-39.pyc +0 -0
  32. src/__pycache__/models.cpython-311.pyc +0 -0
  33. src/__pycache__/preprocessing.cpython-311.pyc +0 -0
  34. src/__pycache__/preprocessing.cpython-39.pyc +0 -0
  35. src/analyze_yelp_data.py +320 -0
  36. src/clean_data.py +83 -0
  37. src/create_dataset.py +217 -0
  38. src/feature_analyzer.py +212 -0
  39. src/model.py +540 -0
  40. src/model_trainer.py +35 -0
  41. src/preprocessing.py +832 -0
.ipynb_checkpoints/Dockerfile-checkpoint ADDED
File without changes
.ipynb_checkpoints/app-checkpoint.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from torch_geometric.data import HeteroData
6
+ import numpy as np
7
+ import pandas as pd
8
+ import networkx as nx
9
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve
10
+ from sklearn.model_selection import train_test_split
11
+ from pathlib import Path
12
+ from datetime import datetime
13
+ from loguru import logger
14
+ from huggingface_hub import hf_hub_download
15
+ import json
16
+ from preprocessing_test import Preprocessor
17
+ from src.model import *
18
+ from main import start_pipelines
19
+
20
+ app = Flask(__name__)
21
+
22
+ # Define default values for each column
23
+ default_values = {
24
+ 'review_id': 'KU_O5udG6zpxOg-VcAEodg',
25
+ 'user_id': 'mh_-eMZ6K5RLWhZyISBhwA',
26
+ 'business_id': 'XQfwVwDr-v0ZS3_CbbE5Xw',
27
+ 'review_stars': 0,
28
+ 'review_useful': 0,
29
+ 'review_funny': 0,
30
+ 'review_cool': 0,
31
+ 'review_text': 'It was a moderate experience',
32
+ 'review_date': 1531001351000,
33
+ 'business_name': 'Coffe at LA',
34
+ 'address': '1460 LA',
35
+ 'city': 'LA',
36
+ 'state': 'CA',
37
+ 'postal_code': '00000',
38
+ 'latitude': 0.0,
39
+ 'longitude': 0.0,
40
+ 'business_stars': 0.0,
41
+ 'business_review_count': 0,
42
+ 'is_open': 0,
43
+ 'attributes': '{}',
44
+ 'categories': 'Restaurants',
45
+ 'hours': '{"Monday": "7:0-20:0", "Tuesday": "7:0-20:0", "Wednesday": "7:0-20:0", "Thursday": "7:0-20:0", "Friday": "7:0-21:0", "Saturday": "7:0-21:0", "Sunday": "7:0-21:0"}',
46
+ 'user_name': 'default_user',
47
+ 'user_review_count': 0,
48
+ 'yelping_since': '2023-01-01 00:00:00',
49
+ 'user_useful': 0,
50
+ 'user_funny': 0,
51
+ 'user_cool': 0,
52
+ 'elite': '2024,2025',
53
+ 'friends': '',
54
+ 'fans': 0,
55
+ 'average_stars': 0.0,
56
+ 'compliment_hot': 0,
57
+ 'compliment_more': 0,
58
+ 'compliment_profile': 0,
59
+ 'compliment_cute': 0,
60
+ 'compliment_list': 0,
61
+ 'compliment_note': 0,
62
+ 'compliment_plain': 0,
63
+ 'compliment_cool': 0,
64
+ 'compliment_funny': 0,
65
+ 'compliment_writer': 0,
66
+ 'compliment_photos': 0,
67
+ 'checkin_date': '2023-01-01 00:00:00',
68
+ 'tip_compliment_count': 0.0,
69
+ 'tip_count': 0.0
70
+ }
71
+
72
+ # Expected types for validation
73
+ expected_types = {
74
+ 'review_id': str,
75
+ 'user_id': str,
76
+ 'business_id': str,
77
+ 'review_stars': int,
78
+ 'review_useful': int,
79
+ 'review_funny': int,
80
+ 'review_cool': int,
81
+ 'review_text': str,
82
+ 'review_date': int,
83
+ 'business_name': str,
84
+ 'address': str,
85
+ 'city': str,
86
+ 'state': str,
87
+ 'postal_code': str,
88
+ 'latitude': float,
89
+ 'longitude': float,
90
+ 'business_stars': float,
91
+ 'business_review_count': int,
92
+ 'is_open': int,
93
+ 'attributes': dict, # Assuming string representation of dict
94
+ 'categories': str,
95
+ 'hours': dict, # Assuming string representation of dict
96
+ 'user_name': str,
97
+ 'user_review_count': int,
98
+ 'yelping_since': str,
99
+ 'user_useful': int,
100
+ 'user_funny': int,
101
+ 'user_cool': int,
102
+ 'elite': str,
103
+ 'friends': str,
104
+ 'fans': int,
105
+ 'average_stars': float,
106
+ 'compliment_hot': int,
107
+ 'compliment_more': int,
108
+ 'compliment_profile': int,
109
+ 'compliment_cute': int,
110
+ 'compliment_list': int,
111
+ 'compliment_note': int,
112
+ 'compliment_plain': int,
113
+ 'compliment_cool': int,
114
+ 'compliment_funny': int,
115
+ 'compliment_writer': int,
116
+ 'compliment_photos': int,
117
+ 'checkin_date': str,
118
+ 'tip_compliment_count': float,
119
+ 'tip_count': float
120
+ }
121
+
122
+ @app.route('/predict', methods=['POST'])
123
+ def predict():
124
+ try:
125
+ # Check if request contains JSON data
126
+ if not request.json:
127
+ return jsonify({'error': 'Request must contain JSON data'}), 400
128
+
129
+ data = request.json
130
+
131
+ # Extract train, test, and test_size with defaults
132
+ train = data.get('train', False)
133
+ test = data.get('test', False)
134
+ test_size = float(data.get('test_size', 0.1))
135
+
136
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
137
+
138
+ # Handle training mode
139
+ if train in (True, 'true', 'True'):
140
+ start_pipelines(test_size=test_size)
141
+ logger.info("PIPELINES FINISHED SUCCESSFULLY")
142
+ return jsonify({
143
+ 'message': 'Training pipelines executed successfully',
144
+ 'test_size': test_size
145
+ }), 200
146
+
147
+ # Handle testing/inference mode
148
+ elif test in (True, 'test', 'True'):
149
+ REPO_ID = "Askhedi/graphformermodel"
150
+ MODEL_FILENAME = "model_GraphformerModel_latest.pth"
151
+ model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME)
152
+
153
+ # Load model
154
+ model = HeteroGraphormer(hidden_dim=64, output_dim=1, edge_dim=4).to(device)
155
+ model.load_state_dict(torch.load(model_path, map_location=device))
156
+ model.eval()
157
+
158
+ # Process input data from JSON
159
+ row = {}
160
+ warnings = []
161
+ for col, expected_type in expected_types.items():
162
+ value = data.get(col, default_values[col])
163
+ try:
164
+ if value == "" or value is None:
165
+ row[col] = default_values[col]
166
+ elif col in ['attributes', 'hours']:
167
+ # Expect a valid JSON string that parses to a dict
168
+ if isinstance(value, str):
169
+ parsed = json.loads(value)
170
+ if not isinstance(parsed, dict):
171
+ raise ValueError
172
+ row[col] = value # Keep as string for Preprocessor
173
+ else:
174
+ raise ValueError
175
+ else:
176
+ row[col] = expected_type(value)
177
+ except (ValueError, TypeError, json.JSONDecodeError):
178
+ row[col] = default_values[col]
179
+ warnings.append(f"Invalid input for '{col}' (expected {expected_type.__name__}), using default value: {default_values[col]}")
180
+
181
+ # Convert dictionaries to strings before passing to DataFrame
182
+ for col in ['attributes', 'hours']:
183
+ if isinstance(row[col], dict):
184
+ row[col] = json.dumps(row[col])
185
+
186
+ # Create DataFrame from input
187
+ input_df = pd.DataFrame([row])
188
+
189
+ # Preprocess using Preprocessor
190
+ preprocessor = Preprocessor(input_df)
191
+ processed_df = preprocessor.run_pipeline()
192
+ logger.info(f"PREPROCESSING COMPLETED VALUES ARE {processed_df}")
193
+
194
+ # Build standalone graph from processed data
195
+ num_users = 1
196
+ num_businesses = 1
197
+ num_rows = 1
198
+
199
+ graph = HeteroData()
200
+ features = torch.tensor(processed_df.drop(columns=['user_id', 'review_id', 'business_id']).values, dtype=torch.float, device=device)
201
+ time_since_user = torch.tensor(processed_df['time_since_last_review_user'].values, dtype=torch.float, device=device)
202
+ time_since_business = torch.tensor(processed_df['time_since_last_review_business'].values, dtype=torch.float, device=device)
203
+
204
+ user_indices = torch.tensor([0], dtype=torch.long, device=device)
205
+ business_indices = torch.tensor([0], dtype=torch.long, device=device)
206
+ review_indices = torch.tensor([0], dtype=torch.long, device=device)
207
+
208
+ user_feats = torch.zeros(num_users, 14, device=device)
209
+ business_feats = torch.zeros(num_businesses, 8, device=device)
210
+ review_feats = torch.zeros(num_rows, 16, device=device)
211
+
212
+ user_feats[0] = features[0, :14]
213
+ business_feats[0] = features[0, 14:22]
214
+ review_feats[0] = features[0, 22:38]
215
+
216
+ graph['user'].x = user_feats
217
+ graph['business'].x = business_feats
218
+ graph['review'].x = review_feats
219
+
220
+ graph['user', 'writes', 'review'].edge_index = torch.stack([user_indices, review_indices], dim=0)
221
+ graph['review', 'about', 'business'].edge_index = torch.stack([review_indices, business_indices], dim=0)
222
+
223
+ # Compute encodings
224
+ G = nx.DiGraph()
225
+ node_type_map = {0: 'user', 1: 'business', 2: 'review'}
226
+ G.add_nodes_from([0, 1, 2])
227
+ G.add_edge(0, 2) # user -> review
228
+ G.add_edge(2, 1) # review -> business
229
+
230
+ num_nodes = 3
231
+ spatial_encoding = torch.full((num_nodes, num_nodes), float('inf'), device=device)
232
+ for i in range(num_nodes):
233
+ for j in range(num_nodes):
234
+ if i == j:
235
+ spatial_encoding[i, j] = 0
236
+ elif nx.has_path(G, i, j):
237
+ spatial_encoding[i, j] = nx.shortest_path_length(G, i, j)
238
+
239
+ centrality_encoding = torch.tensor([G.degree(i) for i in range(num_nodes)], dtype=torch.float, device=device).view(-1, 1)
240
+
241
+ edge_features_dict = {}
242
+ user_writes_edge = graph['user', 'writes', 'review'].edge_index
243
+ review_about_edge = graph['review', 'about', 'business'].edge_index
244
+
245
+ edge_features_dict[('user', 'writes', 'review')] = create_temporal_edge_features(
246
+ time_since_user[user_writes_edge[0]], time_since_user[user_writes_edge[1]],
247
+ user_indices[user_writes_edge[0]], user_indices[user_writes_edge[0]]
248
+ )
249
+ edge_features_dict[('review', 'about', 'business')] = create_temporal_edge_features(
250
+ time_since_business[review_about_edge[0]], time_since_business[review_about_edge[1]],
251
+ torch.zeros_like(review_about_edge[0]), torch.zeros_like(review_about_edge[0])
252
+ )
253
+
254
+ time_since_dict = {
255
+ 'user': torch.tensor([time_since_user[0]], dtype=torch.float, device=device)
256
+ }
257
+
258
+ # Inference
259
+ with torch.no_grad():
260
+ out = model(graph, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict)
261
+ pred_label = 1 if out.squeeze().item() > 0.5 else 0
262
+ prob = out.squeeze().item()
263
+
264
+ # Combine warnings and result
265
+ result = {
266
+ 'warnings': warnings,
267
+ 'prediction': 'Fake' if pred_label == 1 else 'Not Fake',
268
+ 'probability': float(prob)
269
+ }
270
+ return jsonify(result), 200
271
+
272
+ else:
273
+ return jsonify({
274
+ 'error': 'Either "train" or "test" must be set to true'
275
+ }), 400
276
+
277
+ except Exception as e:
278
+ return jsonify({'error': str(e)}), 500
279
+
280
+ if __name__ == '__main__':
281
+ app.run(debug=True, host='0.0.0.0', port=5000)
.ipynb_checkpoints/main-checkpoint.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from pathlib import Path
4
+ import logging
5
+ import sys
6
+ from datetime import datetime
7
+ import warnings
8
+ import gc
9
+ import json
10
+
11
+ from loguru import logger
12
+ from src.create_dataset import process_datasets
13
+ from src.preprocessing import Preprocessor
14
+ from src.clean_data import DataCleaner
15
+ from src.feature_analyzer import FeatureAnalyzer
16
+ from src.model_trainer import ModelTrainer
17
+ from pathlib import Path
18
+
19
+
20
+ def create_directories():
21
+ """Create all necessary directories for the pipeline"""
22
+ directories = {
23
+ 'combined_data': Path('output_files/combined_data'),
24
+ 'preprocessed': Path('output_files/cleaned_preprocessed_data'),
25
+ 'feature_analyzer': Path('output_files/feature_analysis'),
26
+ 'model_outputs': Path('output_files/model_outputs'),
27
+
28
+ }
29
+
30
+ for dir_path in directories.values():
31
+ dir_path.mkdir(parents=True, exist_ok=True)
32
+
33
+ return directories
34
+
35
+ def handle_memory():
36
+ """Handle memory management"""
37
+ gc.collect()
38
+ warnings.filterwarnings('ignore')
39
+
40
+ def save_pipeline_metrics(metrics: dict, filepath: Path):
41
+ """Save pipeline metrics to JSON file"""
42
+ with open(filepath, 'w') as f:
43
+ json.dump(metrics, f, indent=4, default=str)
44
+
45
+ def start_pipelines(train_size=0.25):
46
+ # Setup logging
47
+ logger.info("STARTING YELP DATA ANALYSIS PIPELINES...")
48
+ dirs = create_directories()
49
+ logger.info("Created necessary directories")
50
+
51
+
52
+
53
+
54
+
55
+ logger.info("Pipeline 1: Creating initial dataset...")
56
+ try:
57
+ filename="combined_merged_full.csv"
58
+ df = process_datasets(output_path=dirs['combined_data'],filename=filename)
59
+
60
+ logger.info(f"Dataset created successfully with shape: {df.shape}")
61
+ except Exception as e:
62
+ logger.error(f"Error in dataset creation: {str(e)}")
63
+
64
+
65
+
66
+
67
+ try:
68
+ logger.info("Pipeline 2: Preprocessing and Feature Engineering....")
69
+ output_before_preprocess=Path(str(dirs['combined_data']) )/ "combined_merged_full.csv"
70
+ df = pd.read_csv(output_before_preprocess)
71
+ prep=Preprocessor(df)
72
+ feature_engineered_df=prep.run_pipeline()
73
+
74
+ except Exception as e:
75
+ logger.error(f"Error in Pipeline 2 Preprocessing and Feature Engineering as : {e}")
76
+
77
+
78
+ try:
79
+ logger.info("Pipeline 3: Cleaning data...")
80
+ filename="preprocessed_cleaned.csv"
81
+
82
+ cleaner = DataCleaner(df=feature_engineered_df,output_path=str(dirs['preprocessed']),filename=filename)
83
+ cleaner.run_pipeline()
84
+ clean_output_file_path = Path(str(dirs['preprocessed']) )/ filename
85
+ print("Preprocessed and Cleand data saved in ",clean_output_file_path)
86
+
87
+
88
+
89
+
90
+ except Exception as e:
91
+ logger.error(f"Error in Pipeline 3 Cleaning Data : {str(e)}")
92
+
93
+
94
+
95
+ try:
96
+ logger.info("Pipeline 4: Analyzing features...")
97
+ filename="preprocessed_cleaned.csv"
98
+ preprocessed_clean_output_file=Path(str(dirs['preprocessed']) )/ filename
99
+ preprocessed_clean_df=pd.read_csv(preprocessed_clean_output_file)
100
+
101
+ analyzer = FeatureAnalyzer(df=preprocessed_clean_df,output_path=str(dirs['feature_analyzer']))
102
+ analyzer.run_pipeline()
103
+ except Exception as e:
104
+ logger.error(f"Error in Feature analysis: {str(e)}")
105
+ raise
106
+
107
+
108
+ try:
109
+ logger.info("Pipeline 5 : Training and Evaluating Models...")
110
+ filename="preprocessed_cleaned.csv"
111
+ preprocessed_clean_output_file=Path(str(dirs['preprocessed']) )/ filename
112
+ preprocessed_clean_df=pd.read_csv(preprocessed_clean_output_file)
113
+ preprocessed_clean_df = preprocessed_clean_df.sample(frac=1, random_state=42).reset_index(drop=True)
114
+ size=int(train_size*len(preprocessed_clean_df))
115
+
116
+ preprocessed_clean_df=preprocessed_clean_df.iloc[:size,:]
117
+
118
+
119
+
120
+ trainer = ModelTrainer(df=preprocessed_clean_df,output_path=str(dirs['model_outputs']), epochs=50,test_size=0.3)
121
+ trainer.train_and_evaluate()
122
+
123
+ logger.info(f"Models training completed ")
124
+ except Exception as e:
125
+ logger.error(f"Error in Model Trainer: {str(e)}")
126
+
127
+
128
+
129
+
130
+
131
+
132
+
Dockerfile ADDED
File without changes
app.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from torch_geometric.data import HeteroData
6
+ import numpy as np
7
+ import pandas as pd
8
+ import networkx as nx
9
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve
10
+ from sklearn.model_selection import train_test_split
11
+ from pathlib import Path
12
+ from datetime import datetime
13
+ from loguru import logger
14
+ from huggingface_hub import hf_hub_download
15
+ import json
16
+ from preprocessing_test import Preprocessor
17
+ from src.model import *
18
+ from main import start_pipelines
19
+
20
+ app = Flask(__name__)
21
+
22
+ # Define default values for each column
23
+ default_values = {
24
+ 'review_id': 'KU_O5udG6zpxOg-VcAEodg',
25
+ 'user_id': 'mh_-eMZ6K5RLWhZyISBhwA',
26
+ 'business_id': 'XQfwVwDr-v0ZS3_CbbE5Xw',
27
+ 'review_stars': 0,
28
+ 'review_useful': 0,
29
+ 'review_funny': 0,
30
+ 'review_cool': 0,
31
+ 'review_text': 'It was a moderate experience',
32
+ 'review_date': 1531001351000,
33
+ 'business_name': 'Coffe at LA',
34
+ 'address': '1460 LA',
35
+ 'city': 'LA',
36
+ 'state': 'CA',
37
+ 'postal_code': '00000',
38
+ 'latitude': 0.0,
39
+ 'longitude': 0.0,
40
+ 'business_stars': 0.0,
41
+ 'business_review_count': 0,
42
+ 'is_open': 0,
43
+ 'attributes': '{}',
44
+ 'categories': 'Restaurants',
45
+ 'hours': '{"Monday": "7:0-20:0", "Tuesday": "7:0-20:0", "Wednesday": "7:0-20:0", "Thursday": "7:0-20:0", "Friday": "7:0-21:0", "Saturday": "7:0-21:0", "Sunday": "7:0-21:0"}',
46
+ 'user_name': 'default_user',
47
+ 'user_review_count': 0,
48
+ 'yelping_since': '2023-01-01 00:00:00',
49
+ 'user_useful': 0,
50
+ 'user_funny': 0,
51
+ 'user_cool': 0,
52
+ 'elite': '2024,2025',
53
+ 'friends': '',
54
+ 'fans': 0,
55
+ 'average_stars': 0.0,
56
+ 'compliment_hot': 0,
57
+ 'compliment_more': 0,
58
+ 'compliment_profile': 0,
59
+ 'compliment_cute': 0,
60
+ 'compliment_list': 0,
61
+ 'compliment_note': 0,
62
+ 'compliment_plain': 0,
63
+ 'compliment_cool': 0,
64
+ 'compliment_funny': 0,
65
+ 'compliment_writer': 0,
66
+ 'compliment_photos': 0,
67
+ 'checkin_date': '2023-01-01 00:00:00',
68
+ 'tip_compliment_count': 0.0,
69
+ 'tip_count': 0.0
70
+ }
71
+
72
+ # Expected types for validation
73
+ expected_types = {
74
+ 'review_id': str,
75
+ 'user_id': str,
76
+ 'business_id': str,
77
+ 'review_stars': int,
78
+ 'review_useful': int,
79
+ 'review_funny': int,
80
+ 'review_cool': int,
81
+ 'review_text': str,
82
+ 'review_date': int,
83
+ 'business_name': str,
84
+ 'address': str,
85
+ 'city': str,
86
+ 'state': str,
87
+ 'postal_code': str,
88
+ 'latitude': float,
89
+ 'longitude': float,
90
+ 'business_stars': float,
91
+ 'business_review_count': int,
92
+ 'is_open': int,
93
+ 'attributes': dict, # Assuming string representation of dict
94
+ 'categories': str,
95
+ 'hours': dict, # Assuming string representation of dict
96
+ 'user_name': str,
97
+ 'user_review_count': int,
98
+ 'yelping_since': str,
99
+ 'user_useful': int,
100
+ 'user_funny': int,
101
+ 'user_cool': int,
102
+ 'elite': str,
103
+ 'friends': str,
104
+ 'fans': int,
105
+ 'average_stars': float,
106
+ 'compliment_hot': int,
107
+ 'compliment_more': int,
108
+ 'compliment_profile': int,
109
+ 'compliment_cute': int,
110
+ 'compliment_list': int,
111
+ 'compliment_note': int,
112
+ 'compliment_plain': int,
113
+ 'compliment_cool': int,
114
+ 'compliment_funny': int,
115
+ 'compliment_writer': int,
116
+ 'compliment_photos': int,
117
+ 'checkin_date': str,
118
+ 'tip_compliment_count': float,
119
+ 'tip_count': float
120
+ }
121
+
122
+ @app.route('/predict', methods=['POST'])
123
+ def predict():
124
+ try:
125
+ # Check if request contains JSON data
126
+ if not request.json:
127
+ return jsonify({'error': 'Request must contain JSON data'}), 400
128
+
129
+ data = request.json
130
+
131
+ # Extract train, test, and test_size with defaults
132
+ train = data.get('train', False)
133
+ test = data.get('test', False)
134
+ test_size = float(data.get('test_size', 0.1))
135
+
136
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
137
+
138
+ # Handle training mode
139
+ if train in (True, 'true', 'True'):
140
+ start_pipelines(test_size=test_size)
141
+ logger.info("PIPELINES FINISHED SUCCESSFULLY")
142
+ return jsonify({
143
+ 'message': 'Training pipelines executed successfully',
144
+ 'test_size': test_size
145
+ }), 200
146
+
147
+ # Handle testing/inference mode
148
+ elif test in (True, 'test', 'True'):
149
+ REPO_ID = "Askhedi/graphformermodel"
150
+ MODEL_FILENAME = "model_GraphformerModel_latest.pth"
151
+ model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME)
152
+
153
+ # Load model
154
+ model = HeteroGraphormer(hidden_dim=64, output_dim=1, edge_dim=4).to(device)
155
+ model.load_state_dict(torch.load(model_path, map_location=device))
156
+ model.eval()
157
+
158
+ # Process input data from JSON
159
+ row = {}
160
+ warnings = []
161
+ for col, expected_type in expected_types.items():
162
+ value = data.get(col, default_values[col])
163
+ try:
164
+ if value == "" or value is None:
165
+ row[col] = default_values[col]
166
+ elif col in ['attributes', 'hours']:
167
+ # Expect a valid JSON string that parses to a dict
168
+ if isinstance(value, str):
169
+ parsed = json.loads(value)
170
+ if not isinstance(parsed, dict):
171
+ raise ValueError
172
+ row[col] = value # Keep as string for Preprocessor
173
+ else:
174
+ raise ValueError
175
+ else:
176
+ row[col] = expected_type(value)
177
+ except (ValueError, TypeError, json.JSONDecodeError):
178
+ row[col] = default_values[col]
179
+ warnings.append(f"Invalid input for '{col}' (expected {expected_type.__name__}), using default value: {default_values[col]}")
180
+
181
+ # Convert dictionaries to strings before passing to DataFrame
182
+ for col in ['attributes', 'hours']:
183
+ if isinstance(row[col], dict):
184
+ row[col] = json.dumps(row[col])
185
+
186
+ # Create DataFrame from input
187
+ input_df = pd.DataFrame([row])
188
+
189
+ # Preprocess using Preprocessor
190
+ preprocessor = Preprocessor(input_df)
191
+ processed_df = preprocessor.run_pipeline()
192
+ logger.info(f"PREPROCESSING COMPLETED VALUES ARE {processed_df}")
193
+
194
+ # Build standalone graph from processed data
195
+ num_users = 1
196
+ num_businesses = 1
197
+ num_rows = 1
198
+
199
+ graph = HeteroData()
200
+ features = torch.tensor(processed_df.drop(columns=['user_id', 'review_id', 'business_id']).values, dtype=torch.float, device=device)
201
+ time_since_user = torch.tensor(processed_df['time_since_last_review_user'].values, dtype=torch.float, device=device)
202
+ time_since_business = torch.tensor(processed_df['time_since_last_review_business'].values, dtype=torch.float, device=device)
203
+
204
+ user_indices = torch.tensor([0], dtype=torch.long, device=device)
205
+ business_indices = torch.tensor([0], dtype=torch.long, device=device)
206
+ review_indices = torch.tensor([0], dtype=torch.long, device=device)
207
+
208
+ user_feats = torch.zeros(num_users, 14, device=device)
209
+ business_feats = torch.zeros(num_businesses, 8, device=device)
210
+ review_feats = torch.zeros(num_rows, 16, device=device)
211
+
212
+ user_feats[0] = features[0, :14]
213
+ business_feats[0] = features[0, 14:22]
214
+ review_feats[0] = features[0, 22:38]
215
+
216
+ graph['user'].x = user_feats
217
+ graph['business'].x = business_feats
218
+ graph['review'].x = review_feats
219
+
220
+ graph['user', 'writes', 'review'].edge_index = torch.stack([user_indices, review_indices], dim=0)
221
+ graph['review', 'about', 'business'].edge_index = torch.stack([review_indices, business_indices], dim=0)
222
+
223
+ # Compute encodings
224
+ G = nx.DiGraph()
225
+ node_type_map = {0: 'user', 1: 'business', 2: 'review'}
226
+ G.add_nodes_from([0, 1, 2])
227
+ G.add_edge(0, 2) # user -> review
228
+ G.add_edge(2, 1) # review -> business
229
+
230
+ num_nodes = 3
231
+ spatial_encoding = torch.full((num_nodes, num_nodes), float('inf'), device=device)
232
+ for i in range(num_nodes):
233
+ for j in range(num_nodes):
234
+ if i == j:
235
+ spatial_encoding[i, j] = 0
236
+ elif nx.has_path(G, i, j):
237
+ spatial_encoding[i, j] = nx.shortest_path_length(G, i, j)
238
+
239
+ centrality_encoding = torch.tensor([G.degree(i) for i in range(num_nodes)], dtype=torch.float, device=device).view(-1, 1)
240
+
241
+ edge_features_dict = {}
242
+ user_writes_edge = graph['user', 'writes', 'review'].edge_index
243
+ review_about_edge = graph['review', 'about', 'business'].edge_index
244
+
245
+ edge_features_dict[('user', 'writes', 'review')] = create_temporal_edge_features(
246
+ time_since_user[user_writes_edge[0]], time_since_user[user_writes_edge[1]],
247
+ user_indices[user_writes_edge[0]], user_indices[user_writes_edge[0]]
248
+ )
249
+ edge_features_dict[('review', 'about', 'business')] = create_temporal_edge_features(
250
+ time_since_business[review_about_edge[0]], time_since_business[review_about_edge[1]],
251
+ torch.zeros_like(review_about_edge[0]), torch.zeros_like(review_about_edge[0])
252
+ )
253
+
254
+ time_since_dict = {
255
+ 'user': torch.tensor([time_since_user[0]], dtype=torch.float, device=device)
256
+ }
257
+
258
+ # Inference
259
+ with torch.no_grad():
260
+ out = model(graph, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict)
261
+ pred_label = 1 if out.squeeze().item() > 0.5 else 0
262
+ prob = out.squeeze().item()
263
+
264
+ # Combine warnings and result
265
+ result = {
266
+ 'warnings': warnings,
267
+ 'prediction': 'Fake' if pred_label == 1 else 'Not Fake',
268
+ 'probability': float(prob)
269
+ }
270
+ return jsonify(result), 200
271
+
272
+ else:
273
+ return jsonify({
274
+ 'error': 'Either "train" or "test" must be set to true'
275
+ }), 400
276
+
277
+ except Exception as e:
278
+ return jsonify({'error': str(e)}), 500
279
+
280
+ if __name__ == '__main__':
281
+ app.run(debug=True, host='0.0.0.0', port=5000)
main.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from pathlib import Path
4
+ import logging
5
+ import sys
6
+ from datetime import datetime
7
+ import warnings
8
+ import gc
9
+ import json
10
+
11
+ from loguru import logger
12
+ from src.create_dataset import process_datasets
13
+ from src.preprocessing import Preprocessor
14
+ from src.clean_data import DataCleaner
15
+ from src.feature_analyzer import FeatureAnalyzer
16
+ from src.model_trainer import ModelTrainer
17
+ from pathlib import Path
18
+
19
+
20
+ def create_directories():
21
+ """Create all necessary directories for the pipeline"""
22
+ directories = {
23
+ 'combined_data': Path('output_files/combined_data'),
24
+ 'preprocessed': Path('output_files/cleaned_preprocessed_data'),
25
+ 'feature_analyzer': Path('output_files/feature_analysis'),
26
+ 'model_outputs': Path('output_files/model_outputs'),
27
+
28
+ }
29
+
30
+ for dir_path in directories.values():
31
+ dir_path.mkdir(parents=True, exist_ok=True)
32
+
33
+ return directories
34
+
35
+ def handle_memory():
36
+ """Handle memory management"""
37
+ gc.collect()
38
+ warnings.filterwarnings('ignore')
39
+
40
+ def save_pipeline_metrics(metrics: dict, filepath: Path):
41
+ """Save pipeline metrics to JSON file"""
42
+ with open(filepath, 'w') as f:
43
+ json.dump(metrics, f, indent=4, default=str)
44
+
45
+ def start_pipelines(train_size=0.25):
46
+ # Setup logging
47
+ logger.info("STARTING YELP DATA ANALYSIS PIPELINES...")
48
+ dirs = create_directories()
49
+ logger.info("Created necessary directories")
50
+
51
+
52
+
53
+
54
+
55
+ logger.info("Pipeline 1: Creating initial dataset...")
56
+ try:
57
+ filename="combined_merged_full.csv"
58
+ df = process_datasets(output_path=dirs['combined_data'],filename=filename)
59
+
60
+ logger.info(f"Dataset created successfully with shape: {df.shape}")
61
+ except Exception as e:
62
+ logger.error(f"Error in dataset creation: {str(e)}")
63
+
64
+
65
+
66
+
67
+ try:
68
+ logger.info("Pipeline 2: Preprocessing and Feature Engineering....")
69
+ output_before_preprocess=Path(str(dirs['combined_data']) )/ "combined_merged_full.csv"
70
+ df = pd.read_csv(output_before_preprocess)
71
+ prep=Preprocessor(df)
72
+ feature_engineered_df=prep.run_pipeline()
73
+
74
+ except Exception as e:
75
+ logger.error(f"Error in Pipeline 2 Preprocessing and Feature Engineering as : {e}")
76
+
77
+
78
+ try:
79
+ logger.info("Pipeline 3: Cleaning data...")
80
+ filename="preprocessed_cleaned.csv"
81
+
82
+ cleaner = DataCleaner(df=feature_engineered_df,output_path=str(dirs['preprocessed']),filename=filename)
83
+ cleaner.run_pipeline()
84
+ clean_output_file_path = Path(str(dirs['preprocessed']) )/ filename
85
+ print("Preprocessed and Cleand data saved in ",clean_output_file_path)
86
+
87
+
88
+
89
+
90
+ except Exception as e:
91
+ logger.error(f"Error in Pipeline 3 Cleaning Data : {str(e)}")
92
+
93
+
94
+
95
+ try:
96
+ logger.info("Pipeline 4: Analyzing features...")
97
+ filename="preprocessed_cleaned.csv"
98
+ preprocessed_clean_output_file=Path(str(dirs['preprocessed']) )/ filename
99
+ preprocessed_clean_df=pd.read_csv(preprocessed_clean_output_file)
100
+
101
+ analyzer = FeatureAnalyzer(df=preprocessed_clean_df,output_path=str(dirs['feature_analyzer']))
102
+ analyzer.run_pipeline()
103
+ except Exception as e:
104
+ logger.error(f"Error in Feature analysis: {str(e)}")
105
+ raise
106
+
107
+
108
+ try:
109
+ logger.info("Pipeline 5 : Training and Evaluating Models...")
110
+ filename="preprocessed_cleaned.csv"
111
+ preprocessed_clean_output_file=Path(str(dirs['preprocessed']) )/ filename
112
+ preprocessed_clean_df=pd.read_csv(preprocessed_clean_output_file)
113
+ preprocessed_clean_df = preprocessed_clean_df.sample(frac=1, random_state=42).reset_index(drop=True)
114
+ size=int(train_size*len(preprocessed_clean_df))
115
+
116
+ preprocessed_clean_df=preprocessed_clean_df.iloc[:size,:]
117
+
118
+
119
+
120
+ trainer = ModelTrainer(df=preprocessed_clean_df,output_path=str(dirs['model_outputs']), epochs=50,test_size=0.3)
121
+ trainer.train_and_evaluate()
122
+
123
+ logger.info(f"Models training completed ")
124
+ except Exception as e:
125
+ logger.error(f"Error in Model Trainer: {str(e)}")
126
+
127
+
128
+
129
+
130
+
131
+
132
+
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ujson
2
+ imblearn
3
+ scikit-learn==1.5.2
4
+ loguru
5
+ astropy
6
+ textblob
7
+ nltk
8
+ transformers
9
+ pandas
10
+ numpy
11
+ tqdm
12
+ pymongo
13
+ scikit-learn
14
+ torch
15
+ pathlib
16
+ torch-geometric
17
+ huggingface-hub
18
+ matplotlib
19
+ seaborn
src/.ipynb_checkpoints/analyze_yelp_data-checkpoint.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from transformers import AutoTokenizer, AutoModel
4
+ import torch
5
+ from sklearn.ensemble import IsolationForest
6
+ from sklearn.preprocessing import StandardScaler
7
+ from textblob import TextBlob
8
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
9
+ from sklearn.feature_extraction.text import CountVectorizer
10
+ from sklearn.decomposition import PCA
11
+ import warnings
12
+ from typing import Dict, List, Tuple
13
+ import logging
14
+ from collections import Counter
15
+ from detoxify import Detoxify
16
+ import re
17
+ from datetime import datetime
18
+ import seaborn as sns
19
+ import matplotlib.pyplot as plt
20
+ from pathlib import Path
21
+ import json
22
+
23
+ class AdvancedYelpAnalyzer:
24
+ def __init__(self, df: pd.DataFrame):
25
+ """Initialize the analyzer with necessary models and configurations"""
26
+ self.df = df.copy()
27
+ self.bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
28
+ self.bert_model = AutoModel.from_pretrained('bert-base-uncased')
29
+ self.vader = SentimentIntensityAnalyzer()
30
+ self.toxic_model = Detoxify('original')
31
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
32
+ self.bert_model.to(self.device)
33
+
34
+ # Configure logging
35
+ logging.basicConfig(level=logging.INFO)
36
+ self.logger = logging.getLogger(__name__)
37
+
38
+ def get_bert_embeddings(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
39
+ """Generate BERT embeddings for text"""
40
+ embeddings = []
41
+
42
+ for i in range(0, len(texts), batch_size):
43
+ batch_texts = texts[i:i + batch_size]
44
+ encoded = self.bert_tokenizer(batch_texts,
45
+ padding=True,
46
+ truncation=True,
47
+ max_length=512,
48
+ return_tensors='pt')
49
+
50
+ with torch.no_grad():
51
+ encoded = {k: v.to(self.device) for k, v in encoded.items()}
52
+ outputs = self.bert_model(**encoded)
53
+ batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
54
+ embeddings.append(batch_embeddings)
55
+
56
+ return np.vstack(embeddings)
57
+
58
+ def analyze_sentiment(self) -> pd.DataFrame:
59
+ """Perform comprehensive sentiment analysis using multiple tools"""
60
+ self.logger.info("Starting sentiment analysis...")
61
+
62
+ # Calculate BERT embeddings for reviews
63
+ self.logger.info("Calculating BERT embeddings...")
64
+ review_texts = self.df['review_text'].fillna('').tolist()
65
+ bert_embeddings = self.get_bert_embeddings(review_texts)
66
+
67
+ # Calculate review length using BERT tokenizer
68
+ self.logger.info("Calculating tokenized lengths...")
69
+ self.df['review_length'] = self.df['review_text'].apply(
70
+ lambda x: len(self.bert_tokenizer.encode(str(x)))
71
+ )
72
+
73
+ # Store BERT embeddings mean and std as features
74
+ self.df['bert_embedding_mean'] = np.mean(bert_embeddings, axis=1)
75
+ self.df['bert_embedding_std'] = np.std(bert_embeddings, axis=1)
76
+
77
+ # TextBlob sentiment and subjectivity
78
+ self.df['textblob_polarity'] = self.df['review_text'].apply(
79
+ lambda x: TextBlob(str(x)).sentiment.polarity
80
+ )
81
+ self.df['textblob_subjectivity'] = self.df['review_text'].apply(
82
+ lambda x: TextBlob(str(x)).sentiment.subjectivity
83
+ )
84
+
85
+ # VADER sentiment with custom negative phrase handling
86
+ def get_enhanced_vader_scores(text):
87
+ # Custom negative phrases
88
+ negative_phrases = [
89
+ 'too long', 'way too long', 'waiting', 'changed our minds',
90
+ 'too many', 'took forever', 'took too long', 'waste of time',
91
+ 'not worth', 'disappointing', 'mediocre', 'needs improvement'
92
+ ]
93
+
94
+ # Get base VADER scores
95
+ base_scores = self.vader.polarity_scores(str(text))
96
+
97
+ # Check for negative phrases
98
+ text_lower = str(text).lower()
99
+ neg_count = sum(1 for phrase in negative_phrases if phrase in text_lower)
100
+
101
+ # Adjust scores if negative phrases are found
102
+ if neg_count > 0:
103
+ base_scores['neg'] = max(base_scores['neg'], min(0.7, neg_count * 0.2))
104
+ base_scores['compound'] *= (1 - (neg_count * 0.15))
105
+ # Readjust neutral score
106
+ base_scores['neu'] = max(0, 1 - base_scores['neg'] - base_scores['pos'])
107
+
108
+ return base_scores
109
+
110
+ # Apply enhanced VADER scoring
111
+ vader_scores = self.df['review_text'].apply(get_enhanced_vader_scores)
112
+ self.df['vader_compound'] = vader_scores.apply(lambda x: x['compound'])
113
+ self.df['vader_negative'] = vader_scores.apply(lambda x: x['neg'])
114
+ self.df['vader_positive'] = vader_scores.apply(lambda x: x['pos'])
115
+ self.df['vader_neutral'] = vader_scores.apply(lambda x: x['neu'])
116
+
117
+ # Calculate sentiment extremity
118
+ self.df['sentiment_extremity'] = self.df['vader_compound'].abs()
119
+
120
+ return self.df
121
+
122
+ def detect_anomalies(self) -> pd.DataFrame:
123
+ """Detect anomalous reviews using Isolation Forest with BERT features"""
124
+ self.logger.info("Detecting anomalies...")
125
+
126
+ # Prepare features for anomaly detection
127
+ features = [
128
+ 'review_stars',
129
+ 'textblob_polarity',
130
+ 'vader_compound',
131
+ 'sentiment_extremity',
132
+ 'review_length',
133
+ 'bert_embedding_mean',
134
+ 'bert_embedding_std'
135
+ ]
136
+
137
+ # Ensure all features exist
138
+ missing_features = [f for f in features if f not in self.df.columns]
139
+ if missing_features:
140
+ self.analyze_sentiment()
141
+
142
+ # Standardize features
143
+ scaler = StandardScaler()
144
+ X = scaler.fit_transform(self.df[features])
145
+
146
+ # Apply Isolation Forest
147
+ iso_forest = IsolationForest(
148
+ contamination=0.1,
149
+ random_state=42,
150
+ n_jobs=-1
151
+ )
152
+
153
+ # Fit and predict
154
+ self.df['is_anomaly'] = iso_forest.fit_predict(X)
155
+ self.df['anomaly_score'] = iso_forest.score_samples(X)
156
+
157
+ return self.df
158
+
159
+ def detect_ai_generated_text(self) -> pd.DataFrame:
160
+ """Estimate likelihood of AI-generated content"""
161
+ self.logger.info("Detecting AI-generated content...")
162
+
163
+ # Ensure sentiment analysis has been run
164
+ if 'textblob_subjectivity' not in self.df.columns:
165
+ self.analyze_sentiment()
166
+
167
+ # Use detoxify model to get toxicity scores
168
+ texts = self.df['review_text'].fillna('').tolist()
169
+ toxic_scores = self.toxic_model.predict(texts)
170
+
171
+ # Add scores to DataFrame
172
+ toxic_score_types = ['toxicity', 'severe_toxicity', 'obscene', 'identity_attack',
173
+ 'insult', 'threat', 'sexual_explicit']
174
+ for score_type in toxic_score_types:
175
+ if score_type in toxic_scores:
176
+ self.df[f'toxic_{score_type}'] = toxic_scores[score_type]
177
+
178
+ # Calculate AI generation likelihood based on various factors
179
+ self.df['ai_generated_likelihood'] = (
180
+ (self.df['textblob_subjectivity'] < 0.3) & # Low subjectivity
181
+ (self.df['sentiment_extremity'] > 0.8) & # Extreme sentiment
182
+ (self.df['review_length'] > self.df['review_length'].quantile(0.95)) & # Unusually long
183
+ (self.df['bert_embedding_std'] < self.df['bert_embedding_std'].quantile(0.25)) # Unusual language patterns
184
+ ).astype(int)
185
+
186
+ # Add additional AI detection features
187
+ self.df['ai_detection_score'] = (
188
+ (self.df['textblob_subjectivity'] * -1) + # Lower subjectivity increases score
189
+ (self.df['sentiment_extremity'] * 0.5) + # Extreme sentiment contributes somewhat
190
+ (self.df['bert_embedding_std'] * -0.5) # Lower variation in embeddings increases score
191
+ ).clip(0, 1) # Normalize between 0 and 1
192
+
193
+ return self.df
194
+
195
+ def analyze_business_categories(self) -> Dict:
196
+ """Analyze trends and patterns specific to business categories"""
197
+ self.logger.info("Analyzing business categories...")
198
+
199
+ # Extract categories
200
+ categories = self.df['categories'].fillna('').str.split(', ')
201
+ all_categories = [cat for cats in categories if isinstance(cats, list) for cat in cats]
202
+ category_counts = Counter(all_categories)
203
+
204
+ # Analyze reviews by category
205
+ category_analysis = {}
206
+ for category in set(all_categories):
207
+ category_reviews = self.df[self.df['categories'].str.contains(category, na=False)]
208
+
209
+ category_analysis[category] = {
210
+ 'review_count': len(category_reviews),
211
+ 'avg_rating': category_reviews['review_stars'].mean() if not category_reviews.empty else None,
212
+ 'avg_sentiment': category_reviews['vader_compound'].mean() if 'vader_compound' in self.df.columns and not category_reviews.empty else None,
213
+ 'avg_subjectivity': category_reviews['textblob_subjectivity'].mean() if 'textblob_subjectivity' in self.df.columns and not category_reviews.empty else None
214
+ }
215
+
216
+ return category_analysis
217
+
218
+ def visualize_results(self, output_dir: str):
219
+ """Create visualizations for analysis results"""
220
+ plt.figure(figsize=(15, 10))
221
+
222
+ # Sentiment Distribution
223
+ plt.subplot(2, 2, 1)
224
+ sns.histplot(data=self.df, x='vader_compound', bins=50)
225
+ plt.title('Sentiment Distribution')
226
+
227
+ # Review Volume Over Time
228
+ plt.subplot(2, 2, 2)
229
+ daily_reviews = self.df.groupby('review_date').size()
230
+ daily_reviews.plot()
231
+ plt.title('Review Volume Over Time')
232
+
233
+ # Anomaly Score Distribution
234
+ plt.subplot(2, 2, 3)
235
+ if 'anomaly_score' not in self.df.columns:
236
+ self.detect_anomalies()
237
+ sns.histplot(data=self.df, x='anomaly_score', bins=50)
238
+ plt.title('Anomaly Score Distribution')
239
+
240
+ # AI Generation Likelihood
241
+ plt.subplot(2, 2, 4)
242
+ if 'ai_generated_likelihood' not in self.df.columns:
243
+ self.detect_ai_generated_text()
244
+ sns.histplot(data=self.df, x='ai_generated_likelihood', bins=2)
245
+ plt.title('AI Generation Likelihood')
246
+
247
+ plt.tight_layout()
248
+ plt.savefig(f'{output_dir}/analysis_results.png')
249
+ plt.close()
250
+
251
+ def run_full_analysis(self, output_dir: str) -> Tuple[pd.DataFrame, Dict]:
252
+ """Run complete analysis pipeline with detailed outputs"""
253
+ self.logger.info("Starting full analysis pipeline...")
254
+
255
+ # Create output directory if it doesn't exist
256
+ output_dir = Path(output_dir)
257
+ output_dir.mkdir(parents=True, exist_ok=True)
258
+
259
+ try:
260
+ # Run all analyses
261
+ self.analyze_sentiment()
262
+ self.detect_anomalies()
263
+ self.detect_ai_generated_text()
264
+ category_analysis = self.analyze_business_categories()
265
+
266
+ # Create visualizations
267
+ self.visualize_results(str(output_dir))
268
+
269
+ # Compile results
270
+ analysis_results = {
271
+ 'category_analysis': category_analysis,
272
+ 'sentiment_summary': {
273
+ 'avg_sentiment': self.df['vader_compound'].mean(),
274
+ 'positive_reviews': len(self.df[self.df['vader_compound'] > 0.5]),
275
+ 'negative_reviews': len(self.df[self.df['vader_compound'] < -0.5]),
276
+ 'neutral_reviews': len(self.df[abs(self.df['vader_compound']) <= 0.5])
277
+ },
278
+ 'ai_detection_summary': {
279
+ 'likely_ai_generated': len(self.df[self.df['ai_generated_likelihood'] == 1]),
280
+ 'avg_ai_score': self.df['ai_detection_score'].mean()
281
+ },
282
+ 'anomaly_summary': {
283
+ 'anomalous_reviews': len(self.df[self.df['is_anomaly'] == -1]),
284
+ 'avg_anomaly_score': self.df['anomaly_score'].mean()
285
+ }
286
+ }
287
+
288
+ # Save results
289
+ self.df.to_csv(output_dir / "analyzed_data.csv", index=False)
290
+ with open(output_dir / "analysis_results.json", 'w') as f:
291
+ json.dump(analysis_results, f, indent=4)
292
+
293
+ return self.df, analysis_results
294
+
295
+ except Exception as e:
296
+ self.logger.error(f"Error during analysis: {str(e)}")
297
+ raise
298
+
299
+ # For testing
300
+ if __name__ == "__main__":
301
+ # Set up logging
302
+ logging.basicConfig(level=logging.INFO)
303
+ logger = logging.getLogger(__name__)
304
+
305
+ try:
306
+ # Read test data
307
+ df = pd.read_csv("test_data.csv")
308
+
309
+ # Initialize analyzer
310
+ analyzer = AdvancedYelpAnalyzer(df)
311
+
312
+ # Run analysis
313
+ output_dir = "output"
314
+ analyzed_df, results = analyzer.run_full_analysis(output_dir)
315
+
316
+ logger.info("Analysis completed successfully!")
317
+
318
+ except Exception as e:
319
+ logger.error(f"Error during testing: {str(e)}")
320
+ raise
src/.ipynb_checkpoints/clean_data-checkpoint.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # clean_yelp_data.py
2
+ from loguru import logger
3
+ import pandas as pd
4
+ import numpy as np
5
+ from dataclasses import dataclass
6
+ from typing import Dict, List, Optional, Tuple
7
+ import json
8
+ from pathlib import Path
9
+ import logging
10
+ from scipy.stats import entropy
11
+ import warnings
12
+ from datetime import datetime
13
+ import matplotlib.pyplot as plt
14
+ import seaborn as sns
15
+ import re
16
+ from textblob import TextBlob
17
+ import os
18
+ from pathlib import Path
19
+
20
+ class DataCleaner:
21
+ def __init__(self,df,output_path,filename="preprocessed_cleaned.csv"):
22
+ self.df=df
23
+ self.output_path=output_path
24
+ self.filename=filename
25
+ def saving_cleaned_preprocess(self):
26
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
27
+
28
+ output_file = Path(self.output_path) / self.filename
29
+ logger.info(f"Files saved in directory {output_file} as : { self.filename}")
30
+ self.df.to_csv(output_file, index=False)
31
+
32
+ def dropping_unncessary_columns(self):
33
+ self.df.drop("review_text", axis=1, inplace=True)
34
+ self.df.drop("review_date", axis=1, inplace=True)
35
+ self.df.drop("business_name", axis=1, inplace=True)
36
+ self.df.drop("city", axis=1, inplace=True)
37
+ self.df.drop("state", axis=1, inplace=True)
38
+ self.df.drop("postal_code", axis=1, inplace=True)
39
+ self.df.drop("categories", axis=1, inplace=True)
40
+ self.df.drop("user_name", axis=1, inplace=True)
41
+ self.df.drop("yelping_since", axis=1, inplace=True)
42
+ self.df.drop("checkin_date", axis=1, inplace=True)
43
+ self.df.drop("review_useful", axis=1, inplace=True)
44
+ self.df.drop("review_funny", axis=1, inplace=True)
45
+ self.df.drop("review_cool", axis=1, inplace=True)
46
+ self.df.drop("user_useful", axis=1, inplace=True)
47
+ self.df.drop("user_funny", axis=1, inplace=True)
48
+ self.df.drop("user_cool", axis=1, inplace=True)
49
+ self.df.drop("is_open", axis=1, inplace=True)
50
+ self.df.drop("compliment_hot", axis=1, inplace=True)
51
+ self.df.drop("compliment_more", axis=1, inplace=True)
52
+ self.df.drop("compliment_profile", axis=1, inplace=True)
53
+ self.df.drop("compliment_cute", axis=1, inplace=True)
54
+ self.df.drop("compliment_list", axis=1, inplace=True)
55
+ self.df.drop("compliment_note", axis=1, inplace=True)
56
+ self.df.drop("compliment_plain", axis=1, inplace=True)
57
+ self.df.drop("compliment_cool", axis=1, inplace=True)
58
+ self.df.drop("compliment_funny", axis=1, inplace=True)
59
+ self.df.drop("compliment_writer", axis=1, inplace=True)
60
+ self.df.drop("compliment_photos", axis=1, inplace=True)
61
+
62
+ def run_pipeline(self):
63
+ logger.info("Dropping Unnecessary Columns")
64
+ self.dropping_unncessary_columns()
65
+
66
+
67
+ logger.info("Saving Cleaned and Preprocessed Data")
68
+ self.saving_cleaned_preprocess()
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
src/.ipynb_checkpoints/create_dataset-checkpoint.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import ujson as json
3
+ import gc
4
+ import numpy as np
5
+ from concurrent.futures import ProcessPoolExecutor
6
+ import multiprocessing as mp
7
+ from pymongo import MongoClient
8
+ from collections import defaultdict
9
+ from pathlib import Path
10
+
11
+ # def read_json_parallel(file_path, num_workers=None):
12
+ # """Read JSON file using parallel processing"""
13
+ # if num_workers is None:
14
+ # num_workers = max(1, mp.cpu_count() - 1)
15
+
16
+ # print(f"Reading {file_path}...")
17
+ # # Read chunks and concatenate them into a single DataFrame
18
+ # df = pd.read_json(file_path, lines=True, dtype_backend="pyarrow", chunksize=100000)
19
+ # return next(df)
20
+
21
+
22
+ def read_data_mongo(file_path, num_workers=None):
23
+ """Read JSON file using parallel processing"""
24
+ if num_workers is None:
25
+ num_workers = max(1, mp.cpu_count() - 1)
26
+
27
+ print(f"Reading {file_path}...")
28
+ conn_str = "mongodb://Mtalha:[email protected]/"
29
+
30
+ client = MongoClient(conn_str)
31
+ databases = client.list_database_names()
32
+ db_client=client["Yelp"]
33
+
34
+ # Read the entire file at once since chunksize isn't needed for parallel reading here
35
+ # Use 'records' orient if your JSON was saved with this format
36
+ try:
37
+
38
+ collection = db_client[file_path]
39
+ documents = collection.find({}, {"_id": 0})
40
+ data = list(documents)
41
+ final_dict=defaultdict(list)
42
+
43
+ for dictt in data:
44
+ for k,v in dictt.items():
45
+ final_dict[k].append(v)
46
+ df=pd.DataFrame(final_dict)
47
+
48
+ # df = pd.read_json(file_path, orient='records', dtype_backend="pyarrow")
49
+ except Exception as e:
50
+ # If 'records' doesn't work, try without specifying orient or with 'split'
51
+ # This is a fallback for different JSON structures
52
+ # df = pd.read_json(file_path, dtype_backend="pyarrow")
53
+ print("ERROR WHILE READING FILES FORM MONGODB AS : ",e)
54
+ print(f"Finished reading. DataFrame shape: {df.shape}")
55
+ return df
56
+
57
+ def process_datasets(output_path,filename):
58
+ # File paths
59
+ file_paths = {
60
+ 'business': "yelp_academic_dataset_business",
61
+ 'checkin': "yelp_academic_dataset_checkin",
62
+ 'review': "yelp_academic_dataset_review",
63
+ 'tip': "yelp_academic_dataset_tip",
64
+ 'user': "yelp_academic_dataset_user",
65
+ 'google': "google_review_dataset"
66
+ }
67
+
68
+ # Read datasets with progress tracking
69
+ print("Reading datasets...")
70
+ dfs = {}
71
+ for name, path in file_paths.items():
72
+ print(f"Processing {name} dataset...")
73
+ dfs[name] = read_data_mongo(path)
74
+ print(f"Finished reading {name} dataset. Shape: {dfs[name].shape}")
75
+
76
+ print("All files read. Starting column renaming...")
77
+
78
+
79
+
80
+
81
+
82
+
83
+
84
+
85
+
86
+ # Rename columns to avoid conflicts
87
+ # Reviews
88
+ dfs['review'] = dfs['review'].rename(columns={
89
+ 'date': 'review_date',
90
+ 'stars': 'review_stars',
91
+ 'text': 'review_text',
92
+ 'useful': 'review_useful',
93
+ 'funny': 'review_funny',
94
+ 'cool': 'review_cool'
95
+ })
96
+ # print("COLUMNS IN REVIEW DAFRA)
97
+
98
+ # Tips
99
+ dfs['tip'] = dfs['tip'].rename(columns={
100
+ 'date': 'tip_date',
101
+ 'text': 'tip_text',
102
+ 'compliment_count': 'tip_compliment_count'
103
+ })
104
+
105
+ # Checkins
106
+ dfs['checkin'] = dfs['checkin'].rename(columns={
107
+ 'date': 'checkin_date'
108
+ })
109
+
110
+ # Users
111
+ dfs['user'] = dfs['user'].rename(columns={
112
+ 'name': 'user_name',
113
+ 'review_count': 'user_review_count',
114
+ 'useful': 'user_useful',
115
+ 'funny': 'user_funny',
116
+ 'cool': 'user_cool'
117
+ })
118
+
119
+ # Business
120
+ dfs['business'] = dfs['business'].rename(columns={
121
+ 'name': 'business_name',
122
+ 'stars': 'business_stars',
123
+ 'review_count': 'business_review_count'
124
+ })
125
+ dfs['google'] = dfs['google'].rename(columns={
126
+ 'name': 'business_name',
127
+ 'stars': 'business_stars',
128
+ 'review_count': 'business_review_count'
129
+ })
130
+ df_business_final= dfs['business']
131
+ df_google_final=dfs['google']
132
+ df_review_final=dfs['review']
133
+ df_tip_final=dfs['tip']
134
+ df_checkin_final=dfs['checkin']
135
+ df_user_final=dfs['user']
136
+
137
+
138
+ df_business_final=pd.concat([df_business_final,df_google_final],axis=0)
139
+ df_business_final.reset_index(drop=True,inplace=True)
140
+
141
+
142
+
143
+
144
+ print("Starting merge process...")
145
+
146
+ # Merge process with memory management
147
+ print("Step 1: Starting with reviews...")
148
+ merged_df = df_review_final
149
+
150
+
151
+ print("Step 2: Merging with business data...")
152
+ merged_df = merged_df.merge(
153
+ df_business_final,
154
+ on='business_id',
155
+ how='left'
156
+ )
157
+
158
+
159
+ print("Step 3: Merging with user data...")
160
+ merged_df = merged_df.merge(
161
+ df_user_final,
162
+ on='user_id',
163
+ how='left'
164
+ )
165
+
166
+
167
+ print("Step 4: Merging with checkin data...")
168
+ merged_df = merged_df.merge(
169
+ df_checkin_final,
170
+ on='business_id',
171
+ how='left'
172
+ )
173
+
174
+
175
+ print("Step 5: Aggregating and merging tip data...")
176
+ tip_agg = df_tip_final.groupby('business_id').agg({
177
+ 'tip_compliment_count': 'sum',
178
+ 'tip_text': 'count'
179
+ }).rename(columns={'tip_text': 'tip_count'})
180
+
181
+ merged_df = merged_df.merge(
182
+ tip_agg,
183
+ on='business_id',
184
+ how='left'
185
+ )
186
+
187
+
188
+
189
+ print("Filling NaN values...")
190
+ merged_df['tip_count'] = merged_df['tip_count'].fillna(0)
191
+ merged_df['tip_compliment_count'] = merged_df['tip_compliment_count'].fillna(0)
192
+ merged_df['checkin_date'] = merged_df['checkin_date'].fillna('')
193
+ merged_df["friends"].fillna(0,inplace=True)
194
+
195
+ for col in merged_df.columns:
196
+ if merged_df[col].isnull().sum()>0:
197
+ print(f" {col} has {merged_df[col].isnull().sum()} null values")
198
+
199
+
200
+ print("Shape of Merged Dataset is : ",merged_df.shape)
201
+ output_file = Path(output_path) / filename
202
+ print("COLUMNS BEFORE PREPROCESING")
203
+ print()
204
+ print(merged_df.info())
205
+ for col in merged_df.columns:
206
+ for v in merged_df[col]:
207
+ print(f"Type of values in {col} is {type(v)} and values are like : {v}")
208
+ break
209
+ merged_df.to_csv(output_file,index=False)
210
+
211
+
212
+
213
+
214
+ return merged_df
215
+
216
+ # if __name__ == "__main__":
217
+ # process_datasets()
src/.ipynb_checkpoints/feature_analyzer-checkpoint.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ from pathlib import Path
6
+ from loguru import logger
7
+
8
+ class FeatureAnalyzer:
9
+ def __init__(self,df,output_path):
10
+ self.df=df
11
+ self.output_path=output_path
12
+
13
+
14
+ def plot_correlation_heatmap(self):
15
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
16
+ numeric_cols = self.df.select_dtypes(include=[np.number]).columns.drop('fake')
17
+ correlation_matrix = self.df[numeric_cols].corr()
18
+ plt.figure(figsize=(14, 12))
19
+ sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1, center=0)
20
+ plt.title('Correlation Heatmap of Numeric Features', fontsize=16)
21
+ plt.tight_layout()
22
+ output_file = Path(self.output_path) / 'correlation_heatmap.png'
23
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
24
+ plt.close()
25
+ logger.info(f"Saved correlation heatmap to {output_file}")
26
+
27
+ def plot_mean_by_fake_bar(self):
28
+ key_features = [
29
+ 'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
30
+ 'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
31
+ 'time_since_last_review_user', 'user_account_age', 'pronoun_density',
32
+ 'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
33
+ 'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
34
+ ]
35
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
36
+ mean_by_fake = self.df.groupby('fake')[key_features].mean().T
37
+ mean_by_fake.columns = ['Genuine (0)', 'Fake (1)']
38
+ plt.figure(figsize=(12, 8))
39
+ mean_by_fake.plot(kind='bar', color=['skyblue', 'salmon'], width=0.8)
40
+ plt.title('Mean Feature Values by Fake Label', fontsize=16)
41
+ plt.xlabel('Features', fontsize=12)
42
+ plt.ylabel('Mean Value', fontsize=12)
43
+ plt.xticks(rotation=45, ha='right')
44
+ plt.legend(title='Fake Label')
45
+ plt.tight_layout()
46
+ output_file = Path(self.output_path) / 'mean_by_fake_bar.png'
47
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
48
+ plt.close()
49
+ logger.info(f"Saved mean by fake bar plot to {output_file}")
50
+
51
+ def plot_violin_plots(self):
52
+ key_features = [
53
+ 'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
54
+ 'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
55
+ 'time_since_last_review_user', 'user_account_age', 'pronoun_density',
56
+ 'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
57
+ 'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
58
+ ]
59
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
60
+ plt.figure(figsize=(14, 10))
61
+ for i, feature in enumerate(key_features[:6], 1):
62
+ plt.subplot(2, 3, i)
63
+ sns.violinplot(x='fake', y=feature, data=self.df, palette=['skyblue', 'salmon'])
64
+ plt.title(f'{feature} Distribution', fontsize=12)
65
+ plt.xlabel('Fake (0/1)', fontsize=10)
66
+ plt.tight_layout()
67
+ output_file = Path(self.output_path) / 'violin_plots.png'
68
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
69
+ plt.close()
70
+ logger.info(f"Saved violin plots to {output_file}")
71
+
72
+ def plot_box_plots(self):
73
+ key_features = [
74
+ 'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
75
+ 'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
76
+ 'time_since_last_review_user', 'user_account_age', 'pronoun_density',
77
+ 'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
78
+ 'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
79
+ ]
80
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
81
+ plt.figure(figsize=(14, 10))
82
+ for i, feature in enumerate(key_features[6:11], 1):
83
+ plt.subplot(2, 3, i)
84
+ sns.boxplot(x='fake', y=feature, data=self.df, palette=['skyblue', 'salmon'])
85
+ plt.title(f'{feature} Distribution', fontsize=12)
86
+ plt.xlabel('Fake (0/1)', fontsize=10)
87
+ plt.tight_layout()
88
+ output_file = Path(self.output_path) / 'box_plots.png'
89
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
90
+ plt.close()
91
+ logger.info(f"Saved box plots to {output_file}")
92
+
93
+ def plot_scatter_review_grammar(self):
94
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
95
+ plt.figure(figsize=(10, 6))
96
+ sns.scatterplot(x='review_stars', y='grammar_error_score', hue='fake', data=self.df, palette=['blue', 'red'], alpha=0.5)
97
+ plt.title('Review Stars vs Grammar Error Score by Fake Label', fontsize=16)
98
+ plt.xlabel('Review Stars', fontsize=12)
99
+ plt.ylabel('Grammar Error Score', fontsize=12)
100
+ plt.legend(title='Fake')
101
+ plt.tight_layout()
102
+ output_file = Path(self.output_path) / 'scatter_review_grammar.png'
103
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
104
+ plt.close()
105
+ logger.info(f"Saved scatter plot to {output_file}")
106
+
107
+ def plot_density_plots(self):
108
+ key_features = [
109
+ 'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
110
+ 'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
111
+ 'time_since_last_review_user', 'user_account_age', 'pronoun_density',
112
+ 'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
113
+ 'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
114
+ ]
115
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
116
+ plt.figure(figsize=(14, 10))
117
+ for i, feature in enumerate(key_features[:4], 1):
118
+ plt.subplot(2, 2, i)
119
+ for label in [0, 1]:
120
+ subset = self.df[self.df['fake'] == label]
121
+ sns.kdeplot(subset[feature], label=f'Fake={label}', fill=True, alpha=0.5)
122
+ plt.title(f'{feature} Density', fontsize=12)
123
+ plt.xlabel(feature, fontsize=10)
124
+ plt.legend()
125
+ plt.tight_layout()
126
+ output_file = Path(self.output_path) / 'density_plots.png'
127
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
128
+ plt.close()
129
+ logger.info(f"Saved density plots to {output_file}")
130
+
131
+ def plot_stacked_bar_similarity(self):
132
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
133
+ bins = pd.cut(self.df['similarity_to_other_reviews'], bins=10)
134
+ stacked_data = self.df.groupby([bins, 'fake']).size().unstack(fill_value=0)
135
+ stacked_data = stacked_data.div(stacked_data.sum(axis=1), axis=0)
136
+ plt.figure(figsize=(12, 8))
137
+ stacked_data.plot(kind='bar', stacked=True, color=['skyblue', 'salmon'], width=0.8)
138
+ plt.title('Proportion of Fake by Similarity to Other Reviews Bins', fontsize=16)
139
+ plt.xlabel('Similarity Bins', fontsize=12)
140
+ plt.ylabel('Proportion', fontsize=12)
141
+ plt.legend(['Genuine (0)', 'Fake (1)'], title='Fake Label')
142
+ plt.xticks(rotation=45, ha='right')
143
+ plt.tight_layout()
144
+ output_file = Path(self.output_path) / 'stacked_bar_similarity.png'
145
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
146
+ plt.close()
147
+ logger.info(f"Saved stacked bar plot to {output_file}")
148
+
149
+ def plot_pie_fake_distribution(self):
150
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
151
+ fake_counts = self.df['fake'].value_counts()
152
+ plt.figure(figsize=(8, 8))
153
+ plt.pie(fake_counts, labels=['Genuine (0)', 'Fake (1)'], colors=['skyblue', 'salmon'], autopct='%1.1f%%', startangle=90)
154
+ plt.title('Distribution of Fake Labels', fontsize=16)
155
+ plt.axis('equal')
156
+ output_file = Path(self.output_path) / 'pie_fake_distribution.png'
157
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
158
+ plt.close()
159
+ logger.info(f"Saved pie chart to {output_file}")
160
+
161
+ def plot_count_code_switching(self):
162
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
163
+ plt.figure(figsize=(8, 6))
164
+ sns.countplot(x='code_switching_flag', hue='fake', data=self.df, palette=['skyblue', 'salmon'])
165
+ plt.title('Count of Fake by Code Switching Flag', fontsize=16)
166
+ plt.xlabel('Code Switching Flag (0/1)', fontsize=12)
167
+ plt.ylabel('Count', fontsize=12)
168
+ plt.legend(title='Fake Label')
169
+ plt.tight_layout()
170
+ output_file = Path(self.output_path) / 'count_code_switching.png'
171
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
172
+ plt.close()
173
+ logger.info(f"Saved count plot to {output_file}")
174
+
175
+ def plot_variance_by_fake_bar(self):
176
+ key_features = [
177
+ 'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
178
+ 'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
179
+ 'time_since_last_review_user', 'user_account_age', 'pronoun_density',
180
+ 'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
181
+ 'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
182
+ ]
183
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
184
+ variance_by_fake = self.df.groupby('fake')[key_features].var().T
185
+ variance_by_fake.columns = ['Genuine (0)', 'Fake (1)']
186
+ plt.figure(figsize=(12, 8))
187
+ variance_by_fake.plot(kind='bar', color=['skyblue', 'salmon'], width=0.8)
188
+ plt.title('Feature Variance by Fake Label', fontsize=16)
189
+ plt.xlabel('Features', fontsize=12)
190
+ plt.ylabel('Variance', fontsize=12)
191
+ plt.xticks(rotation=45, ha='right')
192
+ plt.legend(title='Fake Label')
193
+ plt.tight_layout()
194
+ output_file = Path(self.output_path) / 'variance_by_fake_bar.png'
195
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
196
+ plt.close()
197
+ logger.info(f"Saved variance bar plot to {output_file}")
198
+
199
+ def run_pipeline(self):
200
+
201
+ sns.set(style="whitegrid")
202
+ plt.rcParams['figure.figsize'] = (12, 8)
203
+ self.plot_correlation_heatmap()
204
+ self.plot_mean_by_fake_bar()
205
+ self.plot_violin_plots()
206
+ self.plot_box_plots()
207
+ self.plot_scatter_review_grammar()
208
+ self.plot_density_plots()
209
+ self.plot_stacked_bar_similarity()
210
+ self.plot_pie_fake_distribution()
211
+ self.plot_count_code_switching()
212
+ self.plot_variance_by_fake_bar()
src/.ipynb_checkpoints/model-checkpoint.py ADDED
@@ -0,0 +1,541 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from torch_geometric.data import HeteroData
5
+ import numpy as np
6
+ import pandas as pd
7
+ import networkx as nx
8
+ import matplotlib.pyplot as plt
9
+ import seaborn as sns
10
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve
11
+ from sklearn.model_selection import train_test_split
12
+ from pathlib import Path
13
+ from datetime import datetime
14
+ from loguru import logger
15
+
16
+ # Temporal Edge Features Function
17
+ def create_temporal_edge_features(time_since_src, time_since_tgt, user_i, user_j):
18
+ delta_t = torch.abs(time_since_src - time_since_tgt).float()
19
+ hour_scale = torch.sin(delta_t / 3600)
20
+ day_scale = torch.sin(delta_t / (24 * 3600))
21
+ week_scale = torch.sin(delta_t / (7 * 24 * 3600))
22
+ same_user = (user_i == user_j).float()
23
+ burst_feature = same_user * torch.exp(-delta_t / (24 * 3600))
24
+ return torch.stack([hour_scale, day_scale, week_scale, burst_feature], dim=-1)
25
+
26
+ # Custom Multihead Attention (unchanged)
27
+ class CustomMultiheadAttention(nn.Module):
28
+ def __init__(self, embed_dim, num_heads):
29
+ super().__init__()
30
+ self.embed_dim = embed_dim
31
+ self.num_heads = num_heads
32
+ self.head_dim = embed_dim // num_heads
33
+
34
+ assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
35
+
36
+ self.q_proj = nn.Linear(embed_dim, embed_dim)
37
+ self.k_proj = nn.Linear(embed_dim, embed_dim)
38
+ self.v_proj = nn.Linear(embed_dim, embed_dim)
39
+ self.out_proj = nn.Linear(embed_dim, embed_dim)
40
+
41
+ self.scale = self.head_dim ** -0.5
42
+
43
+ def forward(self, query, key, value, attn_bias=None):
44
+ batch_size, seq_len, embed_dim = query.size()
45
+ q = self.q_proj(query)
46
+ k = self.k_proj(key)
47
+ v = self.v_proj(value)
48
+ q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
49
+ k = k.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
50
+ v = v.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
51
+ scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale
52
+ if attn_bias is not None:
53
+ scores = scores + attn_bias.unsqueeze(1)
54
+ attn = F.softmax(scores, dim=-1)
55
+ out = torch.matmul(attn, v)
56
+ out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, embed_dim)
57
+ out = self.out_proj(out)
58
+ return out, attn
59
+
60
+ # HeteroGraphormer (unchanged)
61
+ class HeteroGraphormer(nn.Module):
62
+ def __init__(self, hidden_dim, output_dim, num_heads=4, edge_dim=4):
63
+ super().__init__()
64
+ self.hidden_dim = hidden_dim
65
+
66
+ self.embed_dict = nn.ModuleDict({
67
+ 'user': nn.Linear(14, hidden_dim),
68
+ 'business': nn.Linear(8, hidden_dim),
69
+ 'review': nn.Linear(16, hidden_dim)
70
+ })
71
+
72
+ self.edge_proj = nn.Linear(edge_dim, hidden_dim)
73
+
74
+ self.gru_user = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
75
+ self.gru_business = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
76
+ self.gru_review = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
77
+
78
+ self.attention1 = CustomMultiheadAttention(hidden_dim, num_heads)
79
+ self.attention2 = CustomMultiheadAttention(hidden_dim, num_heads)
80
+
81
+ self.ffn1 = nn.Sequential(
82
+ nn.Linear(hidden_dim, hidden_dim * 4),
83
+ nn.ReLU(),
84
+ nn.Dropout(0.1),
85
+ nn.Linear(hidden_dim * 4, hidden_dim)
86
+ )
87
+ self.ffn2 = nn.Sequential(
88
+ nn.Linear(hidden_dim, hidden_dim * 4),
89
+ nn.ReLU(),
90
+ nn.Dropout(0.1),
91
+ nn.Linear(hidden_dim * 4, hidden_dim)
92
+ )
93
+
94
+ self.norm1 = nn.LayerNorm(hidden_dim)
95
+ self.norm2 = nn.LayerNorm(hidden_dim)
96
+ self.norm3 = nn.LayerNorm(hidden_dim)
97
+ self.norm4 = nn.LayerNorm(hidden_dim)
98
+
99
+ self.centrality_proj = nn.Linear(1, hidden_dim)
100
+
101
+ self.classifier = nn.Sequential(
102
+ nn.Linear(hidden_dim * 3, hidden_dim),
103
+ nn.ReLU(),
104
+ nn.Dropout(0.1),
105
+ nn.Linear(hidden_dim, 1)
106
+ )
107
+
108
+ self.dropout = nn.Dropout(0.1)
109
+
110
+ def time_aware_aggregation(self, x, time_since, decay_rate=0.1):
111
+ weights = torch.exp(-decay_rate * time_since.unsqueeze(-1))
112
+ return x * weights
113
+
114
+ def forward(self, data, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict):
115
+ x_dict = {}
116
+ for node_type in data.x_dict:
117
+ x = self.embed_dict[node_type](data[node_type].x)
118
+ if node_type in time_since_dict:
119
+ x = self.time_aware_aggregation(x, time_since_dict[node_type])
120
+ x_dict[node_type] = x
121
+
122
+ x = torch.cat([x_dict['user'], x_dict['business'], x_dict['review']], dim=0)
123
+
124
+ centrality = self.centrality_proj(centrality_encoding)
125
+ x = x + centrality
126
+
127
+ x = x.unsqueeze(0)
128
+
129
+ x_user = x[:, :data['user'].x.size(0), :]
130
+ x_business = x[:, data['user'].x.size(0):data['user'].x.size(0) + data['business'].x.size(0), :]
131
+ x_review = x[:, data['user'].x.size(0) + data['business'].x.size(0):, :]
132
+
133
+ x_user, _ = self.gru_user(x_user)
134
+ x_business, _ = self.gru_business(x_business)
135
+ x_review, _ = self.gru_review(x_review)
136
+
137
+ x = torch.cat([x_user, x_business, x_review], dim=1)
138
+
139
+ total_nodes = x.size(1)
140
+ attn_bias = torch.zeros(1, total_nodes, total_nodes, device=x.device)
141
+ attn_bias[0] = -spatial_encoding
142
+
143
+ for edge_type in edge_features_dict:
144
+ edge_index = data[edge_type].edge_index
145
+ edge_feats = self.edge_proj(edge_features_dict[edge_type])
146
+ for i, (src, tgt) in enumerate(edge_index.t()):
147
+ attn_bias[0, src, tgt] += edge_feats[i].sum()
148
+
149
+ residual = x
150
+ x, _ = self.attention1(x, x, x, attn_bias=attn_bias)
151
+ x = self.norm1(x + residual)
152
+ x = self.dropout(x)
153
+
154
+ residual = x
155
+ x = self.ffn1(x)
156
+ x = self.norm2(x + residual)
157
+ x = self.dropout(x)
158
+
159
+ residual = x
160
+ x, _ = self.attention2(x, x, x, attn_bias=attn_bias)
161
+ x = self.norm3(x + residual)
162
+ x = self.dropout(x)
163
+
164
+ residual = x
165
+ x = self.ffn2(x)
166
+ x = self.norm4(x + residual)
167
+ x = self.dropout(x)
168
+
169
+ x = x.squeeze(0)
170
+
171
+ user_start = 0
172
+ business_start = data['user'].x.size(0)
173
+ review_start = business_start + data['business'].x.size(0)
174
+
175
+ h_user = x[user_start:business_start]
176
+ h_business = x[business_start:review_start]
177
+ h_review = x[review_start:]
178
+
179
+ user_indices = data['user', 'writes', 'review'].edge_index[0]
180
+ business_indices = data['review', 'about', 'business'].edge_index[1]
181
+ review_indices = data['user', 'writes', 'review'].edge_index[1]
182
+
183
+ h_user_mapped = h_user[user_indices]
184
+ h_business_mapped = h_business[business_indices]
185
+ h_review_mapped = h_review[review_indices]
186
+
187
+ combined = torch.cat([h_review_mapped, h_user_mapped, h_business_mapped], dim=-1)
188
+
189
+ logits = self.classifier(combined)
190
+ return torch.sigmoid(logits)
191
+
192
+ # Updated GraphformerModel with Plotting
193
+ class GraphformerModel:
194
+ def __init__(self, df, output_path, epochs, test_size=0.3):
195
+ self.df_whole = df
196
+ self.output_path = output_path
197
+ self.output_path = Path(self.output_path) / "GraphformerModel"
198
+ self.epochs = epochs
199
+ self.df, self.test_df = train_test_split(self.df_whole, test_size=test_size, random_state=42)
200
+
201
+ torch.manual_seed(42)
202
+ np.random.seed(42)
203
+
204
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
205
+
206
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
207
+ self.model = HeteroGraphormer(hidden_dim=64, output_dim=1, edge_dim=4).to(self.device)
208
+ self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.005)
209
+ self.criterion = nn.BCELoss()
210
+
211
+ def compute_graph_encodings(self, data):
212
+ G = nx.DiGraph()
213
+ node_offset = 0
214
+ node_type_map = {}
215
+
216
+ for node_type in ['user', 'business', 'review']:
217
+ num_nodes = data[node_type].x.size(0)
218
+ for i in range(num_nodes):
219
+ G.add_node(node_offset + i)
220
+ node_type_map[node_offset + i] = node_type
221
+ node_offset += num_nodes
222
+
223
+ edge_types = [('user', 'writes', 'review'), ('review', 'about', 'business')]
224
+ for src_type, rel, tgt_type in edge_types:
225
+ edge_index = data[src_type, rel, tgt_type].edge_index
226
+ src_nodes = edge_index[0].tolist()
227
+ tgt_nodes = edge_index[1].tolist()
228
+ src_offset = 0 if src_type == 'user' else (self.num_users if src_type == 'business' else self.num_users + self.num_businesses)
229
+ tgt_offset = 0 if tgt_type == 'user' else (self.num_users if tgt_type == 'business' else self.num_users + self.num_businesses)
230
+ for src, tgt in zip(src_nodes, tgt_nodes):
231
+ G.add_edge(src + src_offset, tgt + tgt_offset)
232
+
233
+ num_nodes = G.number_of_nodes()
234
+ spatial_encoding = torch.full((num_nodes, num_nodes), float('inf'), device=self.device)
235
+ for i in range(num_nodes):
236
+ for j in range(num_nodes):
237
+ if i == j:
238
+ spatial_encoding[i, j] = 0
239
+ elif nx.has_path(G, i, j):
240
+ spatial_encoding[i, j] = nx.shortest_path_length(G, i, j)
241
+
242
+ centrality_encoding = torch.tensor([G.degree(i) for i in range(num_nodes)], dtype=torch.float, device=self.device).view(-1, 1)
243
+
244
+ return spatial_encoding, centrality_encoding, node_type_map
245
+
246
+ def compute_metrics(self, y_true, y_pred, y_prob, prefix=""):
247
+ metrics = {}
248
+ metrics[f"{prefix}accuracy"] = accuracy_score(y_true, y_pred)
249
+ metrics[f"{prefix}precision"] = precision_score(y_true, y_pred, zero_division=0)
250
+ metrics[f"{prefix}recall"] = recall_score(y_true, y_pred, zero_division=0)
251
+ metrics[f"{prefix}f1"] = f1_score(y_true, y_pred, zero_division=0)
252
+ metrics[f"{prefix}auc_roc"] = roc_auc_score(y_true, y_prob)
253
+ metrics[f"{prefix}conf_matrix"] = confusion_matrix(y_true, y_pred)
254
+ metrics[f"{prefix}class_report"] = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
255
+ return metrics
256
+
257
+ def run_model(self):
258
+ features = torch.tensor(self.df.drop(columns=['user_id', 'review_id', 'business_id', 'fake']).values, dtype=torch.float, device=self.device)
259
+ y = torch.tensor(self.df['fake'].values, dtype=torch.float, device=self.device)
260
+ time_since_user = torch.tensor(self.df['time_since_last_review_user'].values, dtype=torch.float, device=self.device)
261
+ time_since_business = torch.tensor(self.df['time_since_last_review_business'].values, dtype=torch.float, device=self.device)
262
+ num_rows = len(self.df)
263
+
264
+ graph = HeteroData()
265
+
266
+ self.num_users = len(self.df['user_id'].unique())
267
+ self.num_businesses = len(self.df['business_id'].unique())
268
+
269
+ user_indices = torch.tensor(self.df['user_id'].map({uid: i for i, uid in enumerate(self.df['user_id'].unique())}).values, dtype=torch.long, device=self.device)
270
+ business_indices = torch.tensor(self.df['business_id'].map({bid: i for i, bid in enumerate(self.df['business_id'].unique())}).values, dtype=torch.long, device=self.device)
271
+ review_indices = torch.arange(num_rows, dtype=torch.long, device=self.device)
272
+
273
+ user_feats = torch.zeros(self.num_users, 14, device=self.device)
274
+ business_feats = torch.zeros(self.num_businesses, 8, device=self.device)
275
+ review_feats = torch.zeros(num_rows, 16, device=self.device)
276
+
277
+ user_cols = ['hours', 'user_review_count', 'elite', 'friends', 'fans', 'average_stars',
278
+ 'time_since_last_review_user', 'user_account_age', 'user_degree',
279
+ 'user_review_burst_count', 'review_like_ratio', 'latest_checkin_hours',
280
+ 'user_useful_funny_cool', 'rating_variance_user']
281
+ business_cols = ['latitude', 'longitude', 'business_stars', 'business_review_count',
282
+ 'time_since_last_review_business', 'business_degree',
283
+ 'business_review_burst_count', 'rating_deviation_from_business_average']
284
+ review_cols = ['review_stars', 'tip_compliment_count', 'tip_count', 'average_time_between_reviews',
285
+ 'temporal_similarity', 'pronoun_density', 'avg_sentence_length',
286
+ 'excessive_punctuation_count', 'sentiment_polarity', 'good_severity',
287
+ 'bad_severity', 'code_switching_flag', 'grammar_error_score',
288
+ 'repetitive_words_count', 'similarity_to_other_reviews', 'review_useful_funny_cool']
289
+
290
+ for i in range(len(self.df)):
291
+ user_idx = user_indices[i]
292
+ business_idx = business_indices[i]
293
+ user_feats[user_idx] += features[i, :14]
294
+ business_feats[business_idx] += features[i, 14:22]
295
+ review_feats = features[:, 22:38]
296
+
297
+ graph['user'].x = user_feats
298
+ graph['business'].x = business_feats
299
+ graph['review'].x = review_feats
300
+ graph['review'].y = y
301
+
302
+ graph['user', 'writes', 'review'].edge_index = torch.stack([user_indices, review_indices], dim=0)
303
+ graph['review', 'about', 'business'].edge_index = torch.stack([review_indices, business_indices], dim=0)
304
+
305
+ edge_features_dict = {}
306
+ user_writes_edge = graph['user', 'writes', 'review'].edge_index
307
+ review_about_edge = graph['review', 'about', 'business'].edge_index
308
+
309
+ src_users = user_indices[user_writes_edge[0]]
310
+ tgt_reviews = review_indices[user_writes_edge[1]]
311
+ edge_features_dict[('user', 'writes', 'review')] = create_temporal_edge_features(
312
+ time_since_user[src_users], time_since_user[tgt_reviews], src_users, src_users
313
+ )
314
+
315
+ src_reviews = review_indices[review_about_edge[0]]
316
+ tgt_businesses = business_indices[review_about_edge[1]]
317
+ edge_features_dict[('review', 'about', 'business')] = create_temporal_edge_features(
318
+ time_since_business[src_reviews], time_since_business[tgt_businesses],
319
+ torch.zeros_like(src_reviews), torch.zeros_like(src_reviews)
320
+ )
321
+
322
+ user_time_since = self.df.groupby('user_id')['time_since_last_review_user'].min().reindex(
323
+ self.df['user_id'].unique(), fill_value=0).values
324
+ time_since_dict = {
325
+ 'user': torch.tensor(user_time_since, dtype=torch.float, device=self.device)
326
+ }
327
+
328
+ spatial_encoding, centrality_encoding, node_type_map = self.compute_graph_encodings(graph)
329
+
330
+ # Training with metrics history
331
+ self.model.train()
332
+ train_metrics_history = []
333
+ for epoch in range(self.epochs):
334
+ self.optimizer.zero_grad()
335
+ out = self.model(graph, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict)
336
+ loss = self.criterion(out.squeeze(), y)
337
+ loss.backward()
338
+ self.optimizer.step()
339
+
340
+ pred_labels = (out.squeeze() > 0.5).float()
341
+ logger.info(f"PREDICTED LABELS : {pred_labels}")
342
+ # print(pred_labels)
343
+ probs = out.squeeze().detach().cpu().numpy()
344
+ train_metrics = self.compute_metrics(y.cpu().numpy(), pred_labels.cpu().numpy(), probs, prefix="train_")
345
+ train_metrics['loss'] = loss.item()
346
+ train_metrics_history.append(train_metrics)
347
+
348
+ if epoch % 10 == 0:
349
+ logger.info(f"Epoch {epoch}, Loss: {loss.item():.4f}, Accuracy: {train_metrics['train_accuracy']:.4f}, F1: {train_metrics['train_f1']:.4f}")
350
+
351
+ # Save model
352
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
353
+ model_save_path = Path(self.output_path) / f"model_GraphformerModel_latest.pth"
354
+ torch.save(self.model.state_dict(), model_save_path)
355
+
356
+ # Testing
357
+ if self.test_df is not None:
358
+ test_features = torch.tensor(self.test_df.drop(columns=['user_id', 'review_id', 'business_id', 'fake']).values, dtype=torch.float, device=self.device)
359
+ test_y = torch.tensor(self.test_df['fake'].values, dtype=torch.float, device=self.device)
360
+ test_time_since_user = torch.tensor(self.test_df['time_since_last_review_user'].values, dtype=torch.float, device=self.device)
361
+ test_time_since_business = torch.tensor(self.test_df['time_since_last_review_business'].values, dtype=torch.float, device=self.device)
362
+ num_test_rows = len(self.test_df)
363
+
364
+ new_user_unique = self.test_df['user_id'].unique()
365
+ new_business_unique = self.test_df['business_id'].unique()
366
+
367
+ existing_user_ids = list(self.df['user_id'].unique())
368
+ user_mapping = {uid: i for i, uid in enumerate(existing_user_ids)}
369
+ total_users = self.num_users
370
+ for uid in new_user_unique:
371
+ if uid not in user_mapping:
372
+ user_mapping[uid] = total_users
373
+ total_users += 1
374
+
375
+ existing_business_ids = list(self.df['business_id'].unique())
376
+ business_mapping = {bid: i for i, bid in enumerate(existing_business_ids)}
377
+ total_businesses = self.num_businesses
378
+ for bid in new_business_unique:
379
+ if bid not in business_mapping:
380
+ business_mapping[bid] = total_businesses
381
+ total_businesses += 1
382
+
383
+ new_user_indices = torch.tensor([user_mapping[uid] for uid in self.test_df['user_id']], dtype=torch.long, device=self.device)
384
+ new_business_indices = torch.tensor([business_mapping[bid] for bid in self.test_df['business_id']], dtype=torch.long, device=self.device)
385
+ new_review_indices = torch.arange(num_rows, num_rows + num_test_rows, device=self.device)
386
+
387
+ if total_users > self.num_users:
388
+ additional_user_feats = torch.zeros(total_users - self.num_users, 14, device=self.device)
389
+ graph['user'].x = torch.cat([graph['user'].x, additional_user_feats], dim=0)
390
+ if total_businesses > self.num_businesses:
391
+ additional_business_feats = torch.zeros(total_businesses - self.num_businesses, 8, device=self.device)
392
+ graph['business'].x = torch.cat([graph['business'].x, additional_business_feats], dim=0)
393
+
394
+ for i in range(num_test_rows):
395
+ user_idx = new_user_indices[i]
396
+ business_idx = new_business_indices[i]
397
+ if user_idx < graph['user'].x.size(0):
398
+ graph['user'].x[user_idx] += test_features[i, :14]
399
+ if business_idx < graph['business'].x.size(0):
400
+ graph['business'].x[business_idx] += test_features[i, 14:22]
401
+ graph['review'].x = torch.cat([graph['review'].x, test_features[:, 22:38]], dim=0)
402
+ graph['review'].y = torch.cat([graph['review'].y, test_y], dim=0)
403
+
404
+ graph['user', 'writes', 'review'].edge_index = torch.cat([
405
+ graph['user', 'writes', 'review'].edge_index,
406
+ torch.stack([new_user_indices, new_review_indices], dim=0)], dim=1)
407
+ graph['review', 'about', 'business'].edge_index = torch.cat([
408
+ graph['review', 'about', 'business'].edge_index,
409
+ torch.stack([new_review_indices, new_business_indices], dim=0)], dim=1)
410
+
411
+ all_time_since_user = torch.cat([time_since_user, test_time_since_user])
412
+ all_time_since_business = torch.cat([time_since_business, test_time_since_business])
413
+ all_user_indices = torch.cat([user_indices, new_user_indices])
414
+ all_business_indices = torch.cat([business_indices, new_business_indices])
415
+ all_review_indices = torch.cat([review_indices, new_review_indices])
416
+
417
+ user_writes_edge = graph['user', 'writes', 'review'].edge_index
418
+ review_about_edge = graph['review', 'about', 'business'].edge_index
419
+
420
+ edge_features_dict[('user', 'writes', 'review')] = create_temporal_edge_features(
421
+ all_time_since_user[user_writes_edge[0]], all_time_since_user[user_writes_edge[1]],
422
+ all_user_indices[user_writes_edge[0]], all_user_indices[user_writes_edge[0]]
423
+ )
424
+ edge_features_dict[('review', 'about', 'business')] = create_temporal_edge_features(
425
+ all_time_since_business[review_about_edge[0]], all_time_since_business[review_about_edge[1]],
426
+ torch.zeros_like(review_about_edge[0]), torch.zeros_like(review_about_edge[0])
427
+ )
428
+
429
+ self.num_users = total_users
430
+ self.num_businesses = total_businesses
431
+
432
+ test_user_time_since = self.test_df.groupby('user_id')['time_since_last_review_user'].min().reindex(
433
+ pd.Index(list(self.df['user_id'].unique()) + list(self.test_df['user_id'].unique())), fill_value=0).values
434
+ time_since_dict['user'] = torch.tensor(test_user_time_since[:total_users], dtype=torch.float, device=self.device)
435
+
436
+ spatial_encoding, centrality_encoding, node_type_map = self.compute_graph_encodings(graph)
437
+
438
+ self.model.eval()
439
+ with torch.no_grad():
440
+ out = self.model(graph, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict)
441
+ pred_labels = (out.squeeze() > 0.5).float()
442
+ probs = out.squeeze().detach().cpu().numpy()
443
+ test_metrics = self.compute_metrics(graph['review'].y[-num_test_rows:].cpu().numpy(), pred_labels[-num_test_rows:].cpu().numpy(), probs[-num_test_rows:], prefix="test_")
444
+ train_metrics = self.compute_metrics(y.cpu().numpy(), pred_labels[:num_rows].cpu().numpy(), probs[:num_rows], prefix="train_")
445
+ logger.info(f"Test Accuracy: {test_metrics['test_accuracy']:.4f}, F1: {test_metrics['test_f1']:.4f}, AUC-ROC: {test_metrics['test_auc_roc']:.4f}")
446
+
447
+ # Save metrics to file
448
+ metrics_file = Path(self.output_path) / f"metrics_{timestamp}.txt"
449
+ with open(metrics_file, 'w') as f:
450
+ f.write("Training Metrics (Final Epoch):\n")
451
+ for k, v in train_metrics.items():
452
+ f.write(f"{k}: {v}\n")
453
+ f.write("\nTest Metrics:\n")
454
+ for k, v in test_metrics.items():
455
+ f.write(f"{k}: {v}\n")
456
+
457
+ # Plotting and saving to output_path
458
+ plt.figure(figsize=(12, 8))
459
+ plt.plot([m['loss'] for m in train_metrics_history], label='Training Loss')
460
+ plt.xlabel('Epoch')
461
+ plt.ylabel('Loss')
462
+ plt.title('Training Loss Curve')
463
+ plt.legend()
464
+ plt.grid(True)
465
+ plt.savefig(Path(self.output_path) / f"loss_curve_{timestamp}.png")
466
+ plt.close()
467
+
468
+ plt.figure(figsize=(12, 8))
469
+ plt.plot([m['train_accuracy'] for m in train_metrics_history], label='Training Accuracy')
470
+ plt.xlabel('Epoch')
471
+ plt.ylabel('Accuracy')
472
+ plt.title('Training Accuracy Curve')
473
+ plt.legend()
474
+ plt.grid(True)
475
+ plt.savefig(Path(self.output_path) / f"accuracy_curve_{timestamp}.png")
476
+ plt.close()
477
+
478
+ plt.figure(figsize=(12, 8))
479
+ plt.plot([m['train_precision'] for m in train_metrics_history], label='Training Precision')
480
+ plt.plot([m['train_recall'] for m in train_metrics_history], label='Training Recall')
481
+ plt.plot([m['train_f1'] for m in train_metrics_history], label='Training F1-Score')
482
+ plt.xlabel('Epoch')
483
+ plt.ylabel('Score')
484
+ plt.title('Training Precision, Recall, and F1-Score Curves')
485
+ plt.legend()
486
+ plt.grid(True)
487
+ plt.savefig(Path(self.output_path) / f"prf1_curves_{timestamp}.png")
488
+ plt.close()
489
+
490
+ plt.figure(figsize=(12, 8))
491
+ plt.plot([m['train_auc_roc'] for m in train_metrics_history], label='Training AUC-ROC')
492
+ plt.xlabel('Epoch')
493
+ plt.ylabel('AUC-ROC')
494
+ plt.title('Training AUC-ROC Curve')
495
+ plt.legend()
496
+ plt.grid(True)
497
+ plt.savefig(Path(self.output_path) / f"auc_roc_curve_train_{timestamp}.png")
498
+ plt.close()
499
+
500
+ plt.figure(figsize=(8, 6))
501
+ sns.heatmap(test_metrics['test_conf_matrix'], annot=True, fmt='d', cmap='Blues', cbar=False)
502
+ plt.xlabel('Predicted')
503
+ plt.ylabel('True')
504
+ plt.title('Test Confusion Matrix')
505
+ plt.savefig(Path(self.output_path) / f"confusion_matrix_test_{timestamp}.png")
506
+ plt.close()
507
+
508
+ fpr, tpr, _ = roc_curve(graph['review'].y[-num_test_rows:].cpu().numpy(), probs[-num_test_rows:])
509
+ plt.figure(figsize=(10, 6))
510
+ plt.plot(fpr, tpr, label=f'Test ROC Curve (AUC = {test_metrics["test_auc_roc"]:.4f})')
511
+ plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
512
+ plt.xlabel('False Positive Rate')
513
+ plt.ylabel('True Positive Rate')
514
+ plt.title('Test ROC Curve')
515
+ plt.legend()
516
+ plt.grid(True)
517
+ plt.savefig(Path(self.output_path) / f"roc_curve_test_{timestamp}.png")
518
+ plt.close()
519
+
520
+ plt.figure(figsize=(8, 6))
521
+ sns.heatmap(train_metrics['train_conf_matrix'], annot=True, fmt='d', cmap='Blues', cbar=False)
522
+ plt.xlabel('Predicted')
523
+ plt.ylabel('True')
524
+ plt.title('Training Confusion Matrix (Final Epoch)')
525
+ plt.savefig(Path(self.output_path) / f"confusion_matrix_train_{timestamp}.png")
526
+ plt.close()
527
+
528
+ fpr_train, tpr_train, _ = roc_curve(graph['review'].y[:num_rows].cpu().numpy(), probs[:num_rows])
529
+ plt.figure(figsize=(10, 6))
530
+ plt.plot(fpr_train, tpr_train, label=f'Training ROC Curve (AUC = {train_metrics["train_auc_roc"]:.4f})')
531
+ plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
532
+ plt.xlabel('False Positive Rate')
533
+ plt.ylabel('True Positive Rate')
534
+ plt.title('Training ROC Curve (Final Epoch)')
535
+ plt.legend()
536
+ plt.grid(True)
537
+ plt.savefig(Path(self.output_path) / f"roc_curve_train_{timestamp}.png")
538
+ plt.close()
539
+
540
+ logger.info(f"All metrics, plots, and model saved to {self.output_path}")
541
+
src/.ipynb_checkpoints/model_trainer-checkpoint.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.model import GraphformerModel
2
+ from pathlib import Path
3
+ from loguru import logger
4
+
5
+
6
+ class ModelTrainer:
7
+ def __init__(self, df, output_path, epochs=100,test_size=0.3):
8
+ self.df = df
9
+ self.output_path = output_path
10
+ self.epochs = epochs
11
+ self.test_size=test_size
12
+
13
+ # Create output directory
14
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
15
+
16
+ # Initialize the HeteroGraphormerModel
17
+
18
+ self.model = GraphformerModel(df=self.df, output_path=self.output_path, epochs=self.epochs,test_size=self.test_size)
19
+
20
+
21
+
22
+ logger.info(f"Initialized ModelTrainer with output_path: {self.output_path} and epochs: {self.epochs}")
23
+
24
+
25
+ def train_and_evaluate(self):
26
+
27
+ try:
28
+ logger.info("Starting model training and evaluation")
29
+ self.model.run_model()
30
+ logger.info("GraphformerModel training and evaluation completed successfully")
31
+ except Exception as e:
32
+ logger.error(f"Error during GraphformerModel training and evaluation: {e}")
33
+ raise
34
+
35
+
src/.ipynb_checkpoints/preprocessing-checkpoint.py ADDED
@@ -0,0 +1,831 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from loguru import logger
2
+ import pandas as pd
3
+ import json
4
+ from datetime import datetime
5
+ import ast
6
+ import numpy as np
7
+ from pymongo import MongoClient
8
+ from collections import defaultdict
9
+
10
+ from tqdm import tqdm
11
+ import time
12
+
13
+ import requests
14
+ import json
15
+ import os
16
+ import pandas as pd
17
+ import nltk
18
+ from nltk.tokenize import sent_tokenize, word_tokenize
19
+ from nltk.corpus import stopwords
20
+ from textblob import TextBlob
21
+ import re
22
+ from transformers import BertTokenizer, BertModel
23
+ from transformers import RobertaTokenizer, RobertaModel
24
+ import torch
25
+ from sklearn.metrics.pairwise import cosine_similarity
26
+ import numpy as np
27
+
28
+ # Download NLTK resources
29
+ nltk.download('punkt')
30
+ nltk.download('averaged_perceptron_tagger')
31
+ nltk.download('stopwords')
32
+ nltk.download('punkt_tab')
33
+ nltk.download('averaged_perceptron_tagger_eng')
34
+ class Preprocessor:
35
+ def __init__(self,df):
36
+ self.df=df
37
+ self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
38
+ self.model = RobertaModel.from_pretrained('roberta-base')
39
+ self.stop_words = set(stopwords.words('english'))
40
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Add this line
41
+
42
+
43
+
44
+ def get_bert_embedding(self, text):
45
+ inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
46
+ with torch.no_grad():
47
+ outputs = self.model(**inputs)
48
+ return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
49
+
50
+ def preprocess_text(self,text):
51
+ return text if pd.notna(text) else ""
52
+
53
+
54
+ def calculate_duration(self, time_range):
55
+ if not isinstance(time_range, str) or "-" not in time_range:
56
+ return None
57
+ start_str, end_str = time_range.split('-')
58
+ start_str = start_str.strip() + ':00' if len(start_str.split(':')) == 1 else start_str.strip()
59
+ end_str = end_str.strip() + ':00' if len(end_str.split(':')) == 1 else end_str.strip()
60
+ try:
61
+ start = datetime.strptime(start_str, '%H:%M')
62
+ end = datetime.strptime(end_str, '%H:%M')
63
+ duration = (end - start).total_seconds() / 3600
64
+ return duration if duration >= 0 else duration + 24
65
+ except ValueError:
66
+ return None
67
+ def calculate_sentiment_severity(self, text):
68
+ if pd.isna(text) or not text.strip():
69
+ return pd.Series({"good_severity": 0.0, "bad_severity": 0.0})
70
+
71
+ # Get sentiment polarity (-1 to 1)
72
+ blob = TextBlob(text)
73
+ polarity = blob.sentiment.polarity
74
+
75
+ # Define severity weights
76
+ good_weight = 0.7
77
+ bad_weight = 0.3
78
+
79
+ if polarity > 0:
80
+ good_severity = good_weight * polarity
81
+ bad_severity = 0.0
82
+ elif polarity < 0:
83
+ good_severity = 0.0
84
+ bad_severity = bad_weight * abs(polarity)
85
+ else: # Neutral (polarity = 0)
86
+ good_severity = 0.0
87
+ bad_severity = 0.0
88
+
89
+ return pd.Series({"good_severity": good_severity, "bad_severity": bad_severity})
90
+
91
+
92
+ def get_avg_duration(self, hours_str):
93
+ if pd.isna(hours_str) or not isinstance(hours_str, str):
94
+ return pd.NA
95
+ try:
96
+ hours_dict = ast.literal_eval(hours_str)
97
+ if not hours_dict:
98
+ return pd.NA
99
+ durations = [self.calculate_duration(time_range) for time_range in hours_dict.values()]
100
+ valid_durations = [d for d in durations if d is not None]
101
+ return sum(valid_durations) / len(valid_durations) if valid_durations else pd.NA
102
+ except (ValueError, SyntaxError, ZeroDivisionError):
103
+ return pd.NA
104
+
105
+
106
+ def calculate_time_since_last_review(self):
107
+ present_date = datetime.now()
108
+ user_latest_timestamp = {}
109
+
110
+ # Convert review_date to datetime
111
+ self.df["review_date"] = pd.to_datetime(self.df["review_date"])
112
+
113
+ # Calculate hours difference for each user's latest review
114
+ for user_id in self.df["user_id"].unique():
115
+ latest_date = self.df[self.df["user_id"] == user_id]["review_date"].max()
116
+
117
+ if not isinstance(latest_date, datetime):
118
+ latest_date = latest_date.to_pydatetime()
119
+
120
+ hours_difference = (present_date - latest_date).total_seconds() / 3600
121
+ user_latest_timestamp[user_id] = hours_difference
122
+
123
+ # Map the hours difference to a new column
124
+ self.df["time_since_last_review_user"] = self.df["user_id"].map(user_latest_timestamp)
125
+
126
+ def calculate_time_since_last_review_business(self):
127
+ present_date = datetime.now()
128
+
129
+ # Ensure review_date is in datetime format
130
+ self.df["review_date"] = pd.to_datetime(self.df["review_date"])
131
+
132
+ # Initialize dictionary to store hours since last review for each business
133
+ business_latest_timestamp = {}
134
+
135
+ # Iterate over unique business_ids
136
+ for business_id in self.df["business_id"].unique():
137
+ # Get the latest review date for this business
138
+ latest_date = self.df[self.df["business_id"] == business_id]["review_date"].max()
139
+
140
+ # Convert to datetime object if needed
141
+ if not isinstance(latest_date, datetime):
142
+ latest_date = latest_date.to_pydatetime()
143
+
144
+ # Calculate hours difference (already in hours)
145
+ hours_difference = (present_date - latest_date).total_seconds() / 3600
146
+ business_latest_timestamp[business_id] = hours_difference
147
+
148
+ # Map the hours difference to the new column
149
+ self.df["time_since_last_review_business"] = self.df["business_id"].map(business_latest_timestamp)
150
+
151
+
152
+
153
+ def calculate_user_account_age(self):
154
+ present_date = datetime.now()
155
+
156
+ # Convert yelping_since to datetime
157
+ self.df["yelping_since"] = pd.to_datetime(self.df["yelping_since"])
158
+
159
+ # Calculate user account age in days
160
+ self.df["user_account_age"] = (present_date - self.df["yelping_since"]).dt.days
161
+
162
+
163
+ def calculate_avg_time_between_reviews(self):
164
+ # Ensure review_date is in datetime format
165
+ self.df["review_date"] = pd.to_datetime(self.df["review_date"])
166
+
167
+ # Sort the DataFrame by user_id and review_date to ensure chronological order
168
+ self.df = self.df.sort_values(["user_id", "review_date"])
169
+
170
+ # Define helper function to calculate average time between reviews
171
+ def calculate_avg_time(group):
172
+ if len(group) == 1:
173
+ return 0 # If only one review, assign 0
174
+ # Calculate differences in hours between consecutive reviews
175
+ diffs = group["review_date"].diff().dt.total_seconds() / 3600
176
+ # Drop the first NaN (from diff) and compute the mean
177
+ return diffs.dropna().mean()
178
+
179
+ # Apply the function to each user_id group and create a mapping
180
+ avg_time_per_user = self.df.groupby("user_id").apply(calculate_avg_time)
181
+
182
+ # Map the average time back to the original DataFrame
183
+ self.df["average_time_between_reviews"] = self.df["user_id"].map(avg_time_per_user)
184
+
185
+
186
+ def calculate_user_degree(self):
187
+ # Calculate the number of unique businesses per user
188
+ user_business_counts = self.df.groupby("user_id")["business_id"].nunique()
189
+
190
+ # Map the counts back to the original DataFrame
191
+ self.df["user_degree"] = self.df["user_id"].map(user_business_counts)
192
+
193
+
194
+ def calculate_business_degree(self):
195
+ # Calculate the number of unique users per business
196
+ business_user_counts = self.df.groupby("business_id")["user_id"].nunique()
197
+
198
+ # Map the counts back to the original DataFrame
199
+ self.df["business_degree"] = self.df["business_id"].map(business_user_counts)
200
+
201
+
202
+ def calculate_rating_variance_user(self):
203
+ # Calculate the mode (most frequent rating) per user
204
+ user_rating_mode = self.df.groupby("user_id")["review_stars"].agg(lambda x: x.mode()[0])
205
+
206
+ # Map the most frequent rating back to the original DataFrame
207
+ self.df["rating_variance_user"] = self.df["user_id"].map(user_rating_mode)
208
+
209
+
210
+ def calculate_user_review_burst_count(self):
211
+ # Ensure review_date is in datetime format
212
+ self.df["review_date"] = pd.to_datetime(self.df["review_date"])
213
+
214
+ # Sort by user_id and review_date for chronological order
215
+ self.df = self.df.sort_values(["user_id", "review_date"])
216
+
217
+ # Function to calculate the max number of reviews in any 20-day window
218
+ def calculate_burst_count(group):
219
+ if len(group) <= 1:
220
+ return 0 # No burst if 1 or fewer reviews
221
+
222
+ # Convert review_date to a Series for rolling window
223
+ dates = group["review_date"]
224
+
225
+ # Calculate the number of reviews within 20 days of each review
226
+ burst_counts = []
227
+ for i, date in enumerate(dates):
228
+ # Count reviews within 20 days after this date
229
+ window_end = date + pd.Timedelta(days=20)
230
+ count = ((dates >= date) & (dates <= window_end)).sum()
231
+ burst_counts.append(count)
232
+
233
+ # Return the maximum burst count for this user
234
+ return max(burst_counts)
235
+
236
+ # Calculate the burst count per user
237
+ user_burst_counts = self.df.groupby("user_id").apply(calculate_burst_count)
238
+
239
+ # Map the burst count back to the original DataFrame
240
+ self.df["user_review_burst_count"] = self.df["user_id"].map(user_burst_counts)
241
+
242
+
243
+ def calculate_business_review_burst_count(self):
244
+ # Ensure review_date is in datetime format
245
+ self.df["review_date"] = pd.to_datetime(self.df["review_date"])
246
+
247
+ # Sort by business_id and review_date for chronological order
248
+ self.df = self.df.sort_values(["business_id", "review_date"])
249
+
250
+ # Function to calculate the max number of reviews in any 10-day window
251
+ def calculate_burst_count(group):
252
+ if len(group) <= 1:
253
+ return 0 # No burst if 1 or fewer reviews
254
+
255
+ # Convert review_date to a Series for rolling window
256
+ dates = group["review_date"]
257
+
258
+ # Calculate the number of reviews within 10 days of each review
259
+ burst_counts = []
260
+ for i, date in enumerate(dates):
261
+ # Count reviews within 10 days after this date
262
+ window_end = date + pd.Timedelta(days=10)
263
+ count = ((dates >= date) & (dates <= window_end)).sum()
264
+ burst_counts.append(count)
265
+
266
+ # Return the maximum burst count for this business
267
+ return max(burst_counts)
268
+
269
+ # Calculate the burst count per business
270
+ business_burst_counts = self.df.groupby("business_id").apply(calculate_burst_count)
271
+
272
+ # Map the burst count back to the original DataFrame
273
+ self.df["business_review_burst_count"] = self.df["business_id"].map(business_burst_counts)
274
+
275
+
276
+ def calculate_temporal_similarity(self):
277
+ self.df["review_date"] = pd.to_datetime(self.df["review_date"])
278
+
279
+ # Extract the day of the week (0 = Monday, 6 = Sunday)
280
+ self.df["day_of_week"] = self.df["review_date"].dt.dayofweek
281
+
282
+ # Function to calculate avg hours between reviews on frequent days
283
+ def calculate_avg_hours_on_frequent_days(group):
284
+ frequent_days = group["day_of_week"].mode().tolist()
285
+
286
+ if len(group) <= 1:
287
+ return 0
288
+
289
+ frequent_reviews = group[group["day_of_week"].isin(frequent_days)]
290
+
291
+ if len(frequent_reviews) <= 1:
292
+ return 0
293
+
294
+ frequent_reviews = frequent_reviews.sort_values("review_date")
295
+ diffs = frequent_reviews["review_date"].diff().dt.total_seconds() / 3600
296
+
297
+ return diffs.dropna().mean()
298
+
299
+ # Calculate average hours for each user
300
+ avg_hours_per_user = self.df.groupby("user_id").apply(calculate_avg_hours_on_frequent_days)
301
+
302
+ # Map the average hours to the new column
303
+ self.df["temporal_similarity"] = self.df["user_id"].map(avg_hours_per_user)
304
+
305
+ # Drop temporary column
306
+ self.df = self.df.drop(columns=["day_of_week"])
307
+
308
+
309
+ def calculate_rating_deviation_from_business_average(self):
310
+ # Calculate the average rating per business
311
+ business_avg_rating = self.df.groupby("business_id")["review_stars"].mean()
312
+
313
+ # Map the average rating to each row
314
+ self.df["business_avg_rating"] = self.df["business_id"].map(business_avg_rating)
315
+
316
+ # Calculate the deviation from the business average
317
+ self.df["rating_deviation_from_business_average"] = (
318
+ self.df["review_stars"] - self.df["business_avg_rating"]
319
+ )
320
+
321
+ # Drop the temporary column
322
+ self.df = self.df.drop(columns=["business_avg_rating"])
323
+
324
+ def calculate_review_like_ratio(self):
325
+ # Create a binary column for liked reviews (stars >= 4)
326
+ self.df["is_liked"] = (self.df["review_stars"] >= 4).astype(int)
327
+
328
+ # Calculate the like ratio per user
329
+ user_like_ratio = self.df.groupby("user_id")["is_liked"].mean()
330
+
331
+ # Map the like ratio back to the DataFrame
332
+ self.df["review_like_ratio"] = self.df["user_id"].map(user_like_ratio)
333
+
334
+ # Drop the temporary column
335
+ self.df = self.df.drop(columns=["is_liked"])
336
+
337
+ def calculate_latest_checkin_hours(self):
338
+ self.df["yelping_since"] = pd.to_datetime(self.df["yelping_since"])
339
+
340
+ # Function to get the latest check-in date from a list of strings
341
+ def get_latest_checkin(checkin_list):
342
+ if not checkin_list or pd.isna(checkin_list): # Handle empty or NaN
343
+ return None
344
+ if isinstance(checkin_list, str):
345
+ checkin_dates = checkin_list.split(", ")
346
+ else:
347
+ checkin_dates = checkin_list
348
+ return pd.to_datetime(checkin_dates).max()
349
+
350
+ # Apply the function to get the latest check-in date per row
351
+ self.df["latest_checkin_date"] = self.df["checkin_date"].apply(get_latest_checkin)
352
+
353
+ # Calculate the hours difference between latest check-in and yelping_since
354
+ self.df["latest_checkin_hours"] = (
355
+ (self.df["latest_checkin_date"] - self.df["yelping_since"])
356
+ .dt.total_seconds() / 3600
357
+ )
358
+
359
+ # Drop the temporary column
360
+ self.df = self.df.drop(columns=["latest_checkin_date"])
361
+ self.df["latest_checkin_hours"].fillna(0,inplace=True)
362
+
363
+
364
+ def compute_pronoun_density(self, text):
365
+ text = self.preprocess_text(text)
366
+ if not text:
367
+ return 0
368
+ words = word_tokenize(text.lower())
369
+ pos_tags = nltk.pos_tag(words)
370
+ pronouns = sum(1 for word, pos in pos_tags if pos in ['PRP', 'PRP$'] and word in ['i', 'we'])
371
+ return pronouns / len(words) if words else 0
372
+
373
+ def compute_avg_sentence_length(self, text):
374
+ text = self.preprocess_text(text)
375
+ if not text:
376
+ return 0
377
+ sentences = sent_tokenize(text)
378
+ return sum(len(word_tokenize(sent)) for sent in sentences) / len(sentences) if sentences else 0
379
+
380
+ def compute_excessive_punctuation(self, text):
381
+ text = self.preprocess_text(text)
382
+ return len(re.findall(r'[!?.]{2,}', text))
383
+
384
+ def compute_sentiment_polarity(self, text):
385
+ text = self.preprocess_text(text)
386
+ return TextBlob(text).sentiment.polarity if text else 0
387
+
388
+ def compute_code_switching_flag(self, text):
389
+ text = self.preprocess_text(text)
390
+ if not text:
391
+ return 0
392
+
393
+ tokens = self.tokenizer.tokenize(text.lower())
394
+ if not tokens:
395
+ return 0
396
+
397
+ english_words = self.stop_words # Use self.stop_words from __init__
398
+ token_set = set(tokens)
399
+ english_count = sum(1 for token in tokens if token in english_words)
400
+
401
+ non_english_pattern = re.compile(r'[^\x00-\x7F]')
402
+ has_non_ascii = 1 if non_english_pattern.search(text) else 0
403
+
404
+ english_ratio = english_count / len(tokens) if tokens else 0
405
+
406
+ non_english_tokens = sum(1 for token in token_set if token not in english_words and "##" in token and has_non_ascii)
407
+
408
+ # Flag as code-switching if:
409
+ # 1. Mixed English presence (ratio between 0.1 and 0.9)
410
+ # 2. Non-ASCII characters present OR some non-English subword tokens
411
+ if 0.1 < english_ratio < 0.9 and (has_non_ascii or non_english_tokens > 0):
412
+ return 1
413
+ return 0
414
+
415
+
416
+ def batch_tokenize(self, texts, batch_size=32, max_length=512):
417
+ tokenized_outputs = []
418
+ for i in tqdm(range(0, len(texts), batch_size), desc="Tokenizing with RoBERTa on GPU"):
419
+ batch_texts = texts[i:i + batch_size]
420
+ valid_texts = [self.preprocess_text(t) for t in batch_texts]
421
+ # Tokenize with fixed max_length to ensure consistent tensor sizes
422
+ inputs = self.tokenizer(valid_texts, return_tensors='pt', truncation=True, padding='max_length', max_length=max_length)
423
+ tokenized_outputs.append(inputs['input_ids'].to(self.device)) # Move to GPU
424
+ # Concatenate on GPU with consistent sizes
425
+ return torch.cat(tokenized_outputs, dim=0)
426
+
427
+ def compute_grammar_error_score(self, texts, tokenized_ids):
428
+ print("Computing grammar error scores...")
429
+ error_scores = np.zeros(len(texts), dtype=float)
430
+
431
+ vocab_set = set(self.tokenizer.get_vocab().keys())
432
+ for i, input_ids in enumerate(tqdm(tokenized_ids, desc="Processing Grammar Errors")):
433
+ if input_ids.sum() == 0: # Empty input
434
+ continue
435
+ tokens = self.tokenizer.convert_ids_to_tokens(input_ids.cpu().tolist(), skip_special_tokens=True)
436
+ unknown_count = sum(1 for token in tokens if token not in vocab_set and token not in self.stop_words)
437
+ total_count = len([t for t in tokens if t not in self.stop_words])
438
+ error_scores[i] = unknown_count / total_count if total_count > 0 else 0
439
+
440
+ return error_scores
441
+
442
+ def compute_repetitive_words_count(self, texts, tokenized_ids):
443
+ print("Computing repetitive words counts...")
444
+ rep_counts = np.zeros(len(texts), dtype=int)
445
+
446
+ for i, input_ids in enumerate(tqdm(tokenized_ids, desc="Processing Repetition")):
447
+ if input_ids.sum() == 0: # Empty input
448
+ continue
449
+ tokens = self.tokenizer.convert_ids_to_tokens(input_ids.cpu().tolist(), skip_special_tokens=True)
450
+ valid_tokens = [t for t in tokens if t not in self.stop_words and len(t) > 2]
451
+ if valid_tokens:
452
+ token_counts = {}
453
+ for token in valid_tokens:
454
+ token_counts[token] = token_counts.get(token, 0) + 1
455
+ rep_counts[i] = sum(1 for count in token_counts.values() if count > 1)
456
+
457
+ return rep_counts
458
+
459
+ def preprocess_text_for_similarity(self, text):
460
+ if pd.isna(text) or not text.strip():
461
+ return []
462
+ return [w for w in word_tokenize(str(text).lower()) if w not in self.stop_words]
463
+
464
+ def batch_encode_words(self, texts, batch_size=32, max_length=512):
465
+ word_lists = [self.preprocess_text_for_similarity(t) for t in tqdm(texts, desc="Tokenizing Texts")]
466
+ vocab = {word: idx + 1 for idx, word in enumerate(set.union(*[set(w) for w in word_lists if w]))}
467
+
468
+ encoded_batches = []
469
+ for i in tqdm(range(0, len(word_lists), batch_size), desc="Encoding Words on GPU"):
470
+ batch_words = word_lists[i:i + batch_size]
471
+ encoded = np.zeros((len(batch_words), max_length), dtype=np.int64)
472
+ for j, words in enumerate(batch_words):
473
+ if words:
474
+ word_ids = [vocab.get(w, 0) for w in words][:max_length]
475
+ encoded[j, :len(word_ids)] = word_ids
476
+ encoded_tensor = torch.tensor(encoded, dtype=torch.int64).to(self.device)
477
+ encoded_batches.append(encoded_tensor)
478
+
479
+ return torch.cat(encoded_batches, dim=0), vocab
480
+
481
+ def compute_similarity_to_other_reviews(self, batch_size=32, max_length=512):
482
+ all_texts = self.df["review_text"].tolist()
483
+ all_users = self.df["user_id"].tolist()
484
+ all_review_ids = self.df["review_id"].tolist()
485
+
486
+ encoded_words, vocab = self.batch_encode_words(all_texts, batch_size, max_length)
487
+
488
+ similarity_scores = {rid: 0.0 for rid in all_review_ids} # Default scores
489
+ for i, (review_id, user_id) in enumerate(tqdm(zip(all_review_ids, all_users), desc="Computing Similarities on GPU")):
490
+ if pd.isna(review_id) or pd.isna(user_id):
491
+ continue
492
+
493
+ current_words = encoded_words[i]
494
+ if current_words.sum() == 0:
495
+ continue
496
+
497
+ other_indices = torch.tensor([j for j, u in enumerate(all_users) if u != user_id and pd.notna(u)],
498
+ dtype=torch.long).to(self.device)
499
+ if not other_indices.numel():
500
+ continue
501
+
502
+ other_words = encoded_words[other_indices]
503
+ current_set = torch.unique(current_words[current_words > 0])
504
+ other_flat = other_words[other_words > 0]
505
+
506
+ if other_flat.numel() == 0:
507
+ continue
508
+
509
+ other_set = torch.unique(other_flat)
510
+ intersection = torch.sum(torch.isin(current_set, other_set)).float()
511
+ union = torch.unique(torch.cat([current_set, other_set])).numel()
512
+ similarity = intersection / union if union > 0 else 0.0
513
+
514
+ similarity_scores[review_id] = similarity.item()
515
+ return pd.Series(similarity_scores, index=all_review_ids)
516
+
517
+ def calculate_friend_count(self):
518
+ friends = []
519
+ for v in self.df["friends"]:
520
+ if isinstance(v, str):
521
+ friends.append(len(v.split(",")))
522
+ elif type(v)==int or type(v)==float:
523
+ friends.append(0)
524
+ self.df["friends"] = friends
525
+
526
+ def count_elite_years(self, elite):
527
+ if pd.isna(elite):
528
+ return 0
529
+ return len(str(elite).split(","))
530
+
531
+ def transform_elite_status(self):
532
+ self.df["elite"] = self.df["elite"].apply(lambda x: True if self.count_elite_years(x) > 1 else False)
533
+
534
+ def calculate_review_useful_funny_cool(self):
535
+ self.df["review_useful"] = pd.to_numeric(self.df["review_useful"], errors='coerce').fillna(0)
536
+ self.df["review_funny"] = pd.to_numeric(self.df["review_funny"], errors='coerce').fillna(0)
537
+ self.df["review_cool"] = pd.to_numeric(self.df["review_cool"], errors='coerce').fillna(0)
538
+ self.df["review_useful_funny_cool"] = (
539
+ self.df["review_useful"] +
540
+ self.df["review_funny"] +
541
+ self.df["review_cool"]
542
+ )
543
+ self.df["review_useful_funny_cool"] = self.df["review_useful_funny_cool"].fillna(0).astype(int)
544
+
545
+
546
+ def calculate_user_useful_funny_cool(self):
547
+ self.df["user_useful_funny_cool"] = (
548
+ self.df["user_useful"] +
549
+ self.df["user_funny"] +
550
+ self.df["user_cool"]
551
+ )
552
+ self.df["user_useful_funny_cool"] = self.df["user_useful_funny_cool"].fillna(0).astype(int)
553
+
554
+ def compute_fake_score(self, row):
555
+ suspicion_points = 0
556
+
557
+ # Linguistic Features
558
+ if row["pronoun_density"] < 0.01: # Low personal engagement
559
+ suspicion_points += 1
560
+ if row["avg_sentence_length"] < 5 or row["avg_sentence_length"] > 30: # Extreme lengths
561
+ suspicion_points += 1
562
+ if row["grammar_error_score"] > 5: # Many errors
563
+ suspicion_points += 1
564
+ if row["repetitive_words_count"] > 5: # High repetition
565
+ suspicion_points += 1
566
+ if row["code_switching_flag"] == 1: # Language mixing
567
+ suspicion_points += 1
568
+ if row["excessive_punctuation_count"] > 3: # Overuse of punctuation
569
+ suspicion_points += 1
570
+ if abs(row["sentiment_polarity"]) > 0.8: # Extreme sentiment
571
+ suspicion_points += 1
572
+
573
+ # Review Patterns
574
+ if row["similarity_to_other_reviews"] > 0.8: # High duplication
575
+ suspicion_points += 1
576
+ if row["user_review_burst_count"] > 5: # Spammy bursts
577
+ suspicion_points += 1
578
+ if row["business_review_burst_count"] > 5: # Targeted bursts
579
+ suspicion_points += 1
580
+ if abs(row["rating_deviation_from_business_average"]) > 2: # Large rating deviation
581
+ suspicion_points += 1
582
+ if row["review_like_ratio"] > 0.9 or row["review_like_ratio"] < 0.1: # Extreme like ratio
583
+ suspicion_points += 1
584
+
585
+ # User Behavior
586
+ if row["user_account_age"] < 30: # Very new account (days)
587
+ suspicion_points += 1
588
+ if row["average_time_between_reviews"] < 24: # Rapid reviews (hours)
589
+ suspicion_points += 1
590
+ if row["user_degree"] < 2: # Low business interaction
591
+ suspicion_points += 1
592
+ if row["time_since_last_review_user"] < 24: # Recent burst (hours)
593
+ suspicion_points += 1
594
+
595
+ # Threshold: 3 or more points = fake
596
+ return 1 if suspicion_points >= 3 else 0
597
+
598
+
599
+ def run_pipeline(self):
600
+
601
+
602
+
603
+ logger.info("FINALYZING HOURS COLUMN ...")
604
+ self.df["hours"] = self.df["hours"].apply(self.get_avg_duration)
605
+ self.df["hours"] = self.df["hours"].fillna(0)
606
+ print(self.df["hours"][:10])
607
+ print(self.df["hours"].isnull().sum())
608
+
609
+
610
+
611
+
612
+ logger.info("FINALYZING ATTRIBUTES COLUMN ...")
613
+ self.df.drop("attributes",axis=1,inplace=True)
614
+
615
+
616
+
617
+ logger.info("CREATING time_since_last_review_user COLUMN ...")
618
+ self.calculate_time_since_last_review()
619
+ print(np.unique(self.df["time_since_last_review_user"] ))
620
+
621
+
622
+ logger.info("CREATING time_since_last_review_business COLUMN ...")
623
+ self.calculate_time_since_last_review_business()
624
+ print(np.unique(self.df["time_since_last_review_business"] ))
625
+
626
+
627
+
628
+ logger.info("CREATING user_account_age COLUMN ...")
629
+ self.calculate_user_account_age()
630
+ print(np.unique(self.df["user_account_age"] ))
631
+
632
+
633
+
634
+ logger.info("CREATING average_time_between_reviews COLUMN ...")
635
+ self.calculate_avg_time_between_reviews()
636
+ print(np.unique(self.df["average_time_between_reviews"] ))
637
+
638
+
639
+
640
+ logger.info("CREATING user_degree COLUMN ...")
641
+ self.calculate_user_degree()
642
+ print(np.unique(self.df["user_degree"] ))
643
+
644
+
645
+ logger.info("CREATING business_degree COLUMN ...")
646
+ self.calculate_business_degree()
647
+ print(np.unique(self.df["business_degree"] ))
648
+
649
+
650
+ logger.info("CREATING rating_variance_user COLUMN ...")
651
+ self.calculate_rating_variance_user()
652
+ print(np.unique(self.df["rating_variance_user"] ))
653
+
654
+
655
+
656
+ logger.info("CREATING user_review_burst_count COLUMN ...")
657
+ self.calculate_user_review_burst_count()
658
+ print(np.unique(self.df["user_review_burst_count"] ))
659
+
660
+
661
+ logger.info("CREATING business_review_burst_count COLUMN ...")
662
+ self.calculate_business_review_burst_count()
663
+ print(np.unique(self.df["business_review_burst_count"] ))
664
+
665
+
666
+
667
+ logger.info("CREATING temporal_similarity COLUMN ...")
668
+ self.calculate_temporal_similarity()
669
+ print(np.unique(self.df["temporal_similarity"] ))
670
+
671
+
672
+
673
+ logger.info("CREATING rating_deviation_from_business_average COLUMN ...")
674
+ self.calculate_rating_deviation_from_business_average()
675
+ print(np.unique(self.df["rating_deviation_from_business_average"] ))
676
+
677
+
678
+
679
+ logger.info("CREATING review_like_ratio COLUMN ...")
680
+ self.calculate_review_like_ratio()
681
+ print(np.unique(self.df["review_like_ratio"] ))
682
+
683
+
684
+
685
+ logger.info("CREATING latest_checkin_hours COLUMN ...")
686
+ self.calculate_latest_checkin_hours()
687
+ print(np.unique(self.df["latest_checkin_hours"] ))
688
+
689
+
690
+
691
+
692
+ logger.info("CREATING pronoun_density COLUMN ...")
693
+ self.df["pronoun_density"] = self.df["review_text"].apply(self.compute_pronoun_density)
694
+ print(np.unique(self.df["pronoun_density"] ))
695
+
696
+ logger.info("CREATING avg_sentence_length COLUMN ...")
697
+ self.df["avg_sentence_length"] = self.df["review_text"].apply(self.compute_avg_sentence_length)
698
+ print(np.unique(self.df["avg_sentence_length"] ))
699
+
700
+ logger.info("CREATING excessive_punctuation_count COLUMN ...")
701
+ self.df["excessive_punctuation_count"] = self.df["review_text"].apply(self.compute_excessive_punctuation)
702
+ print(np.unique(self.df["excessive_punctuation_count"] ))
703
+
704
+ logger.info("CREATING sentiment_polarity COLUMN ...")
705
+ self.df["sentiment_polarity"] = self.df["review_text"].apply(self.compute_sentiment_polarity)
706
+ print(np.unique(self.df["sentiment_polarity"] ))
707
+
708
+ logger.info("CREATING good_severity and bad_severity COLUMNS ...")
709
+ severity_scores = self.df["review_text"].apply(self.calculate_sentiment_severity)
710
+ self.df[["good_severity", "bad_severity"]] = severity_scores
711
+ print(np.unique(self.df["good_severity"] ))
712
+ print(np.unique(self.df["bad_severity"] ))
713
+
714
+
715
+ logger.info("CREATING code_switching_flag COLUMN ...")
716
+ self.df["code_switching_flag"] = self.df["review_text"].apply(self.compute_code_switching_flag)
717
+ print(np.unique(self.df["code_switching_flag"] ))
718
+
719
+
720
+ all_texts = self.df["review_text"].tolist()
721
+ tokenized_ids = self.batch_tokenize(all_texts, batch_size=32, max_length=512)
722
+
723
+ logger.info("CREATING grammar_error_score COLUMN ...")
724
+ self.df["grammar_error_score"] = self.compute_grammar_error_score(all_texts, tokenized_ids)
725
+ print(np.unique(self.df["grammar_error_score"] ))
726
+
727
+
728
+ logger.info("CREATING repetitive_words_count COLUMN ...")
729
+ self.df["repetitive_words_count"] = self.compute_repetitive_words_count(all_texts, tokenized_ids)
730
+ print(np.unique(self.df["repetitive_words_count"] ))
731
+
732
+
733
+
734
+ logger.info("CREATING similarity_to_other_reviews COLUMN ...")
735
+ similarity_scores = self.compute_similarity_to_other_reviews(batch_size=32, max_length=512)
736
+ self.df["similarity_to_other_reviews"] = self.df["review_id"].map(similarity_scores)
737
+
738
+ print(np.unique(self.df["similarity_to_other_reviews"] ))
739
+
740
+
741
+
742
+ logger.info("CREATING friends COLUMN ...")
743
+ self.calculate_friend_count()
744
+ print(self.df["friends"].value_counts())
745
+
746
+ logger.info("CREATING elite COLUMN ...")
747
+ self.transform_elite_status()
748
+ print(self.df["elite"].value_counts())
749
+
750
+
751
+ logger.info("CREATING review_useful_funny_cool COLUMN ...")
752
+ self.calculate_review_useful_funny_cool()
753
+ print(self.df["review_useful_funny_cool"].value_counts())
754
+
755
+
756
+ logger.info("CREATING user_useful_funny_cool COLUMN ...")
757
+ self.calculate_user_useful_funny_cool()
758
+ print(self.df["user_useful_funny_cool"].value_counts())
759
+
760
+
761
+ logger.info("CREATING LABEL COLUMN ...")
762
+ self.df["fake"] = self.df.apply(self.compute_fake_score, axis=1)
763
+ print(self.df["fake"].value_counts())
764
+
765
+
766
+ logger.info("SEEING NULL VALUES IN FINAL COLUMNS.....")
767
+ print(set(self.df.isnull().sum().values))
768
+ for col in self.df.columns:
769
+ if self.df[col].isnull().sum()>0:
770
+ print(f" {col} has {self.df[col].isnull().sum()} null values")
771
+
772
+
773
+
774
+ return self.df
775
+
776
+
777
+
778
+
779
+
780
+
781
+
782
+
783
+
784
+
785
+
786
+
787
+
788
+
789
+
790
+
791
+
792
+
793
+
794
+
795
+
796
+
797
+
798
+
799
+
800
+
801
+
802
+
803
+
804
+
805
+
806
+
807
+
808
+
809
+
810
+
811
+
812
+
813
+
814
+
815
+
816
+
817
+
818
+
819
+
820
+
821
+
822
+
823
+
824
+
825
+
826
+
827
+
828
+
829
+
830
+
831
+
src/__pycache__/analyze_yelp_data.cpython-311.pyc ADDED
Binary file (20.6 kB). View file
 
src/__pycache__/clean_data.cpython-311.pyc ADDED
Binary file (6.21 kB). View file
 
src/__pycache__/clean_data.cpython-39.pyc ADDED
Binary file (3.03 kB). View file
 
src/__pycache__/create_dataset.cpython-311.pyc ADDED
Binary file (7.58 kB). View file
 
src/__pycache__/create_dataset.cpython-39.pyc ADDED
Binary file (3.98 kB). View file
 
src/__pycache__/data_balancing.cpython-311.pyc ADDED
Binary file (12.6 kB). View file
 
src/__pycache__/feature_analyzer.cpython-311.pyc ADDED
Binary file (17.3 kB). View file
 
src/__pycache__/feature_analyzer.cpython-39.pyc ADDED
Binary file (8.73 kB). View file
 
src/__pycache__/feature_importance.cpython-311.pyc ADDED
Binary file (10.6 kB). View file
 
src/__pycache__/model.cpython-311.pyc ADDED
Binary file (13.9 kB). View file
 
src/__pycache__/model.cpython-39.pyc ADDED
Binary file (17.6 kB). View file
 
src/__pycache__/model1.cpython-311.pyc ADDED
Binary file (42.9 kB). View file
 
src/__pycache__/model1.cpython-39.pyc ADDED
Binary file (17.2 kB). View file
 
src/__pycache__/model3.cpython-311.pyc ADDED
Binary file (44 kB). View file
 
src/__pycache__/model3.cpython-39.pyc ADDED
Binary file (17.6 kB). View file
 
src/__pycache__/model_trainer.cpython-311.pyc ADDED
Binary file (2.31 kB). View file
 
src/__pycache__/model_trainer.cpython-39.pyc ADDED
Binary file (1.32 kB). View file
 
src/__pycache__/models.cpython-311.pyc ADDED
Binary file (45.6 kB). View file
 
src/__pycache__/preprocessing.cpython-311.pyc ADDED
Binary file (50.7 kB). View file
 
src/__pycache__/preprocessing.cpython-39.pyc ADDED
Binary file (24.4 kB). View file
 
src/analyze_yelp_data.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from transformers import AutoTokenizer, AutoModel
4
+ import torch
5
+ from sklearn.ensemble import IsolationForest
6
+ from sklearn.preprocessing import StandardScaler
7
+ from textblob import TextBlob
8
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
9
+ from sklearn.feature_extraction.text import CountVectorizer
10
+ from sklearn.decomposition import PCA
11
+ import warnings
12
+ from typing import Dict, List, Tuple
13
+ import logging
14
+ from collections import Counter
15
+ from detoxify import Detoxify
16
+ import re
17
+ from datetime import datetime
18
+ import seaborn as sns
19
+ import matplotlib.pyplot as plt
20
+ from pathlib import Path
21
+ import json
22
+
23
+ class AdvancedYelpAnalyzer:
24
+ def __init__(self, df: pd.DataFrame):
25
+ """Initialize the analyzer with necessary models and configurations"""
26
+ self.df = df.copy()
27
+ self.bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
28
+ self.bert_model = AutoModel.from_pretrained('bert-base-uncased')
29
+ self.vader = SentimentIntensityAnalyzer()
30
+ self.toxic_model = Detoxify('original')
31
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
32
+ self.bert_model.to(self.device)
33
+
34
+ # Configure logging
35
+ logging.basicConfig(level=logging.INFO)
36
+ self.logger = logging.getLogger(__name__)
37
+
38
+ def get_bert_embeddings(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
39
+ """Generate BERT embeddings for text"""
40
+ embeddings = []
41
+
42
+ for i in range(0, len(texts), batch_size):
43
+ batch_texts = texts[i:i + batch_size]
44
+ encoded = self.bert_tokenizer(batch_texts,
45
+ padding=True,
46
+ truncation=True,
47
+ max_length=512,
48
+ return_tensors='pt')
49
+
50
+ with torch.no_grad():
51
+ encoded = {k: v.to(self.device) for k, v in encoded.items()}
52
+ outputs = self.bert_model(**encoded)
53
+ batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
54
+ embeddings.append(batch_embeddings)
55
+
56
+ return np.vstack(embeddings)
57
+
58
+ def analyze_sentiment(self) -> pd.DataFrame:
59
+ """Perform comprehensive sentiment analysis using multiple tools"""
60
+ self.logger.info("Starting sentiment analysis...")
61
+
62
+ # Calculate BERT embeddings for reviews
63
+ self.logger.info("Calculating BERT embeddings...")
64
+ review_texts = self.df['review_text'].fillna('').tolist()
65
+ bert_embeddings = self.get_bert_embeddings(review_texts)
66
+
67
+ # Calculate review length using BERT tokenizer
68
+ self.logger.info("Calculating tokenized lengths...")
69
+ self.df['review_length'] = self.df['review_text'].apply(
70
+ lambda x: len(self.bert_tokenizer.encode(str(x)))
71
+ )
72
+
73
+ # Store BERT embeddings mean and std as features
74
+ self.df['bert_embedding_mean'] = np.mean(bert_embeddings, axis=1)
75
+ self.df['bert_embedding_std'] = np.std(bert_embeddings, axis=1)
76
+
77
+ # TextBlob sentiment and subjectivity
78
+ self.df['textblob_polarity'] = self.df['review_text'].apply(
79
+ lambda x: TextBlob(str(x)).sentiment.polarity
80
+ )
81
+ self.df['textblob_subjectivity'] = self.df['review_text'].apply(
82
+ lambda x: TextBlob(str(x)).sentiment.subjectivity
83
+ )
84
+
85
+ # VADER sentiment with custom negative phrase handling
86
+ def get_enhanced_vader_scores(text):
87
+ # Custom negative phrases
88
+ negative_phrases = [
89
+ 'too long', 'way too long', 'waiting', 'changed our minds',
90
+ 'too many', 'took forever', 'took too long', 'waste of time',
91
+ 'not worth', 'disappointing', 'mediocre', 'needs improvement'
92
+ ]
93
+
94
+ # Get base VADER scores
95
+ base_scores = self.vader.polarity_scores(str(text))
96
+
97
+ # Check for negative phrases
98
+ text_lower = str(text).lower()
99
+ neg_count = sum(1 for phrase in negative_phrases if phrase in text_lower)
100
+
101
+ # Adjust scores if negative phrases are found
102
+ if neg_count > 0:
103
+ base_scores['neg'] = max(base_scores['neg'], min(0.7, neg_count * 0.2))
104
+ base_scores['compound'] *= (1 - (neg_count * 0.15))
105
+ # Readjust neutral score
106
+ base_scores['neu'] = max(0, 1 - base_scores['neg'] - base_scores['pos'])
107
+
108
+ return base_scores
109
+
110
+ # Apply enhanced VADER scoring
111
+ vader_scores = self.df['review_text'].apply(get_enhanced_vader_scores)
112
+ self.df['vader_compound'] = vader_scores.apply(lambda x: x['compound'])
113
+ self.df['vader_negative'] = vader_scores.apply(lambda x: x['neg'])
114
+ self.df['vader_positive'] = vader_scores.apply(lambda x: x['pos'])
115
+ self.df['vader_neutral'] = vader_scores.apply(lambda x: x['neu'])
116
+
117
+ # Calculate sentiment extremity
118
+ self.df['sentiment_extremity'] = self.df['vader_compound'].abs()
119
+
120
+ return self.df
121
+
122
+ def detect_anomalies(self) -> pd.DataFrame:
123
+ """Detect anomalous reviews using Isolation Forest with BERT features"""
124
+ self.logger.info("Detecting anomalies...")
125
+
126
+ # Prepare features for anomaly detection
127
+ features = [
128
+ 'review_stars',
129
+ 'textblob_polarity',
130
+ 'vader_compound',
131
+ 'sentiment_extremity',
132
+ 'review_length',
133
+ 'bert_embedding_mean',
134
+ 'bert_embedding_std'
135
+ ]
136
+
137
+ # Ensure all features exist
138
+ missing_features = [f for f in features if f not in self.df.columns]
139
+ if missing_features:
140
+ self.analyze_sentiment()
141
+
142
+ # Standardize features
143
+ scaler = StandardScaler()
144
+ X = scaler.fit_transform(self.df[features])
145
+
146
+ # Apply Isolation Forest
147
+ iso_forest = IsolationForest(
148
+ contamination=0.1,
149
+ random_state=42,
150
+ n_jobs=-1
151
+ )
152
+
153
+ # Fit and predict
154
+ self.df['is_anomaly'] = iso_forest.fit_predict(X)
155
+ self.df['anomaly_score'] = iso_forest.score_samples(X)
156
+
157
+ return self.df
158
+
159
+ def detect_ai_generated_text(self) -> pd.DataFrame:
160
+ """Estimate likelihood of AI-generated content"""
161
+ self.logger.info("Detecting AI-generated content...")
162
+
163
+ # Ensure sentiment analysis has been run
164
+ if 'textblob_subjectivity' not in self.df.columns:
165
+ self.analyze_sentiment()
166
+
167
+ # Use detoxify model to get toxicity scores
168
+ texts = self.df['review_text'].fillna('').tolist()
169
+ toxic_scores = self.toxic_model.predict(texts)
170
+
171
+ # Add scores to DataFrame
172
+ toxic_score_types = ['toxicity', 'severe_toxicity', 'obscene', 'identity_attack',
173
+ 'insult', 'threat', 'sexual_explicit']
174
+ for score_type in toxic_score_types:
175
+ if score_type in toxic_scores:
176
+ self.df[f'toxic_{score_type}'] = toxic_scores[score_type]
177
+
178
+ # Calculate AI generation likelihood based on various factors
179
+ self.df['ai_generated_likelihood'] = (
180
+ (self.df['textblob_subjectivity'] < 0.3) & # Low subjectivity
181
+ (self.df['sentiment_extremity'] > 0.8) & # Extreme sentiment
182
+ (self.df['review_length'] > self.df['review_length'].quantile(0.95)) & # Unusually long
183
+ (self.df['bert_embedding_std'] < self.df['bert_embedding_std'].quantile(0.25)) # Unusual language patterns
184
+ ).astype(int)
185
+
186
+ # Add additional AI detection features
187
+ self.df['ai_detection_score'] = (
188
+ (self.df['textblob_subjectivity'] * -1) + # Lower subjectivity increases score
189
+ (self.df['sentiment_extremity'] * 0.5) + # Extreme sentiment contributes somewhat
190
+ (self.df['bert_embedding_std'] * -0.5) # Lower variation in embeddings increases score
191
+ ).clip(0, 1) # Normalize between 0 and 1
192
+
193
+ return self.df
194
+
195
+ def analyze_business_categories(self) -> Dict:
196
+ """Analyze trends and patterns specific to business categories"""
197
+ self.logger.info("Analyzing business categories...")
198
+
199
+ # Extract categories
200
+ categories = self.df['categories'].fillna('').str.split(', ')
201
+ all_categories = [cat for cats in categories if isinstance(cats, list) for cat in cats]
202
+ category_counts = Counter(all_categories)
203
+
204
+ # Analyze reviews by category
205
+ category_analysis = {}
206
+ for category in set(all_categories):
207
+ category_reviews = self.df[self.df['categories'].str.contains(category, na=False)]
208
+
209
+ category_analysis[category] = {
210
+ 'review_count': len(category_reviews),
211
+ 'avg_rating': category_reviews['review_stars'].mean() if not category_reviews.empty else None,
212
+ 'avg_sentiment': category_reviews['vader_compound'].mean() if 'vader_compound' in self.df.columns and not category_reviews.empty else None,
213
+ 'avg_subjectivity': category_reviews['textblob_subjectivity'].mean() if 'textblob_subjectivity' in self.df.columns and not category_reviews.empty else None
214
+ }
215
+
216
+ return category_analysis
217
+
218
+ def visualize_results(self, output_dir: str):
219
+ """Create visualizations for analysis results"""
220
+ plt.figure(figsize=(15, 10))
221
+
222
+ # Sentiment Distribution
223
+ plt.subplot(2, 2, 1)
224
+ sns.histplot(data=self.df, x='vader_compound', bins=50)
225
+ plt.title('Sentiment Distribution')
226
+
227
+ # Review Volume Over Time
228
+ plt.subplot(2, 2, 2)
229
+ daily_reviews = self.df.groupby('review_date').size()
230
+ daily_reviews.plot()
231
+ plt.title('Review Volume Over Time')
232
+
233
+ # Anomaly Score Distribution
234
+ plt.subplot(2, 2, 3)
235
+ if 'anomaly_score' not in self.df.columns:
236
+ self.detect_anomalies()
237
+ sns.histplot(data=self.df, x='anomaly_score', bins=50)
238
+ plt.title('Anomaly Score Distribution')
239
+
240
+ # AI Generation Likelihood
241
+ plt.subplot(2, 2, 4)
242
+ if 'ai_generated_likelihood' not in self.df.columns:
243
+ self.detect_ai_generated_text()
244
+ sns.histplot(data=self.df, x='ai_generated_likelihood', bins=2)
245
+ plt.title('AI Generation Likelihood')
246
+
247
+ plt.tight_layout()
248
+ plt.savefig(f'{output_dir}/analysis_results.png')
249
+ plt.close()
250
+
251
+ def run_full_analysis(self, output_dir: str) -> Tuple[pd.DataFrame, Dict]:
252
+ """Run complete analysis pipeline with detailed outputs"""
253
+ self.logger.info("Starting full analysis pipeline...")
254
+
255
+ # Create output directory if it doesn't exist
256
+ output_dir = Path(output_dir)
257
+ output_dir.mkdir(parents=True, exist_ok=True)
258
+
259
+ try:
260
+ # Run all analyses
261
+ self.analyze_sentiment()
262
+ self.detect_anomalies()
263
+ self.detect_ai_generated_text()
264
+ category_analysis = self.analyze_business_categories()
265
+
266
+ # Create visualizations
267
+ self.visualize_results(str(output_dir))
268
+
269
+ # Compile results
270
+ analysis_results = {
271
+ 'category_analysis': category_analysis,
272
+ 'sentiment_summary': {
273
+ 'avg_sentiment': self.df['vader_compound'].mean(),
274
+ 'positive_reviews': len(self.df[self.df['vader_compound'] > 0.5]),
275
+ 'negative_reviews': len(self.df[self.df['vader_compound'] < -0.5]),
276
+ 'neutral_reviews': len(self.df[abs(self.df['vader_compound']) <= 0.5])
277
+ },
278
+ 'ai_detection_summary': {
279
+ 'likely_ai_generated': len(self.df[self.df['ai_generated_likelihood'] == 1]),
280
+ 'avg_ai_score': self.df['ai_detection_score'].mean()
281
+ },
282
+ 'anomaly_summary': {
283
+ 'anomalous_reviews': len(self.df[self.df['is_anomaly'] == -1]),
284
+ 'avg_anomaly_score': self.df['anomaly_score'].mean()
285
+ }
286
+ }
287
+
288
+ # Save results
289
+ self.df.to_csv(output_dir / "analyzed_data.csv", index=False)
290
+ with open(output_dir / "analysis_results.json", 'w') as f:
291
+ json.dump(analysis_results, f, indent=4)
292
+
293
+ return self.df, analysis_results
294
+
295
+ except Exception as e:
296
+ self.logger.error(f"Error during analysis: {str(e)}")
297
+ raise
298
+
299
+ # For testing
300
+ if __name__ == "__main__":
301
+ # Set up logging
302
+ logging.basicConfig(level=logging.INFO)
303
+ logger = logging.getLogger(__name__)
304
+
305
+ try:
306
+ # Read test data
307
+ df = pd.read_csv("test_data.csv")
308
+
309
+ # Initialize analyzer
310
+ analyzer = AdvancedYelpAnalyzer(df)
311
+
312
+ # Run analysis
313
+ output_dir = "output"
314
+ analyzed_df, results = analyzer.run_full_analysis(output_dir)
315
+
316
+ logger.info("Analysis completed successfully!")
317
+
318
+ except Exception as e:
319
+ logger.error(f"Error during testing: {str(e)}")
320
+ raise
src/clean_data.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # clean_yelp_data.py
2
+ from loguru import logger
3
+ import pandas as pd
4
+ import numpy as np
5
+ from dataclasses import dataclass
6
+ from typing import Dict, List, Optional, Tuple
7
+ import json
8
+ from pathlib import Path
9
+ import logging
10
+ from scipy.stats import entropy
11
+ import warnings
12
+ from datetime import datetime
13
+ import matplotlib.pyplot as plt
14
+ import seaborn as sns
15
+ import re
16
+ from textblob import TextBlob
17
+ import os
18
+ from pathlib import Path
19
+
20
+ class DataCleaner:
21
+ def __init__(self,df,output_path,filename="preprocessed_cleaned.csv"):
22
+ self.df=df
23
+ self.output_path=output_path
24
+ self.filename=filename
25
+ def saving_cleaned_preprocess(self):
26
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
27
+
28
+ output_file = Path(self.output_path) / self.filename
29
+ logger.info(f"Files saved in directory {output_file} as : { self.filename}")
30
+ self.df.to_csv(output_file, index=False)
31
+
32
+ def dropping_unncessary_columns(self):
33
+ self.df.drop("review_text", axis=1, inplace=True)
34
+ self.df.drop("review_date", axis=1, inplace=True)
35
+ self.df.drop("business_name", axis=1, inplace=True)
36
+ self.df.drop("address", axis=1, inplace=True)
37
+ self.df.drop("city", axis=1, inplace=True)
38
+ self.df.drop("state", axis=1, inplace=True)
39
+ self.df.drop("postal_code", axis=1, inplace=True)
40
+ self.df.drop("categories", axis=1, inplace=True)
41
+ self.df.drop("user_name", axis=1, inplace=True)
42
+ self.df.drop("yelping_since", axis=1, inplace=True)
43
+ self.df.drop("checkin_date", axis=1, inplace=True)
44
+ self.df.drop("review_useful", axis=1, inplace=True)
45
+ self.df.drop("review_funny", axis=1, inplace=True)
46
+ self.df.drop("review_cool", axis=1, inplace=True)
47
+ self.df.drop("user_useful", axis=1, inplace=True)
48
+ self.df.drop("user_funny", axis=1, inplace=True)
49
+ self.df.drop("user_cool", axis=1, inplace=True)
50
+ self.df.drop("is_open", axis=1, inplace=True)
51
+ self.df.drop("compliment_hot", axis=1, inplace=True)
52
+ self.df.drop("compliment_more", axis=1, inplace=True)
53
+ self.df.drop("compliment_profile", axis=1, inplace=True)
54
+ self.df.drop("compliment_cute", axis=1, inplace=True)
55
+ self.df.drop("compliment_list", axis=1, inplace=True)
56
+ self.df.drop("compliment_note", axis=1, inplace=True)
57
+ self.df.drop("compliment_plain", axis=1, inplace=True)
58
+ self.df.drop("compliment_cool", axis=1, inplace=True)
59
+ self.df.drop("compliment_funny", axis=1, inplace=True)
60
+ self.df.drop("compliment_writer", axis=1, inplace=True)
61
+ self.df.drop("compliment_photos", axis=1, inplace=True)
62
+
63
+ def run_pipeline(self):
64
+ logger.info("Dropping Unnecessary Columns")
65
+ self.dropping_unncessary_columns()
66
+
67
+ logger.info("Checking Again for NULL values in Columns")
68
+ for col in self.df.columns:
69
+ if self.df[col].isnull().sum()>0:
70
+ print(f" {col} has {self.df[col].isnull().sum()} null values")
71
+
72
+
73
+ logger.info("Saving Cleaned and Preprocessed Data")
74
+ self.saving_cleaned_preprocess()
75
+
76
+
77
+
78
+
79
+
80
+
81
+
82
+
83
+
src/create_dataset.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import ujson as json
3
+ import gc
4
+ import numpy as np
5
+ from concurrent.futures import ProcessPoolExecutor
6
+ import multiprocessing as mp
7
+ from pymongo import MongoClient
8
+ from collections import defaultdict
9
+ from pathlib import Path
10
+
11
+ # def read_json_parallel(file_path, num_workers=None):
12
+ # """Read JSON file using parallel processing"""
13
+ # if num_workers is None:
14
+ # num_workers = max(1, mp.cpu_count() - 1)
15
+
16
+ # print(f"Reading {file_path}...")
17
+ # # Read chunks and concatenate them into a single DataFrame
18
+ # df = pd.read_json(file_path, lines=True, dtype_backend="pyarrow", chunksize=100000)
19
+ # return next(df)
20
+
21
+
22
+ def read_data_mongo(file_path, num_workers=None):
23
+ """Read JSON file using parallel processing"""
24
+ if num_workers is None:
25
+ num_workers = max(1, mp.cpu_count() - 1)
26
+
27
+ print(f"Reading {file_path}...")
28
+ conn_str = "mongodb://Mtalha:[email protected]/"
29
+
30
+ client = MongoClient(conn_str)
31
+ databases = client.list_database_names()
32
+ db_client=client["Yelp"]
33
+
34
+ # Read the entire file at once since chunksize isn't needed for parallel reading here
35
+ # Use 'records' orient if your JSON was saved with this format
36
+ try:
37
+
38
+ collection = db_client[file_path]
39
+ documents = collection.find({}, {"_id": 0})
40
+ data = list(documents)
41
+ final_dict=defaultdict(list)
42
+
43
+ for dictt in data:
44
+ for k,v in dictt.items():
45
+ final_dict[k].append(v)
46
+ df=pd.DataFrame(final_dict)
47
+
48
+ # df = pd.read_json(file_path, orient='records', dtype_backend="pyarrow")
49
+ except Exception as e:
50
+ # If 'records' doesn't work, try without specifying orient or with 'split'
51
+ # This is a fallback for different JSON structures
52
+ # df = pd.read_json(file_path, dtype_backend="pyarrow")
53
+ print("ERROR WHILE READING FILES FORM MONGODB AS : ",e)
54
+ print(f"Finished reading. DataFrame shape: {df.shape}")
55
+ return df
56
+
57
+ def process_datasets(output_path,filename):
58
+ # File paths
59
+ file_paths = {
60
+ 'business': "yelp_academic_dataset_business",
61
+ 'checkin': "yelp_academic_dataset_checkin",
62
+ 'review': "yelp_academic_dataset_review",
63
+ 'tip': "yelp_academic_dataset_tip",
64
+ 'user': "yelp_academic_dataset_user",
65
+ 'google': "google_review_dataset"
66
+ }
67
+
68
+ # Read datasets with progress tracking
69
+ print("Reading datasets...")
70
+ dfs = {}
71
+ for name, path in file_paths.items():
72
+ print(f"Processing {name} dataset...")
73
+ dfs[name] = read_data_mongo(path)
74
+ print(f"Finished reading {name} dataset. Shape: {dfs[name].shape}")
75
+
76
+ print("All files read. Starting column renaming...")
77
+
78
+
79
+
80
+
81
+
82
+
83
+
84
+
85
+
86
+ # Rename columns to avoid conflicts
87
+ # Reviews
88
+ dfs['review'] = dfs['review'].rename(columns={
89
+ 'date': 'review_date',
90
+ 'stars': 'review_stars',
91
+ 'text': 'review_text',
92
+ 'useful': 'review_useful',
93
+ 'funny': 'review_funny',
94
+ 'cool': 'review_cool'
95
+ })
96
+ # print("COLUMNS IN REVIEW DAFRA)
97
+
98
+ # Tips
99
+ dfs['tip'] = dfs['tip'].rename(columns={
100
+ 'date': 'tip_date',
101
+ 'text': 'tip_text',
102
+ 'compliment_count': 'tip_compliment_count'
103
+ })
104
+
105
+ # Checkins
106
+ dfs['checkin'] = dfs['checkin'].rename(columns={
107
+ 'date': 'checkin_date'
108
+ })
109
+
110
+ # Users
111
+ dfs['user'] = dfs['user'].rename(columns={
112
+ 'name': 'user_name',
113
+ 'review_count': 'user_review_count',
114
+ 'useful': 'user_useful',
115
+ 'funny': 'user_funny',
116
+ 'cool': 'user_cool'
117
+ })
118
+
119
+ # Business
120
+ dfs['business'] = dfs['business'].rename(columns={
121
+ 'name': 'business_name',
122
+ 'stars': 'business_stars',
123
+ 'review_count': 'business_review_count'
124
+ })
125
+ dfs['google'] = dfs['google'].rename(columns={
126
+ 'name': 'business_name',
127
+ 'stars': 'business_stars',
128
+ 'review_count': 'business_review_count'
129
+ })
130
+ df_business_final= dfs['business']
131
+ df_google_final=dfs['google']
132
+ df_review_final=dfs['review']
133
+ df_tip_final=dfs['tip']
134
+ df_checkin_final=dfs['checkin']
135
+ df_user_final=dfs['user']
136
+
137
+
138
+ df_business_final=pd.concat([df_business_final,df_google_final],axis=0)
139
+ df_business_final.reset_index(drop=True,inplace=True)
140
+
141
+
142
+
143
+
144
+ print("Starting merge process...")
145
+
146
+ # Merge process with memory management
147
+ print("Step 1: Starting with reviews...")
148
+ merged_df = df_review_final
149
+
150
+
151
+ print("Step 2: Merging with business data...")
152
+ merged_df = merged_df.merge(
153
+ df_business_final,
154
+ on='business_id',
155
+ how='left'
156
+ )
157
+
158
+
159
+ print("Step 3: Merging with user data...")
160
+ merged_df = merged_df.merge(
161
+ df_user_final,
162
+ on='user_id',
163
+ how='left'
164
+ )
165
+
166
+
167
+ print("Step 4: Merging with checkin data...")
168
+ merged_df = merged_df.merge(
169
+ df_checkin_final,
170
+ on='business_id',
171
+ how='left'
172
+ )
173
+
174
+
175
+ print("Step 5: Aggregating and merging tip data...")
176
+ tip_agg = df_tip_final.groupby('business_id').agg({
177
+ 'tip_compliment_count': 'sum',
178
+ 'tip_text': 'count'
179
+ }).rename(columns={'tip_text': 'tip_count'})
180
+
181
+ merged_df = merged_df.merge(
182
+ tip_agg,
183
+ on='business_id',
184
+ how='left'
185
+ )
186
+
187
+
188
+
189
+ print("Filling NaN values...")
190
+ merged_df['tip_count'] = merged_df['tip_count'].fillna(0)
191
+ merged_df['tip_compliment_count'] = merged_df['tip_compliment_count'].fillna(0)
192
+ merged_df['checkin_date'] = merged_df['checkin_date'].fillna('')
193
+ merged_df["friends"].fillna(0,inplace=True)
194
+
195
+ for col in merged_df.columns:
196
+ if merged_df[col].isnull().sum()>0:
197
+ print(f" {col} has {merged_df[col].isnull().sum()} null values")
198
+
199
+
200
+ print("Shape of Merged Dataset is : ",merged_df.shape)
201
+ output_file = Path(output_path) / filename
202
+ print("COLUMNS BEFORE PREPROCESING")
203
+ print()
204
+ print(merged_df.info())
205
+ for col in merged_df.columns:
206
+ for v in merged_df[col]:
207
+ print(f"Type of values in {col} is {type(v)} and values are like : {v}")
208
+ break
209
+ merged_df.to_csv(output_file,index=False)
210
+
211
+
212
+
213
+
214
+ return merged_df
215
+
216
+ # if __name__ == "__main__":
217
+ # process_datasets()
src/feature_analyzer.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ from pathlib import Path
6
+ from loguru import logger
7
+
8
+ class FeatureAnalyzer:
9
+ def __init__(self,df,output_path):
10
+ self.df=df
11
+ self.output_path=output_path
12
+
13
+
14
+ def plot_correlation_heatmap(self):
15
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
16
+ numeric_cols = self.df.select_dtypes(include=[np.number]).columns.drop('fake')
17
+ correlation_matrix = self.df[numeric_cols].corr()
18
+ plt.figure(figsize=(14, 12))
19
+ sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1, center=0)
20
+ plt.title('Correlation Heatmap of Numeric Features', fontsize=16)
21
+ plt.tight_layout()
22
+ output_file = Path(self.output_path) / 'correlation_heatmap.png'
23
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
24
+ plt.close()
25
+ logger.info(f"Saved correlation heatmap to {output_file}")
26
+
27
+ def plot_mean_by_fake_bar(self):
28
+ key_features = [
29
+ 'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
30
+ 'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
31
+ 'time_since_last_review_user', 'user_account_age', 'pronoun_density',
32
+ 'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
33
+ 'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
34
+ ]
35
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
36
+ mean_by_fake = self.df.groupby('fake')[key_features].mean().T
37
+ mean_by_fake.columns = ['Genuine (0)', 'Fake (1)']
38
+ plt.figure(figsize=(12, 8))
39
+ mean_by_fake.plot(kind='bar', color=['skyblue', 'salmon'], width=0.8)
40
+ plt.title('Mean Feature Values by Fake Label', fontsize=16)
41
+ plt.xlabel('Features', fontsize=12)
42
+ plt.ylabel('Mean Value', fontsize=12)
43
+ plt.xticks(rotation=45, ha='right')
44
+ plt.legend(title='Fake Label')
45
+ plt.tight_layout()
46
+ output_file = Path(self.output_path) / 'mean_by_fake_bar.png'
47
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
48
+ plt.close()
49
+ logger.info(f"Saved mean by fake bar plot to {output_file}")
50
+
51
+ def plot_violin_plots(self):
52
+ key_features = [
53
+ 'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
54
+ 'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
55
+ 'time_since_last_review_user', 'user_account_age', 'pronoun_density',
56
+ 'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
57
+ 'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
58
+ ]
59
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
60
+ plt.figure(figsize=(14, 10))
61
+ for i, feature in enumerate(key_features[:6], 1):
62
+ plt.subplot(2, 3, i)
63
+ sns.violinplot(x='fake', y=feature, data=self.df, palette=['skyblue', 'salmon'])
64
+ plt.title(f'{feature} Distribution', fontsize=12)
65
+ plt.xlabel('Fake (0/1)', fontsize=10)
66
+ plt.tight_layout()
67
+ output_file = Path(self.output_path) / 'violin_plots.png'
68
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
69
+ plt.close()
70
+ logger.info(f"Saved violin plots to {output_file}")
71
+
72
+ def plot_box_plots(self):
73
+ key_features = [
74
+ 'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
75
+ 'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
76
+ 'time_since_last_review_user', 'user_account_age', 'pronoun_density',
77
+ 'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
78
+ 'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
79
+ ]
80
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
81
+ plt.figure(figsize=(14, 10))
82
+ for i, feature in enumerate(key_features[6:11], 1):
83
+ plt.subplot(2, 3, i)
84
+ sns.boxplot(x='fake', y=feature, data=self.df, palette=['skyblue', 'salmon'])
85
+ plt.title(f'{feature} Distribution', fontsize=12)
86
+ plt.xlabel('Fake (0/1)', fontsize=10)
87
+ plt.tight_layout()
88
+ output_file = Path(self.output_path) / 'box_plots.png'
89
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
90
+ plt.close()
91
+ logger.info(f"Saved box plots to {output_file}")
92
+
93
+ def plot_scatter_review_grammar(self):
94
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
95
+ plt.figure(figsize=(10, 6))
96
+ sns.scatterplot(x='review_stars', y='grammar_error_score', hue='fake', data=self.df, palette=['blue', 'red'], alpha=0.5)
97
+ plt.title('Review Stars vs Grammar Error Score by Fake Label', fontsize=16)
98
+ plt.xlabel('Review Stars', fontsize=12)
99
+ plt.ylabel('Grammar Error Score', fontsize=12)
100
+ plt.legend(title='Fake')
101
+ plt.tight_layout()
102
+ output_file = Path(self.output_path) / 'scatter_review_grammar.png'
103
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
104
+ plt.close()
105
+ logger.info(f"Saved scatter plot to {output_file}")
106
+
107
+ def plot_density_plots(self):
108
+ key_features = [
109
+ 'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
110
+ 'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
111
+ 'time_since_last_review_user', 'user_account_age', 'pronoun_density',
112
+ 'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
113
+ 'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
114
+ ]
115
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
116
+ plt.figure(figsize=(14, 10))
117
+ for i, feature in enumerate(key_features[:4], 1):
118
+ plt.subplot(2, 2, i)
119
+ for label in [0, 1]:
120
+ subset = self.df[self.df['fake'] == label]
121
+ sns.kdeplot(subset[feature], label=f'Fake={label}', fill=True, alpha=0.5)
122
+ plt.title(f'{feature} Density', fontsize=12)
123
+ plt.xlabel(feature, fontsize=10)
124
+ plt.legend()
125
+ plt.tight_layout()
126
+ output_file = Path(self.output_path) / 'density_plots.png'
127
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
128
+ plt.close()
129
+ logger.info(f"Saved density plots to {output_file}")
130
+
131
+ def plot_stacked_bar_similarity(self):
132
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
133
+ bins = pd.cut(self.df['similarity_to_other_reviews'], bins=10)
134
+ stacked_data = self.df.groupby([bins, 'fake']).size().unstack(fill_value=0)
135
+ stacked_data = stacked_data.div(stacked_data.sum(axis=1), axis=0)
136
+ plt.figure(figsize=(12, 8))
137
+ stacked_data.plot(kind='bar', stacked=True, color=['skyblue', 'salmon'], width=0.8)
138
+ plt.title('Proportion of Fake by Similarity to Other Reviews Bins', fontsize=16)
139
+ plt.xlabel('Similarity Bins', fontsize=12)
140
+ plt.ylabel('Proportion', fontsize=12)
141
+ plt.legend(['Genuine (0)', 'Fake (1)'], title='Fake Label')
142
+ plt.xticks(rotation=45, ha='right')
143
+ plt.tight_layout()
144
+ output_file = Path(self.output_path) / 'stacked_bar_similarity.png'
145
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
146
+ plt.close()
147
+ logger.info(f"Saved stacked bar plot to {output_file}")
148
+
149
+ def plot_pie_fake_distribution(self):
150
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
151
+ fake_counts = self.df['fake'].value_counts()
152
+ plt.figure(figsize=(8, 8))
153
+ plt.pie(fake_counts, labels=['Genuine (0)', 'Fake (1)'], colors=['skyblue', 'salmon'], autopct='%1.1f%%', startangle=90)
154
+ plt.title('Distribution of Fake Labels', fontsize=16)
155
+ plt.axis('equal')
156
+ output_file = Path(self.output_path) / 'pie_fake_distribution.png'
157
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
158
+ plt.close()
159
+ logger.info(f"Saved pie chart to {output_file}")
160
+
161
+ def plot_count_code_switching(self):
162
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
163
+ plt.figure(figsize=(8, 6))
164
+ sns.countplot(x='code_switching_flag', hue='fake', data=self.df, palette=['skyblue', 'salmon'])
165
+ plt.title('Count of Fake by Code Switching Flag', fontsize=16)
166
+ plt.xlabel('Code Switching Flag (0/1)', fontsize=12)
167
+ plt.ylabel('Count', fontsize=12)
168
+ plt.legend(title='Fake Label')
169
+ plt.tight_layout()
170
+ output_file = Path(self.output_path) / 'count_code_switching.png'
171
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
172
+ plt.close()
173
+ logger.info(f"Saved count plot to {output_file}")
174
+
175
+ def plot_variance_by_fake_bar(self):
176
+ key_features = [
177
+ 'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
178
+ 'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
179
+ 'time_since_last_review_user', 'user_account_age', 'pronoun_density',
180
+ 'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
181
+ 'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
182
+ ]
183
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
184
+ variance_by_fake = self.df.groupby('fake')[key_features].var().T
185
+ variance_by_fake.columns = ['Genuine (0)', 'Fake (1)']
186
+ plt.figure(figsize=(12, 8))
187
+ variance_by_fake.plot(kind='bar', color=['skyblue', 'salmon'], width=0.8)
188
+ plt.title('Feature Variance by Fake Label', fontsize=16)
189
+ plt.xlabel('Features', fontsize=12)
190
+ plt.ylabel('Variance', fontsize=12)
191
+ plt.xticks(rotation=45, ha='right')
192
+ plt.legend(title='Fake Label')
193
+ plt.tight_layout()
194
+ output_file = Path(self.output_path) / 'variance_by_fake_bar.png'
195
+ plt.savefig(output_file, dpi=300, bbox_inches='tight')
196
+ plt.close()
197
+ logger.info(f"Saved variance bar plot to {output_file}")
198
+
199
+ def run_pipeline(self):
200
+
201
+ sns.set(style="whitegrid")
202
+ plt.rcParams['figure.figsize'] = (12, 8)
203
+ self.plot_correlation_heatmap()
204
+ self.plot_mean_by_fake_bar()
205
+ self.plot_violin_plots()
206
+ self.plot_box_plots()
207
+ self.plot_scatter_review_grammar()
208
+ self.plot_density_plots()
209
+ self.plot_stacked_bar_similarity()
210
+ self.plot_pie_fake_distribution()
211
+ self.plot_count_code_switching()
212
+ self.plot_variance_by_fake_bar()
src/model.py ADDED
@@ -0,0 +1,540 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from torch_geometric.data import HeteroData
5
+ import numpy as np
6
+ import pandas as pd
7
+ import networkx as nx
8
+ import matplotlib.pyplot as plt
9
+ import seaborn as sns
10
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve
11
+ from sklearn.model_selection import train_test_split
12
+ from pathlib import Path
13
+ from datetime import datetime
14
+ from loguru import logger
15
+
16
+ # Temporal Edge Features Function
17
+ def create_temporal_edge_features(time_since_src, time_since_tgt, user_i, user_j):
18
+ delta_t = torch.abs(time_since_src - time_since_tgt).float()
19
+ hour_scale = torch.sin(delta_t / 3600)
20
+ day_scale = torch.sin(delta_t / (24 * 3600))
21
+ week_scale = torch.sin(delta_t / (7 * 24 * 3600))
22
+ same_user = (user_i == user_j).float()
23
+ burst_feature = same_user * torch.exp(-delta_t / (24 * 3600))
24
+ return torch.stack([hour_scale, day_scale, week_scale, burst_feature], dim=-1)
25
+
26
+ # Custom Multihead Attention (unchanged)
27
+ class CustomMultiheadAttention(nn.Module):
28
+ def __init__(self, embed_dim, num_heads):
29
+ super().__init__()
30
+ self.embed_dim = embed_dim
31
+ self.num_heads = num_heads
32
+ self.head_dim = embed_dim // num_heads
33
+
34
+ assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
35
+
36
+ self.q_proj = nn.Linear(embed_dim, embed_dim)
37
+ self.k_proj = nn.Linear(embed_dim, embed_dim)
38
+ self.v_proj = nn.Linear(embed_dim, embed_dim)
39
+ self.out_proj = nn.Linear(embed_dim, embed_dim)
40
+
41
+ self.scale = self.head_dim ** -0.5
42
+
43
+ def forward(self, query, key, value, attn_bias=None):
44
+ batch_size, seq_len, embed_dim = query.size()
45
+ q = self.q_proj(query)
46
+ k = self.k_proj(key)
47
+ v = self.v_proj(value)
48
+ q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
49
+ k = k.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
50
+ v = v.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
51
+ scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale
52
+ if attn_bias is not None:
53
+ scores = scores + attn_bias.unsqueeze(1)
54
+ attn = F.softmax(scores, dim=-1)
55
+ out = torch.matmul(attn, v)
56
+ out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, embed_dim)
57
+ out = self.out_proj(out)
58
+ return out, attn
59
+
60
+ # HeteroGraphormer (unchanged)
61
+ class HeteroGraphormer(nn.Module):
62
+ def __init__(self, hidden_dim, output_dim, num_heads=4, edge_dim=4):
63
+ super().__init__()
64
+ self.hidden_dim = hidden_dim
65
+
66
+ self.embed_dict = nn.ModuleDict({
67
+ 'user': nn.Linear(14, hidden_dim),
68
+ 'business': nn.Linear(8, hidden_dim),
69
+ 'review': nn.Linear(16, hidden_dim)
70
+ })
71
+
72
+ self.edge_proj = nn.Linear(edge_dim, hidden_dim)
73
+
74
+ self.gru_user = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
75
+ self.gru_business = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
76
+ self.gru_review = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
77
+
78
+ self.attention1 = CustomMultiheadAttention(hidden_dim, num_heads)
79
+ self.attention2 = CustomMultiheadAttention(hidden_dim, num_heads)
80
+
81
+ self.ffn1 = nn.Sequential(
82
+ nn.Linear(hidden_dim, hidden_dim * 4),
83
+ nn.ReLU(),
84
+ nn.Dropout(0.1),
85
+ nn.Linear(hidden_dim * 4, hidden_dim)
86
+ )
87
+ self.ffn2 = nn.Sequential(
88
+ nn.Linear(hidden_dim, hidden_dim * 4),
89
+ nn.ReLU(),
90
+ nn.Dropout(0.1),
91
+ nn.Linear(hidden_dim * 4, hidden_dim)
92
+ )
93
+
94
+ self.norm1 = nn.LayerNorm(hidden_dim)
95
+ self.norm2 = nn.LayerNorm(hidden_dim)
96
+ self.norm3 = nn.LayerNorm(hidden_dim)
97
+ self.norm4 = nn.LayerNorm(hidden_dim)
98
+
99
+ self.centrality_proj = nn.Linear(1, hidden_dim)
100
+
101
+ self.classifier = nn.Sequential(
102
+ nn.Linear(hidden_dim * 3, hidden_dim),
103
+ nn.ReLU(),
104
+ nn.Dropout(0.1),
105
+ nn.Linear(hidden_dim, 1)
106
+ )
107
+
108
+ self.dropout = nn.Dropout(0.1)
109
+
110
+ def time_aware_aggregation(self, x, time_since, decay_rate=0.1):
111
+ weights = torch.exp(-decay_rate * time_since.unsqueeze(-1))
112
+ return x * weights
113
+
114
+ def forward(self, data, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict):
115
+ x_dict = {}
116
+ for node_type in data.x_dict:
117
+ x = self.embed_dict[node_type](data[node_type].x)
118
+ if node_type in time_since_dict:
119
+ x = self.time_aware_aggregation(x, time_since_dict[node_type])
120
+ x_dict[node_type] = x
121
+
122
+ x = torch.cat([x_dict['user'], x_dict['business'], x_dict['review']], dim=0)
123
+
124
+ centrality = self.centrality_proj(centrality_encoding)
125
+ x = x + centrality
126
+
127
+ x = x.unsqueeze(0)
128
+
129
+ x_user = x[:, :data['user'].x.size(0), :]
130
+ x_business = x[:, data['user'].x.size(0):data['user'].x.size(0) + data['business'].x.size(0), :]
131
+ x_review = x[:, data['user'].x.size(0) + data['business'].x.size(0):, :]
132
+
133
+ x_user, _ = self.gru_user(x_user)
134
+ x_business, _ = self.gru_business(x_business)
135
+ x_review, _ = self.gru_review(x_review)
136
+
137
+ x = torch.cat([x_user, x_business, x_review], dim=1)
138
+
139
+ total_nodes = x.size(1)
140
+ attn_bias = torch.zeros(1, total_nodes, total_nodes, device=x.device)
141
+ attn_bias[0] = -spatial_encoding
142
+
143
+ for edge_type in edge_features_dict:
144
+ edge_index = data[edge_type].edge_index
145
+ edge_feats = self.edge_proj(edge_features_dict[edge_type])
146
+ for i, (src, tgt) in enumerate(edge_index.t()):
147
+ attn_bias[0, src, tgt] += edge_feats[i].sum()
148
+
149
+ residual = x
150
+ x, _ = self.attention1(x, x, x, attn_bias=attn_bias)
151
+ x = self.norm1(x + residual)
152
+ x = self.dropout(x)
153
+
154
+ residual = x
155
+ x = self.ffn1(x)
156
+ x = self.norm2(x + residual)
157
+ x = self.dropout(x)
158
+
159
+ residual = x
160
+ x, _ = self.attention2(x, x, x, attn_bias=attn_bias)
161
+ x = self.norm3(x + residual)
162
+ x = self.dropout(x)
163
+
164
+ residual = x
165
+ x = self.ffn2(x)
166
+ x = self.norm4(x + residual)
167
+ x = self.dropout(x)
168
+
169
+ x = x.squeeze(0)
170
+
171
+ user_start = 0
172
+ business_start = data['user'].x.size(0)
173
+ review_start = business_start + data['business'].x.size(0)
174
+
175
+ h_user = x[user_start:business_start]
176
+ h_business = x[business_start:review_start]
177
+ h_review = x[review_start:]
178
+
179
+ user_indices = data['user', 'writes', 'review'].edge_index[0]
180
+ business_indices = data['review', 'about', 'business'].edge_index[1]
181
+ review_indices = data['user', 'writes', 'review'].edge_index[1]
182
+
183
+ h_user_mapped = h_user[user_indices]
184
+ h_business_mapped = h_business[business_indices]
185
+ h_review_mapped = h_review[review_indices]
186
+
187
+ combined = torch.cat([h_review_mapped, h_user_mapped, h_business_mapped], dim=-1)
188
+
189
+ logits = self.classifier(combined)
190
+ return torch.sigmoid(logits)
191
+
192
+ # Updated GraphformerModel with Plotting
193
+ class GraphformerModel:
194
+ def __init__(self, df, output_path, epochs, test_size=0.3):
195
+ self.df_whole = df
196
+ self.output_path = output_path
197
+ self.output_path = Path(self.output_path) / "GraphformerModel"
198
+ self.epochs = epochs
199
+ self.df, self.test_df = train_test_split(self.df_whole, test_size=test_size, random_state=42)
200
+
201
+ torch.manual_seed(42)
202
+ np.random.seed(42)
203
+
204
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
205
+
206
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
207
+ self.model = HeteroGraphormer(hidden_dim=64, output_dim=1, edge_dim=4).to(self.device)
208
+ self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.005)
209
+ self.criterion = nn.BCELoss()
210
+
211
+ def compute_graph_encodings(self, data):
212
+ G = nx.DiGraph()
213
+ node_offset = 0
214
+ node_type_map = {}
215
+
216
+ for node_type in ['user', 'business', 'review']:
217
+ num_nodes = data[node_type].x.size(0)
218
+ for i in range(num_nodes):
219
+ G.add_node(node_offset + i)
220
+ node_type_map[node_offset + i] = node_type
221
+ node_offset += num_nodes
222
+
223
+ edge_types = [('user', 'writes', 'review'), ('review', 'about', 'business')]
224
+ for src_type, rel, tgt_type in edge_types:
225
+ edge_index = data[src_type, rel, tgt_type].edge_index
226
+ src_nodes = edge_index[0].tolist()
227
+ tgt_nodes = edge_index[1].tolist()
228
+ src_offset = 0 if src_type == 'user' else (self.num_users if src_type == 'business' else self.num_users + self.num_businesses)
229
+ tgt_offset = 0 if tgt_type == 'user' else (self.num_users if tgt_type == 'business' else self.num_users + self.num_businesses)
230
+ for src, tgt in zip(src_nodes, tgt_nodes):
231
+ G.add_edge(src + src_offset, tgt + tgt_offset)
232
+
233
+ num_nodes = G.number_of_nodes()
234
+ spatial_encoding = torch.full((num_nodes, num_nodes), float('inf'), device=self.device)
235
+ for i in range(num_nodes):
236
+ for j in range(num_nodes):
237
+ if i == j:
238
+ spatial_encoding[i, j] = 0
239
+ elif nx.has_path(G, i, j):
240
+ spatial_encoding[i, j] = nx.shortest_path_length(G, i, j)
241
+
242
+ centrality_encoding = torch.tensor([G.degree(i) for i in range(num_nodes)], dtype=torch.float, device=self.device).view(-1, 1)
243
+
244
+ return spatial_encoding, centrality_encoding, node_type_map
245
+
246
+ def compute_metrics(self, y_true, y_pred, y_prob, prefix=""):
247
+ metrics = {}
248
+ metrics[f"{prefix}accuracy"] = accuracy_score(y_true, y_pred)
249
+ metrics[f"{prefix}precision"] = precision_score(y_true, y_pred, zero_division=0)
250
+ metrics[f"{prefix}recall"] = recall_score(y_true, y_pred, zero_division=0)
251
+ metrics[f"{prefix}f1"] = f1_score(y_true, y_pred, zero_division=0)
252
+ metrics[f"{prefix}auc_roc"] = roc_auc_score(y_true, y_prob)
253
+ metrics[f"{prefix}conf_matrix"] = confusion_matrix(y_true, y_pred)
254
+ metrics[f"{prefix}class_report"] = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
255
+ return metrics
256
+
257
+ def run_model(self):
258
+ features = torch.tensor(self.df.drop(columns=['user_id', 'review_id', 'business_id', 'fake']).values, dtype=torch.float, device=self.device)
259
+ y = torch.tensor(self.df['fake'].values, dtype=torch.float, device=self.device)
260
+ time_since_user = torch.tensor(self.df['time_since_last_review_user'].values, dtype=torch.float, device=self.device)
261
+ time_since_business = torch.tensor(self.df['time_since_last_review_business'].values, dtype=torch.float, device=self.device)
262
+ num_rows = len(self.df)
263
+
264
+ graph = HeteroData()
265
+
266
+ self.num_users = len(self.df['user_id'].unique())
267
+ self.num_businesses = len(self.df['business_id'].unique())
268
+
269
+ user_indices = torch.tensor(self.df['user_id'].map({uid: i for i, uid in enumerate(self.df['user_id'].unique())}).values, dtype=torch.long, device=self.device)
270
+ business_indices = torch.tensor(self.df['business_id'].map({bid: i for i, bid in enumerate(self.df['business_id'].unique())}).values, dtype=torch.long, device=self.device)
271
+ review_indices = torch.arange(num_rows, dtype=torch.long, device=self.device)
272
+
273
+ user_feats = torch.zeros(self.num_users, 14, device=self.device)
274
+ business_feats = torch.zeros(self.num_businesses, 8, device=self.device)
275
+ review_feats = torch.zeros(num_rows, 16, device=self.device)
276
+
277
+ user_cols = ['hours', 'user_review_count', 'elite', 'friends', 'fans', 'average_stars',
278
+ 'time_since_last_review_user', 'user_account_age', 'user_degree',
279
+ 'user_review_burst_count', 'review_like_ratio', 'latest_checkin_hours',
280
+ 'user_useful_funny_cool', 'rating_variance_user']
281
+ business_cols = ['latitude', 'longitude', 'business_stars', 'business_review_count',
282
+ 'time_since_last_review_business', 'business_degree',
283
+ 'business_review_burst_count', 'rating_deviation_from_business_average']
284
+ review_cols = ['review_stars', 'tip_compliment_count', 'tip_count', 'average_time_between_reviews',
285
+ 'temporal_similarity', 'pronoun_density', 'avg_sentence_length',
286
+ 'excessive_punctuation_count', 'sentiment_polarity', 'good_severity',
287
+ 'bad_severity', 'code_switching_flag', 'grammar_error_score',
288
+ 'repetitive_words_count', 'similarity_to_other_reviews', 'review_useful_funny_cool']
289
+
290
+ for i in range(len(self.df)):
291
+ user_idx = user_indices[i]
292
+ business_idx = business_indices[i]
293
+ user_feats[user_idx] += features[i, :14]
294
+ business_feats[business_idx] += features[i, 14:22]
295
+ review_feats = features[:, 22:38]
296
+
297
+ graph['user'].x = user_feats
298
+ graph['business'].x = business_feats
299
+ graph['review'].x = review_feats
300
+ graph['review'].y = y
301
+
302
+ graph['user', 'writes', 'review'].edge_index = torch.stack([user_indices, review_indices], dim=0)
303
+ graph['review', 'about', 'business'].edge_index = torch.stack([review_indices, business_indices], dim=0)
304
+
305
+ edge_features_dict = {}
306
+ user_writes_edge = graph['user', 'writes', 'review'].edge_index
307
+ review_about_edge = graph['review', 'about', 'business'].edge_index
308
+
309
+ src_users = user_indices[user_writes_edge[0]]
310
+ tgt_reviews = review_indices[user_writes_edge[1]]
311
+ edge_features_dict[('user', 'writes', 'review')] = create_temporal_edge_features(
312
+ time_since_user[src_users], time_since_user[tgt_reviews], src_users, src_users
313
+ )
314
+
315
+ src_reviews = review_indices[review_about_edge[0]]
316
+ tgt_businesses = business_indices[review_about_edge[1]]
317
+ edge_features_dict[('review', 'about', 'business')] = create_temporal_edge_features(
318
+ time_since_business[src_reviews], time_since_business[tgt_businesses],
319
+ torch.zeros_like(src_reviews), torch.zeros_like(src_reviews)
320
+ )
321
+
322
+ user_time_since = self.df.groupby('user_id')['time_since_last_review_user'].min().reindex(
323
+ self.df['user_id'].unique(), fill_value=0).values
324
+ time_since_dict = {
325
+ 'user': torch.tensor(user_time_since, dtype=torch.float, device=self.device)
326
+ }
327
+
328
+ spatial_encoding, centrality_encoding, node_type_map = self.compute_graph_encodings(graph)
329
+
330
+ # Training with metrics history
331
+ self.model.train()
332
+ train_metrics_history = []
333
+ for epoch in range(self.epochs):
334
+ self.optimizer.zero_grad()
335
+ out = self.model(graph, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict)
336
+ loss = self.criterion(out.squeeze(), y)
337
+ loss.backward()
338
+ self.optimizer.step()
339
+
340
+ pred_labels = (out.squeeze() > 0.5).float()
341
+
342
+ probs = out.squeeze().detach().cpu().numpy()
343
+ train_metrics = self.compute_metrics(y.cpu().numpy(), pred_labels.cpu().numpy(), probs, prefix="train_")
344
+ train_metrics['loss'] = loss.item()
345
+ train_metrics_history.append(train_metrics)
346
+
347
+ if epoch % 10 == 0:
348
+ logger.info(f"Epoch {epoch}, Loss: {loss.item():.4f}, Accuracy: {train_metrics['train_accuracy']:.4f}, F1: {train_metrics['train_f1']:.4f}")
349
+
350
+ # Save model
351
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
352
+ model_save_path = Path(self.output_path) / f"model_GraphformerModel_latest.pth"
353
+ torch.save(self.model.state_dict(), model_save_path)
354
+
355
+ # Testing
356
+ if self.test_df is not None:
357
+ test_features = torch.tensor(self.test_df.drop(columns=['user_id', 'review_id', 'business_id', 'fake']).values, dtype=torch.float, device=self.device)
358
+ test_y = torch.tensor(self.test_df['fake'].values, dtype=torch.float, device=self.device)
359
+ test_time_since_user = torch.tensor(self.test_df['time_since_last_review_user'].values, dtype=torch.float, device=self.device)
360
+ test_time_since_business = torch.tensor(self.test_df['time_since_last_review_business'].values, dtype=torch.float, device=self.device)
361
+ num_test_rows = len(self.test_df)
362
+
363
+ new_user_unique = self.test_df['user_id'].unique()
364
+ new_business_unique = self.test_df['business_id'].unique()
365
+
366
+ existing_user_ids = list(self.df['user_id'].unique())
367
+ user_mapping = {uid: i for i, uid in enumerate(existing_user_ids)}
368
+ total_users = self.num_users
369
+ for uid in new_user_unique:
370
+ if uid not in user_mapping:
371
+ user_mapping[uid] = total_users
372
+ total_users += 1
373
+
374
+ existing_business_ids = list(self.df['business_id'].unique())
375
+ business_mapping = {bid: i for i, bid in enumerate(existing_business_ids)}
376
+ total_businesses = self.num_businesses
377
+ for bid in new_business_unique:
378
+ if bid not in business_mapping:
379
+ business_mapping[bid] = total_businesses
380
+ total_businesses += 1
381
+
382
+ new_user_indices = torch.tensor([user_mapping[uid] for uid in self.test_df['user_id']], dtype=torch.long, device=self.device)
383
+ new_business_indices = torch.tensor([business_mapping[bid] for bid in self.test_df['business_id']], dtype=torch.long, device=self.device)
384
+ new_review_indices = torch.arange(num_rows, num_rows + num_test_rows, device=self.device)
385
+
386
+ if total_users > self.num_users:
387
+ additional_user_feats = torch.zeros(total_users - self.num_users, 14, device=self.device)
388
+ graph['user'].x = torch.cat([graph['user'].x, additional_user_feats], dim=0)
389
+ if total_businesses > self.num_businesses:
390
+ additional_business_feats = torch.zeros(total_businesses - self.num_businesses, 8, device=self.device)
391
+ graph['business'].x = torch.cat([graph['business'].x, additional_business_feats], dim=0)
392
+
393
+ for i in range(num_test_rows):
394
+ user_idx = new_user_indices[i]
395
+ business_idx = new_business_indices[i]
396
+ if user_idx < graph['user'].x.size(0):
397
+ graph['user'].x[user_idx] += test_features[i, :14]
398
+ if business_idx < graph['business'].x.size(0):
399
+ graph['business'].x[business_idx] += test_features[i, 14:22]
400
+ graph['review'].x = torch.cat([graph['review'].x, test_features[:, 22:38]], dim=0)
401
+ graph['review'].y = torch.cat([graph['review'].y, test_y], dim=0)
402
+
403
+ graph['user', 'writes', 'review'].edge_index = torch.cat([
404
+ graph['user', 'writes', 'review'].edge_index,
405
+ torch.stack([new_user_indices, new_review_indices], dim=0)], dim=1)
406
+ graph['review', 'about', 'business'].edge_index = torch.cat([
407
+ graph['review', 'about', 'business'].edge_index,
408
+ torch.stack([new_review_indices, new_business_indices], dim=0)], dim=1)
409
+
410
+ all_time_since_user = torch.cat([time_since_user, test_time_since_user])
411
+ all_time_since_business = torch.cat([time_since_business, test_time_since_business])
412
+ all_user_indices = torch.cat([user_indices, new_user_indices])
413
+ all_business_indices = torch.cat([business_indices, new_business_indices])
414
+ all_review_indices = torch.cat([review_indices, new_review_indices])
415
+
416
+ user_writes_edge = graph['user', 'writes', 'review'].edge_index
417
+ review_about_edge = graph['review', 'about', 'business'].edge_index
418
+
419
+ edge_features_dict[('user', 'writes', 'review')] = create_temporal_edge_features(
420
+ all_time_since_user[user_writes_edge[0]], all_time_since_user[user_writes_edge[1]],
421
+ all_user_indices[user_writes_edge[0]], all_user_indices[user_writes_edge[0]]
422
+ )
423
+ edge_features_dict[('review', 'about', 'business')] = create_temporal_edge_features(
424
+ all_time_since_business[review_about_edge[0]], all_time_since_business[review_about_edge[1]],
425
+ torch.zeros_like(review_about_edge[0]), torch.zeros_like(review_about_edge[0])
426
+ )
427
+
428
+ self.num_users = total_users
429
+ self.num_businesses = total_businesses
430
+
431
+ test_user_time_since = self.test_df.groupby('user_id')['time_since_last_review_user'].min().reindex(
432
+ pd.Index(list(self.df['user_id'].unique()) + list(self.test_df['user_id'].unique())), fill_value=0).values
433
+ time_since_dict['user'] = torch.tensor(test_user_time_since[:total_users], dtype=torch.float, device=self.device)
434
+
435
+ spatial_encoding, centrality_encoding, node_type_map = self.compute_graph_encodings(graph)
436
+
437
+ self.model.eval()
438
+ with torch.no_grad():
439
+ out = self.model(graph, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict)
440
+ pred_labels = (out.squeeze() > 0.5).float()
441
+ probs = out.squeeze().detach().cpu().numpy()
442
+ test_metrics = self.compute_metrics(graph['review'].y[-num_test_rows:].cpu().numpy(), pred_labels[-num_test_rows:].cpu().numpy(), probs[-num_test_rows:], prefix="test_")
443
+ train_metrics = self.compute_metrics(y.cpu().numpy(), pred_labels[:num_rows].cpu().numpy(), probs[:num_rows], prefix="train_")
444
+ logger.info(f"Test Accuracy: {test_metrics['test_accuracy']:.4f}, F1: {test_metrics['test_f1']:.4f}, AUC-ROC: {test_metrics['test_auc_roc']:.4f}")
445
+
446
+ # Save metrics to file
447
+ metrics_file = Path(self.output_path) / f"metrics_{timestamp}.txt"
448
+ with open(metrics_file, 'w') as f:
449
+ f.write("Training Metrics (Final Epoch):\n")
450
+ for k, v in train_metrics.items():
451
+ f.write(f"{k}: {v}\n")
452
+ f.write("\nTest Metrics:\n")
453
+ for k, v in test_metrics.items():
454
+ f.write(f"{k}: {v}\n")
455
+
456
+ # Plotting and saving to output_path
457
+ plt.figure(figsize=(12, 8))
458
+ plt.plot([m['loss'] for m in train_metrics_history], label='Training Loss')
459
+ plt.xlabel('Epoch')
460
+ plt.ylabel('Loss')
461
+ plt.title('Training Loss Curve')
462
+ plt.legend()
463
+ plt.grid(True)
464
+ plt.savefig(Path(self.output_path) / f"loss_curve_{timestamp}.png")
465
+ plt.close()
466
+
467
+ plt.figure(figsize=(12, 8))
468
+ plt.plot([m['train_accuracy'] for m in train_metrics_history], label='Training Accuracy')
469
+ plt.xlabel('Epoch')
470
+ plt.ylabel('Accuracy')
471
+ plt.title('Training Accuracy Curve')
472
+ plt.legend()
473
+ plt.grid(True)
474
+ plt.savefig(Path(self.output_path) / f"accuracy_curve_{timestamp}.png")
475
+ plt.close()
476
+
477
+ plt.figure(figsize=(12, 8))
478
+ plt.plot([m['train_precision'] for m in train_metrics_history], label='Training Precision')
479
+ plt.plot([m['train_recall'] for m in train_metrics_history], label='Training Recall')
480
+ plt.plot([m['train_f1'] for m in train_metrics_history], label='Training F1-Score')
481
+ plt.xlabel('Epoch')
482
+ plt.ylabel('Score')
483
+ plt.title('Training Precision, Recall, and F1-Score Curves')
484
+ plt.legend()
485
+ plt.grid(True)
486
+ plt.savefig(Path(self.output_path) / f"prf1_curves_{timestamp}.png")
487
+ plt.close()
488
+
489
+ plt.figure(figsize=(12, 8))
490
+ plt.plot([m['train_auc_roc'] for m in train_metrics_history], label='Training AUC-ROC')
491
+ plt.xlabel('Epoch')
492
+ plt.ylabel('AUC-ROC')
493
+ plt.title('Training AUC-ROC Curve')
494
+ plt.legend()
495
+ plt.grid(True)
496
+ plt.savefig(Path(self.output_path) / f"auc_roc_curve_train_{timestamp}.png")
497
+ plt.close()
498
+
499
+ plt.figure(figsize=(8, 6))
500
+ sns.heatmap(test_metrics['test_conf_matrix'], annot=True, fmt='d', cmap='Blues', cbar=False)
501
+ plt.xlabel('Predicted')
502
+ plt.ylabel('True')
503
+ plt.title('Test Confusion Matrix')
504
+ plt.savefig(Path(self.output_path) / f"confusion_matrix_test_{timestamp}.png")
505
+ plt.close()
506
+
507
+ fpr, tpr, _ = roc_curve(graph['review'].y[-num_test_rows:].cpu().numpy(), probs[-num_test_rows:])
508
+ plt.figure(figsize=(10, 6))
509
+ plt.plot(fpr, tpr, label=f'Test ROC Curve (AUC = {test_metrics["test_auc_roc"]:.4f})')
510
+ plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
511
+ plt.xlabel('False Positive Rate')
512
+ plt.ylabel('True Positive Rate')
513
+ plt.title('Test ROC Curve')
514
+ plt.legend()
515
+ plt.grid(True)
516
+ plt.savefig(Path(self.output_path) / f"roc_curve_test_{timestamp}.png")
517
+ plt.close()
518
+
519
+ plt.figure(figsize=(8, 6))
520
+ sns.heatmap(train_metrics['train_conf_matrix'], annot=True, fmt='d', cmap='Blues', cbar=False)
521
+ plt.xlabel('Predicted')
522
+ plt.ylabel('True')
523
+ plt.title('Training Confusion Matrix (Final Epoch)')
524
+ plt.savefig(Path(self.output_path) / f"confusion_matrix_train_{timestamp}.png")
525
+ plt.close()
526
+
527
+ fpr_train, tpr_train, _ = roc_curve(graph['review'].y[:num_rows].cpu().numpy(), probs[:num_rows])
528
+ plt.figure(figsize=(10, 6))
529
+ plt.plot(fpr_train, tpr_train, label=f'Training ROC Curve (AUC = {train_metrics["train_auc_roc"]:.4f})')
530
+ plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
531
+ plt.xlabel('False Positive Rate')
532
+ plt.ylabel('True Positive Rate')
533
+ plt.title('Training ROC Curve (Final Epoch)')
534
+ plt.legend()
535
+ plt.grid(True)
536
+ plt.savefig(Path(self.output_path) / f"roc_curve_train_{timestamp}.png")
537
+ plt.close()
538
+
539
+ logger.info(f"All metrics, plots, and model saved to {self.output_path}")
540
+
src/model_trainer.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.model import GraphformerModel
2
+ from pathlib import Path
3
+ from loguru import logger
4
+
5
+
6
+ class ModelTrainer:
7
+ def __init__(self, df, output_path, epochs=100,test_size=0.3):
8
+ self.df = df
9
+ self.output_path = output_path
10
+ self.epochs = epochs
11
+ self.test_size=test_size
12
+
13
+ # Create output directory
14
+ Path(self.output_path).mkdir(parents=True, exist_ok=True)
15
+
16
+ # Initialize the HeteroGraphormerModel
17
+
18
+ self.model = GraphformerModel(df=self.df, output_path=self.output_path, epochs=self.epochs,test_size=self.test_size)
19
+
20
+
21
+
22
+ logger.info(f"Initialized ModelTrainer with output_path: {self.output_path} and epochs: {self.epochs}")
23
+
24
+
25
+ def train_and_evaluate(self):
26
+
27
+ try:
28
+ logger.info("Starting model training and evaluation")
29
+ self.model.run_model()
30
+ logger.info("GraphformerModel training and evaluation completed successfully")
31
+ except Exception as e:
32
+ logger.error(f"Error during GraphformerModel training and evaluation: {e}")
33
+ raise
34
+
35
+
src/preprocessing.py ADDED
@@ -0,0 +1,832 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from loguru import logger
2
+ import pandas as pd
3
+ import json
4
+ from datetime import datetime
5
+ import ast
6
+ import numpy as np
7
+ from pymongo import MongoClient
8
+ from collections import defaultdict
9
+
10
+ from tqdm import tqdm
11
+ import time
12
+
13
+ import requests
14
+ import json
15
+ import os
16
+ import pandas as pd
17
+ import nltk
18
+ from nltk.tokenize import sent_tokenize, word_tokenize
19
+ from nltk.corpus import stopwords
20
+ from textblob import TextBlob
21
+ import re
22
+ from transformers import BertTokenizer, BertModel
23
+ from transformers import RobertaTokenizer, RobertaModel
24
+ import torch
25
+ from sklearn.metrics.pairwise import cosine_similarity
26
+ import numpy as np
27
+
28
+ # Download NLTK resources
29
+ nltk.download('punkt')
30
+ nltk.download('averaged_perceptron_tagger')
31
+ nltk.download('stopwords')
32
+ nltk.download('punkt_tab')
33
+ nltk.download('averaged_perceptron_tagger_eng')
34
+
35
+ class Preprocessor:
36
+ def __init__(self,df):
37
+ self.df=df
38
+ self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
39
+ self.model = RobertaModel.from_pretrained('roberta-base')
40
+ self.stop_words = set(stopwords.words('english'))
41
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Add this line
42
+
43
+
44
+
45
+ def get_bert_embedding(self, text):
46
+ inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
47
+ with torch.no_grad():
48
+ outputs = self.model(**inputs)
49
+ return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
50
+
51
+ def preprocess_text(self,text):
52
+ return text if pd.notna(text) else ""
53
+
54
+
55
+ def calculate_duration(self, time_range):
56
+ if not isinstance(time_range, str) or "-" not in time_range:
57
+ return None
58
+ start_str, end_str = time_range.split('-')
59
+ start_str = start_str.strip() + ':00' if len(start_str.split(':')) == 1 else start_str.strip()
60
+ end_str = end_str.strip() + ':00' if len(end_str.split(':')) == 1 else end_str.strip()
61
+ try:
62
+ start = datetime.strptime(start_str, '%H:%M')
63
+ end = datetime.strptime(end_str, '%H:%M')
64
+ duration = (end - start).total_seconds() / 3600
65
+ return duration if duration >= 0 else duration + 24
66
+ except ValueError:
67
+ return None
68
+ def calculate_sentiment_severity(self, text):
69
+ if pd.isna(text) or not text.strip():
70
+ return pd.Series({"good_severity": 0.0, "bad_severity": 0.0})
71
+
72
+ # Get sentiment polarity (-1 to 1)
73
+ blob = TextBlob(text)
74
+ polarity = blob.sentiment.polarity
75
+
76
+ # Define severity weights
77
+ good_weight = 0.7
78
+ bad_weight = 0.3
79
+
80
+ if polarity > 0:
81
+ good_severity = good_weight * polarity
82
+ bad_severity = 0.0
83
+ elif polarity < 0:
84
+ good_severity = 0.0
85
+ bad_severity = bad_weight * abs(polarity)
86
+ else: # Neutral (polarity = 0)
87
+ good_severity = 0.0
88
+ bad_severity = 0.0
89
+
90
+ return pd.Series({"good_severity": good_severity, "bad_severity": bad_severity})
91
+
92
+
93
+ def get_avg_duration(self, hours_str):
94
+ if pd.isna(hours_str) or not isinstance(hours_str, str):
95
+ return pd.NA
96
+ try:
97
+ hours_dict = ast.literal_eval(hours_str)
98
+ if not hours_dict:
99
+ return pd.NA
100
+ durations = [self.calculate_duration(time_range) for time_range in hours_dict.values()]
101
+ valid_durations = [d for d in durations if d is not None]
102
+ return sum(valid_durations) / len(valid_durations) if valid_durations else pd.NA
103
+ except (ValueError, SyntaxError, ZeroDivisionError):
104
+ return pd.NA
105
+
106
+
107
+ def calculate_time_since_last_review(self):
108
+ present_date = datetime.now()
109
+ user_latest_timestamp = {}
110
+
111
+ # Convert review_date to datetime
112
+ self.df["review_date"] = pd.to_datetime(self.df["review_date"])
113
+
114
+ # Calculate hours difference for each user's latest review
115
+ for user_id in self.df["user_id"].unique():
116
+ latest_date = self.df[self.df["user_id"] == user_id]["review_date"].max()
117
+
118
+ if not isinstance(latest_date, datetime):
119
+ latest_date = latest_date.to_pydatetime()
120
+
121
+ hours_difference = (present_date - latest_date).total_seconds() / 3600
122
+ user_latest_timestamp[user_id] = hours_difference
123
+
124
+ # Map the hours difference to a new column
125
+ self.df["time_since_last_review_user"] = self.df["user_id"].map(user_latest_timestamp)
126
+
127
+ def calculate_time_since_last_review_business(self):
128
+ present_date = datetime.now()
129
+
130
+ # Ensure review_date is in datetime format
131
+ self.df["review_date"] = pd.to_datetime(self.df["review_date"])
132
+
133
+ # Initialize dictionary to store hours since last review for each business
134
+ business_latest_timestamp = {}
135
+
136
+ # Iterate over unique business_ids
137
+ for business_id in self.df["business_id"].unique():
138
+ # Get the latest review date for this business
139
+ latest_date = self.df[self.df["business_id"] == business_id]["review_date"].max()
140
+
141
+ # Convert to datetime object if needed
142
+ if not isinstance(latest_date, datetime):
143
+ latest_date = latest_date.to_pydatetime()
144
+
145
+ # Calculate hours difference (already in hours)
146
+ hours_difference = (present_date - latest_date).total_seconds() / 3600
147
+ business_latest_timestamp[business_id] = hours_difference
148
+
149
+ # Map the hours difference to the new column
150
+ self.df["time_since_last_review_business"] = self.df["business_id"].map(business_latest_timestamp)
151
+
152
+
153
+
154
+ def calculate_user_account_age(self):
155
+ present_date = datetime.now()
156
+
157
+ # Convert yelping_since to datetime
158
+ self.df["yelping_since"] = pd.to_datetime(self.df["yelping_since"])
159
+
160
+ # Calculate user account age in days
161
+ self.df["user_account_age"] = (present_date - self.df["yelping_since"]).dt.days
162
+
163
+
164
+ def calculate_avg_time_between_reviews(self):
165
+ # Ensure review_date is in datetime format
166
+ self.df["review_date"] = pd.to_datetime(self.df["review_date"])
167
+
168
+ # Sort the DataFrame by user_id and review_date to ensure chronological order
169
+ self.df = self.df.sort_values(["user_id", "review_date"])
170
+
171
+ # Define helper function to calculate average time between reviews
172
+ def calculate_avg_time(group):
173
+ if len(group) == 1:
174
+ return 0 # If only one review, assign 0
175
+ # Calculate differences in hours between consecutive reviews
176
+ diffs = group["review_date"].diff().dt.total_seconds() / 3600
177
+ # Drop the first NaN (from diff) and compute the mean
178
+ return diffs.dropna().mean()
179
+
180
+ # Apply the function to each user_id group and create a mapping
181
+ avg_time_per_user = self.df.groupby("user_id").apply(calculate_avg_time)
182
+
183
+ # Map the average time back to the original DataFrame
184
+ self.df["average_time_between_reviews"] = self.df["user_id"].map(avg_time_per_user)
185
+
186
+
187
+ def calculate_user_degree(self):
188
+ # Calculate the number of unique businesses per user
189
+ user_business_counts = self.df.groupby("user_id")["business_id"].nunique()
190
+
191
+ # Map the counts back to the original DataFrame
192
+ self.df["user_degree"] = self.df["user_id"].map(user_business_counts)
193
+
194
+
195
+ def calculate_business_degree(self):
196
+ # Calculate the number of unique users per business
197
+ business_user_counts = self.df.groupby("business_id")["user_id"].nunique()
198
+
199
+ # Map the counts back to the original DataFrame
200
+ self.df["business_degree"] = self.df["business_id"].map(business_user_counts)
201
+
202
+
203
+ def calculate_rating_variance_user(self):
204
+ # Calculate the mode (most frequent rating) per user
205
+ user_rating_mode = self.df.groupby("user_id")["review_stars"].agg(lambda x: x.mode()[0])
206
+
207
+ # Map the most frequent rating back to the original DataFrame
208
+ self.df["rating_variance_user"] = self.df["user_id"].map(user_rating_mode)
209
+
210
+
211
+ def calculate_user_review_burst_count(self):
212
+ # Ensure review_date is in datetime format
213
+ self.df["review_date"] = pd.to_datetime(self.df["review_date"])
214
+
215
+ # Sort by user_id and review_date for chronological order
216
+ self.df = self.df.sort_values(["user_id", "review_date"])
217
+
218
+ # Function to calculate the max number of reviews in any 20-day window
219
+ def calculate_burst_count(group):
220
+ if len(group) <= 1:
221
+ return 0 # No burst if 1 or fewer reviews
222
+
223
+ # Convert review_date to a Series for rolling window
224
+ dates = group["review_date"]
225
+
226
+ # Calculate the number of reviews within 20 days of each review
227
+ burst_counts = []
228
+ for i, date in enumerate(dates):
229
+ # Count reviews within 20 days after this date
230
+ window_end = date + pd.Timedelta(days=20)
231
+ count = ((dates >= date) & (dates <= window_end)).sum()
232
+ burst_counts.append(count)
233
+
234
+ # Return the maximum burst count for this user
235
+ return max(burst_counts)
236
+
237
+ # Calculate the burst count per user
238
+ user_burst_counts = self.df.groupby("user_id").apply(calculate_burst_count)
239
+
240
+ # Map the burst count back to the original DataFrame
241
+ self.df["user_review_burst_count"] = self.df["user_id"].map(user_burst_counts)
242
+
243
+
244
+ def calculate_business_review_burst_count(self):
245
+ # Ensure review_date is in datetime format
246
+ self.df["review_date"] = pd.to_datetime(self.df["review_date"])
247
+
248
+ # Sort by business_id and review_date for chronological order
249
+ self.df = self.df.sort_values(["business_id", "review_date"])
250
+
251
+ # Function to calculate the max number of reviews in any 10-day window
252
+ def calculate_burst_count(group):
253
+ if len(group) <= 1:
254
+ return 0 # No burst if 1 or fewer reviews
255
+
256
+ # Convert review_date to a Series for rolling window
257
+ dates = group["review_date"]
258
+
259
+ # Calculate the number of reviews within 10 days of each review
260
+ burst_counts = []
261
+ for i, date in enumerate(dates):
262
+ # Count reviews within 10 days after this date
263
+ window_end = date + pd.Timedelta(days=10)
264
+ count = ((dates >= date) & (dates <= window_end)).sum()
265
+ burst_counts.append(count)
266
+
267
+ # Return the maximum burst count for this business
268
+ return max(burst_counts)
269
+
270
+ # Calculate the burst count per business
271
+ business_burst_counts = self.df.groupby("business_id").apply(calculate_burst_count)
272
+
273
+ # Map the burst count back to the original DataFrame
274
+ self.df["business_review_burst_count"] = self.df["business_id"].map(business_burst_counts)
275
+
276
+
277
+ def calculate_temporal_similarity(self):
278
+ self.df["review_date"] = pd.to_datetime(self.df["review_date"])
279
+
280
+ # Extract the day of the week (0 = Monday, 6 = Sunday)
281
+ self.df["day_of_week"] = self.df["review_date"].dt.dayofweek
282
+
283
+ # Function to calculate avg hours between reviews on frequent days
284
+ def calculate_avg_hours_on_frequent_days(group):
285
+ frequent_days = group["day_of_week"].mode().tolist()
286
+
287
+ if len(group) <= 1:
288
+ return 0
289
+
290
+ frequent_reviews = group[group["day_of_week"].isin(frequent_days)]
291
+
292
+ if len(frequent_reviews) <= 1:
293
+ return 0
294
+
295
+ frequent_reviews = frequent_reviews.sort_values("review_date")
296
+ diffs = frequent_reviews["review_date"].diff().dt.total_seconds() / 3600
297
+
298
+ return diffs.dropna().mean()
299
+
300
+ # Calculate average hours for each user
301
+ avg_hours_per_user = self.df.groupby("user_id").apply(calculate_avg_hours_on_frequent_days)
302
+
303
+ # Map the average hours to the new column
304
+ self.df["temporal_similarity"] = self.df["user_id"].map(avg_hours_per_user)
305
+
306
+ # Drop temporary column
307
+ self.df = self.df.drop(columns=["day_of_week"])
308
+
309
+
310
+ def calculate_rating_deviation_from_business_average(self):
311
+ # Calculate the average rating per business
312
+ business_avg_rating = self.df.groupby("business_id")["review_stars"].mean()
313
+
314
+ # Map the average rating to each row
315
+ self.df["business_avg_rating"] = self.df["business_id"].map(business_avg_rating)
316
+
317
+ # Calculate the deviation from the business average
318
+ self.df["rating_deviation_from_business_average"] = (
319
+ self.df["review_stars"] - self.df["business_avg_rating"]
320
+ )
321
+
322
+ # Drop the temporary column
323
+ self.df = self.df.drop(columns=["business_avg_rating"])
324
+
325
+ def calculate_review_like_ratio(self):
326
+ # Create a binary column for liked reviews (stars >= 4)
327
+ self.df["is_liked"] = (self.df["review_stars"] >= 4).astype(int)
328
+
329
+ # Calculate the like ratio per user
330
+ user_like_ratio = self.df.groupby("user_id")["is_liked"].mean()
331
+
332
+ # Map the like ratio back to the DataFrame
333
+ self.df["review_like_ratio"] = self.df["user_id"].map(user_like_ratio)
334
+
335
+ # Drop the temporary column
336
+ self.df = self.df.drop(columns=["is_liked"])
337
+
338
+ def calculate_latest_checkin_hours(self):
339
+ self.df["yelping_since"] = pd.to_datetime(self.df["yelping_since"])
340
+
341
+ # Function to get the latest check-in date from a list of strings
342
+ def get_latest_checkin(checkin_list):
343
+ if not checkin_list or pd.isna(checkin_list): # Handle empty or NaN
344
+ return None
345
+ if isinstance(checkin_list, str):
346
+ checkin_dates = checkin_list.split(", ")
347
+ else:
348
+ checkin_dates = checkin_list
349
+ return pd.to_datetime(checkin_dates).max()
350
+
351
+ # Apply the function to get the latest check-in date per row
352
+ self.df["latest_checkin_date"] = self.df["checkin_date"].apply(get_latest_checkin)
353
+
354
+ # Calculate the hours difference between latest check-in and yelping_since
355
+ self.df["latest_checkin_hours"] = (
356
+ (self.df["latest_checkin_date"] - self.df["yelping_since"])
357
+ .dt.total_seconds() / 3600
358
+ )
359
+
360
+ # Drop the temporary column
361
+ self.df = self.df.drop(columns=["latest_checkin_date"])
362
+ self.df["latest_checkin_hours"].fillna(0,inplace=True)
363
+
364
+
365
+ def compute_pronoun_density(self, text):
366
+ text = self.preprocess_text(text)
367
+ if not text:
368
+ return 0
369
+ words = word_tokenize(text.lower())
370
+ pos_tags = nltk.pos_tag(words)
371
+ pronouns = sum(1 for word, pos in pos_tags if pos in ['PRP', 'PRP$'] and word in ['i', 'we'])
372
+ return pronouns / len(words) if words else 0
373
+
374
+ def compute_avg_sentence_length(self, text):
375
+ text = self.preprocess_text(text)
376
+ if not text:
377
+ return 0
378
+ sentences = sent_tokenize(text)
379
+ return sum(len(word_tokenize(sent)) for sent in sentences) / len(sentences) if sentences else 0
380
+
381
+ def compute_excessive_punctuation(self, text):
382
+ text = self.preprocess_text(text)
383
+ return len(re.findall(r'[!?.]{2,}', text))
384
+
385
+ def compute_sentiment_polarity(self, text):
386
+ text = self.preprocess_text(text)
387
+ return TextBlob(text).sentiment.polarity if text else 0
388
+
389
+ def compute_code_switching_flag(self, text):
390
+ text = self.preprocess_text(text)
391
+ if not text:
392
+ return 0
393
+
394
+ tokens = self.tokenizer.tokenize(text.lower())
395
+ if not tokens:
396
+ return 0
397
+
398
+ english_words = self.stop_words # Use self.stop_words from __init__
399
+ token_set = set(tokens)
400
+ english_count = sum(1 for token in tokens if token in english_words)
401
+
402
+ non_english_pattern = re.compile(r'[^\x00-\x7F]')
403
+ has_non_ascii = 1 if non_english_pattern.search(text) else 0
404
+
405
+ english_ratio = english_count / len(tokens) if tokens else 0
406
+
407
+ non_english_tokens = sum(1 for token in token_set if token not in english_words and "##" in token and has_non_ascii)
408
+
409
+ # Flag as code-switching if:
410
+ # 1. Mixed English presence (ratio between 0.1 and 0.9)
411
+ # 2. Non-ASCII characters present OR some non-English subword tokens
412
+ if 0.1 < english_ratio < 0.9 and (has_non_ascii or non_english_tokens > 0):
413
+ return 1
414
+ return 0
415
+
416
+
417
+ def batch_tokenize(self, texts, batch_size=32, max_length=512):
418
+ tokenized_outputs = []
419
+ for i in tqdm(range(0, len(texts), batch_size), desc="Tokenizing with RoBERTa on GPU"):
420
+ batch_texts = texts[i:i + batch_size]
421
+ valid_texts = [self.preprocess_text(t) for t in batch_texts]
422
+ # Tokenize with fixed max_length to ensure consistent tensor sizes
423
+ inputs = self.tokenizer(valid_texts, return_tensors='pt', truncation=True, padding='max_length', max_length=max_length)
424
+ tokenized_outputs.append(inputs['input_ids'].to(self.device)) # Move to GPU
425
+ # Concatenate on GPU with consistent sizes
426
+ return torch.cat(tokenized_outputs, dim=0)
427
+
428
+ def compute_grammar_error_score(self, texts, tokenized_ids):
429
+ print("Computing grammar error scores...")
430
+ error_scores = np.zeros(len(texts), dtype=float)
431
+
432
+ vocab_set = set(self.tokenizer.get_vocab().keys())
433
+ for i, input_ids in enumerate(tqdm(tokenized_ids, desc="Processing Grammar Errors")):
434
+ if input_ids.sum() == 0: # Empty input
435
+ continue
436
+ tokens = self.tokenizer.convert_ids_to_tokens(input_ids.cpu().tolist(), skip_special_tokens=True)
437
+ unknown_count = sum(1 for token in tokens if token not in vocab_set and token not in self.stop_words)
438
+ total_count = len([t for t in tokens if t not in self.stop_words])
439
+ error_scores[i] = unknown_count / total_count if total_count > 0 else 0
440
+
441
+ return error_scores
442
+
443
+ def compute_repetitive_words_count(self, texts, tokenized_ids):
444
+ print("Computing repetitive words counts...")
445
+ rep_counts = np.zeros(len(texts), dtype=int)
446
+
447
+ for i, input_ids in enumerate(tqdm(tokenized_ids, desc="Processing Repetition")):
448
+ if input_ids.sum() == 0: # Empty input
449
+ continue
450
+ tokens = self.tokenizer.convert_ids_to_tokens(input_ids.cpu().tolist(), skip_special_tokens=True)
451
+ valid_tokens = [t for t in tokens if t not in self.stop_words and len(t) > 2]
452
+ if valid_tokens:
453
+ token_counts = {}
454
+ for token in valid_tokens:
455
+ token_counts[token] = token_counts.get(token, 0) + 1
456
+ rep_counts[i] = sum(1 for count in token_counts.values() if count > 1)
457
+
458
+ return rep_counts
459
+
460
+ def preprocess_text_for_similarity(self, text):
461
+ if pd.isna(text) or not text.strip():
462
+ return []
463
+ return [w for w in word_tokenize(str(text).lower()) if w not in self.stop_words]
464
+
465
+ def batch_encode_words(self, texts, batch_size=32, max_length=512):
466
+ word_lists = [self.preprocess_text_for_similarity(t) for t in tqdm(texts, desc="Tokenizing Texts")]
467
+ vocab = {word: idx + 1 for idx, word in enumerate(set.union(*[set(w) for w in word_lists if w]))}
468
+
469
+ encoded_batches = []
470
+ for i in tqdm(range(0, len(word_lists), batch_size), desc="Encoding Words on GPU"):
471
+ batch_words = word_lists[i:i + batch_size]
472
+ encoded = np.zeros((len(batch_words), max_length), dtype=np.int64)
473
+ for j, words in enumerate(batch_words):
474
+ if words:
475
+ word_ids = [vocab.get(w, 0) for w in words][:max_length]
476
+ encoded[j, :len(word_ids)] = word_ids
477
+ encoded_tensor = torch.tensor(encoded, dtype=torch.int64).to(self.device)
478
+ encoded_batches.append(encoded_tensor)
479
+
480
+ return torch.cat(encoded_batches, dim=0), vocab
481
+
482
+ def compute_similarity_to_other_reviews(self, batch_size=32, max_length=512):
483
+ all_texts = self.df["review_text"].tolist()
484
+ all_users = self.df["user_id"].tolist()
485
+ all_review_ids = self.df["review_id"].tolist()
486
+
487
+ encoded_words, vocab = self.batch_encode_words(all_texts, batch_size, max_length)
488
+
489
+ similarity_scores = {rid: 0.0 for rid in all_review_ids} # Default scores
490
+ for i, (review_id, user_id) in enumerate(tqdm(zip(all_review_ids, all_users), desc="Computing Similarities on GPU")):
491
+ if pd.isna(review_id) or pd.isna(user_id):
492
+ continue
493
+
494
+ current_words = encoded_words[i]
495
+ if current_words.sum() == 0:
496
+ continue
497
+
498
+ other_indices = torch.tensor([j for j, u in enumerate(all_users) if u != user_id and pd.notna(u)],
499
+ dtype=torch.long).to(self.device)
500
+ if not other_indices.numel():
501
+ continue
502
+
503
+ other_words = encoded_words[other_indices]
504
+ current_set = torch.unique(current_words[current_words > 0])
505
+ other_flat = other_words[other_words > 0]
506
+
507
+ if other_flat.numel() == 0:
508
+ continue
509
+
510
+ other_set = torch.unique(other_flat)
511
+ intersection = torch.sum(torch.isin(current_set, other_set)).float()
512
+ union = torch.unique(torch.cat([current_set, other_set])).numel()
513
+ similarity = intersection / union if union > 0 else 0.0
514
+
515
+ similarity_scores[review_id] = similarity.item()
516
+ return pd.Series(similarity_scores, index=all_review_ids)
517
+
518
+ def calculate_friend_count(self):
519
+ friends = []
520
+ for v in self.df["friends"]:
521
+ if isinstance(v, str):
522
+ friends.append(len(v.split(",")))
523
+ elif type(v)==int or type(v)==float:
524
+ friends.append(0)
525
+ self.df["friends"] = friends
526
+
527
+ def count_elite_years(self, elite):
528
+ if pd.isna(elite):
529
+ return 0
530
+ return len(str(elite).split(","))
531
+
532
+ def transform_elite_status(self):
533
+ self.df["elite"] = self.df["elite"].apply(lambda x: True if self.count_elite_years(x) > 1 else False)
534
+ self.df["elite"] = self.df["elite"].astype(int)
535
+
536
+
537
+ def calculate_review_useful_funny_cool(self):
538
+ self.df["review_useful"] = pd.to_numeric(self.df["review_useful"], errors='coerce').fillna(0)
539
+ self.df["review_funny"] = pd.to_numeric(self.df["review_funny"], errors='coerce').fillna(0)
540
+ self.df["review_cool"] = pd.to_numeric(self.df["review_cool"], errors='coerce').fillna(0)
541
+ self.df["review_useful_funny_cool"] = (
542
+ self.df["review_useful"] +
543
+ self.df["review_funny"] +
544
+ self.df["review_cool"]
545
+ )
546
+ self.df["review_useful_funny_cool"] = self.df["review_useful_funny_cool"].fillna(0).astype(int)
547
+
548
+
549
+ def calculate_user_useful_funny_cool(self):
550
+ self.df["user_useful_funny_cool"] = (
551
+ self.df["user_useful"] +
552
+ self.df["user_funny"] +
553
+ self.df["user_cool"]
554
+ )
555
+ self.df["user_useful_funny_cool"] = self.df["user_useful_funny_cool"].fillna(0).astype(int)
556
+
557
+ def compute_fake_score(self, row):
558
+ suspicion_points = 0
559
+
560
+ # Linguistic Features
561
+ if row["pronoun_density"] < 0.01: # Low personal engagement
562
+ suspicion_points += 1
563
+ if row["avg_sentence_length"] < 5 or row["avg_sentence_length"] > 30: # Extreme lengths
564
+ suspicion_points += 1
565
+ if row["grammar_error_score"] > 5: # Many errors
566
+ suspicion_points += 1
567
+ if row["repetitive_words_count"] > 5: # High repetition
568
+ suspicion_points += 1
569
+ if row["code_switching_flag"] == 1: # Language mixing
570
+ suspicion_points += 1
571
+ if row["excessive_punctuation_count"] > 3: # Overuse of punctuation
572
+ suspicion_points += 1
573
+ if abs(row["sentiment_polarity"]) > 0.8: # Extreme sentiment
574
+ suspicion_points += 1
575
+
576
+ # Review Patterns
577
+ if row["similarity_to_other_reviews"] > 0.8: # High duplication
578
+ suspicion_points += 1
579
+ if row["user_review_burst_count"] > 5: # Spammy bursts
580
+ suspicion_points += 1
581
+ if row["business_review_burst_count"] > 5: # Targeted bursts
582
+ suspicion_points += 1
583
+ if abs(row["rating_deviation_from_business_average"]) > 2: # Large rating deviation
584
+ suspicion_points += 1
585
+ if row["review_like_ratio"] > 0.9 or row["review_like_ratio"] < 0.1: # Extreme like ratio
586
+ suspicion_points += 1
587
+
588
+ # User Behavior
589
+ if row["user_account_age"] < 30: # Very new account (days)
590
+ suspicion_points += 1
591
+ if row["average_time_between_reviews"] < 24: # Rapid reviews (hours)
592
+ suspicion_points += 1
593
+ if row["user_degree"] < 2: # Low business interaction
594
+ suspicion_points += 1
595
+ if row["time_since_last_review_user"] < 24: # Recent burst (hours)
596
+ suspicion_points += 1
597
+
598
+ # Threshold: 3 or more points = fake
599
+ return 1 if suspicion_points >= 3 else 0
600
+
601
+
602
+ def run_pipeline(self):
603
+
604
+
605
+
606
+ logger.info("FINALYZING HOURS COLUMN ...")
607
+ self.df["hours"] = self.df["hours"].apply(self.get_avg_duration)
608
+ self.df["hours"] = self.df["hours"].fillna(0)
609
+ print(self.df["hours"][:10])
610
+ print(self.df["hours"].isnull().sum())
611
+
612
+
613
+
614
+
615
+ logger.info("FINALYZING ATTRIBUTES COLUMN ...")
616
+ self.df.drop("attributes",axis=1,inplace=True)
617
+
618
+
619
+
620
+ logger.info("CREATING time_since_last_review_user COLUMN ...")
621
+ self.calculate_time_since_last_review()
622
+ print(np.unique(self.df["time_since_last_review_user"] ))
623
+
624
+
625
+ logger.info("CREATING time_since_last_review_business COLUMN ...")
626
+ self.calculate_time_since_last_review_business()
627
+ print(np.unique(self.df["time_since_last_review_business"] ))
628
+
629
+
630
+
631
+ logger.info("CREATING user_account_age COLUMN ...")
632
+ self.calculate_user_account_age()
633
+ print(np.unique(self.df["user_account_age"] ))
634
+
635
+
636
+
637
+ logger.info("CREATING average_time_between_reviews COLUMN ...")
638
+ self.calculate_avg_time_between_reviews()
639
+ print(np.unique(self.df["average_time_between_reviews"] ))
640
+
641
+
642
+
643
+ logger.info("CREATING user_degree COLUMN ...")
644
+ self.calculate_user_degree()
645
+ print(np.unique(self.df["user_degree"] ))
646
+
647
+
648
+ logger.info("CREATING business_degree COLUMN ...")
649
+ self.calculate_business_degree()
650
+ print(np.unique(self.df["business_degree"] ))
651
+
652
+
653
+ logger.info("CREATING rating_variance_user COLUMN ...")
654
+ self.calculate_rating_variance_user()
655
+ print(np.unique(self.df["rating_variance_user"] ))
656
+
657
+
658
+
659
+ logger.info("CREATING user_review_burst_count COLUMN ...")
660
+ self.calculate_user_review_burst_count()
661
+ print(np.unique(self.df["user_review_burst_count"] ))
662
+
663
+
664
+ logger.info("CREATING business_review_burst_count COLUMN ...")
665
+ self.calculate_business_review_burst_count()
666
+ print(np.unique(self.df["business_review_burst_count"] ))
667
+
668
+
669
+
670
+ logger.info("CREATING temporal_similarity COLUMN ...")
671
+ self.calculate_temporal_similarity()
672
+ print(np.unique(self.df["temporal_similarity"] ))
673
+
674
+
675
+
676
+ logger.info("CREATING rating_deviation_from_business_average COLUMN ...")
677
+ self.calculate_rating_deviation_from_business_average()
678
+ print(np.unique(self.df["rating_deviation_from_business_average"] ))
679
+
680
+
681
+
682
+ logger.info("CREATING review_like_ratio COLUMN ...")
683
+ self.calculate_review_like_ratio()
684
+ print(np.unique(self.df["review_like_ratio"] ))
685
+
686
+
687
+
688
+ logger.info("CREATING latest_checkin_hours COLUMN ...")
689
+ self.calculate_latest_checkin_hours()
690
+ print(np.unique(self.df["latest_checkin_hours"] ))
691
+
692
+
693
+
694
+
695
+ logger.info("CREATING pronoun_density COLUMN ...")
696
+ self.df["pronoun_density"] = self.df["review_text"].apply(self.compute_pronoun_density)
697
+ print(np.unique(self.df["pronoun_density"] ))
698
+
699
+ logger.info("CREATING avg_sentence_length COLUMN ...")
700
+ self.df["avg_sentence_length"] = self.df["review_text"].apply(self.compute_avg_sentence_length)
701
+ print(np.unique(self.df["avg_sentence_length"] ))
702
+
703
+ logger.info("CREATING excessive_punctuation_count COLUMN ...")
704
+ self.df["excessive_punctuation_count"] = self.df["review_text"].apply(self.compute_excessive_punctuation)
705
+ print(np.unique(self.df["excessive_punctuation_count"] ))
706
+
707
+ logger.info("CREATING sentiment_polarity COLUMN ...")
708
+ self.df["sentiment_polarity"] = self.df["review_text"].apply(self.compute_sentiment_polarity)
709
+ print(np.unique(self.df["sentiment_polarity"] ))
710
+
711
+ logger.info("CREATING good_severity and bad_severity COLUMNS ...")
712
+ severity_scores = self.df["review_text"].apply(self.calculate_sentiment_severity)
713
+ self.df[["good_severity", "bad_severity"]] = severity_scores
714
+ print(np.unique(self.df["good_severity"] ))
715
+ print(np.unique(self.df["bad_severity"] ))
716
+
717
+
718
+ logger.info("CREATING code_switching_flag COLUMN ...")
719
+ self.df["code_switching_flag"] = self.df["review_text"].apply(self.compute_code_switching_flag)
720
+ print(np.unique(self.df["code_switching_flag"] ))
721
+
722
+
723
+ all_texts = self.df["review_text"].tolist()
724
+ tokenized_ids = self.batch_tokenize(all_texts, batch_size=32, max_length=512)
725
+
726
+ logger.info("CREATING grammar_error_score COLUMN ...")
727
+ self.df["grammar_error_score"] = self.compute_grammar_error_score(all_texts, tokenized_ids)
728
+ print(np.unique(self.df["grammar_error_score"] ))
729
+
730
+
731
+ logger.info("CREATING repetitive_words_count COLUMN ...")
732
+ self.df["repetitive_words_count"] = self.compute_repetitive_words_count(all_texts, tokenized_ids)
733
+ print(np.unique(self.df["repetitive_words_count"] ))
734
+
735
+
736
+
737
+ logger.info("CREATING similarity_to_other_reviews COLUMN ...")
738
+ similarity_scores = self.compute_similarity_to_other_reviews(batch_size=32, max_length=512)
739
+ self.df["similarity_to_other_reviews"] = self.df["review_id"].map(similarity_scores)
740
+
741
+ print(np.unique(self.df["similarity_to_other_reviews"] ))
742
+
743
+
744
+
745
+ logger.info("CREATING friends COLUMN ...")
746
+ self.calculate_friend_count()
747
+ print(self.df["friends"].value_counts())
748
+
749
+ logger.info("CREATING elite COLUMN ...")
750
+ self.transform_elite_status()
751
+ print(self.df["elite"].value_counts())
752
+
753
+
754
+ logger.info("CREATING review_useful_funny_cool COLUMN ...")
755
+ self.calculate_review_useful_funny_cool()
756
+ print(self.df["review_useful_funny_cool"].value_counts())
757
+
758
+
759
+ logger.info("CREATING user_useful_funny_cool COLUMN ...")
760
+ self.calculate_user_useful_funny_cool()
761
+ print(self.df["user_useful_funny_cool"].value_counts())
762
+
763
+
764
+ logger.info("CREATING LABEL COLUMN ...")
765
+ self.df["fake"] = self.df.apply(self.compute_fake_score, axis=1)
766
+ print(self.df["fake"].value_counts())
767
+
768
+
769
+ logger.info("SEEING NULL VALUES IN FINAL COLUMNS.....")
770
+ print(set(self.df.isnull().sum().values))
771
+
772
+
773
+
774
+
775
+ return self.df
776
+
777
+
778
+
779
+
780
+
781
+
782
+
783
+
784
+
785
+
786
+
787
+
788
+
789
+
790
+
791
+
792
+
793
+
794
+
795
+
796
+
797
+
798
+
799
+
800
+
801
+
802
+
803
+
804
+
805
+
806
+
807
+
808
+
809
+
810
+
811
+
812
+
813
+
814
+
815
+
816
+
817
+
818
+
819
+
820
+
821
+
822
+
823
+
824
+
825
+
826
+
827
+
828
+
829
+
830
+
831
+
832
+