OVH commited on
Commit
c1e568b
·
1 Parent(s): c0cb02a

Added some files

Browse files
Files changed (1) hide show
  1. preprocessing_test.py +866 -0
preprocessing_test.py ADDED
@@ -0,0 +1,866 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from loguru import logger
2
+ import pandas as pd
3
+ import json
4
+ from datetime import datetime
5
+ import ast
6
+ import numpy as np
7
+ from pymongo import MongoClient
8
+ from collections import defaultdict
9
+
10
+ from tqdm import tqdm
11
+ import time
12
+
13
+ import requests
14
+ import json
15
+ import os
16
+ import pandas as pd
17
+ import nltk
18
+ from nltk.tokenize import sent_tokenize, word_tokenize
19
+ from nltk.corpus import stopwords
20
+ from textblob import TextBlob
21
+ import re
22
+ from transformers import BertTokenizer, BertModel
23
+ from transformers import RobertaTokenizer, RobertaModel
24
+ import torch
25
+ from sklearn.metrics.pairwise import cosine_similarity
26
+ import numpy as np
27
+
28
+ # Download NLTK resources
29
+ nltk.download('punkt')
30
+ nltk.download('averaged_perceptron_tagger')
31
+ nltk.download('stopwords')
32
+ nltk.download('punkt_tab')
33
+ nltk.download('averaged_perceptron_tagger_eng')
34
+
35
+ class Preprocessor:
36
+ def __init__(self,df):
37
+ self.df=df
38
+ self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
39
+ self.model = RobertaModel.from_pretrained('roberta-base')
40
+ self.stop_words = set(stopwords.words('english'))
41
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Add this line
42
+
43
+
44
+
45
+ def get_bert_embedding(self, text):
46
+ inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
47
+ with torch.no_grad():
48
+ outputs = self.model(**inputs)
49
+ return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
50
+
51
+ def preprocess_text(self,text):
52
+ return text if pd.notna(text) else ""
53
+
54
+
55
+ def calculate_duration(self, time_range):
56
+ if not isinstance(time_range, str) or "-" not in time_range:
57
+ return None
58
+ start_str, end_str = time_range.split('-')
59
+ start_str = start_str.strip() + ':00' if len(start_str.split(':')) == 1 else start_str.strip()
60
+ end_str = end_str.strip() + ':00' if len(end_str.split(':')) == 1 else end_str.strip()
61
+ try:
62
+ start = datetime.strptime(start_str, '%H:%M')
63
+ end = datetime.strptime(end_str, '%H:%M')
64
+ duration = (end - start).total_seconds() / 3600
65
+ return duration if duration >= 0 else duration + 24
66
+ except ValueError:
67
+ return None
68
+ def calculate_sentiment_severity(self, text):
69
+ if pd.isna(text) or not text.strip():
70
+ return pd.Series({"good_severity": 0.0, "bad_severity": 0.0})
71
+
72
+ # Get sentiment polarity (-1 to 1)
73
+ blob = TextBlob(text)
74
+ polarity = blob.sentiment.polarity
75
+
76
+ # Define severity weights
77
+ good_weight = 0.7
78
+ bad_weight = 0.3
79
+
80
+ if polarity > 0:
81
+ good_severity = good_weight * polarity
82
+ bad_severity = 0.0
83
+ elif polarity < 0:
84
+ good_severity = 0.0
85
+ bad_severity = bad_weight * abs(polarity)
86
+ else: # Neutral (polarity = 0)
87
+ good_severity = 0.0
88
+ bad_severity = 0.0
89
+
90
+ return pd.Series({"good_severity": good_severity, "bad_severity": bad_severity})
91
+
92
+
93
+ def get_avg_duration(self, hours_str):
94
+ if pd.isna(hours_str) or not isinstance(hours_str, str):
95
+ return pd.NA
96
+ try:
97
+ hours_dict = ast.literal_eval(hours_str)
98
+ if not hours_dict:
99
+ return pd.NA
100
+ durations = [self.calculate_duration(time_range) for time_range in hours_dict.values()]
101
+ valid_durations = [d for d in durations if d is not None]
102
+ return sum(valid_durations) / len(valid_durations) if valid_durations else pd.NA
103
+ except (ValueError, SyntaxError, ZeroDivisionError):
104
+ return pd.NA
105
+
106
+
107
+ def calculate_time_since_last_review(self):
108
+ present_date = datetime.now()
109
+ user_latest_timestamp = {}
110
+
111
+ # Convert review_date to datetime
112
+ self.df["review_date"] = pd.to_datetime(self.df["review_date"])
113
+
114
+ # Calculate hours difference for each user's latest review
115
+ for user_id in self.df["user_id"].unique():
116
+ latest_date = self.df[self.df["user_id"] == user_id]["review_date"].max()
117
+
118
+ if not isinstance(latest_date, datetime):
119
+ latest_date = latest_date.to_pydatetime()
120
+
121
+ hours_difference = (present_date - latest_date).total_seconds() / 3600
122
+ user_latest_timestamp[user_id] = hours_difference
123
+
124
+ # Map the hours difference to a new column
125
+ self.df["time_since_last_review_user"] = self.df["user_id"].map(user_latest_timestamp)
126
+
127
+ def calculate_time_since_last_review_business(self):
128
+ present_date = datetime.now()
129
+
130
+ # Ensure review_date is in datetime format
131
+ self.df["review_date"] = pd.to_datetime(self.df["review_date"])
132
+
133
+ # Initialize dictionary to store hours since last review for each business
134
+ business_latest_timestamp = {}
135
+
136
+ # Iterate over unique business_ids
137
+ for business_id in self.df["business_id"].unique():
138
+ # Get the latest review date for this business
139
+ latest_date = self.df[self.df["business_id"] == business_id]["review_date"].max()
140
+
141
+ # Convert to datetime object if needed
142
+ if not isinstance(latest_date, datetime):
143
+ latest_date = latest_date.to_pydatetime()
144
+
145
+ # Calculate hours difference (already in hours)
146
+ hours_difference = (present_date - latest_date).total_seconds() / 3600
147
+ business_latest_timestamp[business_id] = hours_difference
148
+
149
+ # Map the hours difference to the new column
150
+ self.df["time_since_last_review_business"] = self.df["business_id"].map(business_latest_timestamp)
151
+
152
+
153
+
154
+ def calculate_user_account_age(self):
155
+ present_date = datetime.now()
156
+
157
+ # Convert yelping_since to datetime
158
+ self.df["yelping_since"] = pd.to_datetime(self.df["yelping_since"])
159
+
160
+ # Calculate user account age in days
161
+ self.df["user_account_age"] = (present_date - self.df["yelping_since"]).dt.days
162
+
163
+
164
+ def calculate_avg_time_between_reviews(self):
165
+ # Ensure review_date is in datetime format
166
+ self.df["review_date"] = pd.to_datetime(self.df["review_date"])
167
+
168
+ # Sort the DataFrame by user_id and review_date to ensure chronological order
169
+ self.df = self.df.sort_values(["user_id", "review_date"])
170
+
171
+ # Define helper function to calculate average time between reviews
172
+ def calculate_avg_time(group):
173
+ if len(group) == 1:
174
+ return 0 # If only one review, assign 0
175
+ # Calculate differences in hours between consecutive reviews
176
+ diffs = group["review_date"].diff().dt.total_seconds() / 3600
177
+ # Drop the first NaN (from diff) and compute the mean
178
+ return diffs.dropna().mean()
179
+
180
+ # Apply the function to each user_id group and create a mapping
181
+ avg_time_per_user = self.df.groupby("user_id").apply(calculate_avg_time)
182
+
183
+ # Map the average time back to the original DataFrame
184
+ self.df["average_time_between_reviews"] = self.df["user_id"].map(avg_time_per_user)
185
+
186
+
187
+ def calculate_user_degree(self):
188
+ # Calculate the number of unique businesses per user
189
+ user_business_counts = self.df.groupby("user_id")["business_id"].nunique()
190
+
191
+ # Map the counts back to the original DataFrame
192
+ self.df["user_degree"] = self.df["user_id"].map(user_business_counts)
193
+
194
+
195
+ def calculate_business_degree(self):
196
+ # Calculate the number of unique users per business
197
+ business_user_counts = self.df.groupby("business_id")["user_id"].nunique()
198
+
199
+ # Map the counts back to the original DataFrame
200
+ self.df["business_degree"] = self.df["business_id"].map(business_user_counts)
201
+
202
+
203
+ def calculate_rating_variance_user(self):
204
+ # Calculate the mode (most frequent rating) per user
205
+ user_rating_mode = self.df.groupby("user_id")["review_stars"].agg(lambda x: x.mode()[0])
206
+
207
+ # Map the most frequent rating back to the original DataFrame
208
+ self.df["rating_variance_user"] = self.df["user_id"].map(user_rating_mode)
209
+
210
+
211
+ def calculate_user_review_burst_count(self):
212
+ # Ensure review_date is in datetime format
213
+ self.df["review_date"] = pd.to_datetime(self.df["review_date"])
214
+
215
+ # Sort by user_id and review_date for chronological order
216
+ self.df = self.df.sort_values(["user_id", "review_date"])
217
+
218
+ # Function to calculate the max number of reviews in any 20-day window
219
+ def calculate_burst_count(group):
220
+ if len(group) <= 1:
221
+ return 0 # No burst if 1 or fewer reviews
222
+
223
+ # Convert review_date to a Series for rolling window
224
+ dates = group["review_date"]
225
+
226
+ # Calculate the number of reviews within 20 days of each review
227
+ burst_counts = []
228
+ for i, date in enumerate(dates):
229
+ # Count reviews within 20 days after this date
230
+ window_end = date + pd.Timedelta(days=20)
231
+ count = ((dates >= date) & (dates <= window_end)).sum()
232
+ burst_counts.append(count)
233
+
234
+ # Return the maximum burst count for this user
235
+ return max(burst_counts)
236
+
237
+ # Calculate the burst count per user
238
+ user_burst_counts = self.df.groupby("user_id").apply(calculate_burst_count)
239
+
240
+ # Map the burst count back to the original DataFrame
241
+ self.df["user_review_burst_count"] = self.df["user_id"].map(user_burst_counts)
242
+
243
+
244
+ def calculate_business_review_burst_count(self):
245
+ # Ensure review_date is in datetime format
246
+ self.df["review_date"] = pd.to_datetime(self.df["review_date"])
247
+
248
+ # Sort by business_id and review_date for chronological order
249
+ self.df = self.df.sort_values(["business_id", "review_date"])
250
+
251
+ # Function to calculate the max number of reviews in any 10-day window
252
+ def calculate_burst_count(group):
253
+ if len(group) <= 1:
254
+ return 0 # No burst if 1 or fewer reviews
255
+
256
+ # Convert review_date to a Series for rolling window
257
+ dates = group["review_date"]
258
+
259
+ # Calculate the number of reviews within 10 days of each review
260
+ burst_counts = []
261
+ for i, date in enumerate(dates):
262
+ # Count reviews within 10 days after this date
263
+ window_end = date + pd.Timedelta(days=10)
264
+ count = ((dates >= date) & (dates <= window_end)).sum()
265
+ burst_counts.append(count)
266
+
267
+ # Return the maximum burst count for this business
268
+ return max(burst_counts)
269
+
270
+ # Calculate the burst count per business
271
+ business_burst_counts = self.df.groupby("business_id").apply(calculate_burst_count)
272
+
273
+ # Map the burst count back to the original DataFrame
274
+ self.df["business_review_burst_count"] = self.df["business_id"].map(business_burst_counts)
275
+
276
+
277
+ def calculate_temporal_similarity(self):
278
+ self.df["review_date"] = pd.to_datetime(self.df["review_date"])
279
+
280
+ # Extract the day of the week (0 = Monday, 6 = Sunday)
281
+ self.df["day_of_week"] = self.df["review_date"].dt.dayofweek
282
+
283
+ # Function to calculate avg hours between reviews on frequent days
284
+ def calculate_avg_hours_on_frequent_days(group):
285
+ frequent_days = group["day_of_week"].mode().tolist()
286
+
287
+ if len(group) <= 1:
288
+ return 0
289
+
290
+ frequent_reviews = group[group["day_of_week"].isin(frequent_days)]
291
+
292
+ if len(frequent_reviews) <= 1:
293
+ return 0
294
+
295
+ frequent_reviews = frequent_reviews.sort_values("review_date")
296
+ diffs = frequent_reviews["review_date"].diff().dt.total_seconds() / 3600
297
+
298
+ return diffs.dropna().mean()
299
+
300
+ # Calculate average hours for each user
301
+ avg_hours_per_user = self.df.groupby("user_id").apply(calculate_avg_hours_on_frequent_days)
302
+
303
+ # Map the average hours to the new column
304
+ self.df["temporal_similarity"] = self.df["user_id"].map(avg_hours_per_user)
305
+
306
+ # Drop temporary column
307
+ self.df = self.df.drop(columns=["day_of_week"])
308
+
309
+
310
+ def calculate_rating_deviation_from_business_average(self):
311
+ # Calculate the average rating per business
312
+ business_avg_rating = self.df.groupby("business_id")["review_stars"].mean()
313
+
314
+ # Map the average rating to each row
315
+ self.df["business_avg_rating"] = self.df["business_id"].map(business_avg_rating)
316
+
317
+ # Calculate the deviation from the business average
318
+ self.df["rating_deviation_from_business_average"] = (
319
+ self.df["review_stars"] - self.df["business_avg_rating"]
320
+ )
321
+
322
+ # Drop the temporary column
323
+ self.df = self.df.drop(columns=["business_avg_rating"])
324
+
325
+ def calculate_review_like_ratio(self):
326
+ # Create a binary column for liked reviews (stars >= 4)
327
+ self.df["is_liked"] = (self.df["review_stars"] >= 4).astype(int)
328
+
329
+ # Calculate the like ratio per user
330
+ user_like_ratio = self.df.groupby("user_id")["is_liked"].mean()
331
+
332
+ # Map the like ratio back to the DataFrame
333
+ self.df["review_like_ratio"] = self.df["user_id"].map(user_like_ratio)
334
+
335
+ # Drop the temporary column
336
+ self.df = self.df.drop(columns=["is_liked"])
337
+
338
+ def calculate_latest_checkin_hours(self):
339
+ self.df["yelping_since"] = pd.to_datetime(self.df["yelping_since"])
340
+
341
+ # Function to get the latest check-in date from a list of strings
342
+ def get_latest_checkin(checkin_list):
343
+ if not checkin_list or pd.isna(checkin_list): # Handle empty or NaN
344
+ return None
345
+ if isinstance(checkin_list, str):
346
+ checkin_dates = checkin_list.split(", ")
347
+ else:
348
+ checkin_dates = checkin_list
349
+ return pd.to_datetime(checkin_dates).max()
350
+
351
+ # Apply the function to get the latest check-in date per row
352
+ self.df["latest_checkin_date"] = self.df["checkin_date"].apply(get_latest_checkin)
353
+
354
+ # Calculate the hours difference between latest check-in and yelping_since
355
+ self.df["latest_checkin_hours"] = (
356
+ (self.df["latest_checkin_date"] - self.df["yelping_since"])
357
+ .dt.total_seconds() / 3600
358
+ )
359
+
360
+ # Drop the temporary column
361
+ self.df = self.df.drop(columns=["latest_checkin_date"])
362
+ self.df["latest_checkin_hours"].fillna(0,inplace=True)
363
+
364
+
365
+ def compute_pronoun_density(self, text):
366
+ text = self.preprocess_text(text)
367
+ if not text:
368
+ return 0
369
+ words = word_tokenize(text.lower())
370
+ pos_tags = nltk.pos_tag(words)
371
+ pronouns = sum(1 for word, pos in pos_tags if pos in ['PRP', 'PRP$'] and word in ['i', 'we'])
372
+ return pronouns / len(words) if words else 0
373
+
374
+ def compute_avg_sentence_length(self, text):
375
+ text = self.preprocess_text(text)
376
+ if not text:
377
+ return 0
378
+ sentences = sent_tokenize(text)
379
+ return sum(len(word_tokenize(sent)) for sent in sentences) / len(sentences) if sentences else 0
380
+
381
+ def compute_excessive_punctuation(self, text):
382
+ text = self.preprocess_text(text)
383
+ return len(re.findall(r'[!?.]{2,}', text))
384
+
385
+ def compute_sentiment_polarity(self, text):
386
+ text = self.preprocess_text(text)
387
+ return TextBlob(text).sentiment.polarity if text else 0
388
+
389
+ def compute_code_switching_flag(self, text):
390
+ text = self.preprocess_text(text)
391
+ if not text:
392
+ return 0
393
+
394
+ tokens = self.tokenizer.tokenize(text.lower())
395
+ if not tokens:
396
+ return 0
397
+
398
+ english_words = self.stop_words # Use self.stop_words from __init__
399
+ token_set = set(tokens)
400
+ english_count = sum(1 for token in tokens if token in english_words)
401
+
402
+ non_english_pattern = re.compile(r'[^\x00-\x7F]')
403
+ has_non_ascii = 1 if non_english_pattern.search(text) else 0
404
+
405
+ english_ratio = english_count / len(tokens) if tokens else 0
406
+
407
+ non_english_tokens = sum(1 for token in token_set if token not in english_words and "##" in token and has_non_ascii)
408
+
409
+ # Flag as code-switching if:
410
+ # 1. Mixed English presence (ratio between 0.1 and 0.9)
411
+ # 2. Non-ASCII characters present OR some non-English subword tokens
412
+ if 0.1 < english_ratio < 0.9 and (has_non_ascii or non_english_tokens > 0):
413
+ return 1
414
+ return 0
415
+
416
+
417
+ def batch_tokenize(self, texts, batch_size=32, max_length=512):
418
+ tokenized_outputs = []
419
+ for i in tqdm(range(0, len(texts), batch_size), desc="Tokenizing with RoBERTa on GPU"):
420
+ batch_texts = texts[i:i + batch_size]
421
+ valid_texts = [self.preprocess_text(t) for t in batch_texts]
422
+ # Tokenize with fixed max_length to ensure consistent tensor sizes
423
+ inputs = self.tokenizer(valid_texts, return_tensors='pt', truncation=True, padding='max_length', max_length=max_length)
424
+ tokenized_outputs.append(inputs['input_ids'].to(self.device)) # Move to GPU
425
+ # Concatenate on GPU with consistent sizes
426
+ return torch.cat(tokenized_outputs, dim=0)
427
+
428
+ def compute_grammar_error_score(self, texts, tokenized_ids):
429
+ print("Computing grammar error scores...")
430
+ error_scores = np.zeros(len(texts), dtype=float)
431
+
432
+ vocab_set = set(self.tokenizer.get_vocab().keys())
433
+ for i, input_ids in enumerate(tqdm(tokenized_ids, desc="Processing Grammar Errors")):
434
+ if input_ids.sum() == 0: # Empty input
435
+ continue
436
+ tokens = self.tokenizer.convert_ids_to_tokens(input_ids.cpu().tolist(), skip_special_tokens=True)
437
+ unknown_count = sum(1 for token in tokens if token not in vocab_set and token not in self.stop_words)
438
+ total_count = len([t for t in tokens if t not in self.stop_words])
439
+ error_scores[i] = unknown_count / total_count if total_count > 0 else 0
440
+
441
+ return error_scores
442
+
443
+ def compute_repetitive_words_count(self, texts, tokenized_ids):
444
+ print("Computing repetitive words counts...")
445
+ rep_counts = np.zeros(len(texts), dtype=int)
446
+
447
+ for i, input_ids in enumerate(tqdm(tokenized_ids, desc="Processing Repetition")):
448
+ if input_ids.sum() == 0: # Empty input
449
+ continue
450
+ tokens = self.tokenizer.convert_ids_to_tokens(input_ids.cpu().tolist(), skip_special_tokens=True)
451
+ valid_tokens = [t for t in tokens if t not in self.stop_words and len(t) > 2]
452
+ if valid_tokens:
453
+ token_counts = {}
454
+ for token in valid_tokens:
455
+ token_counts[token] = token_counts.get(token, 0) + 1
456
+ rep_counts[i] = sum(1 for count in token_counts.values() if count > 1)
457
+
458
+ return rep_counts
459
+
460
+ def preprocess_text_for_similarity(self, text):
461
+ if pd.isna(text) or not text.strip():
462
+ return []
463
+ return [w for w in word_tokenize(str(text).lower()) if w not in self.stop_words]
464
+
465
+ def batch_encode_words(self, texts, batch_size=32, max_length=512):
466
+ word_lists = [self.preprocess_text_for_similarity(t) for t in tqdm(texts, desc="Tokenizing Texts")]
467
+ vocab = {word: idx + 1 for idx, word in enumerate(set.union(*[set(w) for w in word_lists if w]))}
468
+
469
+ encoded_batches = []
470
+ for i in tqdm(range(0, len(word_lists), batch_size), desc="Encoding Words on GPU"):
471
+ batch_words = word_lists[i:i + batch_size]
472
+ encoded = np.zeros((len(batch_words), max_length), dtype=np.int64)
473
+ for j, words in enumerate(batch_words):
474
+ if words:
475
+ word_ids = [vocab.get(w, 0) for w in words][:max_length]
476
+ encoded[j, :len(word_ids)] = word_ids
477
+ encoded_tensor = torch.tensor(encoded, dtype=torch.int64).to(self.device)
478
+ encoded_batches.append(encoded_tensor)
479
+
480
+ return torch.cat(encoded_batches, dim=0), vocab
481
+
482
+ def compute_similarity_to_other_reviews(self, batch_size=32, max_length=512):
483
+ all_texts = self.df["review_text"].tolist()
484
+ all_users = self.df["user_id"].tolist()
485
+ all_review_ids = self.df["review_id"].tolist()
486
+
487
+ encoded_words, vocab = self.batch_encode_words(all_texts, batch_size, max_length)
488
+
489
+ similarity_scores = {rid: 0.0 for rid in all_review_ids} # Default scores
490
+ for i, (review_id, user_id) in enumerate(tqdm(zip(all_review_ids, all_users), desc="Computing Similarities on GPU")):
491
+ if pd.isna(review_id) or pd.isna(user_id):
492
+ continue
493
+
494
+ current_words = encoded_words[i]
495
+ if current_words.sum() == 0:
496
+ continue
497
+
498
+ other_indices = torch.tensor([j for j, u in enumerate(all_users) if u != user_id and pd.notna(u)],
499
+ dtype=torch.long).to(self.device)
500
+ if not other_indices.numel():
501
+ continue
502
+
503
+ other_words = encoded_words[other_indices]
504
+ current_set = torch.unique(current_words[current_words > 0])
505
+ other_flat = other_words[other_words > 0]
506
+
507
+ if other_flat.numel() == 0:
508
+ continue
509
+
510
+ other_set = torch.unique(other_flat)
511
+ intersection = torch.sum(torch.isin(current_set, other_set)).float()
512
+ union = torch.unique(torch.cat([current_set, other_set])).numel()
513
+ similarity = intersection / union if union > 0 else 0.0
514
+
515
+ similarity_scores[review_id] = similarity.item()
516
+ return pd.Series(similarity_scores, index=all_review_ids)
517
+
518
+ def calculate_friend_count(self):
519
+ friends = []
520
+ for v in self.df["friends"]:
521
+ if isinstance(v, str):
522
+ friends.append(len(v.split(",")))
523
+ elif type(v)==int or type(v)==float:
524
+ friends.append(0)
525
+ self.df["friends"] = friends
526
+
527
+ def count_elite_years(self, elite):
528
+ if pd.isna(elite):
529
+ return 0
530
+ return len(str(elite).split(","))
531
+
532
+ def transform_elite_status(self):
533
+ self.df["elite"] = self.df["elite"].apply(lambda x: True if self.count_elite_years(x) > 1 else False)
534
+ self.df["elite"] = self.df["elite"].astype(int)
535
+
536
+
537
+ def calculate_review_useful_funny_cool(self):
538
+ self.df["review_useful"] = pd.to_numeric(self.df["review_useful"], errors='coerce').fillna(0)
539
+ self.df["review_funny"] = pd.to_numeric(self.df["review_funny"], errors='coerce').fillna(0)
540
+ self.df["review_cool"] = pd.to_numeric(self.df["review_cool"], errors='coerce').fillna(0)
541
+ self.df["review_useful_funny_cool"] = (
542
+ self.df["review_useful"] +
543
+ self.df["review_funny"] +
544
+ self.df["review_cool"]
545
+ )
546
+ self.df["review_useful_funny_cool"] = self.df["review_useful_funny_cool"].fillna(0).astype(int)
547
+
548
+
549
+ def calculate_user_useful_funny_cool(self):
550
+ self.df["user_useful_funny_cool"] = (
551
+ self.df["user_useful"] +
552
+ self.df["user_funny"] +
553
+ self.df["user_cool"]
554
+ )
555
+ self.df["user_useful_funny_cool"] = self.df["user_useful_funny_cool"].fillna(0).astype(int)
556
+
557
+ def compute_fake_score(self, row):
558
+ suspicion_points = 0
559
+
560
+ # Linguistic Features
561
+ if row["pronoun_density"] < 0.01: # Low personal engagement
562
+ suspicion_points += 1
563
+ if row["avg_sentence_length"] < 5 or row["avg_sentence_length"] > 30: # Extreme lengths
564
+ suspicion_points += 1
565
+ if row["grammar_error_score"] > 5: # Many errors
566
+ suspicion_points += 1
567
+ if row["repetitive_words_count"] > 5: # High repetition
568
+ suspicion_points += 1
569
+ if row["code_switching_flag"] == 1: # Language mixing
570
+ suspicion_points += 1
571
+ if row["excessive_punctuation_count"] > 3: # Overuse of punctuation
572
+ suspicion_points += 1
573
+ if abs(row["sentiment_polarity"]) > 0.8: # Extreme sentiment
574
+ suspicion_points += 1
575
+
576
+ # Review Patterns
577
+ if row["similarity_to_other_reviews"] > 0.8: # High duplication
578
+ suspicion_points += 1
579
+ if row["user_review_burst_count"] > 5: # Spammy bursts
580
+ suspicion_points += 1
581
+ if row["business_review_burst_count"] > 5: # Targeted bursts
582
+ suspicion_points += 1
583
+ if abs(row["rating_deviation_from_business_average"]) > 2: # Large rating deviation
584
+ suspicion_points += 1
585
+ if row["review_like_ratio"] > 0.9 or row["review_like_ratio"] < 0.1: # Extreme like ratio
586
+ suspicion_points += 1
587
+
588
+ # User Behavior
589
+ if row["user_account_age"] < 30: # Very new account (days)
590
+ suspicion_points += 1
591
+ if row["average_time_between_reviews"] < 24: # Rapid reviews (hours)
592
+ suspicion_points += 1
593
+ if row["user_degree"] < 2: # Low business interaction
594
+ suspicion_points += 1
595
+ if row["time_since_last_review_user"] < 24: # Recent burst (hours)
596
+ suspicion_points += 1
597
+
598
+ # Threshold: 3 or more points = fake
599
+ return 1 if suspicion_points >= 3 else 0
600
+
601
+ def dropping_unncessary_columns(self):
602
+
603
+ self.df.drop("review_text", axis=1, inplace=True)
604
+ self.df.drop("review_date", axis=1, inplace=True)
605
+ self.df.drop("business_name", axis=1, inplace=True)
606
+ self.df.drop("address", axis=1, inplace=True)
607
+ self.df.drop("city", axis=1, inplace=True)
608
+ self.df.drop("state", axis=1, inplace=True)
609
+ self.df.drop("postal_code", axis=1, inplace=True)
610
+ self.df.drop("categories", axis=1, inplace=True)
611
+ self.df.drop("user_name", axis=1, inplace=True)
612
+ self.df.drop("yelping_since", axis=1, inplace=True)
613
+ self.df.drop("checkin_date", axis=1, inplace=True)
614
+ self.df.drop("review_useful", axis=1, inplace=True)
615
+ self.df.drop("review_funny", axis=1, inplace=True)
616
+ self.df.drop("review_cool", axis=1, inplace=True)
617
+ self.df.drop("user_useful", axis=1, inplace=True)
618
+ self.df.drop("user_funny", axis=1, inplace=True)
619
+ self.df.drop("user_cool", axis=1, inplace=True)
620
+ self.df.drop("is_open", axis=1, inplace=True)
621
+ self.df.drop("compliment_hot", axis=1, inplace=True)
622
+ self.df.drop("compliment_more", axis=1, inplace=True)
623
+ self.df.drop("compliment_profile", axis=1, inplace=True)
624
+ self.df.drop("compliment_cute", axis=1, inplace=True)
625
+ self.df.drop("compliment_list", axis=1, inplace=True)
626
+ self.df.drop("compliment_note", axis=1, inplace=True)
627
+ self.df.drop("compliment_plain", axis=1, inplace=True)
628
+ self.df.drop("compliment_cool", axis=1, inplace=True)
629
+ self.df.drop("compliment_funny", axis=1, inplace=True)
630
+ self.df.drop("compliment_writer", axis=1, inplace=True)
631
+ self.df.drop("compliment_photos", axis=1, inplace=True)
632
+
633
+ def run_pipeline(self):
634
+
635
+
636
+
637
+ logger.info("FINALYZING HOURS COLUMN ...")
638
+ self.df["hours"] = self.df["hours"].apply(self.get_avg_duration)
639
+ self.df["hours"] = self.df["hours"].fillna(0)
640
+ print(self.df["hours"][:10])
641
+ print(self.df["hours"].isnull().sum())
642
+
643
+
644
+
645
+
646
+ logger.info("FINALYZING ATTRIBUTES COLUMN ...")
647
+ self.df.drop("attributes",axis=1,inplace=True)
648
+
649
+
650
+
651
+ logger.info("CREATING time_since_last_review_user COLUMN ...")
652
+ self.calculate_time_since_last_review()
653
+ print(np.unique(self.df["time_since_last_review_user"] ))
654
+
655
+
656
+ logger.info("CREATING time_since_last_review_business COLUMN ...")
657
+ self.calculate_time_since_last_review_business()
658
+ print(np.unique(self.df["time_since_last_review_business"] ))
659
+
660
+
661
+
662
+ logger.info("CREATING user_account_age COLUMN ...")
663
+ self.calculate_user_account_age()
664
+ print(np.unique(self.df["user_account_age"] ))
665
+
666
+
667
+
668
+ logger.info("CREATING average_time_between_reviews COLUMN ...")
669
+ self.calculate_avg_time_between_reviews()
670
+ print(np.unique(self.df["average_time_between_reviews"] ))
671
+
672
+
673
+
674
+ logger.info("CREATING user_degree COLUMN ...")
675
+ self.calculate_user_degree()
676
+ print(np.unique(self.df["user_degree"] ))
677
+
678
+
679
+ logger.info("CREATING business_degree COLUMN ...")
680
+ self.calculate_business_degree()
681
+ print(np.unique(self.df["business_degree"] ))
682
+
683
+
684
+ logger.info("CREATING rating_variance_user COLUMN ...")
685
+ self.calculate_rating_variance_user()
686
+ print(np.unique(self.df["rating_variance_user"] ))
687
+
688
+
689
+
690
+ logger.info("CREATING user_review_burst_count COLUMN ...")
691
+ self.calculate_user_review_burst_count()
692
+ print(np.unique(self.df["user_review_burst_count"] ))
693
+
694
+
695
+ logger.info("CREATING business_review_burst_count COLUMN ...")
696
+ self.calculate_business_review_burst_count()
697
+ print(np.unique(self.df["business_review_burst_count"] ))
698
+
699
+
700
+
701
+ logger.info("CREATING temporal_similarity COLUMN ...")
702
+ self.calculate_temporal_similarity()
703
+ print(np.unique(self.df["temporal_similarity"] ))
704
+
705
+
706
+
707
+ logger.info("CREATING rating_deviation_from_business_average COLUMN ...")
708
+ self.calculate_rating_deviation_from_business_average()
709
+ print(np.unique(self.df["rating_deviation_from_business_average"] ))
710
+
711
+
712
+
713
+ logger.info("CREATING review_like_ratio COLUMN ...")
714
+ self.calculate_review_like_ratio()
715
+ print(np.unique(self.df["review_like_ratio"] ))
716
+
717
+
718
+
719
+ logger.info("CREATING latest_checkin_hours COLUMN ...")
720
+ self.calculate_latest_checkin_hours()
721
+ print(np.unique(self.df["latest_checkin_hours"] ))
722
+
723
+
724
+
725
+
726
+ logger.info("CREATING pronoun_density COLUMN ...")
727
+ self.df["pronoun_density"] = self.df["review_text"].apply(self.compute_pronoun_density)
728
+ print(np.unique(self.df["pronoun_density"] ))
729
+
730
+ logger.info("CREATING avg_sentence_length COLUMN ...")
731
+ self.df["avg_sentence_length"] = self.df["review_text"].apply(self.compute_avg_sentence_length)
732
+ print(np.unique(self.df["avg_sentence_length"] ))
733
+
734
+ logger.info("CREATING excessive_punctuation_count COLUMN ...")
735
+ self.df["excessive_punctuation_count"] = self.df["review_text"].apply(self.compute_excessive_punctuation)
736
+ print(np.unique(self.df["excessive_punctuation_count"] ))
737
+
738
+ logger.info("CREATING sentiment_polarity COLUMN ...")
739
+ self.df["sentiment_polarity"] = self.df["review_text"].apply(self.compute_sentiment_polarity)
740
+ print(np.unique(self.df["sentiment_polarity"] ))
741
+
742
+ logger.info("CREATING good_severity and bad_severity COLUMNS ...")
743
+ severity_scores = self.df["review_text"].apply(self.calculate_sentiment_severity)
744
+ self.df[["good_severity", "bad_severity"]] = severity_scores
745
+ print(np.unique(self.df["good_severity"] ))
746
+ print(np.unique(self.df["bad_severity"] ))
747
+
748
+
749
+ logger.info("CREATING code_switching_flag COLUMN ...")
750
+ self.df["code_switching_flag"] = self.df["review_text"].apply(self.compute_code_switching_flag)
751
+ print(np.unique(self.df["code_switching_flag"] ))
752
+
753
+
754
+ all_texts = self.df["review_text"].tolist()
755
+ tokenized_ids = self.batch_tokenize(all_texts, batch_size=32, max_length=512)
756
+
757
+ logger.info("CREATING grammar_error_score COLUMN ...")
758
+ self.df["grammar_error_score"] = self.compute_grammar_error_score(all_texts, tokenized_ids)
759
+ print(np.unique(self.df["grammar_error_score"] ))
760
+
761
+
762
+ logger.info("CREATING repetitive_words_count COLUMN ...")
763
+ self.df["repetitive_words_count"] = self.compute_repetitive_words_count(all_texts, tokenized_ids)
764
+ print(np.unique(self.df["repetitive_words_count"] ))
765
+
766
+
767
+
768
+ logger.info("CREATING similarity_to_other_reviews COLUMN ...")
769
+ similarity_scores = self.compute_similarity_to_other_reviews(batch_size=32, max_length=512)
770
+ self.df["similarity_to_other_reviews"] = self.df["review_id"].map(similarity_scores)
771
+
772
+ print(np.unique(self.df["similarity_to_other_reviews"] ))
773
+
774
+
775
+
776
+ logger.info("CREATING friends COLUMN ...")
777
+ self.calculate_friend_count()
778
+ print(self.df["friends"].value_counts())
779
+
780
+ logger.info("CREATING elite COLUMN ...")
781
+ self.transform_elite_status()
782
+ print(self.df["elite"].value_counts())
783
+
784
+
785
+ logger.info("CREATING review_useful_funny_cool COLUMN ...")
786
+ self.calculate_review_useful_funny_cool()
787
+ print(self.df["review_useful_funny_cool"].value_counts())
788
+
789
+
790
+ logger.info("CREATING user_useful_funny_cool COLUMN ...")
791
+ self.calculate_user_useful_funny_cool()
792
+ print(self.df["user_useful_funny_cool"].value_counts())
793
+
794
+
795
+ # logger.info("CREATING LABEL COLUMN ...")
796
+ # self.df["fake"] = self.df.apply(self.compute_fake_score, axis=1)
797
+ # print(self.df["fake"].value_counts())
798
+
799
+ logger.info("DELETING THE UNWANTED COLUMNS ...")
800
+ self.dropping_unncessary_columns()
801
+ print()
802
+
803
+ logger.info("SEEING NULL VALUES IN FINAL COLUMNS.....")
804
+ print(set(self.df.isnull().sum().values))
805
+
806
+
807
+
808
+
809
+ return self.df
810
+
811
+
812
+
813
+
814
+
815
+
816
+
817
+
818
+
819
+
820
+
821
+
822
+
823
+
824
+
825
+
826
+
827
+
828
+
829
+
830
+
831
+
832
+
833
+
834
+
835
+
836
+
837
+
838
+
839
+
840
+
841
+
842
+
843
+
844
+
845
+
846
+
847
+
848
+
849
+
850
+
851
+
852
+
853
+
854
+
855
+
856
+
857
+
858
+
859
+
860
+
861
+
862
+
863
+
864
+
865
+
866
+