OVH
commited on
Commit
·
c1e568b
1
Parent(s):
c0cb02a
Added some files
Browse files- preprocessing_test.py +866 -0
preprocessing_test.py
ADDED
@@ -0,0 +1,866 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from loguru import logger
|
2 |
+
import pandas as pd
|
3 |
+
import json
|
4 |
+
from datetime import datetime
|
5 |
+
import ast
|
6 |
+
import numpy as np
|
7 |
+
from pymongo import MongoClient
|
8 |
+
from collections import defaultdict
|
9 |
+
|
10 |
+
from tqdm import tqdm
|
11 |
+
import time
|
12 |
+
|
13 |
+
import requests
|
14 |
+
import json
|
15 |
+
import os
|
16 |
+
import pandas as pd
|
17 |
+
import nltk
|
18 |
+
from nltk.tokenize import sent_tokenize, word_tokenize
|
19 |
+
from nltk.corpus import stopwords
|
20 |
+
from textblob import TextBlob
|
21 |
+
import re
|
22 |
+
from transformers import BertTokenizer, BertModel
|
23 |
+
from transformers import RobertaTokenizer, RobertaModel
|
24 |
+
import torch
|
25 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
26 |
+
import numpy as np
|
27 |
+
|
28 |
+
# Download NLTK resources
|
29 |
+
nltk.download('punkt')
|
30 |
+
nltk.download('averaged_perceptron_tagger')
|
31 |
+
nltk.download('stopwords')
|
32 |
+
nltk.download('punkt_tab')
|
33 |
+
nltk.download('averaged_perceptron_tagger_eng')
|
34 |
+
|
35 |
+
class Preprocessor:
|
36 |
+
def __init__(self,df):
|
37 |
+
self.df=df
|
38 |
+
self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
|
39 |
+
self.model = RobertaModel.from_pretrained('roberta-base')
|
40 |
+
self.stop_words = set(stopwords.words('english'))
|
41 |
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Add this line
|
42 |
+
|
43 |
+
|
44 |
+
|
45 |
+
def get_bert_embedding(self, text):
|
46 |
+
inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
|
47 |
+
with torch.no_grad():
|
48 |
+
outputs = self.model(**inputs)
|
49 |
+
return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
|
50 |
+
|
51 |
+
def preprocess_text(self,text):
|
52 |
+
return text if pd.notna(text) else ""
|
53 |
+
|
54 |
+
|
55 |
+
def calculate_duration(self, time_range):
|
56 |
+
if not isinstance(time_range, str) or "-" not in time_range:
|
57 |
+
return None
|
58 |
+
start_str, end_str = time_range.split('-')
|
59 |
+
start_str = start_str.strip() + ':00' if len(start_str.split(':')) == 1 else start_str.strip()
|
60 |
+
end_str = end_str.strip() + ':00' if len(end_str.split(':')) == 1 else end_str.strip()
|
61 |
+
try:
|
62 |
+
start = datetime.strptime(start_str, '%H:%M')
|
63 |
+
end = datetime.strptime(end_str, '%H:%M')
|
64 |
+
duration = (end - start).total_seconds() / 3600
|
65 |
+
return duration if duration >= 0 else duration + 24
|
66 |
+
except ValueError:
|
67 |
+
return None
|
68 |
+
def calculate_sentiment_severity(self, text):
|
69 |
+
if pd.isna(text) or not text.strip():
|
70 |
+
return pd.Series({"good_severity": 0.0, "bad_severity": 0.0})
|
71 |
+
|
72 |
+
# Get sentiment polarity (-1 to 1)
|
73 |
+
blob = TextBlob(text)
|
74 |
+
polarity = blob.sentiment.polarity
|
75 |
+
|
76 |
+
# Define severity weights
|
77 |
+
good_weight = 0.7
|
78 |
+
bad_weight = 0.3
|
79 |
+
|
80 |
+
if polarity > 0:
|
81 |
+
good_severity = good_weight * polarity
|
82 |
+
bad_severity = 0.0
|
83 |
+
elif polarity < 0:
|
84 |
+
good_severity = 0.0
|
85 |
+
bad_severity = bad_weight * abs(polarity)
|
86 |
+
else: # Neutral (polarity = 0)
|
87 |
+
good_severity = 0.0
|
88 |
+
bad_severity = 0.0
|
89 |
+
|
90 |
+
return pd.Series({"good_severity": good_severity, "bad_severity": bad_severity})
|
91 |
+
|
92 |
+
|
93 |
+
def get_avg_duration(self, hours_str):
|
94 |
+
if pd.isna(hours_str) or not isinstance(hours_str, str):
|
95 |
+
return pd.NA
|
96 |
+
try:
|
97 |
+
hours_dict = ast.literal_eval(hours_str)
|
98 |
+
if not hours_dict:
|
99 |
+
return pd.NA
|
100 |
+
durations = [self.calculate_duration(time_range) for time_range in hours_dict.values()]
|
101 |
+
valid_durations = [d for d in durations if d is not None]
|
102 |
+
return sum(valid_durations) / len(valid_durations) if valid_durations else pd.NA
|
103 |
+
except (ValueError, SyntaxError, ZeroDivisionError):
|
104 |
+
return pd.NA
|
105 |
+
|
106 |
+
|
107 |
+
def calculate_time_since_last_review(self):
|
108 |
+
present_date = datetime.now()
|
109 |
+
user_latest_timestamp = {}
|
110 |
+
|
111 |
+
# Convert review_date to datetime
|
112 |
+
self.df["review_date"] = pd.to_datetime(self.df["review_date"])
|
113 |
+
|
114 |
+
# Calculate hours difference for each user's latest review
|
115 |
+
for user_id in self.df["user_id"].unique():
|
116 |
+
latest_date = self.df[self.df["user_id"] == user_id]["review_date"].max()
|
117 |
+
|
118 |
+
if not isinstance(latest_date, datetime):
|
119 |
+
latest_date = latest_date.to_pydatetime()
|
120 |
+
|
121 |
+
hours_difference = (present_date - latest_date).total_seconds() / 3600
|
122 |
+
user_latest_timestamp[user_id] = hours_difference
|
123 |
+
|
124 |
+
# Map the hours difference to a new column
|
125 |
+
self.df["time_since_last_review_user"] = self.df["user_id"].map(user_latest_timestamp)
|
126 |
+
|
127 |
+
def calculate_time_since_last_review_business(self):
|
128 |
+
present_date = datetime.now()
|
129 |
+
|
130 |
+
# Ensure review_date is in datetime format
|
131 |
+
self.df["review_date"] = pd.to_datetime(self.df["review_date"])
|
132 |
+
|
133 |
+
# Initialize dictionary to store hours since last review for each business
|
134 |
+
business_latest_timestamp = {}
|
135 |
+
|
136 |
+
# Iterate over unique business_ids
|
137 |
+
for business_id in self.df["business_id"].unique():
|
138 |
+
# Get the latest review date for this business
|
139 |
+
latest_date = self.df[self.df["business_id"] == business_id]["review_date"].max()
|
140 |
+
|
141 |
+
# Convert to datetime object if needed
|
142 |
+
if not isinstance(latest_date, datetime):
|
143 |
+
latest_date = latest_date.to_pydatetime()
|
144 |
+
|
145 |
+
# Calculate hours difference (already in hours)
|
146 |
+
hours_difference = (present_date - latest_date).total_seconds() / 3600
|
147 |
+
business_latest_timestamp[business_id] = hours_difference
|
148 |
+
|
149 |
+
# Map the hours difference to the new column
|
150 |
+
self.df["time_since_last_review_business"] = self.df["business_id"].map(business_latest_timestamp)
|
151 |
+
|
152 |
+
|
153 |
+
|
154 |
+
def calculate_user_account_age(self):
|
155 |
+
present_date = datetime.now()
|
156 |
+
|
157 |
+
# Convert yelping_since to datetime
|
158 |
+
self.df["yelping_since"] = pd.to_datetime(self.df["yelping_since"])
|
159 |
+
|
160 |
+
# Calculate user account age in days
|
161 |
+
self.df["user_account_age"] = (present_date - self.df["yelping_since"]).dt.days
|
162 |
+
|
163 |
+
|
164 |
+
def calculate_avg_time_between_reviews(self):
|
165 |
+
# Ensure review_date is in datetime format
|
166 |
+
self.df["review_date"] = pd.to_datetime(self.df["review_date"])
|
167 |
+
|
168 |
+
# Sort the DataFrame by user_id and review_date to ensure chronological order
|
169 |
+
self.df = self.df.sort_values(["user_id", "review_date"])
|
170 |
+
|
171 |
+
# Define helper function to calculate average time between reviews
|
172 |
+
def calculate_avg_time(group):
|
173 |
+
if len(group) == 1:
|
174 |
+
return 0 # If only one review, assign 0
|
175 |
+
# Calculate differences in hours between consecutive reviews
|
176 |
+
diffs = group["review_date"].diff().dt.total_seconds() / 3600
|
177 |
+
# Drop the first NaN (from diff) and compute the mean
|
178 |
+
return diffs.dropna().mean()
|
179 |
+
|
180 |
+
# Apply the function to each user_id group and create a mapping
|
181 |
+
avg_time_per_user = self.df.groupby("user_id").apply(calculate_avg_time)
|
182 |
+
|
183 |
+
# Map the average time back to the original DataFrame
|
184 |
+
self.df["average_time_between_reviews"] = self.df["user_id"].map(avg_time_per_user)
|
185 |
+
|
186 |
+
|
187 |
+
def calculate_user_degree(self):
|
188 |
+
# Calculate the number of unique businesses per user
|
189 |
+
user_business_counts = self.df.groupby("user_id")["business_id"].nunique()
|
190 |
+
|
191 |
+
# Map the counts back to the original DataFrame
|
192 |
+
self.df["user_degree"] = self.df["user_id"].map(user_business_counts)
|
193 |
+
|
194 |
+
|
195 |
+
def calculate_business_degree(self):
|
196 |
+
# Calculate the number of unique users per business
|
197 |
+
business_user_counts = self.df.groupby("business_id")["user_id"].nunique()
|
198 |
+
|
199 |
+
# Map the counts back to the original DataFrame
|
200 |
+
self.df["business_degree"] = self.df["business_id"].map(business_user_counts)
|
201 |
+
|
202 |
+
|
203 |
+
def calculate_rating_variance_user(self):
|
204 |
+
# Calculate the mode (most frequent rating) per user
|
205 |
+
user_rating_mode = self.df.groupby("user_id")["review_stars"].agg(lambda x: x.mode()[0])
|
206 |
+
|
207 |
+
# Map the most frequent rating back to the original DataFrame
|
208 |
+
self.df["rating_variance_user"] = self.df["user_id"].map(user_rating_mode)
|
209 |
+
|
210 |
+
|
211 |
+
def calculate_user_review_burst_count(self):
|
212 |
+
# Ensure review_date is in datetime format
|
213 |
+
self.df["review_date"] = pd.to_datetime(self.df["review_date"])
|
214 |
+
|
215 |
+
# Sort by user_id and review_date for chronological order
|
216 |
+
self.df = self.df.sort_values(["user_id", "review_date"])
|
217 |
+
|
218 |
+
# Function to calculate the max number of reviews in any 20-day window
|
219 |
+
def calculate_burst_count(group):
|
220 |
+
if len(group) <= 1:
|
221 |
+
return 0 # No burst if 1 or fewer reviews
|
222 |
+
|
223 |
+
# Convert review_date to a Series for rolling window
|
224 |
+
dates = group["review_date"]
|
225 |
+
|
226 |
+
# Calculate the number of reviews within 20 days of each review
|
227 |
+
burst_counts = []
|
228 |
+
for i, date in enumerate(dates):
|
229 |
+
# Count reviews within 20 days after this date
|
230 |
+
window_end = date + pd.Timedelta(days=20)
|
231 |
+
count = ((dates >= date) & (dates <= window_end)).sum()
|
232 |
+
burst_counts.append(count)
|
233 |
+
|
234 |
+
# Return the maximum burst count for this user
|
235 |
+
return max(burst_counts)
|
236 |
+
|
237 |
+
# Calculate the burst count per user
|
238 |
+
user_burst_counts = self.df.groupby("user_id").apply(calculate_burst_count)
|
239 |
+
|
240 |
+
# Map the burst count back to the original DataFrame
|
241 |
+
self.df["user_review_burst_count"] = self.df["user_id"].map(user_burst_counts)
|
242 |
+
|
243 |
+
|
244 |
+
def calculate_business_review_burst_count(self):
|
245 |
+
# Ensure review_date is in datetime format
|
246 |
+
self.df["review_date"] = pd.to_datetime(self.df["review_date"])
|
247 |
+
|
248 |
+
# Sort by business_id and review_date for chronological order
|
249 |
+
self.df = self.df.sort_values(["business_id", "review_date"])
|
250 |
+
|
251 |
+
# Function to calculate the max number of reviews in any 10-day window
|
252 |
+
def calculate_burst_count(group):
|
253 |
+
if len(group) <= 1:
|
254 |
+
return 0 # No burst if 1 or fewer reviews
|
255 |
+
|
256 |
+
# Convert review_date to a Series for rolling window
|
257 |
+
dates = group["review_date"]
|
258 |
+
|
259 |
+
# Calculate the number of reviews within 10 days of each review
|
260 |
+
burst_counts = []
|
261 |
+
for i, date in enumerate(dates):
|
262 |
+
# Count reviews within 10 days after this date
|
263 |
+
window_end = date + pd.Timedelta(days=10)
|
264 |
+
count = ((dates >= date) & (dates <= window_end)).sum()
|
265 |
+
burst_counts.append(count)
|
266 |
+
|
267 |
+
# Return the maximum burst count for this business
|
268 |
+
return max(burst_counts)
|
269 |
+
|
270 |
+
# Calculate the burst count per business
|
271 |
+
business_burst_counts = self.df.groupby("business_id").apply(calculate_burst_count)
|
272 |
+
|
273 |
+
# Map the burst count back to the original DataFrame
|
274 |
+
self.df["business_review_burst_count"] = self.df["business_id"].map(business_burst_counts)
|
275 |
+
|
276 |
+
|
277 |
+
def calculate_temporal_similarity(self):
|
278 |
+
self.df["review_date"] = pd.to_datetime(self.df["review_date"])
|
279 |
+
|
280 |
+
# Extract the day of the week (0 = Monday, 6 = Sunday)
|
281 |
+
self.df["day_of_week"] = self.df["review_date"].dt.dayofweek
|
282 |
+
|
283 |
+
# Function to calculate avg hours between reviews on frequent days
|
284 |
+
def calculate_avg_hours_on_frequent_days(group):
|
285 |
+
frequent_days = group["day_of_week"].mode().tolist()
|
286 |
+
|
287 |
+
if len(group) <= 1:
|
288 |
+
return 0
|
289 |
+
|
290 |
+
frequent_reviews = group[group["day_of_week"].isin(frequent_days)]
|
291 |
+
|
292 |
+
if len(frequent_reviews) <= 1:
|
293 |
+
return 0
|
294 |
+
|
295 |
+
frequent_reviews = frequent_reviews.sort_values("review_date")
|
296 |
+
diffs = frequent_reviews["review_date"].diff().dt.total_seconds() / 3600
|
297 |
+
|
298 |
+
return diffs.dropna().mean()
|
299 |
+
|
300 |
+
# Calculate average hours for each user
|
301 |
+
avg_hours_per_user = self.df.groupby("user_id").apply(calculate_avg_hours_on_frequent_days)
|
302 |
+
|
303 |
+
# Map the average hours to the new column
|
304 |
+
self.df["temporal_similarity"] = self.df["user_id"].map(avg_hours_per_user)
|
305 |
+
|
306 |
+
# Drop temporary column
|
307 |
+
self.df = self.df.drop(columns=["day_of_week"])
|
308 |
+
|
309 |
+
|
310 |
+
def calculate_rating_deviation_from_business_average(self):
|
311 |
+
# Calculate the average rating per business
|
312 |
+
business_avg_rating = self.df.groupby("business_id")["review_stars"].mean()
|
313 |
+
|
314 |
+
# Map the average rating to each row
|
315 |
+
self.df["business_avg_rating"] = self.df["business_id"].map(business_avg_rating)
|
316 |
+
|
317 |
+
# Calculate the deviation from the business average
|
318 |
+
self.df["rating_deviation_from_business_average"] = (
|
319 |
+
self.df["review_stars"] - self.df["business_avg_rating"]
|
320 |
+
)
|
321 |
+
|
322 |
+
# Drop the temporary column
|
323 |
+
self.df = self.df.drop(columns=["business_avg_rating"])
|
324 |
+
|
325 |
+
def calculate_review_like_ratio(self):
|
326 |
+
# Create a binary column for liked reviews (stars >= 4)
|
327 |
+
self.df["is_liked"] = (self.df["review_stars"] >= 4).astype(int)
|
328 |
+
|
329 |
+
# Calculate the like ratio per user
|
330 |
+
user_like_ratio = self.df.groupby("user_id")["is_liked"].mean()
|
331 |
+
|
332 |
+
# Map the like ratio back to the DataFrame
|
333 |
+
self.df["review_like_ratio"] = self.df["user_id"].map(user_like_ratio)
|
334 |
+
|
335 |
+
# Drop the temporary column
|
336 |
+
self.df = self.df.drop(columns=["is_liked"])
|
337 |
+
|
338 |
+
def calculate_latest_checkin_hours(self):
|
339 |
+
self.df["yelping_since"] = pd.to_datetime(self.df["yelping_since"])
|
340 |
+
|
341 |
+
# Function to get the latest check-in date from a list of strings
|
342 |
+
def get_latest_checkin(checkin_list):
|
343 |
+
if not checkin_list or pd.isna(checkin_list): # Handle empty or NaN
|
344 |
+
return None
|
345 |
+
if isinstance(checkin_list, str):
|
346 |
+
checkin_dates = checkin_list.split(", ")
|
347 |
+
else:
|
348 |
+
checkin_dates = checkin_list
|
349 |
+
return pd.to_datetime(checkin_dates).max()
|
350 |
+
|
351 |
+
# Apply the function to get the latest check-in date per row
|
352 |
+
self.df["latest_checkin_date"] = self.df["checkin_date"].apply(get_latest_checkin)
|
353 |
+
|
354 |
+
# Calculate the hours difference between latest check-in and yelping_since
|
355 |
+
self.df["latest_checkin_hours"] = (
|
356 |
+
(self.df["latest_checkin_date"] - self.df["yelping_since"])
|
357 |
+
.dt.total_seconds() / 3600
|
358 |
+
)
|
359 |
+
|
360 |
+
# Drop the temporary column
|
361 |
+
self.df = self.df.drop(columns=["latest_checkin_date"])
|
362 |
+
self.df["latest_checkin_hours"].fillna(0,inplace=True)
|
363 |
+
|
364 |
+
|
365 |
+
def compute_pronoun_density(self, text):
|
366 |
+
text = self.preprocess_text(text)
|
367 |
+
if not text:
|
368 |
+
return 0
|
369 |
+
words = word_tokenize(text.lower())
|
370 |
+
pos_tags = nltk.pos_tag(words)
|
371 |
+
pronouns = sum(1 for word, pos in pos_tags if pos in ['PRP', 'PRP$'] and word in ['i', 'we'])
|
372 |
+
return pronouns / len(words) if words else 0
|
373 |
+
|
374 |
+
def compute_avg_sentence_length(self, text):
|
375 |
+
text = self.preprocess_text(text)
|
376 |
+
if not text:
|
377 |
+
return 0
|
378 |
+
sentences = sent_tokenize(text)
|
379 |
+
return sum(len(word_tokenize(sent)) for sent in sentences) / len(sentences) if sentences else 0
|
380 |
+
|
381 |
+
def compute_excessive_punctuation(self, text):
|
382 |
+
text = self.preprocess_text(text)
|
383 |
+
return len(re.findall(r'[!?.]{2,}', text))
|
384 |
+
|
385 |
+
def compute_sentiment_polarity(self, text):
|
386 |
+
text = self.preprocess_text(text)
|
387 |
+
return TextBlob(text).sentiment.polarity if text else 0
|
388 |
+
|
389 |
+
def compute_code_switching_flag(self, text):
|
390 |
+
text = self.preprocess_text(text)
|
391 |
+
if not text:
|
392 |
+
return 0
|
393 |
+
|
394 |
+
tokens = self.tokenizer.tokenize(text.lower())
|
395 |
+
if not tokens:
|
396 |
+
return 0
|
397 |
+
|
398 |
+
english_words = self.stop_words # Use self.stop_words from __init__
|
399 |
+
token_set = set(tokens)
|
400 |
+
english_count = sum(1 for token in tokens if token in english_words)
|
401 |
+
|
402 |
+
non_english_pattern = re.compile(r'[^\x00-\x7F]')
|
403 |
+
has_non_ascii = 1 if non_english_pattern.search(text) else 0
|
404 |
+
|
405 |
+
english_ratio = english_count / len(tokens) if tokens else 0
|
406 |
+
|
407 |
+
non_english_tokens = sum(1 for token in token_set if token not in english_words and "##" in token and has_non_ascii)
|
408 |
+
|
409 |
+
# Flag as code-switching if:
|
410 |
+
# 1. Mixed English presence (ratio between 0.1 and 0.9)
|
411 |
+
# 2. Non-ASCII characters present OR some non-English subword tokens
|
412 |
+
if 0.1 < english_ratio < 0.9 and (has_non_ascii or non_english_tokens > 0):
|
413 |
+
return 1
|
414 |
+
return 0
|
415 |
+
|
416 |
+
|
417 |
+
def batch_tokenize(self, texts, batch_size=32, max_length=512):
|
418 |
+
tokenized_outputs = []
|
419 |
+
for i in tqdm(range(0, len(texts), batch_size), desc="Tokenizing with RoBERTa on GPU"):
|
420 |
+
batch_texts = texts[i:i + batch_size]
|
421 |
+
valid_texts = [self.preprocess_text(t) for t in batch_texts]
|
422 |
+
# Tokenize with fixed max_length to ensure consistent tensor sizes
|
423 |
+
inputs = self.tokenizer(valid_texts, return_tensors='pt', truncation=True, padding='max_length', max_length=max_length)
|
424 |
+
tokenized_outputs.append(inputs['input_ids'].to(self.device)) # Move to GPU
|
425 |
+
# Concatenate on GPU with consistent sizes
|
426 |
+
return torch.cat(tokenized_outputs, dim=0)
|
427 |
+
|
428 |
+
def compute_grammar_error_score(self, texts, tokenized_ids):
|
429 |
+
print("Computing grammar error scores...")
|
430 |
+
error_scores = np.zeros(len(texts), dtype=float)
|
431 |
+
|
432 |
+
vocab_set = set(self.tokenizer.get_vocab().keys())
|
433 |
+
for i, input_ids in enumerate(tqdm(tokenized_ids, desc="Processing Grammar Errors")):
|
434 |
+
if input_ids.sum() == 0: # Empty input
|
435 |
+
continue
|
436 |
+
tokens = self.tokenizer.convert_ids_to_tokens(input_ids.cpu().tolist(), skip_special_tokens=True)
|
437 |
+
unknown_count = sum(1 for token in tokens if token not in vocab_set and token not in self.stop_words)
|
438 |
+
total_count = len([t for t in tokens if t not in self.stop_words])
|
439 |
+
error_scores[i] = unknown_count / total_count if total_count > 0 else 0
|
440 |
+
|
441 |
+
return error_scores
|
442 |
+
|
443 |
+
def compute_repetitive_words_count(self, texts, tokenized_ids):
|
444 |
+
print("Computing repetitive words counts...")
|
445 |
+
rep_counts = np.zeros(len(texts), dtype=int)
|
446 |
+
|
447 |
+
for i, input_ids in enumerate(tqdm(tokenized_ids, desc="Processing Repetition")):
|
448 |
+
if input_ids.sum() == 0: # Empty input
|
449 |
+
continue
|
450 |
+
tokens = self.tokenizer.convert_ids_to_tokens(input_ids.cpu().tolist(), skip_special_tokens=True)
|
451 |
+
valid_tokens = [t for t in tokens if t not in self.stop_words and len(t) > 2]
|
452 |
+
if valid_tokens:
|
453 |
+
token_counts = {}
|
454 |
+
for token in valid_tokens:
|
455 |
+
token_counts[token] = token_counts.get(token, 0) + 1
|
456 |
+
rep_counts[i] = sum(1 for count in token_counts.values() if count > 1)
|
457 |
+
|
458 |
+
return rep_counts
|
459 |
+
|
460 |
+
def preprocess_text_for_similarity(self, text):
|
461 |
+
if pd.isna(text) or not text.strip():
|
462 |
+
return []
|
463 |
+
return [w for w in word_tokenize(str(text).lower()) if w not in self.stop_words]
|
464 |
+
|
465 |
+
def batch_encode_words(self, texts, batch_size=32, max_length=512):
|
466 |
+
word_lists = [self.preprocess_text_for_similarity(t) for t in tqdm(texts, desc="Tokenizing Texts")]
|
467 |
+
vocab = {word: idx + 1 for idx, word in enumerate(set.union(*[set(w) for w in word_lists if w]))}
|
468 |
+
|
469 |
+
encoded_batches = []
|
470 |
+
for i in tqdm(range(0, len(word_lists), batch_size), desc="Encoding Words on GPU"):
|
471 |
+
batch_words = word_lists[i:i + batch_size]
|
472 |
+
encoded = np.zeros((len(batch_words), max_length), dtype=np.int64)
|
473 |
+
for j, words in enumerate(batch_words):
|
474 |
+
if words:
|
475 |
+
word_ids = [vocab.get(w, 0) for w in words][:max_length]
|
476 |
+
encoded[j, :len(word_ids)] = word_ids
|
477 |
+
encoded_tensor = torch.tensor(encoded, dtype=torch.int64).to(self.device)
|
478 |
+
encoded_batches.append(encoded_tensor)
|
479 |
+
|
480 |
+
return torch.cat(encoded_batches, dim=0), vocab
|
481 |
+
|
482 |
+
def compute_similarity_to_other_reviews(self, batch_size=32, max_length=512):
|
483 |
+
all_texts = self.df["review_text"].tolist()
|
484 |
+
all_users = self.df["user_id"].tolist()
|
485 |
+
all_review_ids = self.df["review_id"].tolist()
|
486 |
+
|
487 |
+
encoded_words, vocab = self.batch_encode_words(all_texts, batch_size, max_length)
|
488 |
+
|
489 |
+
similarity_scores = {rid: 0.0 for rid in all_review_ids} # Default scores
|
490 |
+
for i, (review_id, user_id) in enumerate(tqdm(zip(all_review_ids, all_users), desc="Computing Similarities on GPU")):
|
491 |
+
if pd.isna(review_id) or pd.isna(user_id):
|
492 |
+
continue
|
493 |
+
|
494 |
+
current_words = encoded_words[i]
|
495 |
+
if current_words.sum() == 0:
|
496 |
+
continue
|
497 |
+
|
498 |
+
other_indices = torch.tensor([j for j, u in enumerate(all_users) if u != user_id and pd.notna(u)],
|
499 |
+
dtype=torch.long).to(self.device)
|
500 |
+
if not other_indices.numel():
|
501 |
+
continue
|
502 |
+
|
503 |
+
other_words = encoded_words[other_indices]
|
504 |
+
current_set = torch.unique(current_words[current_words > 0])
|
505 |
+
other_flat = other_words[other_words > 0]
|
506 |
+
|
507 |
+
if other_flat.numel() == 0:
|
508 |
+
continue
|
509 |
+
|
510 |
+
other_set = torch.unique(other_flat)
|
511 |
+
intersection = torch.sum(torch.isin(current_set, other_set)).float()
|
512 |
+
union = torch.unique(torch.cat([current_set, other_set])).numel()
|
513 |
+
similarity = intersection / union if union > 0 else 0.0
|
514 |
+
|
515 |
+
similarity_scores[review_id] = similarity.item()
|
516 |
+
return pd.Series(similarity_scores, index=all_review_ids)
|
517 |
+
|
518 |
+
def calculate_friend_count(self):
|
519 |
+
friends = []
|
520 |
+
for v in self.df["friends"]:
|
521 |
+
if isinstance(v, str):
|
522 |
+
friends.append(len(v.split(",")))
|
523 |
+
elif type(v)==int or type(v)==float:
|
524 |
+
friends.append(0)
|
525 |
+
self.df["friends"] = friends
|
526 |
+
|
527 |
+
def count_elite_years(self, elite):
|
528 |
+
if pd.isna(elite):
|
529 |
+
return 0
|
530 |
+
return len(str(elite).split(","))
|
531 |
+
|
532 |
+
def transform_elite_status(self):
|
533 |
+
self.df["elite"] = self.df["elite"].apply(lambda x: True if self.count_elite_years(x) > 1 else False)
|
534 |
+
self.df["elite"] = self.df["elite"].astype(int)
|
535 |
+
|
536 |
+
|
537 |
+
def calculate_review_useful_funny_cool(self):
|
538 |
+
self.df["review_useful"] = pd.to_numeric(self.df["review_useful"], errors='coerce').fillna(0)
|
539 |
+
self.df["review_funny"] = pd.to_numeric(self.df["review_funny"], errors='coerce').fillna(0)
|
540 |
+
self.df["review_cool"] = pd.to_numeric(self.df["review_cool"], errors='coerce').fillna(0)
|
541 |
+
self.df["review_useful_funny_cool"] = (
|
542 |
+
self.df["review_useful"] +
|
543 |
+
self.df["review_funny"] +
|
544 |
+
self.df["review_cool"]
|
545 |
+
)
|
546 |
+
self.df["review_useful_funny_cool"] = self.df["review_useful_funny_cool"].fillna(0).astype(int)
|
547 |
+
|
548 |
+
|
549 |
+
def calculate_user_useful_funny_cool(self):
|
550 |
+
self.df["user_useful_funny_cool"] = (
|
551 |
+
self.df["user_useful"] +
|
552 |
+
self.df["user_funny"] +
|
553 |
+
self.df["user_cool"]
|
554 |
+
)
|
555 |
+
self.df["user_useful_funny_cool"] = self.df["user_useful_funny_cool"].fillna(0).astype(int)
|
556 |
+
|
557 |
+
def compute_fake_score(self, row):
|
558 |
+
suspicion_points = 0
|
559 |
+
|
560 |
+
# Linguistic Features
|
561 |
+
if row["pronoun_density"] < 0.01: # Low personal engagement
|
562 |
+
suspicion_points += 1
|
563 |
+
if row["avg_sentence_length"] < 5 or row["avg_sentence_length"] > 30: # Extreme lengths
|
564 |
+
suspicion_points += 1
|
565 |
+
if row["grammar_error_score"] > 5: # Many errors
|
566 |
+
suspicion_points += 1
|
567 |
+
if row["repetitive_words_count"] > 5: # High repetition
|
568 |
+
suspicion_points += 1
|
569 |
+
if row["code_switching_flag"] == 1: # Language mixing
|
570 |
+
suspicion_points += 1
|
571 |
+
if row["excessive_punctuation_count"] > 3: # Overuse of punctuation
|
572 |
+
suspicion_points += 1
|
573 |
+
if abs(row["sentiment_polarity"]) > 0.8: # Extreme sentiment
|
574 |
+
suspicion_points += 1
|
575 |
+
|
576 |
+
# Review Patterns
|
577 |
+
if row["similarity_to_other_reviews"] > 0.8: # High duplication
|
578 |
+
suspicion_points += 1
|
579 |
+
if row["user_review_burst_count"] > 5: # Spammy bursts
|
580 |
+
suspicion_points += 1
|
581 |
+
if row["business_review_burst_count"] > 5: # Targeted bursts
|
582 |
+
suspicion_points += 1
|
583 |
+
if abs(row["rating_deviation_from_business_average"]) > 2: # Large rating deviation
|
584 |
+
suspicion_points += 1
|
585 |
+
if row["review_like_ratio"] > 0.9 or row["review_like_ratio"] < 0.1: # Extreme like ratio
|
586 |
+
suspicion_points += 1
|
587 |
+
|
588 |
+
# User Behavior
|
589 |
+
if row["user_account_age"] < 30: # Very new account (days)
|
590 |
+
suspicion_points += 1
|
591 |
+
if row["average_time_between_reviews"] < 24: # Rapid reviews (hours)
|
592 |
+
suspicion_points += 1
|
593 |
+
if row["user_degree"] < 2: # Low business interaction
|
594 |
+
suspicion_points += 1
|
595 |
+
if row["time_since_last_review_user"] < 24: # Recent burst (hours)
|
596 |
+
suspicion_points += 1
|
597 |
+
|
598 |
+
# Threshold: 3 or more points = fake
|
599 |
+
return 1 if suspicion_points >= 3 else 0
|
600 |
+
|
601 |
+
def dropping_unncessary_columns(self):
|
602 |
+
|
603 |
+
self.df.drop("review_text", axis=1, inplace=True)
|
604 |
+
self.df.drop("review_date", axis=1, inplace=True)
|
605 |
+
self.df.drop("business_name", axis=1, inplace=True)
|
606 |
+
self.df.drop("address", axis=1, inplace=True)
|
607 |
+
self.df.drop("city", axis=1, inplace=True)
|
608 |
+
self.df.drop("state", axis=1, inplace=True)
|
609 |
+
self.df.drop("postal_code", axis=1, inplace=True)
|
610 |
+
self.df.drop("categories", axis=1, inplace=True)
|
611 |
+
self.df.drop("user_name", axis=1, inplace=True)
|
612 |
+
self.df.drop("yelping_since", axis=1, inplace=True)
|
613 |
+
self.df.drop("checkin_date", axis=1, inplace=True)
|
614 |
+
self.df.drop("review_useful", axis=1, inplace=True)
|
615 |
+
self.df.drop("review_funny", axis=1, inplace=True)
|
616 |
+
self.df.drop("review_cool", axis=1, inplace=True)
|
617 |
+
self.df.drop("user_useful", axis=1, inplace=True)
|
618 |
+
self.df.drop("user_funny", axis=1, inplace=True)
|
619 |
+
self.df.drop("user_cool", axis=1, inplace=True)
|
620 |
+
self.df.drop("is_open", axis=1, inplace=True)
|
621 |
+
self.df.drop("compliment_hot", axis=1, inplace=True)
|
622 |
+
self.df.drop("compliment_more", axis=1, inplace=True)
|
623 |
+
self.df.drop("compliment_profile", axis=1, inplace=True)
|
624 |
+
self.df.drop("compliment_cute", axis=1, inplace=True)
|
625 |
+
self.df.drop("compliment_list", axis=1, inplace=True)
|
626 |
+
self.df.drop("compliment_note", axis=1, inplace=True)
|
627 |
+
self.df.drop("compliment_plain", axis=1, inplace=True)
|
628 |
+
self.df.drop("compliment_cool", axis=1, inplace=True)
|
629 |
+
self.df.drop("compliment_funny", axis=1, inplace=True)
|
630 |
+
self.df.drop("compliment_writer", axis=1, inplace=True)
|
631 |
+
self.df.drop("compliment_photos", axis=1, inplace=True)
|
632 |
+
|
633 |
+
def run_pipeline(self):
|
634 |
+
|
635 |
+
|
636 |
+
|
637 |
+
logger.info("FINALYZING HOURS COLUMN ...")
|
638 |
+
self.df["hours"] = self.df["hours"].apply(self.get_avg_duration)
|
639 |
+
self.df["hours"] = self.df["hours"].fillna(0)
|
640 |
+
print(self.df["hours"][:10])
|
641 |
+
print(self.df["hours"].isnull().sum())
|
642 |
+
|
643 |
+
|
644 |
+
|
645 |
+
|
646 |
+
logger.info("FINALYZING ATTRIBUTES COLUMN ...")
|
647 |
+
self.df.drop("attributes",axis=1,inplace=True)
|
648 |
+
|
649 |
+
|
650 |
+
|
651 |
+
logger.info("CREATING time_since_last_review_user COLUMN ...")
|
652 |
+
self.calculate_time_since_last_review()
|
653 |
+
print(np.unique(self.df["time_since_last_review_user"] ))
|
654 |
+
|
655 |
+
|
656 |
+
logger.info("CREATING time_since_last_review_business COLUMN ...")
|
657 |
+
self.calculate_time_since_last_review_business()
|
658 |
+
print(np.unique(self.df["time_since_last_review_business"] ))
|
659 |
+
|
660 |
+
|
661 |
+
|
662 |
+
logger.info("CREATING user_account_age COLUMN ...")
|
663 |
+
self.calculate_user_account_age()
|
664 |
+
print(np.unique(self.df["user_account_age"] ))
|
665 |
+
|
666 |
+
|
667 |
+
|
668 |
+
logger.info("CREATING average_time_between_reviews COLUMN ...")
|
669 |
+
self.calculate_avg_time_between_reviews()
|
670 |
+
print(np.unique(self.df["average_time_between_reviews"] ))
|
671 |
+
|
672 |
+
|
673 |
+
|
674 |
+
logger.info("CREATING user_degree COLUMN ...")
|
675 |
+
self.calculate_user_degree()
|
676 |
+
print(np.unique(self.df["user_degree"] ))
|
677 |
+
|
678 |
+
|
679 |
+
logger.info("CREATING business_degree COLUMN ...")
|
680 |
+
self.calculate_business_degree()
|
681 |
+
print(np.unique(self.df["business_degree"] ))
|
682 |
+
|
683 |
+
|
684 |
+
logger.info("CREATING rating_variance_user COLUMN ...")
|
685 |
+
self.calculate_rating_variance_user()
|
686 |
+
print(np.unique(self.df["rating_variance_user"] ))
|
687 |
+
|
688 |
+
|
689 |
+
|
690 |
+
logger.info("CREATING user_review_burst_count COLUMN ...")
|
691 |
+
self.calculate_user_review_burst_count()
|
692 |
+
print(np.unique(self.df["user_review_burst_count"] ))
|
693 |
+
|
694 |
+
|
695 |
+
logger.info("CREATING business_review_burst_count COLUMN ...")
|
696 |
+
self.calculate_business_review_burst_count()
|
697 |
+
print(np.unique(self.df["business_review_burst_count"] ))
|
698 |
+
|
699 |
+
|
700 |
+
|
701 |
+
logger.info("CREATING temporal_similarity COLUMN ...")
|
702 |
+
self.calculate_temporal_similarity()
|
703 |
+
print(np.unique(self.df["temporal_similarity"] ))
|
704 |
+
|
705 |
+
|
706 |
+
|
707 |
+
logger.info("CREATING rating_deviation_from_business_average COLUMN ...")
|
708 |
+
self.calculate_rating_deviation_from_business_average()
|
709 |
+
print(np.unique(self.df["rating_deviation_from_business_average"] ))
|
710 |
+
|
711 |
+
|
712 |
+
|
713 |
+
logger.info("CREATING review_like_ratio COLUMN ...")
|
714 |
+
self.calculate_review_like_ratio()
|
715 |
+
print(np.unique(self.df["review_like_ratio"] ))
|
716 |
+
|
717 |
+
|
718 |
+
|
719 |
+
logger.info("CREATING latest_checkin_hours COLUMN ...")
|
720 |
+
self.calculate_latest_checkin_hours()
|
721 |
+
print(np.unique(self.df["latest_checkin_hours"] ))
|
722 |
+
|
723 |
+
|
724 |
+
|
725 |
+
|
726 |
+
logger.info("CREATING pronoun_density COLUMN ...")
|
727 |
+
self.df["pronoun_density"] = self.df["review_text"].apply(self.compute_pronoun_density)
|
728 |
+
print(np.unique(self.df["pronoun_density"] ))
|
729 |
+
|
730 |
+
logger.info("CREATING avg_sentence_length COLUMN ...")
|
731 |
+
self.df["avg_sentence_length"] = self.df["review_text"].apply(self.compute_avg_sentence_length)
|
732 |
+
print(np.unique(self.df["avg_sentence_length"] ))
|
733 |
+
|
734 |
+
logger.info("CREATING excessive_punctuation_count COLUMN ...")
|
735 |
+
self.df["excessive_punctuation_count"] = self.df["review_text"].apply(self.compute_excessive_punctuation)
|
736 |
+
print(np.unique(self.df["excessive_punctuation_count"] ))
|
737 |
+
|
738 |
+
logger.info("CREATING sentiment_polarity COLUMN ...")
|
739 |
+
self.df["sentiment_polarity"] = self.df["review_text"].apply(self.compute_sentiment_polarity)
|
740 |
+
print(np.unique(self.df["sentiment_polarity"] ))
|
741 |
+
|
742 |
+
logger.info("CREATING good_severity and bad_severity COLUMNS ...")
|
743 |
+
severity_scores = self.df["review_text"].apply(self.calculate_sentiment_severity)
|
744 |
+
self.df[["good_severity", "bad_severity"]] = severity_scores
|
745 |
+
print(np.unique(self.df["good_severity"] ))
|
746 |
+
print(np.unique(self.df["bad_severity"] ))
|
747 |
+
|
748 |
+
|
749 |
+
logger.info("CREATING code_switching_flag COLUMN ...")
|
750 |
+
self.df["code_switching_flag"] = self.df["review_text"].apply(self.compute_code_switching_flag)
|
751 |
+
print(np.unique(self.df["code_switching_flag"] ))
|
752 |
+
|
753 |
+
|
754 |
+
all_texts = self.df["review_text"].tolist()
|
755 |
+
tokenized_ids = self.batch_tokenize(all_texts, batch_size=32, max_length=512)
|
756 |
+
|
757 |
+
logger.info("CREATING grammar_error_score COLUMN ...")
|
758 |
+
self.df["grammar_error_score"] = self.compute_grammar_error_score(all_texts, tokenized_ids)
|
759 |
+
print(np.unique(self.df["grammar_error_score"] ))
|
760 |
+
|
761 |
+
|
762 |
+
logger.info("CREATING repetitive_words_count COLUMN ...")
|
763 |
+
self.df["repetitive_words_count"] = self.compute_repetitive_words_count(all_texts, tokenized_ids)
|
764 |
+
print(np.unique(self.df["repetitive_words_count"] ))
|
765 |
+
|
766 |
+
|
767 |
+
|
768 |
+
logger.info("CREATING similarity_to_other_reviews COLUMN ...")
|
769 |
+
similarity_scores = self.compute_similarity_to_other_reviews(batch_size=32, max_length=512)
|
770 |
+
self.df["similarity_to_other_reviews"] = self.df["review_id"].map(similarity_scores)
|
771 |
+
|
772 |
+
print(np.unique(self.df["similarity_to_other_reviews"] ))
|
773 |
+
|
774 |
+
|
775 |
+
|
776 |
+
logger.info("CREATING friends COLUMN ...")
|
777 |
+
self.calculate_friend_count()
|
778 |
+
print(self.df["friends"].value_counts())
|
779 |
+
|
780 |
+
logger.info("CREATING elite COLUMN ...")
|
781 |
+
self.transform_elite_status()
|
782 |
+
print(self.df["elite"].value_counts())
|
783 |
+
|
784 |
+
|
785 |
+
logger.info("CREATING review_useful_funny_cool COLUMN ...")
|
786 |
+
self.calculate_review_useful_funny_cool()
|
787 |
+
print(self.df["review_useful_funny_cool"].value_counts())
|
788 |
+
|
789 |
+
|
790 |
+
logger.info("CREATING user_useful_funny_cool COLUMN ...")
|
791 |
+
self.calculate_user_useful_funny_cool()
|
792 |
+
print(self.df["user_useful_funny_cool"].value_counts())
|
793 |
+
|
794 |
+
|
795 |
+
# logger.info("CREATING LABEL COLUMN ...")
|
796 |
+
# self.df["fake"] = self.df.apply(self.compute_fake_score, axis=1)
|
797 |
+
# print(self.df["fake"].value_counts())
|
798 |
+
|
799 |
+
logger.info("DELETING THE UNWANTED COLUMNS ...")
|
800 |
+
self.dropping_unncessary_columns()
|
801 |
+
print()
|
802 |
+
|
803 |
+
logger.info("SEEING NULL VALUES IN FINAL COLUMNS.....")
|
804 |
+
print(set(self.df.isnull().sum().values))
|
805 |
+
|
806 |
+
|
807 |
+
|
808 |
+
|
809 |
+
return self.df
|
810 |
+
|
811 |
+
|
812 |
+
|
813 |
+
|
814 |
+
|
815 |
+
|
816 |
+
|
817 |
+
|
818 |
+
|
819 |
+
|
820 |
+
|
821 |
+
|
822 |
+
|
823 |
+
|
824 |
+
|
825 |
+
|
826 |
+
|
827 |
+
|
828 |
+
|
829 |
+
|
830 |
+
|
831 |
+
|
832 |
+
|
833 |
+
|
834 |
+
|
835 |
+
|
836 |
+
|
837 |
+
|
838 |
+
|
839 |
+
|
840 |
+
|
841 |
+
|
842 |
+
|
843 |
+
|
844 |
+
|
845 |
+
|
846 |
+
|
847 |
+
|
848 |
+
|
849 |
+
|
850 |
+
|
851 |
+
|
852 |
+
|
853 |
+
|
854 |
+
|
855 |
+
|
856 |
+
|
857 |
+
|
858 |
+
|
859 |
+
|
860 |
+
|
861 |
+
|
862 |
+
|
863 |
+
|
864 |
+
|
865 |
+
|
866 |
+
|