Spaces:

Askhedi
/

flaskapp

Sleeping

App Files Files Community

OVH commited on Mar 5

Commit

67b1c6c

1 Parent(s): 12ab6cd

Added all the files

Browse files

Files changed (41) hide show

.ipynb_checkpoints/Dockerfile-checkpoint +0 -0
.ipynb_checkpoints/app-checkpoint.py +281 -0
.ipynb_checkpoints/main-checkpoint.py +132 -0
Dockerfile +0 -0
app.py +281 -0
main.py +132 -0
requirements.txt +19 -0
src/.ipynb_checkpoints/analyze_yelp_data-checkpoint.py +320 -0
src/.ipynb_checkpoints/clean_data-checkpoint.py +77 -0
src/.ipynb_checkpoints/create_dataset-checkpoint.py +217 -0
src/.ipynb_checkpoints/feature_analyzer-checkpoint.py +212 -0
src/.ipynb_checkpoints/model-checkpoint.py +541 -0
src/.ipynb_checkpoints/model_trainer-checkpoint.py +35 -0
src/.ipynb_checkpoints/preprocessing-checkpoint.py +831 -0
src/__pycache__/analyze_yelp_data.cpython-311.pyc +0 -0
src/__pycache__/clean_data.cpython-311.pyc +0 -0
src/__pycache__/clean_data.cpython-39.pyc +0 -0
src/__pycache__/create_dataset.cpython-311.pyc +0 -0
src/__pycache__/create_dataset.cpython-39.pyc +0 -0
src/__pycache__/data_balancing.cpython-311.pyc +0 -0
src/__pycache__/feature_analyzer.cpython-311.pyc +0 -0
src/__pycache__/feature_analyzer.cpython-39.pyc +0 -0
src/__pycache__/feature_importance.cpython-311.pyc +0 -0
src/__pycache__/model.cpython-311.pyc +0 -0
src/__pycache__/model.cpython-39.pyc +0 -0
src/__pycache__/model1.cpython-311.pyc +0 -0
src/__pycache__/model1.cpython-39.pyc +0 -0
src/__pycache__/model3.cpython-311.pyc +0 -0
src/__pycache__/model3.cpython-39.pyc +0 -0
src/__pycache__/model_trainer.cpython-311.pyc +0 -0
src/__pycache__/model_trainer.cpython-39.pyc +0 -0
src/__pycache__/models.cpython-311.pyc +0 -0
src/__pycache__/preprocessing.cpython-311.pyc +0 -0
src/__pycache__/preprocessing.cpython-39.pyc +0 -0
src/analyze_yelp_data.py +320 -0
src/clean_data.py +83 -0
src/create_dataset.py +217 -0
src/feature_analyzer.py +212 -0
src/model.py +540 -0
src/model_trainer.py +35 -0
src/preprocessing.py +832 -0

.ipynb_checkpoints/Dockerfile-checkpoint ADDED Viewed

File without changes

.ipynb_checkpoints/app-checkpoint.py ADDED Viewed

	@@ -0,0 +1,281 @@

+from flask import Flask, request, jsonify
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch_geometric.data import HeteroData
+import numpy as np
+import pandas as pd
+import networkx as nx
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve
+from sklearn.model_selection import train_test_split
+from pathlib import Path
+from datetime import datetime
+from loguru import logger
+from huggingface_hub import hf_hub_download
+import json
+from preprocessing_test import Preprocessor
+from src.model import *
+from main import start_pipelines
+app = Flask(__name__)
+# Define default values for each column
+default_values = {
+    'review_id': 'KU_O5udG6zpxOg-VcAEodg',
+    'user_id': 'mh_-eMZ6K5RLWhZyISBhwA',
+    'business_id': 'XQfwVwDr-v0ZS3_CbbE5Xw',
+    'review_stars': 0,
+    'review_useful': 0,
+    'review_funny': 0,
+    'review_cool': 0,
+    'review_text': 'It was a moderate experience',
+    'review_date': 1531001351000,
+    'business_name': 'Coffe at LA',
+    'address': '1460 LA',
+    'city': 'LA',
+    'state': 'CA',
+    'postal_code': '00000',
+    'latitude': 0.0,
+    'longitude': 0.0,
+    'business_stars': 0.0,
+    'business_review_count': 0,
+    'is_open': 0,
+    'attributes': '{}',
+    'categories': 'Restaurants',
+    'hours': '{"Monday": "7:0-20:0", "Tuesday": "7:0-20:0", "Wednesday": "7:0-20:0", "Thursday": "7:0-20:0", "Friday": "7:0-21:0", "Saturday": "7:0-21:0", "Sunday": "7:0-21:0"}',
+    'user_name': 'default_user',
+    'user_review_count': 0,
+    'yelping_since': '2023-01-01 00:00:00',
+    'user_useful': 0,
+    'user_funny': 0,
+    'user_cool': 0,
+    'elite': '2024,2025',
+    'friends': '',
+    'fans': 0,
+    'average_stars': 0.0,
+    'compliment_hot': 0,
+    'compliment_more': 0,
+    'compliment_profile': 0,
+    'compliment_cute': 0,
+    'compliment_list': 0,
+    'compliment_note': 0,
+    'compliment_plain': 0,
+    'compliment_cool': 0,
+    'compliment_funny': 0,
+    'compliment_writer': 0,
+    'compliment_photos': 0,
+    'checkin_date': '2023-01-01 00:00:00',
+    'tip_compliment_count': 0.0,
+    'tip_count': 0.0
+}
+# Expected types for validation
+expected_types = {
+    'review_id': str,
+    'user_id': str,
+    'business_id': str,
+    'review_stars': int,
+    'review_useful': int,
+    'review_funny': int,
+    'review_cool': int,
+    'review_text': str,
+    'review_date': int,
+    'business_name': str,
+    'address': str,
+    'city': str,
+    'state': str,
+    'postal_code': str,
+    'latitude': float,
+    'longitude': float,
+    'business_stars': float,
+    'business_review_count': int,
+    'is_open': int,
+    'attributes': dict,  # Assuming string representation of dict
+    'categories': str,
+    'hours': dict,  # Assuming string representation of dict
+    'user_name': str,
+    'user_review_count': int,
+    'yelping_since': str,
+    'user_useful': int,
+    'user_funny': int,
+    'user_cool': int,
+    'elite': str,
+    'friends': str,
+    'fans': int,
+    'average_stars': float,
+    'compliment_hot': int,
+    'compliment_more': int,
+    'compliment_profile': int,
+    'compliment_cute': int,
+    'compliment_list': int,
+    'compliment_note': int,
+    'compliment_plain': int,
+    'compliment_cool': int,
+    'compliment_funny': int,
+    'compliment_writer': int,
+    'compliment_photos': int,
+    'checkin_date': str,
+    'tip_compliment_count': float,
+    'tip_count': float
+}
+@app.route('/predict', methods=['POST'])
+def predict():
+    try:
+        # Check if request contains JSON data
+        if not request.json:
+            return jsonify({'error': 'Request must contain JSON data'}), 400
+        data = request.json
+        # Extract train, test, and test_size with defaults
+        train = data.get('train', False)
+        test = data.get('test', False)
+        test_size = float(data.get('test_size', 0.1))
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        # Handle training mode
+        if train in (True, 'true', 'True'):
+            start_pipelines(test_size=test_size)
+            logger.info("PIPELINES FINISHED SUCCESSFULLY")
+            return jsonify({
+                'message': 'Training pipelines executed successfully',
+                'test_size': test_size
+            }), 200
+        # Handle testing/inference mode
+        elif test in (True, 'test', 'True'):
+            REPO_ID = "Askhedi/graphformermodel"
+            MODEL_FILENAME = "model_GraphformerModel_latest.pth"
+            model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME)
+            # Load model
+            model = HeteroGraphormer(hidden_dim=64, output_dim=1, edge_dim=4).to(device)
+            model.load_state_dict(torch.load(model_path, map_location=device))
+            model.eval()
+            # Process input data from JSON
+            row = {}
+            warnings = []
+            for col, expected_type in expected_types.items():
+                value = data.get(col, default_values[col])
+                try:
+                    if value == "" or value is None:
+                        row[col] = default_values[col]
+                    elif col in ['attributes', 'hours']:
+                        # Expect a valid JSON string that parses to a dict
+                        if isinstance(value, str):
+                            parsed = json.loads(value)
+                            if not isinstance(parsed, dict):
+                                raise ValueError
+                            row[col] = value  # Keep as string for Preprocessor
+                        else:
+                            raise ValueError
+                    else:
+                        row[col] = expected_type(value)
+                except (ValueError, TypeError, json.JSONDecodeError):
+                    row[col] = default_values[col]
+                    warnings.append(f"Invalid input for '{col}' (expected {expected_type.__name__}), using default value: {default_values[col]}")
+            # Convert dictionaries to strings before passing to DataFrame
+            for col in ['attributes', 'hours']:
+                if isinstance(row[col], dict):
+                    row[col] = json.dumps(row[col])
+            # Create DataFrame from input
+            input_df = pd.DataFrame([row])
+            # Preprocess using Preprocessor
+            preprocessor = Preprocessor(input_df)
+            processed_df = preprocessor.run_pipeline()
+            logger.info(f"PREPROCESSING COMPLETED VALUES ARE {processed_df}")
+            # Build standalone graph from processed data
+            num_users = 1
+            num_businesses = 1
+            num_rows = 1
+            graph = HeteroData()
+            features = torch.tensor(processed_df.drop(columns=['user_id', 'review_id', 'business_id']).values, dtype=torch.float, device=device)
+            time_since_user = torch.tensor(processed_df['time_since_last_review_user'].values, dtype=torch.float, device=device)
+            time_since_business = torch.tensor(processed_df['time_since_last_review_business'].values, dtype=torch.float, device=device)
+            user_indices = torch.tensor([0], dtype=torch.long, device=device)
+            business_indices = torch.tensor([0], dtype=torch.long, device=device)
+            review_indices = torch.tensor([0], dtype=torch.long, device=device)
+            user_feats = torch.zeros(num_users, 14, device=device)
+            business_feats = torch.zeros(num_businesses, 8, device=device)
+            review_feats = torch.zeros(num_rows, 16, device=device)
+            user_feats[0] = features[0, :14]
+            business_feats[0] = features[0, 14:22]
+            review_feats[0] = features[0, 22:38]
+            graph['user'].x = user_feats
+            graph['business'].x = business_feats
+            graph['review'].x = review_feats
+            graph['user', 'writes', 'review'].edge_index = torch.stack([user_indices, review_indices], dim=0)
+            graph['review', 'about', 'business'].edge_index = torch.stack([review_indices, business_indices], dim=0)
+            # Compute encodings
+            G = nx.DiGraph()
+            node_type_map = {0: 'user', 1: 'business', 2: 'review'}
+            G.add_nodes_from([0, 1, 2])
+            G.add_edge(0, 2)  # user -> review
+            G.add_edge(2, 1)  # review -> business
+            num_nodes = 3
+            spatial_encoding = torch.full((num_nodes, num_nodes), float('inf'), device=device)
+            for i in range(num_nodes):
+                for j in range(num_nodes):
+                    if i == j:
+                        spatial_encoding[i, j] = 0
+                    elif nx.has_path(G, i, j):
+                        spatial_encoding[i, j] = nx.shortest_path_length(G, i, j)
+            centrality_encoding = torch.tensor([G.degree(i) for i in range(num_nodes)], dtype=torch.float, device=device).view(-1, 1)
+            edge_features_dict = {}
+            user_writes_edge = graph['user', 'writes', 'review'].edge_index
+            review_about_edge = graph['review', 'about', 'business'].edge_index
+            edge_features_dict[('user', 'writes', 'review')] = create_temporal_edge_features(
+                time_since_user[user_writes_edge[0]], time_since_user[user_writes_edge[1]],
+                user_indices[user_writes_edge[0]], user_indices[user_writes_edge[0]]
+            )
+            edge_features_dict[('review', 'about', 'business')] = create_temporal_edge_features(
+                time_since_business[review_about_edge[0]], time_since_business[review_about_edge[1]],
+                torch.zeros_like(review_about_edge[0]), torch.zeros_like(review_about_edge[0])
+            )
+            time_since_dict = {
+                'user': torch.tensor([time_since_user[0]], dtype=torch.float, device=device)
+            }
+            # Inference
+            with torch.no_grad():
+                out = model(graph, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict)
+                pred_label = 1 if out.squeeze().item() > 0.5 else 0
+                prob = out.squeeze().item()
+            # Combine warnings and result
+            result = {
+                'warnings': warnings,
+                'prediction': 'Fake' if pred_label == 1 else 'Not Fake',
+                'probability': float(prob)
+            }
+            return jsonify(result), 200
+        else:
+            return jsonify({
+                'error': 'Either "train" or "test" must be set to true'
+            }), 400
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+if __name__ == '__main__':
+    app.run(debug=True, host='0.0.0.0', port=5000)

.ipynb_checkpoints/main-checkpoint.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import pandas as pd
+import numpy as np
+from pathlib import Path
+import logging
+import sys
+from datetime import datetime
+import warnings
+import gc
+import json
+from loguru import logger
+from src.create_dataset import process_datasets
+from src.preprocessing import Preprocessor
+from src.clean_data import DataCleaner
+from src.feature_analyzer import FeatureAnalyzer
+from src.model_trainer import ModelTrainer
+from pathlib import Path
+def create_directories():
+    """Create all necessary directories for the pipeline"""
+    directories = {
+        'combined_data': Path('output_files/combined_data'),
+        'preprocessed': Path('output_files/cleaned_preprocessed_data'),
+        'feature_analyzer': Path('output_files/feature_analysis'),
+        'model_outputs': Path('output_files/model_outputs'),
+    }
+    for dir_path in directories.values():
+        dir_path.mkdir(parents=True, exist_ok=True)
+    return directories
+def handle_memory():
+    """Handle memory management"""
+    gc.collect()
+    warnings.filterwarnings('ignore')
+def save_pipeline_metrics(metrics: dict, filepath: Path):
+    """Save pipeline metrics to JSON file"""
+    with open(filepath, 'w') as f:
+        json.dump(metrics, f, indent=4, default=str)
+def start_pipelines(train_size=0.25):
+    # Setup logging
+    logger.info("STARTING YELP DATA ANALYSIS PIPELINES...")
+    dirs = create_directories()
+    logger.info("Created necessary directories")
+    logger.info("Pipeline 1: Creating initial dataset...")
+    try:
+        filename="combined_merged_full.csv"
+        df = process_datasets(output_path=dirs['combined_data'],filename=filename)
+        logger.info(f"Dataset created successfully with shape: {df.shape}")
+    except Exception as e:
+        logger.error(f"Error in dataset creation: {str(e)}")
+    try:
+        logger.info("Pipeline 2: Preprocessing and Feature Engineering....")
+        output_before_preprocess=Path(str(dirs['combined_data']) )/  "combined_merged_full.csv"
+        df = pd.read_csv(output_before_preprocess)
+        prep=Preprocessor(df)
+        feature_engineered_df=prep.run_pipeline()
+    except Exception as e:
+        logger.error(f"Error in Pipeline 2 Preprocessing and Feature Engineering as : {e}")
+    try:
+        logger.info("Pipeline 3: Cleaning data...")
+        filename="preprocessed_cleaned.csv"
+        cleaner = DataCleaner(df=feature_engineered_df,output_path=str(dirs['preprocessed']),filename=filename)
+        cleaner.run_pipeline()
+        clean_output_file_path = Path(str(dirs['preprocessed']) )/  filename
+        print("Preprocessed and Cleand data saved in ",clean_output_file_path)
+    except Exception as e:
+        logger.error(f"Error in Pipeline 3 Cleaning Data : {str(e)}")
+    try:
+        logger.info("Pipeline 4: Analyzing features...")
+        filename="preprocessed_cleaned.csv"
+        preprocessed_clean_output_file=Path(str(dirs['preprocessed']) )/  filename
+        preprocessed_clean_df=pd.read_csv(preprocessed_clean_output_file)
+        analyzer = FeatureAnalyzer(df=preprocessed_clean_df,output_path=str(dirs['feature_analyzer']))
+        analyzer.run_pipeline()
+    except Exception as e:
+        logger.error(f"Error in Feature analysis: {str(e)}")
+        raise
+    try:
+        logger.info("Pipeline 5 : Training and Evaluating Models...")
+        filename="preprocessed_cleaned.csv"
+        preprocessed_clean_output_file=Path(str(dirs['preprocessed']) )/  filename
+        preprocessed_clean_df=pd.read_csv(preprocessed_clean_output_file)
+        preprocessed_clean_df = preprocessed_clean_df.sample(frac=1, random_state=42).reset_index(drop=True)
+        size=int(train_size*len(preprocessed_clean_df))
+        preprocessed_clean_df=preprocessed_clean_df.iloc[:size,:]
+        trainer = ModelTrainer(df=preprocessed_clean_df,output_path=str(dirs['model_outputs']), epochs=50,test_size=0.3)
+        trainer.train_and_evaluate()
+        logger.info(f"Models training completed ")
+    except Exception as e:
+        logger.error(f"Error in Model Trainer: {str(e)}")

Dockerfile ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,281 @@

+from flask import Flask, request, jsonify
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch_geometric.data import HeteroData
+import numpy as np
+import pandas as pd
+import networkx as nx
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve
+from sklearn.model_selection import train_test_split
+from pathlib import Path
+from datetime import datetime
+from loguru import logger
+from huggingface_hub import hf_hub_download
+import json
+from preprocessing_test import Preprocessor
+from src.model import *
+from main import start_pipelines
+app = Flask(__name__)
+# Define default values for each column
+default_values = {
+    'review_id': 'KU_O5udG6zpxOg-VcAEodg',
+    'user_id': 'mh_-eMZ6K5RLWhZyISBhwA',
+    'business_id': 'XQfwVwDr-v0ZS3_CbbE5Xw',
+    'review_stars': 0,
+    'review_useful': 0,
+    'review_funny': 0,
+    'review_cool': 0,
+    'review_text': 'It was a moderate experience',
+    'review_date': 1531001351000,
+    'business_name': 'Coffe at LA',
+    'address': '1460 LA',
+    'city': 'LA',
+    'state': 'CA',
+    'postal_code': '00000',
+    'latitude': 0.0,
+    'longitude': 0.0,
+    'business_stars': 0.0,
+    'business_review_count': 0,
+    'is_open': 0,
+    'attributes': '{}',
+    'categories': 'Restaurants',
+    'hours': '{"Monday": "7:0-20:0", "Tuesday": "7:0-20:0", "Wednesday": "7:0-20:0", "Thursday": "7:0-20:0", "Friday": "7:0-21:0", "Saturday": "7:0-21:0", "Sunday": "7:0-21:0"}',
+    'user_name': 'default_user',
+    'user_review_count': 0,
+    'yelping_since': '2023-01-01 00:00:00',
+    'user_useful': 0,
+    'user_funny': 0,
+    'user_cool': 0,
+    'elite': '2024,2025',
+    'friends': '',
+    'fans': 0,
+    'average_stars': 0.0,
+    'compliment_hot': 0,
+    'compliment_more': 0,
+    'compliment_profile': 0,
+    'compliment_cute': 0,
+    'compliment_list': 0,
+    'compliment_note': 0,
+    'compliment_plain': 0,
+    'compliment_cool': 0,
+    'compliment_funny': 0,
+    'compliment_writer': 0,
+    'compliment_photos': 0,
+    'checkin_date': '2023-01-01 00:00:00',
+    'tip_compliment_count': 0.0,
+    'tip_count': 0.0
+}
+# Expected types for validation
+expected_types = {
+    'review_id': str,
+    'user_id': str,
+    'business_id': str,
+    'review_stars': int,
+    'review_useful': int,
+    'review_funny': int,
+    'review_cool': int,
+    'review_text': str,
+    'review_date': int,
+    'business_name': str,
+    'address': str,
+    'city': str,
+    'state': str,
+    'postal_code': str,
+    'latitude': float,
+    'longitude': float,
+    'business_stars': float,
+    'business_review_count': int,
+    'is_open': int,
+    'attributes': dict,  # Assuming string representation of dict
+    'categories': str,
+    'hours': dict,  # Assuming string representation of dict
+    'user_name': str,
+    'user_review_count': int,
+    'yelping_since': str,
+    'user_useful': int,
+    'user_funny': int,
+    'user_cool': int,
+    'elite': str,
+    'friends': str,
+    'fans': int,
+    'average_stars': float,
+    'compliment_hot': int,
+    'compliment_more': int,
+    'compliment_profile': int,
+    'compliment_cute': int,
+    'compliment_list': int,
+    'compliment_note': int,
+    'compliment_plain': int,
+    'compliment_cool': int,
+    'compliment_funny': int,
+    'compliment_writer': int,
+    'compliment_photos': int,
+    'checkin_date': str,
+    'tip_compliment_count': float,
+    'tip_count': float
+}
+@app.route('/predict', methods=['POST'])
+def predict():
+    try:
+        # Check if request contains JSON data
+        if not request.json:
+            return jsonify({'error': 'Request must contain JSON data'}), 400
+        data = request.json
+        # Extract train, test, and test_size with defaults
+        train = data.get('train', False)
+        test = data.get('test', False)
+        test_size = float(data.get('test_size', 0.1))
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        # Handle training mode
+        if train in (True, 'true', 'True'):
+            start_pipelines(test_size=test_size)
+            logger.info("PIPELINES FINISHED SUCCESSFULLY")
+            return jsonify({
+                'message': 'Training pipelines executed successfully',
+                'test_size': test_size
+            }), 200
+        # Handle testing/inference mode
+        elif test in (True, 'test', 'True'):
+            REPO_ID = "Askhedi/graphformermodel"
+            MODEL_FILENAME = "model_GraphformerModel_latest.pth"
+            model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME)
+            # Load model
+            model = HeteroGraphormer(hidden_dim=64, output_dim=1, edge_dim=4).to(device)
+            model.load_state_dict(torch.load(model_path, map_location=device))
+            model.eval()
+            # Process input data from JSON
+            row = {}
+            warnings = []
+            for col, expected_type in expected_types.items():
+                value = data.get(col, default_values[col])
+                try:
+                    if value == "" or value is None:
+                        row[col] = default_values[col]
+                    elif col in ['attributes', 'hours']:
+                        # Expect a valid JSON string that parses to a dict
+                        if isinstance(value, str):
+                            parsed = json.loads(value)
+                            if not isinstance(parsed, dict):
+                                raise ValueError
+                            row[col] = value  # Keep as string for Preprocessor
+                        else:
+                            raise ValueError
+                    else:
+                        row[col] = expected_type(value)
+                except (ValueError, TypeError, json.JSONDecodeError):
+                    row[col] = default_values[col]
+                    warnings.append(f"Invalid input for '{col}' (expected {expected_type.__name__}), using default value: {default_values[col]}")
+            # Convert dictionaries to strings before passing to DataFrame
+            for col in ['attributes', 'hours']:
+                if isinstance(row[col], dict):
+                    row[col] = json.dumps(row[col])
+            # Create DataFrame from input
+            input_df = pd.DataFrame([row])
+            # Preprocess using Preprocessor
+            preprocessor = Preprocessor(input_df)
+            processed_df = preprocessor.run_pipeline()
+            logger.info(f"PREPROCESSING COMPLETED VALUES ARE {processed_df}")
+            # Build standalone graph from processed data
+            num_users = 1
+            num_businesses = 1
+            num_rows = 1
+            graph = HeteroData()
+            features = torch.tensor(processed_df.drop(columns=['user_id', 'review_id', 'business_id']).values, dtype=torch.float, device=device)
+            time_since_user = torch.tensor(processed_df['time_since_last_review_user'].values, dtype=torch.float, device=device)
+            time_since_business = torch.tensor(processed_df['time_since_last_review_business'].values, dtype=torch.float, device=device)
+            user_indices = torch.tensor([0], dtype=torch.long, device=device)
+            business_indices = torch.tensor([0], dtype=torch.long, device=device)
+            review_indices = torch.tensor([0], dtype=torch.long, device=device)
+            user_feats = torch.zeros(num_users, 14, device=device)
+            business_feats = torch.zeros(num_businesses, 8, device=device)
+            review_feats = torch.zeros(num_rows, 16, device=device)
+            user_feats[0] = features[0, :14]
+            business_feats[0] = features[0, 14:22]
+            review_feats[0] = features[0, 22:38]
+            graph['user'].x = user_feats
+            graph['business'].x = business_feats
+            graph['review'].x = review_feats
+            graph['user', 'writes', 'review'].edge_index = torch.stack([user_indices, review_indices], dim=0)
+            graph['review', 'about', 'business'].edge_index = torch.stack([review_indices, business_indices], dim=0)
+            # Compute encodings
+            G = nx.DiGraph()
+            node_type_map = {0: 'user', 1: 'business', 2: 'review'}
+            G.add_nodes_from([0, 1, 2])
+            G.add_edge(0, 2)  # user -> review
+            G.add_edge(2, 1)  # review -> business
+            num_nodes = 3
+            spatial_encoding = torch.full((num_nodes, num_nodes), float('inf'), device=device)
+            for i in range(num_nodes):
+                for j in range(num_nodes):
+                    if i == j:
+                        spatial_encoding[i, j] = 0
+                    elif nx.has_path(G, i, j):
+                        spatial_encoding[i, j] = nx.shortest_path_length(G, i, j)
+            centrality_encoding = torch.tensor([G.degree(i) for i in range(num_nodes)], dtype=torch.float, device=device).view(-1, 1)
+            edge_features_dict = {}
+            user_writes_edge = graph['user', 'writes', 'review'].edge_index
+            review_about_edge = graph['review', 'about', 'business'].edge_index
+            edge_features_dict[('user', 'writes', 'review')] = create_temporal_edge_features(
+                time_since_user[user_writes_edge[0]], time_since_user[user_writes_edge[1]],
+                user_indices[user_writes_edge[0]], user_indices[user_writes_edge[0]]
+            )
+            edge_features_dict[('review', 'about', 'business')] = create_temporal_edge_features(
+                time_since_business[review_about_edge[0]], time_since_business[review_about_edge[1]],
+                torch.zeros_like(review_about_edge[0]), torch.zeros_like(review_about_edge[0])
+            )
+            time_since_dict = {
+                'user': torch.tensor([time_since_user[0]], dtype=torch.float, device=device)
+            }
+            # Inference
+            with torch.no_grad():
+                out = model(graph, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict)
+                pred_label = 1 if out.squeeze().item() > 0.5 else 0
+                prob = out.squeeze().item()
+            # Combine warnings and result
+            result = {
+                'warnings': warnings,
+                'prediction': 'Fake' if pred_label == 1 else 'Not Fake',
+                'probability': float(prob)
+            }
+            return jsonify(result), 200
+        else:
+            return jsonify({
+                'error': 'Either "train" or "test" must be set to true'
+            }), 400
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+if __name__ == '__main__':
+    app.run(debug=True, host='0.0.0.0', port=5000)

main.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import pandas as pd
+import numpy as np
+from pathlib import Path
+import logging
+import sys
+from datetime import datetime
+import warnings
+import gc
+import json
+from loguru import logger
+from src.create_dataset import process_datasets
+from src.preprocessing import Preprocessor
+from src.clean_data import DataCleaner
+from src.feature_analyzer import FeatureAnalyzer
+from src.model_trainer import ModelTrainer
+from pathlib import Path
+def create_directories():
+    """Create all necessary directories for the pipeline"""
+    directories = {
+        'combined_data': Path('output_files/combined_data'),
+        'preprocessed': Path('output_files/cleaned_preprocessed_data'),
+        'feature_analyzer': Path('output_files/feature_analysis'),
+        'model_outputs': Path('output_files/model_outputs'),
+    }
+    for dir_path in directories.values():
+        dir_path.mkdir(parents=True, exist_ok=True)
+    return directories
+def handle_memory():
+    """Handle memory management"""
+    gc.collect()
+    warnings.filterwarnings('ignore')
+def save_pipeline_metrics(metrics: dict, filepath: Path):
+    """Save pipeline metrics to JSON file"""
+    with open(filepath, 'w') as f:
+        json.dump(metrics, f, indent=4, default=str)
+def start_pipelines(train_size=0.25):
+    # Setup logging
+    logger.info("STARTING YELP DATA ANALYSIS PIPELINES...")
+    dirs = create_directories()
+    logger.info("Created necessary directories")
+    logger.info("Pipeline 1: Creating initial dataset...")
+    try:
+        filename="combined_merged_full.csv"
+        df = process_datasets(output_path=dirs['combined_data'],filename=filename)
+        logger.info(f"Dataset created successfully with shape: {df.shape}")
+    except Exception as e:
+        logger.error(f"Error in dataset creation: {str(e)}")
+    try:
+        logger.info("Pipeline 2: Preprocessing and Feature Engineering....")
+        output_before_preprocess=Path(str(dirs['combined_data']) )/  "combined_merged_full.csv"
+        df = pd.read_csv(output_before_preprocess)
+        prep=Preprocessor(df)
+        feature_engineered_df=prep.run_pipeline()
+    except Exception as e:
+        logger.error(f"Error in Pipeline 2 Preprocessing and Feature Engineering as : {e}")
+    try:
+        logger.info("Pipeline 3: Cleaning data...")
+        filename="preprocessed_cleaned.csv"
+        cleaner = DataCleaner(df=feature_engineered_df,output_path=str(dirs['preprocessed']),filename=filename)
+        cleaner.run_pipeline()
+        clean_output_file_path = Path(str(dirs['preprocessed']) )/  filename
+        print("Preprocessed and Cleand data saved in ",clean_output_file_path)
+    except Exception as e:
+        logger.error(f"Error in Pipeline 3 Cleaning Data : {str(e)}")
+    try:
+        logger.info("Pipeline 4: Analyzing features...")
+        filename="preprocessed_cleaned.csv"
+        preprocessed_clean_output_file=Path(str(dirs['preprocessed']) )/  filename
+        preprocessed_clean_df=pd.read_csv(preprocessed_clean_output_file)
+        analyzer = FeatureAnalyzer(df=preprocessed_clean_df,output_path=str(dirs['feature_analyzer']))
+        analyzer.run_pipeline()
+    except Exception as e:
+        logger.error(f"Error in Feature analysis: {str(e)}")
+        raise
+    try:
+        logger.info("Pipeline 5 : Training and Evaluating Models...")
+        filename="preprocessed_cleaned.csv"
+        preprocessed_clean_output_file=Path(str(dirs['preprocessed']) )/  filename
+        preprocessed_clean_df=pd.read_csv(preprocessed_clean_output_file)
+        preprocessed_clean_df = preprocessed_clean_df.sample(frac=1, random_state=42).reset_index(drop=True)
+        size=int(train_size*len(preprocessed_clean_df))
+        preprocessed_clean_df=preprocessed_clean_df.iloc[:size,:]
+        trainer = ModelTrainer(df=preprocessed_clean_df,output_path=str(dirs['model_outputs']), epochs=50,test_size=0.3)
+        trainer.train_and_evaluate()
+        logger.info(f"Models training completed ")
+    except Exception as e:
+        logger.error(f"Error in Model Trainer: {str(e)}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+ujson
+imblearn
+scikit-learn==1.5.2
+loguru
+astropy
+textblob
+nltk
+transformers
+pandas
+numpy
+tqdm
+pymongo
+scikit-learn
+torch
+pathlib
+torch-geometric
+huggingface-hub
+matplotlib
+seaborn

src/.ipynb_checkpoints/analyze_yelp_data-checkpoint.py ADDED Viewed

	@@ -0,0 +1,320 @@

+import pandas as pd
+import numpy as np
+from transformers import AutoTokenizer, AutoModel
+import torch
+from sklearn.ensemble import IsolationForest
+from sklearn.preprocessing import StandardScaler
+from textblob import TextBlob
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.decomposition import PCA
+import warnings
+from typing import Dict, List, Tuple
+import logging
+from collections import Counter
+from detoxify import Detoxify
+import re
+from datetime import datetime
+import seaborn as sns
+import matplotlib.pyplot as plt
+from pathlib import Path
+import json
+class AdvancedYelpAnalyzer:
+    def __init__(self, df: pd.DataFrame):
+        """Initialize the analyzer with necessary models and configurations"""
+        self.df = df.copy()
+        self.bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+        self.bert_model = AutoModel.from_pretrained('bert-base-uncased')
+        self.vader = SentimentIntensityAnalyzer()
+        self.toxic_model = Detoxify('original')
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.bert_model.to(self.device)
+        # Configure logging
+        logging.basicConfig(level=logging.INFO)
+        self.logger = logging.getLogger(__name__)
+    def get_bert_embeddings(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
+        """Generate BERT embeddings for text"""
+        embeddings = []
+        for i in range(0, len(texts), batch_size):
+            batch_texts = texts[i:i + batch_size]
+            encoded = self.bert_tokenizer(batch_texts,
+                                        padding=True,
+                                        truncation=True,
+                                        max_length=512,
+                                        return_tensors='pt')
+            with torch.no_grad():
+                encoded = {k: v.to(self.device) for k, v in encoded.items()}
+                outputs = self.bert_model(**encoded)
+                batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
+                embeddings.append(batch_embeddings)
+        return np.vstack(embeddings)
+    def analyze_sentiment(self) -> pd.DataFrame:
+        """Perform comprehensive sentiment analysis using multiple tools"""
+        self.logger.info("Starting sentiment analysis...")
+        # Calculate BERT embeddings for reviews
+        self.logger.info("Calculating BERT embeddings...")
+        review_texts = self.df['review_text'].fillna('').tolist()
+        bert_embeddings = self.get_bert_embeddings(review_texts)
+        # Calculate review length using BERT tokenizer
+        self.logger.info("Calculating tokenized lengths...")
+        self.df['review_length'] = self.df['review_text'].apply(
+            lambda x: len(self.bert_tokenizer.encode(str(x)))
+        )
+        # Store BERT embeddings mean and std as features
+        self.df['bert_embedding_mean'] = np.mean(bert_embeddings, axis=1)
+        self.df['bert_embedding_std'] = np.std(bert_embeddings, axis=1)
+        # TextBlob sentiment and subjectivity
+        self.df['textblob_polarity'] = self.df['review_text'].apply(
+            lambda x: TextBlob(str(x)).sentiment.polarity
+        )
+        self.df['textblob_subjectivity'] = self.df['review_text'].apply(
+            lambda x: TextBlob(str(x)).sentiment.subjectivity
+        )
+        # VADER sentiment with custom negative phrase handling
+        def get_enhanced_vader_scores(text):
+            # Custom negative phrases
+            negative_phrases = [
+                'too long', 'way too long', 'waiting', 'changed our minds',
+                'too many', 'took forever', 'took too long', 'waste of time',
+                'not worth', 'disappointing', 'mediocre', 'needs improvement'
+            ]
+            # Get base VADER scores
+            base_scores = self.vader.polarity_scores(str(text))
+            # Check for negative phrases
+            text_lower = str(text).lower()
+            neg_count = sum(1 for phrase in negative_phrases if phrase in text_lower)
+            # Adjust scores if negative phrases are found
+            if neg_count > 0:
+                base_scores['neg'] = max(base_scores['neg'], min(0.7, neg_count * 0.2))
+                base_scores['compound'] *= (1 - (neg_count * 0.15))
+                # Readjust neutral score
+                base_scores['neu'] = max(0, 1 - base_scores['neg'] - base_scores['pos'])
+            return base_scores
+        # Apply enhanced VADER scoring
+        vader_scores = self.df['review_text'].apply(get_enhanced_vader_scores)
+        self.df['vader_compound'] = vader_scores.apply(lambda x: x['compound'])
+        self.df['vader_negative'] = vader_scores.apply(lambda x: x['neg'])
+        self.df['vader_positive'] = vader_scores.apply(lambda x: x['pos'])
+        self.df['vader_neutral'] = vader_scores.apply(lambda x: x['neu'])
+        # Calculate sentiment extremity
+        self.df['sentiment_extremity'] = self.df['vader_compound'].abs()
+        return self.df
+    def detect_anomalies(self) -> pd.DataFrame:
+        """Detect anomalous reviews using Isolation Forest with BERT features"""
+        self.logger.info("Detecting anomalies...")
+        # Prepare features for anomaly detection
+        features = [
+            'review_stars',
+            'textblob_polarity',
+            'vader_compound',
+            'sentiment_extremity',
+            'review_length',
+            'bert_embedding_mean',
+            'bert_embedding_std'
+        ]
+        # Ensure all features exist
+        missing_features = [f for f in features if f not in self.df.columns]
+        if missing_features:
+            self.analyze_sentiment()
+        # Standardize features
+        scaler = StandardScaler()
+        X = scaler.fit_transform(self.df[features])
+        # Apply Isolation Forest
+        iso_forest = IsolationForest(
+            contamination=0.1,
+            random_state=42,
+            n_jobs=-1
+        )
+        # Fit and predict
+        self.df['is_anomaly'] = iso_forest.fit_predict(X)
+        self.df['anomaly_score'] = iso_forest.score_samples(X)
+        return self.df
+    def detect_ai_generated_text(self) -> pd.DataFrame:
+        """Estimate likelihood of AI-generated content"""
+        self.logger.info("Detecting AI-generated content...")
+        # Ensure sentiment analysis has been run
+        if 'textblob_subjectivity' not in self.df.columns:
+            self.analyze_sentiment()
+        # Use detoxify model to get toxicity scores
+        texts = self.df['review_text'].fillna('').tolist()
+        toxic_scores = self.toxic_model.predict(texts)
+        # Add scores to DataFrame
+        toxic_score_types = ['toxicity', 'severe_toxicity', 'obscene', 'identity_attack',
+                            'insult', 'threat', 'sexual_explicit']
+        for score_type in toxic_score_types:
+            if score_type in toxic_scores:
+                self.df[f'toxic_{score_type}'] = toxic_scores[score_type]
+        # Calculate AI generation likelihood based on various factors
+        self.df['ai_generated_likelihood'] = (
+            (self.df['textblob_subjectivity'] < 0.3) &  # Low subjectivity
+            (self.df['sentiment_extremity'] > 0.8) &    # Extreme sentiment
+            (self.df['review_length'] > self.df['review_length'].quantile(0.95)) & # Unusually long
+            (self.df['bert_embedding_std'] < self.df['bert_embedding_std'].quantile(0.25)) # Unusual language patterns
+        ).astype(int)
+        # Add additional AI detection features
+        self.df['ai_detection_score'] = (
+            (self.df['textblob_subjectivity'] * -1) +  # Lower subjectivity increases score
+            (self.df['sentiment_extremity'] * 0.5) +   # Extreme sentiment contributes somewhat
+            (self.df['bert_embedding_std'] * -0.5)     # Lower variation in embeddings increases score
+        ).clip(0, 1)  # Normalize between 0 and 1
+        return self.df
+    def analyze_business_categories(self) -> Dict:
+        """Analyze trends and patterns specific to business categories"""
+        self.logger.info("Analyzing business categories...")
+        # Extract categories
+        categories = self.df['categories'].fillna('').str.split(', ')
+        all_categories = [cat for cats in categories if isinstance(cats, list) for cat in cats]
+        category_counts = Counter(all_categories)
+        # Analyze reviews by category
+        category_analysis = {}
+        for category in set(all_categories):
+            category_reviews = self.df[self.df['categories'].str.contains(category, na=False)]
+            category_analysis[category] = {
+                'review_count': len(category_reviews),
+                'avg_rating': category_reviews['review_stars'].mean() if not category_reviews.empty else None,
+                'avg_sentiment': category_reviews['vader_compound'].mean() if 'vader_compound' in self.df.columns and not category_reviews.empty else None,
+                'avg_subjectivity': category_reviews['textblob_subjectivity'].mean() if 'textblob_subjectivity' in self.df.columns and not category_reviews.empty else None
+            }
+        return category_analysis
+    def visualize_results(self, output_dir: str):
+        """Create visualizations for analysis results"""
+        plt.figure(figsize=(15, 10))
+        # Sentiment Distribution
+        plt.subplot(2, 2, 1)
+        sns.histplot(data=self.df, x='vader_compound', bins=50)
+        plt.title('Sentiment Distribution')
+        # Review Volume Over Time
+        plt.subplot(2, 2, 2)
+        daily_reviews = self.df.groupby('review_date').size()
+        daily_reviews.plot()
+        plt.title('Review Volume Over Time')
+        # Anomaly Score Distribution
+        plt.subplot(2, 2, 3)
+        if 'anomaly_score' not in self.df.columns:
+            self.detect_anomalies()
+        sns.histplot(data=self.df, x='anomaly_score', bins=50)
+        plt.title('Anomaly Score Distribution')
+        # AI Generation Likelihood
+        plt.subplot(2, 2, 4)
+        if 'ai_generated_likelihood' not in self.df.columns:
+            self.detect_ai_generated_text()
+        sns.histplot(data=self.df, x='ai_generated_likelihood', bins=2)
+        plt.title('AI Generation Likelihood')
+        plt.tight_layout()
+        plt.savefig(f'{output_dir}/analysis_results.png')
+        plt.close()
+    def run_full_analysis(self, output_dir: str) -> Tuple[pd.DataFrame, Dict]:
+        """Run complete analysis pipeline with detailed outputs"""
+        self.logger.info("Starting full analysis pipeline...")
+        # Create output directory if it doesn't exist
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        try:
+            # Run all analyses
+            self.analyze_sentiment()
+            self.detect_anomalies()
+            self.detect_ai_generated_text()
+            category_analysis = self.analyze_business_categories()
+            # Create visualizations
+            self.visualize_results(str(output_dir))
+            # Compile results
+            analysis_results = {
+                'category_analysis': category_analysis,
+                'sentiment_summary': {
+                    'avg_sentiment': self.df['vader_compound'].mean(),
+                    'positive_reviews': len(self.df[self.df['vader_compound'] > 0.5]),
+                    'negative_reviews': len(self.df[self.df['vader_compound'] < -0.5]),
+                    'neutral_reviews': len(self.df[abs(self.df['vader_compound']) <= 0.5])
+                },
+                'ai_detection_summary': {
+                    'likely_ai_generated': len(self.df[self.df['ai_generated_likelihood'] == 1]),
+                    'avg_ai_score': self.df['ai_detection_score'].mean()
+                },
+                'anomaly_summary': {
+                    'anomalous_reviews': len(self.df[self.df['is_anomaly'] == -1]),
+                    'avg_anomaly_score': self.df['anomaly_score'].mean()
+                }
+            }
+            # Save results
+            self.df.to_csv(output_dir / "analyzed_data.csv", index=False)
+            with open(output_dir / "analysis_results.json", 'w') as f:
+                json.dump(analysis_results, f, indent=4)
+            return self.df, analysis_results
+        except Exception as e:
+            self.logger.error(f"Error during analysis: {str(e)}")
+            raise
+# For testing
+if __name__ == "__main__":
+    # Set up logging
+    logging.basicConfig(level=logging.INFO)
+    logger = logging.getLogger(__name__)
+    try:
+        # Read test data
+        df = pd.read_csv("test_data.csv")
+        # Initialize analyzer
+        analyzer = AdvancedYelpAnalyzer(df)
+        # Run analysis
+        output_dir = "output"
+        analyzed_df, results = analyzer.run_full_analysis(output_dir)
+        logger.info("Analysis completed successfully!")
+    except Exception as e:
+        logger.error(f"Error during testing: {str(e)}")
+        raise

src/.ipynb_checkpoints/clean_data-checkpoint.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# clean_yelp_data.py
+from loguru import logger
+import pandas as pd
+import numpy as np
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+import json
+from pathlib import Path
+import logging
+from scipy.stats import entropy
+import warnings
+from datetime import datetime
+import matplotlib.pyplot as plt
+import seaborn as sns
+import re
+from textblob import TextBlob
+import os
+from pathlib import Path
+class DataCleaner:
+    def __init__(self,df,output_path,filename="preprocessed_cleaned.csv"):
+        self.df=df
+        self.output_path=output_path
+        self.filename=filename
+    def saving_cleaned_preprocess(self):
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        output_file = Path(self.output_path) /  self.filename
+        logger.info(f"Files saved in directory {output_file} as : { self.filename}")
+        self.df.to_csv(output_file, index=False)
+    def dropping_unncessary_columns(self):
+        self.df.drop("review_text", axis=1, inplace=True)
+        self.df.drop("review_date", axis=1, inplace=True)
+        self.df.drop("business_name", axis=1, inplace=True)
+        self.df.drop("city", axis=1, inplace=True)
+        self.df.drop("state", axis=1, inplace=True)
+        self.df.drop("postal_code", axis=1, inplace=True)
+        self.df.drop("categories", axis=1, inplace=True)
+        self.df.drop("user_name", axis=1, inplace=True)
+        self.df.drop("yelping_since", axis=1, inplace=True)
+        self.df.drop("checkin_date", axis=1, inplace=True)
+        self.df.drop("review_useful", axis=1, inplace=True)
+        self.df.drop("review_funny", axis=1, inplace=True)
+        self.df.drop("review_cool", axis=1, inplace=True)
+        self.df.drop("user_useful", axis=1, inplace=True)
+        self.df.drop("user_funny", axis=1, inplace=True)
+        self.df.drop("user_cool", axis=1, inplace=True)
+        self.df.drop("is_open", axis=1, inplace=True)
+        self.df.drop("compliment_hot", axis=1, inplace=True)
+        self.df.drop("compliment_more", axis=1, inplace=True)
+        self.df.drop("compliment_profile", axis=1, inplace=True)
+        self.df.drop("compliment_cute", axis=1, inplace=True)
+        self.df.drop("compliment_list", axis=1, inplace=True)
+        self.df.drop("compliment_note", axis=1, inplace=True)
+        self.df.drop("compliment_plain", axis=1, inplace=True)
+        self.df.drop("compliment_cool", axis=1, inplace=True)
+        self.df.drop("compliment_funny", axis=1, inplace=True)
+        self.df.drop("compliment_writer", axis=1, inplace=True)
+        self.df.drop("compliment_photos", axis=1, inplace=True)
+    def run_pipeline(self):
+        logger.info("Dropping Unnecessary Columns")
+        self.dropping_unncessary_columns()
+        logger.info("Saving Cleaned and Preprocessed Data")
+        self.saving_cleaned_preprocess()

src/.ipynb_checkpoints/create_dataset-checkpoint.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import pandas as pd
+import ujson as json
+import gc
+import numpy as np
+from concurrent.futures import ProcessPoolExecutor
+import multiprocessing as mp
+from pymongo import MongoClient
+from collections import defaultdict
+from pathlib import Path
+# def read_json_parallel(file_path, num_workers=None):
+#     """Read JSON file using parallel processing"""
+#     if num_workers is None:
+#         num_workers = max(1, mp.cpu_count() - 1)
+#     print(f"Reading {file_path}...")
+#     # Read chunks and concatenate them into a single DataFrame
+#     df = pd.read_json(file_path, lines=True, dtype_backend="pyarrow", chunksize=100000)
+#     return next(df)
+def read_data_mongo(file_path, num_workers=None):
+    """Read JSON file using parallel processing"""
+    if num_workers is None:
+        num_workers = max(1, mp.cpu_count() - 1)
+    print(f"Reading {file_path}...")
+    conn_str = "mongodb://Mtalha:[email protected]/"
+    client = MongoClient(conn_str)
+    databases = client.list_database_names()
+    db_client=client["Yelp"]
+    # Read the entire file at once since chunksize isn't needed for parallel reading here
+    # Use 'records' orient if your JSON was saved with this format
+    try:
+        collection = db_client[file_path]
+        documents = collection.find({}, {"_id": 0})
+        data = list(documents)
+        final_dict=defaultdict(list)
+        for dictt in data:
+            for k,v in dictt.items():
+                final_dict[k].append(v)
+        df=pd.DataFrame(final_dict)
+        # df = pd.read_json(file_path, orient='records', dtype_backend="pyarrow")
+    except Exception as e:
+        # If 'records' doesn't work, try without specifying orient or with 'split'
+        # This is a fallback for different JSON structures
+        # df = pd.read_json(file_path, dtype_backend="pyarrow")
+        print("ERROR WHILE READING FILES FORM MONGODB AS : ",e)
+    print(f"Finished reading. DataFrame shape: {df.shape}")
+    return df
+def process_datasets(output_path,filename):
+    # File paths
+    file_paths = {
+        'business': "yelp_academic_dataset_business",
+        'checkin':  "yelp_academic_dataset_checkin",
+        'review':   "yelp_academic_dataset_review",
+        'tip':      "yelp_academic_dataset_tip",
+        'user':     "yelp_academic_dataset_user",
+        'google':   "google_review_dataset"
+    }
+    # Read datasets with progress tracking
+    print("Reading datasets...")
+    dfs = {}
+    for name, path in file_paths.items():
+        print(f"Processing {name} dataset...")
+        dfs[name] = read_data_mongo(path)
+        print(f"Finished reading {name} dataset. Shape: {dfs[name].shape}")
+    print("All files read. Starting column renaming...")
+    # Rename columns to avoid conflicts
+    # Reviews
+    dfs['review'] = dfs['review'].rename(columns={
+        'date': 'review_date',
+        'stars': 'review_stars',
+        'text': 'review_text',
+        'useful': 'review_useful',
+        'funny': 'review_funny',
+        'cool': 'review_cool'
+    })
+    # print("COLUMNS IN REVIEW DAFRA)
+    # Tips
+    dfs['tip'] = dfs['tip'].rename(columns={
+        'date': 'tip_date',
+        'text': 'tip_text',
+        'compliment_count': 'tip_compliment_count'
+    })
+    # Checkins
+    dfs['checkin'] = dfs['checkin'].rename(columns={
+         'date': 'checkin_date'
+    })
+    # Users
+    dfs['user'] = dfs['user'].rename(columns={
+        'name': 'user_name',
+        'review_count': 'user_review_count',
+        'useful': 'user_useful',
+        'funny': 'user_funny',
+        'cool': 'user_cool'
+    })
+    # Business
+    dfs['business'] = dfs['business'].rename(columns={
+        'name': 'business_name',
+        'stars': 'business_stars',
+        'review_count': 'business_review_count'
+    })
+    dfs['google'] = dfs['google'].rename(columns={
+        'name': 'business_name',
+        'stars': 'business_stars',
+        'review_count': 'business_review_count'
+    })
+    df_business_final= dfs['business']
+    df_google_final=dfs['google']
+    df_review_final=dfs['review']
+    df_tip_final=dfs['tip']
+    df_checkin_final=dfs['checkin']
+    df_user_final=dfs['user']
+    df_business_final=pd.concat([df_business_final,df_google_final],axis=0)
+    df_business_final.reset_index(drop=True,inplace=True)
+    print("Starting merge process...")
+    # Merge process with memory management
+    print("Step 1: Starting with reviews...")
+    merged_df = df_review_final
+    print("Step 2: Merging with business data...")
+    merged_df = merged_df.merge(
+        df_business_final,
+        on='business_id',
+        how='left'
+    )
+    print("Step 3: Merging with user data...")
+    merged_df = merged_df.merge(
+        df_user_final,
+        on='user_id',
+        how='left'
+    )
+    print("Step 4: Merging with checkin data...")
+    merged_df = merged_df.merge(
+        df_checkin_final,
+        on='business_id',
+        how='left'
+    )
+    print("Step 5: Aggregating and merging tip data...")
+    tip_agg = df_tip_final.groupby('business_id').agg({
+        'tip_compliment_count': 'sum',
+        'tip_text': 'count'
+    }).rename(columns={'tip_text': 'tip_count'})
+    merged_df = merged_df.merge(
+        tip_agg,
+        on='business_id',
+        how='left'
+    )
+    print("Filling NaN values...")
+    merged_df['tip_count'] = merged_df['tip_count'].fillna(0)
+    merged_df['tip_compliment_count'] = merged_df['tip_compliment_count'].fillna(0)
+    merged_df['checkin_date'] = merged_df['checkin_date'].fillna('')
+    merged_df["friends"].fillna(0,inplace=True)
+    for col in merged_df.columns:
+        if merged_df[col].isnull().sum()>0:
+            print(f" {col} has {merged_df[col].isnull().sum()} null values")
+    print("Shape of Merged Dataset is : ",merged_df.shape)
+    output_file = Path(output_path) / filename
+    print("COLUMNS BEFORE PREPROCESING")
+    print()
+    print(merged_df.info())
+    for col in merged_df.columns:
+        for v in merged_df[col]:
+            print(f"Type of values in {col} is {type(v)} and values are like : {v}")
+            break
+    merged_df.to_csv(output_file,index=False)
+    return merged_df
+# if __name__ == "__main__":
+#     process_datasets()

src/.ipynb_checkpoints/feature_analyzer-checkpoint.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from pathlib import Path
+from loguru import logger
+class FeatureAnalyzer:
+    def __init__(self,df,output_path):
+        self.df=df
+        self.output_path=output_path
+    def plot_correlation_heatmap(self):
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        numeric_cols = self.df.select_dtypes(include=[np.number]).columns.drop('fake')
+        correlation_matrix = self.df[numeric_cols].corr()
+        plt.figure(figsize=(14, 12))
+        sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1, center=0)
+        plt.title('Correlation Heatmap of Numeric Features', fontsize=16)
+        plt.tight_layout()
+        output_file = Path(self.output_path) / 'correlation_heatmap.png'
+        plt.savefig(output_file, dpi=300, bbox_inches='tight')
+        plt.close()
+        logger.info(f"Saved correlation heatmap to {output_file}")
+    def plot_mean_by_fake_bar(self):
+        key_features = [
+            'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
+            'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
+            'time_since_last_review_user', 'user_account_age', 'pronoun_density',
+            'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
+            'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
+        ]
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        mean_by_fake = self.df.groupby('fake')[key_features].mean().T
+        mean_by_fake.columns = ['Genuine (0)', 'Fake (1)']
+        plt.figure(figsize=(12, 8))
+        mean_by_fake.plot(kind='bar', color=['skyblue', 'salmon'], width=0.8)
+        plt.title('Mean Feature Values by Fake Label', fontsize=16)
+        plt.xlabel('Features', fontsize=12)
+        plt.ylabel('Mean Value', fontsize=12)
+        plt.xticks(rotation=45, ha='right')
+        plt.legend(title='Fake Label')
+        plt.tight_layout()
+        output_file = Path(self.output_path) / 'mean_by_fake_bar.png'
+        plt.savefig(output_file, dpi=300, bbox_inches='tight')
+        plt.close()
+        logger.info(f"Saved mean by fake bar plot to {output_file}")
+    def plot_violin_plots(self):
+        key_features = [
+            'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
+            'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
+            'time_since_last_review_user', 'user_account_age', 'pronoun_density',
+            'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
+            'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
+        ]
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        plt.figure(figsize=(14, 10))
+        for i, feature in enumerate(key_features[:6], 1):
+            plt.subplot(2, 3, i)
+            sns.violinplot(x='fake', y=feature, data=self.df, palette=['skyblue', 'salmon'])
+            plt.title(f'{feature} Distribution', fontsize=12)
+            plt.xlabel('Fake (0/1)', fontsize=10)
+        plt.tight_layout()
+        output_file = Path(self.output_path) / 'violin_plots.png'
+        plt.savefig(output_file, dpi=300, bbox_inches='tight')
+        plt.close()
+        logger.info(f"Saved violin plots to {output_file}")
+    def plot_box_plots(self):
+        key_features = [
+            'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
+            'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
+            'time_since_last_review_user', 'user_account_age', 'pronoun_density',
+            'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
+            'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
+        ]
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        plt.figure(figsize=(14, 10))
+        for i, feature in enumerate(key_features[6:11], 1):
+            plt.subplot(2, 3, i)
+            sns.boxplot(x='fake', y=feature, data=self.df, palette=['skyblue', 'salmon'])
+            plt.title(f'{feature} Distribution', fontsize=12)
+            plt.xlabel('Fake (0/1)', fontsize=10)
+        plt.tight_layout()
+        output_file = Path(self.output_path) / 'box_plots.png'
+        plt.savefig(output_file, dpi=300, bbox_inches='tight')
+        plt.close()
+        logger.info(f"Saved box plots to {output_file}")
+    def plot_scatter_review_grammar(self):
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        plt.figure(figsize=(10, 6))
+        sns.scatterplot(x='review_stars', y='grammar_error_score', hue='fake', data=self.df, palette=['blue', 'red'], alpha=0.5)
+        plt.title('Review Stars vs Grammar Error Score by Fake Label', fontsize=16)
+        plt.xlabel('Review Stars', fontsize=12)
+        plt.ylabel('Grammar Error Score', fontsize=12)
+        plt.legend(title='Fake')
+        plt.tight_layout()
+        output_file = Path(self.output_path) / 'scatter_review_grammar.png'
+        plt.savefig(output_file, dpi=300, bbox_inches='tight')
+        plt.close()
+        logger.info(f"Saved scatter plot to {output_file}")
+    def plot_density_plots(self):
+        key_features = [
+            'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
+            'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
+            'time_since_last_review_user', 'user_account_age', 'pronoun_density',
+            'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
+            'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
+        ]
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        plt.figure(figsize=(14, 10))
+        for i, feature in enumerate(key_features[:4], 1):
+            plt.subplot(2, 2, i)
+            for label in [0, 1]:
+                subset = self.df[self.df['fake'] == label]
+                sns.kdeplot(subset[feature], label=f'Fake={label}', fill=True, alpha=0.5)
+            plt.title(f'{feature} Density', fontsize=12)
+            plt.xlabel(feature, fontsize=10)
+            plt.legend()
+        plt.tight_layout()
+        output_file = Path(self.output_path) / 'density_plots.png'
+        plt.savefig(output_file, dpi=300, bbox_inches='tight')
+        plt.close()
+        logger.info(f"Saved density plots to {output_file}")
+    def plot_stacked_bar_similarity(self):
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        bins = pd.cut(self.df['similarity_to_other_reviews'], bins=10)
+        stacked_data = self.df.groupby([bins, 'fake']).size().unstack(fill_value=0)
+        stacked_data = stacked_data.div(stacked_data.sum(axis=1), axis=0)
+        plt.figure(figsize=(12, 8))
+        stacked_data.plot(kind='bar', stacked=True, color=['skyblue', 'salmon'], width=0.8)
+        plt.title('Proportion of Fake by Similarity to Other Reviews Bins', fontsize=16)
+        plt.xlabel('Similarity Bins', fontsize=12)
+        plt.ylabel('Proportion', fontsize=12)
+        plt.legend(['Genuine (0)', 'Fake (1)'], title='Fake Label')
+        plt.xticks(rotation=45, ha='right')
+        plt.tight_layout()
+        output_file = Path(self.output_path) / 'stacked_bar_similarity.png'
+        plt.savefig(output_file, dpi=300, bbox_inches='tight')
+        plt.close()
+        logger.info(f"Saved stacked bar plot to {output_file}")
+    def plot_pie_fake_distribution(self):
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        fake_counts = self.df['fake'].value_counts()
+        plt.figure(figsize=(8, 8))
+        plt.pie(fake_counts, labels=['Genuine (0)', 'Fake (1)'], colors=['skyblue', 'salmon'], autopct='%1.1f%%', startangle=90)
+        plt.title('Distribution of Fake Labels', fontsize=16)
+        plt.axis('equal')
+        output_file = Path(self.output_path) / 'pie_fake_distribution.png'
+        plt.savefig(output_file, dpi=300, bbox_inches='tight')
+        plt.close()
+        logger.info(f"Saved pie chart to {output_file}")
+    def plot_count_code_switching(self):
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        plt.figure(figsize=(8, 6))
+        sns.countplot(x='code_switching_flag', hue='fake', data=self.df, palette=['skyblue', 'salmon'])
+        plt.title('Count of Fake by Code Switching Flag', fontsize=16)
+        plt.xlabel('Code Switching Flag (0/1)', fontsize=12)
+        plt.ylabel('Count', fontsize=12)
+        plt.legend(title='Fake Label')
+        plt.tight_layout()
+        output_file = Path(self.output_path) / 'count_code_switching.png'
+        plt.savefig(output_file, dpi=300, bbox_inches='tight')
+        plt.close()
+        logger.info(f"Saved count plot to {output_file}")
+    def plot_variance_by_fake_bar(self):
+        key_features = [
+            'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
+            'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
+            'time_since_last_review_user', 'user_account_age', 'pronoun_density',
+            'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
+            'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
+        ]
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        variance_by_fake = self.df.groupby('fake')[key_features].var().T
+        variance_by_fake.columns = ['Genuine (0)', 'Fake (1)']
+        plt.figure(figsize=(12, 8))
+        variance_by_fake.plot(kind='bar', color=['skyblue', 'salmon'], width=0.8)
+        plt.title('Feature Variance by Fake Label', fontsize=16)
+        plt.xlabel('Features', fontsize=12)
+        plt.ylabel('Variance', fontsize=12)
+        plt.xticks(rotation=45, ha='right')
+        plt.legend(title='Fake Label')
+        plt.tight_layout()
+        output_file = Path(self.output_path) / 'variance_by_fake_bar.png'
+        plt.savefig(output_file, dpi=300, bbox_inches='tight')
+        plt.close()
+        logger.info(f"Saved variance bar plot to {output_file}")
+    def run_pipeline(self):
+        sns.set(style="whitegrid")
+        plt.rcParams['figure.figsize'] = (12, 8)
+        self.plot_correlation_heatmap()
+        self.plot_mean_by_fake_bar()
+        self.plot_violin_plots()
+        self.plot_box_plots()
+        self.plot_scatter_review_grammar()
+        self.plot_density_plots()
+        self.plot_stacked_bar_similarity()
+        self.plot_pie_fake_distribution()
+        self.plot_count_code_switching()
+        self.plot_variance_by_fake_bar()

src/.ipynb_checkpoints/model-checkpoint.py ADDED Viewed

	@@ -0,0 +1,541 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch_geometric.data import HeteroData
+import numpy as np
+import pandas as pd
+import networkx as nx
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve
+from sklearn.model_selection import train_test_split
+from pathlib import Path
+from datetime import datetime
+from loguru import logger
+# Temporal Edge Features Function
+def create_temporal_edge_features(time_since_src, time_since_tgt, user_i, user_j):
+    delta_t = torch.abs(time_since_src - time_since_tgt).float()
+    hour_scale = torch.sin(delta_t / 3600)
+    day_scale = torch.sin(delta_t / (24 * 3600))
+    week_scale = torch.sin(delta_t / (7 * 24 * 3600))
+    same_user = (user_i == user_j).float()
+    burst_feature = same_user * torch.exp(-delta_t / (24 * 3600))
+    return torch.stack([hour_scale, day_scale, week_scale, burst_feature], dim=-1)
+# Custom Multihead Attention (unchanged)
+class CustomMultiheadAttention(nn.Module):
+    def __init__(self, embed_dim, num_heads):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+        self.scale = self.head_dim ** -0.5
+    def forward(self, query, key, value, attn_bias=None):
+        batch_size, seq_len, embed_dim = query.size()
+        q = self.q_proj(query)
+        k = self.k_proj(key)
+        v = self.v_proj(value)
+        q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        k = k.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        v = v.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale
+        if attn_bias is not None:
+            scores = scores + attn_bias.unsqueeze(1)
+        attn = F.softmax(scores, dim=-1)
+        out = torch.matmul(attn, v)
+        out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, embed_dim)
+        out = self.out_proj(out)
+        return out, attn
+# HeteroGraphormer (unchanged)
+class HeteroGraphormer(nn.Module):
+    def __init__(self, hidden_dim, output_dim, num_heads=4, edge_dim=4):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.embed_dict = nn.ModuleDict({
+            'user': nn.Linear(14, hidden_dim),
+            'business': nn.Linear(8, hidden_dim),
+            'review': nn.Linear(16, hidden_dim)
+        })
+        self.edge_proj = nn.Linear(edge_dim, hidden_dim)
+        self.gru_user = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
+        self.gru_business = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
+        self.gru_review = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
+        self.attention1 = CustomMultiheadAttention(hidden_dim, num_heads)
+        self.attention2 = CustomMultiheadAttention(hidden_dim, num_heads)
+        self.ffn1 = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim * 4),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(hidden_dim * 4, hidden_dim)
+        )
+        self.ffn2 = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim * 4),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(hidden_dim * 4, hidden_dim)
+        )
+        self.norm1 = nn.LayerNorm(hidden_dim)
+        self.norm2 = nn.LayerNorm(hidden_dim)
+        self.norm3 = nn.LayerNorm(hidden_dim)
+        self.norm4 = nn.LayerNorm(hidden_dim)
+        self.centrality_proj = nn.Linear(1, hidden_dim)
+        self.classifier = nn.Sequential(
+            nn.Linear(hidden_dim * 3, hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(hidden_dim, 1)
+        )
+        self.dropout = nn.Dropout(0.1)
+    def time_aware_aggregation(self, x, time_since, decay_rate=0.1):
+        weights = torch.exp(-decay_rate * time_since.unsqueeze(-1))
+        return x * weights
+    def forward(self, data, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict):
+        x_dict = {}
+        for node_type in data.x_dict:
+            x = self.embed_dict[node_type](data[node_type].x)
+            if node_type in time_since_dict:
+                x = self.time_aware_aggregation(x, time_since_dict[node_type])
+            x_dict[node_type] = x
+        x = torch.cat([x_dict['user'], x_dict['business'], x_dict['review']], dim=0)
+        centrality = self.centrality_proj(centrality_encoding)
+        x = x + centrality
+        x = x.unsqueeze(0)
+        x_user = x[:, :data['user'].x.size(0), :]
+        x_business = x[:, data['user'].x.size(0):data['user'].x.size(0) + data['business'].x.size(0), :]
+        x_review = x[:, data['user'].x.size(0) + data['business'].x.size(0):, :]
+        x_user, _ = self.gru_user(x_user)
+        x_business, _ = self.gru_business(x_business)
+        x_review, _ = self.gru_review(x_review)
+        x = torch.cat([x_user, x_business, x_review], dim=1)
+        total_nodes = x.size(1)
+        attn_bias = torch.zeros(1, total_nodes, total_nodes, device=x.device)
+        attn_bias[0] = -spatial_encoding
+        for edge_type in edge_features_dict:
+            edge_index = data[edge_type].edge_index
+            edge_feats = self.edge_proj(edge_features_dict[edge_type])
+            for i, (src, tgt) in enumerate(edge_index.t()):
+                attn_bias[0, src, tgt] += edge_feats[i].sum()
+        residual = x
+        x, _ = self.attention1(x, x, x, attn_bias=attn_bias)
+        x = self.norm1(x + residual)
+        x = self.dropout(x)
+        residual = x
+        x = self.ffn1(x)
+        x = self.norm2(x + residual)
+        x = self.dropout(x)
+        residual = x
+        x, _ = self.attention2(x, x, x, attn_bias=attn_bias)
+        x = self.norm3(x + residual)
+        x = self.dropout(x)
+        residual = x
+        x = self.ffn2(x)
+        x = self.norm4(x + residual)
+        x = self.dropout(x)
+        x = x.squeeze(0)
+        user_start = 0
+        business_start = data['user'].x.size(0)
+        review_start = business_start + data['business'].x.size(0)
+        h_user = x[user_start:business_start]
+        h_business = x[business_start:review_start]
+        h_review = x[review_start:]
+        user_indices = data['user', 'writes', 'review'].edge_index[0]
+        business_indices = data['review', 'about', 'business'].edge_index[1]
+        review_indices = data['user', 'writes', 'review'].edge_index[1]
+        h_user_mapped = h_user[user_indices]
+        h_business_mapped = h_business[business_indices]
+        h_review_mapped = h_review[review_indices]
+        combined = torch.cat([h_review_mapped, h_user_mapped, h_business_mapped], dim=-1)
+        logits = self.classifier(combined)
+        return torch.sigmoid(logits)
+# Updated GraphformerModel with Plotting
+class GraphformerModel:
+    def __init__(self, df, output_path, epochs, test_size=0.3):
+        self.df_whole = df
+        self.output_path = output_path
+        self.output_path = Path(self.output_path) / "GraphformerModel"
+        self.epochs = epochs
+        self.df, self.test_df = train_test_split(self.df_whole, test_size=test_size, random_state=42)
+        torch.manual_seed(42)
+        np.random.seed(42)
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.model = HeteroGraphormer(hidden_dim=64, output_dim=1, edge_dim=4).to(self.device)
+        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.005)
+        self.criterion = nn.BCELoss()
+    def compute_graph_encodings(self, data):
+        G = nx.DiGraph()
+        node_offset = 0
+        node_type_map = {}
+        for node_type in ['user', 'business', 'review']:
+            num_nodes = data[node_type].x.size(0)
+            for i in range(num_nodes):
+                G.add_node(node_offset + i)
+                node_type_map[node_offset + i] = node_type
+            node_offset += num_nodes
+        edge_types = [('user', 'writes', 'review'), ('review', 'about', 'business')]
+        for src_type, rel, tgt_type in edge_types:
+            edge_index = data[src_type, rel, tgt_type].edge_index
+            src_nodes = edge_index[0].tolist()
+            tgt_nodes = edge_index[1].tolist()
+            src_offset = 0 if src_type == 'user' else (self.num_users if src_type == 'business' else self.num_users + self.num_businesses)
+            tgt_offset = 0 if tgt_type == 'user' else (self.num_users if tgt_type == 'business' else self.num_users + self.num_businesses)
+            for src, tgt in zip(src_nodes, tgt_nodes):
+                G.add_edge(src + src_offset, tgt + tgt_offset)
+        num_nodes = G.number_of_nodes()
+        spatial_encoding = torch.full((num_nodes, num_nodes), float('inf'), device=self.device)
+        for i in range(num_nodes):
+            for j in range(num_nodes):
+                if i == j:
+                    spatial_encoding[i, j] = 0
+                elif nx.has_path(G, i, j):
+                    spatial_encoding[i, j] = nx.shortest_path_length(G, i, j)
+        centrality_encoding = torch.tensor([G.degree(i) for i in range(num_nodes)], dtype=torch.float, device=self.device).view(-1, 1)
+        return spatial_encoding, centrality_encoding, node_type_map
+    def compute_metrics(self, y_true, y_pred, y_prob, prefix=""):
+        metrics = {}
+        metrics[f"{prefix}accuracy"] = accuracy_score(y_true, y_pred)
+        metrics[f"{prefix}precision"] = precision_score(y_true, y_pred, zero_division=0)
+        metrics[f"{prefix}recall"] = recall_score(y_true, y_pred, zero_division=0)
+        metrics[f"{prefix}f1"] = f1_score(y_true, y_pred, zero_division=0)
+        metrics[f"{prefix}auc_roc"] = roc_auc_score(y_true, y_prob)
+        metrics[f"{prefix}conf_matrix"] = confusion_matrix(y_true, y_pred)
+        metrics[f"{prefix}class_report"] = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
+        return metrics
+    def run_model(self):
+        features = torch.tensor(self.df.drop(columns=['user_id', 'review_id', 'business_id', 'fake']).values, dtype=torch.float, device=self.device)
+        y = torch.tensor(self.df['fake'].values, dtype=torch.float, device=self.device)
+        time_since_user = torch.tensor(self.df['time_since_last_review_user'].values, dtype=torch.float, device=self.device)
+        time_since_business = torch.tensor(self.df['time_since_last_review_business'].values, dtype=torch.float, device=self.device)
+        num_rows = len(self.df)
+        graph = HeteroData()
+        self.num_users = len(self.df['user_id'].unique())
+        self.num_businesses = len(self.df['business_id'].unique())
+        user_indices = torch.tensor(self.df['user_id'].map({uid: i for i, uid in enumerate(self.df['user_id'].unique())}).values, dtype=torch.long, device=self.device)
+        business_indices = torch.tensor(self.df['business_id'].map({bid: i for i, bid in enumerate(self.df['business_id'].unique())}).values, dtype=torch.long, device=self.device)
+        review_indices = torch.arange(num_rows, dtype=torch.long, device=self.device)
+        user_feats = torch.zeros(self.num_users, 14, device=self.device)
+        business_feats = torch.zeros(self.num_businesses, 8, device=self.device)
+        review_feats = torch.zeros(num_rows, 16, device=self.device)
+        user_cols = ['hours', 'user_review_count', 'elite', 'friends', 'fans', 'average_stars',
+                     'time_since_last_review_user', 'user_account_age', 'user_degree',
+                     'user_review_burst_count', 'review_like_ratio', 'latest_checkin_hours',
+                     'user_useful_funny_cool', 'rating_variance_user']
+        business_cols = ['latitude', 'longitude', 'business_stars', 'business_review_count',
+                         'time_since_last_review_business', 'business_degree',
+                         'business_review_burst_count', 'rating_deviation_from_business_average']
+        review_cols = ['review_stars', 'tip_compliment_count', 'tip_count', 'average_time_between_reviews',
+                       'temporal_similarity', 'pronoun_density', 'avg_sentence_length',
+                       'excessive_punctuation_count', 'sentiment_polarity', 'good_severity',
+                       'bad_severity', 'code_switching_flag', 'grammar_error_score',
+                       'repetitive_words_count', 'similarity_to_other_reviews', 'review_useful_funny_cool']
+        for i in range(len(self.df)):
+            user_idx = user_indices[i]
+            business_idx = business_indices[i]
+            user_feats[user_idx] += features[i, :14]
+            business_feats[business_idx] += features[i, 14:22]
+        review_feats = features[:, 22:38]
+        graph['user'].x = user_feats
+        graph['business'].x = business_feats
+        graph['review'].x = review_feats
+        graph['review'].y = y
+        graph['user', 'writes', 'review'].edge_index = torch.stack([user_indices, review_indices], dim=0)
+        graph['review', 'about', 'business'].edge_index = torch.stack([review_indices, business_indices], dim=0)
+        edge_features_dict = {}
+        user_writes_edge = graph['user', 'writes', 'review'].edge_index
+        review_about_edge = graph['review', 'about', 'business'].edge_index
+        src_users = user_indices[user_writes_edge[0]]
+        tgt_reviews = review_indices[user_writes_edge[1]]
+        edge_features_dict[('user', 'writes', 'review')] = create_temporal_edge_features(
+            time_since_user[src_users], time_since_user[tgt_reviews], src_users, src_users
+        )
+        src_reviews = review_indices[review_about_edge[0]]
+        tgt_businesses = business_indices[review_about_edge[1]]
+        edge_features_dict[('review', 'about', 'business')] = create_temporal_edge_features(
+            time_since_business[src_reviews], time_since_business[tgt_businesses],
+            torch.zeros_like(src_reviews), torch.zeros_like(src_reviews)
+        )
+        user_time_since = self.df.groupby('user_id')['time_since_last_review_user'].min().reindex(
+            self.df['user_id'].unique(), fill_value=0).values
+        time_since_dict = {
+            'user': torch.tensor(user_time_since, dtype=torch.float, device=self.device)
+        }
+        spatial_encoding, centrality_encoding, node_type_map = self.compute_graph_encodings(graph)
+        # Training with metrics history
+        self.model.train()
+        train_metrics_history = []
+        for epoch in range(self.epochs):
+            self.optimizer.zero_grad()
+            out = self.model(graph, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict)
+            loss = self.criterion(out.squeeze(), y)
+            loss.backward()
+            self.optimizer.step()
+            pred_labels = (out.squeeze() > 0.5).float()
+            logger.info(f"PREDICTED LABELS : {pred_labels}")
+            # print(pred_labels)
+            probs = out.squeeze().detach().cpu().numpy()
+            train_metrics = self.compute_metrics(y.cpu().numpy(), pred_labels.cpu().numpy(), probs, prefix="train_")
+            train_metrics['loss'] = loss.item()
+            train_metrics_history.append(train_metrics)
+            if epoch % 10 == 0:
+                logger.info(f"Epoch {epoch}, Loss: {loss.item():.4f}, Accuracy: {train_metrics['train_accuracy']:.4f}, F1: {train_metrics['train_f1']:.4f}")
+        # Save model
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        model_save_path = Path(self.output_path) / f"model_GraphformerModel_latest.pth"
+        torch.save(self.model.state_dict(), model_save_path)
+        # Testing
+        if self.test_df is not None:
+            test_features = torch.tensor(self.test_df.drop(columns=['user_id', 'review_id', 'business_id', 'fake']).values, dtype=torch.float, device=self.device)
+            test_y = torch.tensor(self.test_df['fake'].values, dtype=torch.float, device=self.device)
+            test_time_since_user = torch.tensor(self.test_df['time_since_last_review_user'].values, dtype=torch.float, device=self.device)
+            test_time_since_business = torch.tensor(self.test_df['time_since_last_review_business'].values, dtype=torch.float, device=self.device)
+            num_test_rows = len(self.test_df)
+            new_user_unique = self.test_df['user_id'].unique()
+            new_business_unique = self.test_df['business_id'].unique()
+            existing_user_ids = list(self.df['user_id'].unique())
+            user_mapping = {uid: i for i, uid in enumerate(existing_user_ids)}
+            total_users = self.num_users
+            for uid in new_user_unique:
+                if uid not in user_mapping:
+                    user_mapping[uid] = total_users
+                    total_users += 1
+            existing_business_ids = list(self.df['business_id'].unique())
+            business_mapping = {bid: i for i, bid in enumerate(existing_business_ids)}
+            total_businesses = self.num_businesses
+            for bid in new_business_unique:
+                if bid not in business_mapping:
+                    business_mapping[bid] = total_businesses
+                    total_businesses += 1
+            new_user_indices = torch.tensor([user_mapping[uid] for uid in self.test_df['user_id']], dtype=torch.long, device=self.device)
+            new_business_indices = torch.tensor([business_mapping[bid] for bid in self.test_df['business_id']], dtype=torch.long, device=self.device)
+            new_review_indices = torch.arange(num_rows, num_rows + num_test_rows, device=self.device)
+            if total_users > self.num_users:
+                additional_user_feats = torch.zeros(total_users - self.num_users, 14, device=self.device)
+                graph['user'].x = torch.cat([graph['user'].x, additional_user_feats], dim=0)
+            if total_businesses > self.num_businesses:
+                additional_business_feats = torch.zeros(total_businesses - self.num_businesses, 8, device=self.device)
+                graph['business'].x = torch.cat([graph['business'].x, additional_business_feats], dim=0)
+            for i in range(num_test_rows):
+                user_idx = new_user_indices[i]
+                business_idx = new_business_indices[i]
+                if user_idx < graph['user'].x.size(0):
+                    graph['user'].x[user_idx] += test_features[i, :14]
+                if business_idx < graph['business'].x.size(0):
+                    graph['business'].x[business_idx] += test_features[i, 14:22]
+            graph['review'].x = torch.cat([graph['review'].x, test_features[:, 22:38]], dim=0)
+            graph['review'].y = torch.cat([graph['review'].y, test_y], dim=0)
+            graph['user', 'writes', 'review'].edge_index = torch.cat([
+                graph['user', 'writes', 'review'].edge_index,
+                torch.stack([new_user_indices, new_review_indices], dim=0)], dim=1)
+            graph['review', 'about', 'business'].edge_index = torch.cat([
+                graph['review', 'about', 'business'].edge_index,
+                torch.stack([new_review_indices, new_business_indices], dim=0)], dim=1)
+            all_time_since_user = torch.cat([time_since_user, test_time_since_user])
+            all_time_since_business = torch.cat([time_since_business, test_time_since_business])
+            all_user_indices = torch.cat([user_indices, new_user_indices])
+            all_business_indices = torch.cat([business_indices, new_business_indices])
+            all_review_indices = torch.cat([review_indices, new_review_indices])
+            user_writes_edge = graph['user', 'writes', 'review'].edge_index
+            review_about_edge = graph['review', 'about', 'business'].edge_index
+            edge_features_dict[('user', 'writes', 'review')] = create_temporal_edge_features(
+                all_time_since_user[user_writes_edge[0]], all_time_since_user[user_writes_edge[1]],
+                all_user_indices[user_writes_edge[0]], all_user_indices[user_writes_edge[0]]
+            )
+            edge_features_dict[('review', 'about', 'business')] = create_temporal_edge_features(
+                all_time_since_business[review_about_edge[0]], all_time_since_business[review_about_edge[1]],
+                torch.zeros_like(review_about_edge[0]), torch.zeros_like(review_about_edge[0])
+            )
+            self.num_users = total_users
+            self.num_businesses = total_businesses
+            test_user_time_since = self.test_df.groupby('user_id')['time_since_last_review_user'].min().reindex(
+                pd.Index(list(self.df['user_id'].unique()) + list(self.test_df['user_id'].unique())), fill_value=0).values
+            time_since_dict['user'] = torch.tensor(test_user_time_since[:total_users], dtype=torch.float, device=self.device)
+            spatial_encoding, centrality_encoding, node_type_map = self.compute_graph_encodings(graph)
+            self.model.eval()
+            with torch.no_grad():
+                out = self.model(graph, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict)
+                pred_labels = (out.squeeze() > 0.5).float()
+                probs = out.squeeze().detach().cpu().numpy()
+                test_metrics = self.compute_metrics(graph['review'].y[-num_test_rows:].cpu().numpy(), pred_labels[-num_test_rows:].cpu().numpy(), probs[-num_test_rows:], prefix="test_")
+                train_metrics = self.compute_metrics(y.cpu().numpy(), pred_labels[:num_rows].cpu().numpy(), probs[:num_rows], prefix="train_")
+                logger.info(f"Test Accuracy: {test_metrics['test_accuracy']:.4f}, F1: {test_metrics['test_f1']:.4f}, AUC-ROC: {test_metrics['test_auc_roc']:.4f}")
+            # Save metrics to file
+            metrics_file = Path(self.output_path) / f"metrics_{timestamp}.txt"
+            with open(metrics_file, 'w') as f:
+                f.write("Training Metrics (Final Epoch):\n")
+                for k, v in train_metrics.items():
+                    f.write(f"{k}: {v}\n")
+                f.write("\nTest Metrics:\n")
+                for k, v in test_metrics.items():
+                    f.write(f"{k}: {v}\n")
+            # Plotting and saving to output_path
+            plt.figure(figsize=(12, 8))
+            plt.plot([m['loss'] for m in train_metrics_history], label='Training Loss')
+            plt.xlabel('Epoch')
+            plt.ylabel('Loss')
+            plt.title('Training Loss Curve')
+            plt.legend()
+            plt.grid(True)
+            plt.savefig(Path(self.output_path) / f"loss_curve_{timestamp}.png")
+            plt.close()
+            plt.figure(figsize=(12, 8))
+            plt.plot([m['train_accuracy'] for m in train_metrics_history], label='Training Accuracy')
+            plt.xlabel('Epoch')
+            plt.ylabel('Accuracy')
+            plt.title('Training Accuracy Curve')
+            plt.legend()
+            plt.grid(True)
+            plt.savefig(Path(self.output_path) / f"accuracy_curve_{timestamp}.png")
+            plt.close()
+            plt.figure(figsize=(12, 8))
+            plt.plot([m['train_precision'] for m in train_metrics_history], label='Training Precision')
+            plt.plot([m['train_recall'] for m in train_metrics_history], label='Training Recall')
+            plt.plot([m['train_f1'] for m in train_metrics_history], label='Training F1-Score')
+            plt.xlabel('Epoch')
+            plt.ylabel('Score')
+            plt.title('Training Precision, Recall, and F1-Score Curves')
+            plt.legend()
+            plt.grid(True)
+            plt.savefig(Path(self.output_path) / f"prf1_curves_{timestamp}.png")
+            plt.close()
+            plt.figure(figsize=(12, 8))
+            plt.plot([m['train_auc_roc'] for m in train_metrics_history], label='Training AUC-ROC')
+            plt.xlabel('Epoch')
+            plt.ylabel('AUC-ROC')
+            plt.title('Training AUC-ROC Curve')
+            plt.legend()
+            plt.grid(True)
+            plt.savefig(Path(self.output_path) / f"auc_roc_curve_train_{timestamp}.png")
+            plt.close()
+            plt.figure(figsize=(8, 6))
+            sns.heatmap(test_metrics['test_conf_matrix'], annot=True, fmt='d', cmap='Blues', cbar=False)
+            plt.xlabel('Predicted')
+            plt.ylabel('True')
+            plt.title('Test Confusion Matrix')
+            plt.savefig(Path(self.output_path) / f"confusion_matrix_test_{timestamp}.png")
+            plt.close()
+            fpr, tpr, _ = roc_curve(graph['review'].y[-num_test_rows:].cpu().numpy(), probs[-num_test_rows:])
+            plt.figure(figsize=(10, 6))
+            plt.plot(fpr, tpr, label=f'Test ROC Curve (AUC = {test_metrics["test_auc_roc"]:.4f})')
+            plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
+            plt.xlabel('False Positive Rate')
+            plt.ylabel('True Positive Rate')
+            plt.title('Test ROC Curve')
+            plt.legend()
+            plt.grid(True)
+            plt.savefig(Path(self.output_path) / f"roc_curve_test_{timestamp}.png")
+            plt.close()
+            plt.figure(figsize=(8, 6))
+            sns.heatmap(train_metrics['train_conf_matrix'], annot=True, fmt='d', cmap='Blues', cbar=False)
+            plt.xlabel('Predicted')
+            plt.ylabel('True')
+            plt.title('Training Confusion Matrix (Final Epoch)')
+            plt.savefig(Path(self.output_path) / f"confusion_matrix_train_{timestamp}.png")
+            plt.close()
+            fpr_train, tpr_train, _ = roc_curve(graph['review'].y[:num_rows].cpu().numpy(), probs[:num_rows])
+            plt.figure(figsize=(10, 6))
+            plt.plot(fpr_train, tpr_train, label=f'Training ROC Curve (AUC = {train_metrics["train_auc_roc"]:.4f})')
+            plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
+            plt.xlabel('False Positive Rate')
+            plt.ylabel('True Positive Rate')
+            plt.title('Training ROC Curve (Final Epoch)')
+            plt.legend()
+            plt.grid(True)
+            plt.savefig(Path(self.output_path) / f"roc_curve_train_{timestamp}.png")
+            plt.close()
+            logger.info(f"All metrics, plots, and model saved to {self.output_path}")

src/.ipynb_checkpoints/model_trainer-checkpoint.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from src.model import GraphformerModel
+from pathlib import Path
+from loguru import logger
+class ModelTrainer:
+    def __init__(self, df, output_path, epochs=100,test_size=0.3):
+        self.df = df
+        self.output_path = output_path
+        self.epochs = epochs
+        self.test_size=test_size
+        # Create output directory
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        # Initialize the HeteroGraphormerModel
+        self.model = GraphformerModel(df=self.df, output_path=self.output_path, epochs=self.epochs,test_size=self.test_size)
+        logger.info(f"Initialized ModelTrainer with output_path: {self.output_path} and epochs: {self.epochs}")
+    def train_and_evaluate(self):
+        try:
+            logger.info("Starting model training and evaluation")
+            self.model.run_model()
+            logger.info("GraphformerModel training and evaluation completed successfully")
+        except Exception as e:
+            logger.error(f"Error during GraphformerModel training and evaluation: {e}")
+            raise

src/.ipynb_checkpoints/preprocessing-checkpoint.py ADDED Viewed

	@@ -0,0 +1,831 @@

+from loguru import logger
+import pandas as pd
+import json
+from datetime import datetime
+import ast
+import numpy as np
+from pymongo import MongoClient
+from collections import defaultdict
+from tqdm import tqdm
+import time
+import requests
+import json
+import os
+import pandas as pd
+import nltk
+from nltk.tokenize import sent_tokenize, word_tokenize
+from nltk.corpus import stopwords
+from textblob import TextBlob
+import re
+from transformers import BertTokenizer, BertModel
+from transformers import RobertaTokenizer, RobertaModel
+import torch
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+# Download NLTK resources
+nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
+nltk.download('stopwords')
+nltk.download('punkt_tab')
+nltk.download('averaged_perceptron_tagger_eng')
+class Preprocessor:
+    def __init__(self,df):
+        self.df=df
+        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+        self.model = RobertaModel.from_pretrained('roberta-base')
+        self.stop_words = set(stopwords.words('english'))
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Add this line
+    def get_bert_embedding(self, text):
+        inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
+    def preprocess_text(self,text):
+        return text if pd.notna(text) else ""
+    def calculate_duration(self, time_range):
+        if not isinstance(time_range, str) or "-" not in time_range:
+            return None
+        start_str, end_str = time_range.split('-')
+        start_str = start_str.strip() + ':00' if len(start_str.split(':')) == 1 else start_str.strip()
+        end_str = end_str.strip() + ':00' if len(end_str.split(':')) == 1 else end_str.strip()
+        try:
+            start = datetime.strptime(start_str, '%H:%M')
+            end = datetime.strptime(end_str, '%H:%M')
+            duration = (end - start).total_seconds() / 3600
+            return duration if duration >= 0 else duration + 24
+        except ValueError:
+            return None
+    def calculate_sentiment_severity(self, text):
+        if pd.isna(text) or not text.strip():
+            return pd.Series({"good_severity": 0.0, "bad_severity": 0.0})
+        # Get sentiment polarity (-1 to 1)
+        blob = TextBlob(text)
+        polarity = blob.sentiment.polarity
+        # Define severity weights
+        good_weight = 0.7
+        bad_weight = 0.3
+        if polarity > 0:
+            good_severity = good_weight * polarity
+            bad_severity = 0.0
+        elif polarity < 0:
+            good_severity = 0.0
+            bad_severity = bad_weight * abs(polarity)
+        else:  # Neutral (polarity = 0)
+            good_severity = 0.0
+            bad_severity = 0.0
+        return pd.Series({"good_severity": good_severity, "bad_severity": bad_severity})
+    def get_avg_duration(self, hours_str):
+        if pd.isna(hours_str) or not isinstance(hours_str, str):
+            return pd.NA
+        try:
+            hours_dict = ast.literal_eval(hours_str)
+            if not hours_dict:
+                return pd.NA
+            durations = [self.calculate_duration(time_range) for time_range in hours_dict.values()]
+            valid_durations = [d for d in durations if d is not None]
+            return sum(valid_durations) / len(valid_durations) if valid_durations else pd.NA
+        except (ValueError, SyntaxError, ZeroDivisionError):
+            return pd.NA
+    def calculate_time_since_last_review(self):
+        present_date = datetime.now()
+        user_latest_timestamp = {}
+        # Convert review_date to datetime
+        self.df["review_date"] = pd.to_datetime(self.df["review_date"])
+        # Calculate hours difference for each user's latest review
+        for user_id in self.df["user_id"].unique():
+            latest_date = self.df[self.df["user_id"] == user_id]["review_date"].max()
+            if not isinstance(latest_date, datetime):
+                latest_date = latest_date.to_pydatetime()
+            hours_difference = (present_date - latest_date).total_seconds() / 3600
+            user_latest_timestamp[user_id] = hours_difference
+        # Map the hours difference to a new column
+        self.df["time_since_last_review_user"] = self.df["user_id"].map(user_latest_timestamp)
+    def calculate_time_since_last_review_business(self):
+        present_date = datetime.now()
+        # Ensure review_date is in datetime format
+        self.df["review_date"] = pd.to_datetime(self.df["review_date"])
+        # Initialize dictionary to store hours since last review for each business
+        business_latest_timestamp = {}
+        # Iterate over unique business_ids
+        for business_id in self.df["business_id"].unique():
+            # Get the latest review date for this business
+            latest_date = self.df[self.df["business_id"] == business_id]["review_date"].max()
+            # Convert to datetime object if needed
+            if not isinstance(latest_date, datetime):
+                latest_date = latest_date.to_pydatetime()
+            # Calculate hours difference (already in hours)
+            hours_difference = (present_date - latest_date).total_seconds() / 3600
+            business_latest_timestamp[business_id] = hours_difference
+        # Map the hours difference to the new column
+        self.df["time_since_last_review_business"] = self.df["business_id"].map(business_latest_timestamp)
+    def calculate_user_account_age(self):
+        present_date = datetime.now()
+        # Convert yelping_since to datetime
+        self.df["yelping_since"] = pd.to_datetime(self.df["yelping_since"])
+        # Calculate user account age in days
+        self.df["user_account_age"] = (present_date - self.df["yelping_since"]).dt.days
+    def calculate_avg_time_between_reviews(self):
+        # Ensure review_date is in datetime format
+        self.df["review_date"] = pd.to_datetime(self.df["review_date"])
+        # Sort the DataFrame by user_id and review_date to ensure chronological order
+        self.df = self.df.sort_values(["user_id", "review_date"])
+        # Define helper function to calculate average time between reviews
+        def calculate_avg_time(group):
+            if len(group) == 1:
+                return 0  # If only one review, assign 0
+            # Calculate differences in hours between consecutive reviews
+            diffs = group["review_date"].diff().dt.total_seconds() / 3600
+            # Drop the first NaN (from diff) and compute the mean
+            return diffs.dropna().mean()
+        # Apply the function to each user_id group and create a mapping
+        avg_time_per_user = self.df.groupby("user_id").apply(calculate_avg_time)
+        # Map the average time back to the original DataFrame
+        self.df["average_time_between_reviews"] = self.df["user_id"].map(avg_time_per_user)
+    def calculate_user_degree(self):
+    # Calculate the number of unique businesses per user
+        user_business_counts = self.df.groupby("user_id")["business_id"].nunique()
+        # Map the counts back to the original DataFrame
+        self.df["user_degree"] = self.df["user_id"].map(user_business_counts)
+    def calculate_business_degree(self):
+        # Calculate the number of unique users per business
+        business_user_counts = self.df.groupby("business_id")["user_id"].nunique()
+        # Map the counts back to the original DataFrame
+        self.df["business_degree"] = self.df["business_id"].map(business_user_counts)
+    def calculate_rating_variance_user(self):
+        # Calculate the mode (most frequent rating) per user
+        user_rating_mode = self.df.groupby("user_id")["review_stars"].agg(lambda x: x.mode()[0])
+        # Map the most frequent rating back to the original DataFrame
+        self.df["rating_variance_user"] = self.df["user_id"].map(user_rating_mode)
+    def calculate_user_review_burst_count(self):
+    # Ensure review_date is in datetime format
+        self.df["review_date"] = pd.to_datetime(self.df["review_date"])
+        # Sort by user_id and review_date for chronological order
+        self.df = self.df.sort_values(["user_id", "review_date"])
+        # Function to calculate the max number of reviews in any 20-day window
+        def calculate_burst_count(group):
+            if len(group) <= 1:
+                return 0  # No burst if 1 or fewer reviews
+            # Convert review_date to a Series for rolling window
+            dates = group["review_date"]
+            # Calculate the number of reviews within 20 days of each review
+            burst_counts = []
+            for i, date in enumerate(dates):
+                # Count reviews within 20 days after this date
+                window_end = date + pd.Timedelta(days=20)
+                count = ((dates >= date) & (dates <= window_end)).sum()
+                burst_counts.append(count)
+            # Return the maximum burst count for this user
+            return max(burst_counts)
+        # Calculate the burst count per user
+        user_burst_counts = self.df.groupby("user_id").apply(calculate_burst_count)
+        # Map the burst count back to the original DataFrame
+        self.df["user_review_burst_count"] = self.df["user_id"].map(user_burst_counts)
+    def calculate_business_review_burst_count(self):
+        # Ensure review_date is in datetime format
+        self.df["review_date"] = pd.to_datetime(self.df["review_date"])
+        # Sort by business_id and review_date for chronological order
+        self.df = self.df.sort_values(["business_id", "review_date"])
+        # Function to calculate the max number of reviews in any 10-day window
+        def calculate_burst_count(group):
+            if len(group) <= 1:
+                return 0  # No burst if 1 or fewer reviews
+            # Convert review_date to a Series for rolling window
+            dates = group["review_date"]
+            # Calculate the number of reviews within 10 days of each review
+            burst_counts = []
+            for i, date in enumerate(dates):
+                # Count reviews within 10 days after this date
+                window_end = date + pd.Timedelta(days=10)
+                count = ((dates >= date) & (dates <= window_end)).sum()
+                burst_counts.append(count)
+            # Return the maximum burst count for this business
+            return max(burst_counts)
+        # Calculate the burst count per business
+        business_burst_counts = self.df.groupby("business_id").apply(calculate_burst_count)
+        # Map the burst count back to the original DataFrame
+        self.df["business_review_burst_count"] = self.df["business_id"].map(business_burst_counts)
+    def calculate_temporal_similarity(self):
+        self.df["review_date"] = pd.to_datetime(self.df["review_date"])
+        # Extract the day of the week (0 = Monday, 6 = Sunday)
+        self.df["day_of_week"] = self.df["review_date"].dt.dayofweek
+        # Function to calculate avg hours between reviews on frequent days
+        def calculate_avg_hours_on_frequent_days(group):
+            frequent_days = group["day_of_week"].mode().tolist()
+            if len(group) <= 1:
+                return 0
+            frequent_reviews = group[group["day_of_week"].isin(frequent_days)]
+            if len(frequent_reviews) <= 1:
+                return 0
+            frequent_reviews = frequent_reviews.sort_values("review_date")
+            diffs = frequent_reviews["review_date"].diff().dt.total_seconds() / 3600
+            return diffs.dropna().mean()
+        # Calculate average hours for each user
+        avg_hours_per_user = self.df.groupby("user_id").apply(calculate_avg_hours_on_frequent_days)
+        # Map the average hours to the new column
+        self.df["temporal_similarity"] = self.df["user_id"].map(avg_hours_per_user)
+        # Drop temporary column
+        self.df = self.df.drop(columns=["day_of_week"])
+    def calculate_rating_deviation_from_business_average(self):
+    # Calculate the average rating per business
+        business_avg_rating = self.df.groupby("business_id")["review_stars"].mean()
+        # Map the average rating to each row
+        self.df["business_avg_rating"] = self.df["business_id"].map(business_avg_rating)
+        # Calculate the deviation from the business average
+        self.df["rating_deviation_from_business_average"] = (
+            self.df["review_stars"] - self.df["business_avg_rating"]
+        )
+        # Drop the temporary column
+        self.df = self.df.drop(columns=["business_avg_rating"])
+    def calculate_review_like_ratio(self):
+        # Create a binary column for liked reviews (stars >= 4)
+        self.df["is_liked"] = (self.df["review_stars"] >= 4).astype(int)
+        # Calculate the like ratio per user
+        user_like_ratio = self.df.groupby("user_id")["is_liked"].mean()
+        # Map the like ratio back to the DataFrame
+        self.df["review_like_ratio"] = self.df["user_id"].map(user_like_ratio)
+        # Drop the temporary column
+        self.df = self.df.drop(columns=["is_liked"])
+    def calculate_latest_checkin_hours(self):
+        self.df["yelping_since"] = pd.to_datetime(self.df["yelping_since"])
+        # Function to get the latest check-in date from a list of strings
+        def get_latest_checkin(checkin_list):
+            if not checkin_list or pd.isna(checkin_list):  # Handle empty or NaN
+                return None
+            if isinstance(checkin_list, str):
+                checkin_dates = checkin_list.split(", ")
+            else:
+                checkin_dates = checkin_list
+            return pd.to_datetime(checkin_dates).max()
+        # Apply the function to get the latest check-in date per row
+        self.df["latest_checkin_date"] = self.df["checkin_date"].apply(get_latest_checkin)
+        # Calculate the hours difference between latest check-in and yelping_since
+        self.df["latest_checkin_hours"] = (
+            (self.df["latest_checkin_date"] - self.df["yelping_since"])
+            .dt.total_seconds() / 3600
+        )
+        # Drop the temporary column
+        self.df = self.df.drop(columns=["latest_checkin_date"])
+        self.df["latest_checkin_hours"].fillna(0,inplace=True)
+    def compute_pronoun_density(self, text):
+        text = self.preprocess_text(text)
+        if not text:
+            return 0
+        words = word_tokenize(text.lower())
+        pos_tags = nltk.pos_tag(words)
+        pronouns = sum(1 for word, pos in pos_tags if pos in ['PRP', 'PRP$'] and word in ['i', 'we'])
+        return pronouns / len(words) if words else 0
+    def compute_avg_sentence_length(self, text):
+        text = self.preprocess_text(text)
+        if not text:
+            return 0
+        sentences = sent_tokenize(text)
+        return sum(len(word_tokenize(sent)) for sent in sentences) / len(sentences) if sentences else 0
+    def compute_excessive_punctuation(self, text):
+        text = self.preprocess_text(text)
+        return len(re.findall(r'[!?.]{2,}', text))
+    def compute_sentiment_polarity(self, text):
+        text = self.preprocess_text(text)
+        return TextBlob(text).sentiment.polarity if text else 0
+    def compute_code_switching_flag(self, text):
+        text = self.preprocess_text(text)
+        if not text:
+            return 0
+        tokens = self.tokenizer.tokenize(text.lower())
+        if not tokens:
+            return 0
+        english_words = self.stop_words  # Use self.stop_words from __init__
+        token_set = set(tokens)
+        english_count = sum(1 for token in tokens if token in english_words)
+        non_english_pattern = re.compile(r'[^\x00-\x7F]')
+        has_non_ascii = 1 if non_english_pattern.search(text) else 0
+        english_ratio = english_count / len(tokens) if tokens else 0
+        non_english_tokens = sum(1 for token in token_set if token not in english_words and "##" in token and has_non_ascii)
+        # Flag as code-switching if:
+        # 1. Mixed English presence (ratio between 0.1 and 0.9)
+        # 2. Non-ASCII characters present OR some non-English subword tokens
+        if 0.1 < english_ratio < 0.9 and (has_non_ascii or non_english_tokens > 0):
+            return 1
+        return 0
+    def batch_tokenize(self, texts, batch_size=32, max_length=512):
+        tokenized_outputs = []
+        for i in tqdm(range(0, len(texts), batch_size), desc="Tokenizing with RoBERTa on GPU"):
+            batch_texts = texts[i:i + batch_size]
+            valid_texts = [self.preprocess_text(t) for t in batch_texts]
+            # Tokenize with fixed max_length to ensure consistent tensor sizes
+            inputs = self.tokenizer(valid_texts, return_tensors='pt', truncation=True, padding='max_length', max_length=max_length)
+            tokenized_outputs.append(inputs['input_ids'].to(self.device))  # Move to GPU
+        # Concatenate on GPU with consistent sizes
+        return torch.cat(tokenized_outputs, dim=0)
+    def compute_grammar_error_score(self, texts, tokenized_ids):
+        print("Computing grammar error scores...")
+        error_scores = np.zeros(len(texts), dtype=float)
+        vocab_set = set(self.tokenizer.get_vocab().keys())
+        for i, input_ids in enumerate(tqdm(tokenized_ids, desc="Processing Grammar Errors")):
+            if input_ids.sum() == 0:  # Empty input
+                continue
+            tokens = self.tokenizer.convert_ids_to_tokens(input_ids.cpu().tolist(), skip_special_tokens=True)
+            unknown_count = sum(1 for token in tokens if token not in vocab_set and token not in self.stop_words)
+            total_count = len([t for t in tokens if t not in self.stop_words])
+            error_scores[i] = unknown_count / total_count if total_count > 0 else 0
+        return error_scores
+    def compute_repetitive_words_count(self, texts, tokenized_ids):
+        print("Computing repetitive words counts...")
+        rep_counts = np.zeros(len(texts), dtype=int)
+        for i, input_ids in enumerate(tqdm(tokenized_ids, desc="Processing Repetition")):
+            if input_ids.sum() == 0:  # Empty input
+                continue
+            tokens = self.tokenizer.convert_ids_to_tokens(input_ids.cpu().tolist(), skip_special_tokens=True)
+            valid_tokens = [t for t in tokens if t not in self.stop_words and len(t) > 2]
+            if valid_tokens:
+                token_counts = {}
+                for token in valid_tokens:
+                    token_counts[token] = token_counts.get(token, 0) + 1
+                rep_counts[i] = sum(1 for count in token_counts.values() if count > 1)
+        return rep_counts
+    def preprocess_text_for_similarity(self, text):
+        if pd.isna(text) or not text.strip():
+            return []
+        return [w for w in word_tokenize(str(text).lower()) if w not in self.stop_words]
+    def batch_encode_words(self, texts, batch_size=32, max_length=512):
+        word_lists = [self.preprocess_text_for_similarity(t) for t in tqdm(texts, desc="Tokenizing Texts")]
+        vocab = {word: idx + 1 for idx, word in enumerate(set.union(*[set(w) for w in word_lists if w]))}
+        encoded_batches = []
+        for i in tqdm(range(0, len(word_lists), batch_size), desc="Encoding Words on GPU"):
+            batch_words = word_lists[i:i + batch_size]
+            encoded = np.zeros((len(batch_words), max_length), dtype=np.int64)
+            for j, words in enumerate(batch_words):
+                if words:
+                    word_ids = [vocab.get(w, 0) for w in words][:max_length]
+                    encoded[j, :len(word_ids)] = word_ids
+            encoded_tensor = torch.tensor(encoded, dtype=torch.int64).to(self.device)
+            encoded_batches.append(encoded_tensor)
+        return torch.cat(encoded_batches, dim=0), vocab
+    def compute_similarity_to_other_reviews(self, batch_size=32, max_length=512):
+        all_texts = self.df["review_text"].tolist()
+        all_users = self.df["user_id"].tolist()
+        all_review_ids = self.df["review_id"].tolist()
+        encoded_words, vocab = self.batch_encode_words(all_texts, batch_size, max_length)
+        similarity_scores = {rid: 0.0 for rid in all_review_ids}  # Default scores
+        for i, (review_id, user_id) in enumerate(tqdm(zip(all_review_ids, all_users), desc="Computing Similarities on GPU")):
+            if pd.isna(review_id) or pd.isna(user_id):
+                continue
+            current_words = encoded_words[i]
+            if current_words.sum() == 0:
+                continue
+            other_indices = torch.tensor([j for j, u in enumerate(all_users) if u != user_id and pd.notna(u)],
+                                       dtype=torch.long).to(self.device)
+            if not other_indices.numel():
+                continue
+            other_words = encoded_words[other_indices]
+            current_set = torch.unique(current_words[current_words > 0])
+            other_flat = other_words[other_words > 0]
+            if other_flat.numel() == 0:
+                continue
+            other_set = torch.unique(other_flat)
+            intersection = torch.sum(torch.isin(current_set, other_set)).float()
+            union = torch.unique(torch.cat([current_set, other_set])).numel()
+            similarity = intersection / union if union > 0 else 0.0
+            similarity_scores[review_id] = similarity.item()
+        return pd.Series(similarity_scores, index=all_review_ids)
+    def calculate_friend_count(self):
+        friends = []
+        for v in self.df["friends"]:
+            if isinstance(v, str):
+                friends.append(len(v.split(",")))
+            elif type(v)==int or type(v)==float:
+                friends.append(0)
+        self.df["friends"] = friends
+    def count_elite_years(self, elite):
+        if pd.isna(elite):
+            return 0
+        return len(str(elite).split(","))
+    def transform_elite_status(self):
+        self.df["elite"] = self.df["elite"].apply(lambda x: True if self.count_elite_years(x) > 1 else False)
+    def calculate_review_useful_funny_cool(self):
+        self.df["review_useful"] = pd.to_numeric(self.df["review_useful"], errors='coerce').fillna(0)
+        self.df["review_funny"] = pd.to_numeric(self.df["review_funny"], errors='coerce').fillna(0)
+        self.df["review_cool"] = pd.to_numeric(self.df["review_cool"], errors='coerce').fillna(0)
+        self.df["review_useful_funny_cool"] = (
+            self.df["review_useful"] +
+            self.df["review_funny"] +
+            self.df["review_cool"]
+        )
+        self.df["review_useful_funny_cool"] = self.df["review_useful_funny_cool"].fillna(0).astype(int)
+    def calculate_user_useful_funny_cool(self):
+        self.df["user_useful_funny_cool"] = (
+            self.df["user_useful"] +
+            self.df["user_funny"] +
+            self.df["user_cool"]
+        )
+        self.df["user_useful_funny_cool"] = self.df["user_useful_funny_cool"].fillna(0).astype(int)
+    def compute_fake_score(self, row):
+        suspicion_points = 0
+        # Linguistic Features
+        if row["pronoun_density"] < 0.01:  # Low personal engagement
+            suspicion_points += 1
+        if row["avg_sentence_length"] < 5 or row["avg_sentence_length"] > 30:  # Extreme lengths
+            suspicion_points += 1
+        if row["grammar_error_score"] > 5:  # Many errors
+            suspicion_points += 1
+        if row["repetitive_words_count"] > 5:  # High repetition
+            suspicion_points += 1
+        if row["code_switching_flag"] == 1:  # Language mixing
+            suspicion_points += 1
+        if row["excessive_punctuation_count"] > 3:  # Overuse of punctuation
+            suspicion_points += 1
+        if abs(row["sentiment_polarity"]) > 0.8:  # Extreme sentiment
+            suspicion_points += 1
+        # Review Patterns
+        if row["similarity_to_other_reviews"] > 0.8:  # High duplication
+            suspicion_points += 1
+        if row["user_review_burst_count"] > 5:  # Spammy bursts
+            suspicion_points += 1
+        if row["business_review_burst_count"] > 5:  # Targeted bursts
+            suspicion_points += 1
+        if abs(row["rating_deviation_from_business_average"]) > 2:  # Large rating deviation
+            suspicion_points += 1
+        if row["review_like_ratio"] > 0.9 or row["review_like_ratio"] < 0.1:  # Extreme like ratio
+            suspicion_points += 1
+        # User Behavior
+        if row["user_account_age"] < 30:  # Very new account (days)
+            suspicion_points += 1
+        if row["average_time_between_reviews"] < 24:  # Rapid reviews (hours)
+            suspicion_points += 1
+        if row["user_degree"] < 2:  # Low business interaction
+            suspicion_points += 1
+        if row["time_since_last_review_user"] < 24:  # Recent burst (hours)
+            suspicion_points += 1
+        # Threshold: 3 or more points = fake
+        return 1 if suspicion_points >= 3 else 0
+    def run_pipeline(self):
+        logger.info("FINALYZING HOURS COLUMN ...")
+        self.df["hours"] = self.df["hours"].apply(self.get_avg_duration)
+        self.df["hours"] = self.df["hours"].fillna(0)
+        print(self.df["hours"][:10])
+        print(self.df["hours"].isnull().sum())
+        logger.info("FINALYZING ATTRIBUTES COLUMN ...")
+        self.df.drop("attributes",axis=1,inplace=True)
+        logger.info("CREATING time_since_last_review_user COLUMN ...")
+        self.calculate_time_since_last_review()
+        print(np.unique(self.df["time_since_last_review_user"] ))
+        logger.info("CREATING time_since_last_review_business COLUMN ...")
+        self.calculate_time_since_last_review_business()
+        print(np.unique(self.df["time_since_last_review_business"] ))
+        logger.info("CREATING user_account_age COLUMN ...")
+        self.calculate_user_account_age()
+        print(np.unique(self.df["user_account_age"] ))
+        logger.info("CREATING average_time_between_reviews COLUMN ...")
+        self.calculate_avg_time_between_reviews()
+        print(np.unique(self.df["average_time_between_reviews"] ))
+        logger.info("CREATING user_degree COLUMN ...")
+        self.calculate_user_degree()
+        print(np.unique(self.df["user_degree"] ))
+        logger.info("CREATING business_degree COLUMN ...")
+        self.calculate_business_degree()
+        print(np.unique(self.df["business_degree"] ))
+        logger.info("CREATING rating_variance_user COLUMN ...")
+        self.calculate_rating_variance_user()
+        print(np.unique(self.df["rating_variance_user"] ))
+        logger.info("CREATING user_review_burst_count COLUMN ...")
+        self.calculate_user_review_burst_count()
+        print(np.unique(self.df["user_review_burst_count"] ))
+        logger.info("CREATING business_review_burst_count COLUMN ...")
+        self.calculate_business_review_burst_count()
+        print(np.unique(self.df["business_review_burst_count"] ))
+        logger.info("CREATING temporal_similarity COLUMN ...")
+        self.calculate_temporal_similarity()
+        print(np.unique(self.df["temporal_similarity"] ))
+        logger.info("CREATING rating_deviation_from_business_average COLUMN ...")
+        self.calculate_rating_deviation_from_business_average()
+        print(np.unique(self.df["rating_deviation_from_business_average"] ))
+        logger.info("CREATING review_like_ratio COLUMN ...")
+        self.calculate_review_like_ratio()
+        print(np.unique(self.df["review_like_ratio"] ))
+        logger.info("CREATING latest_checkin_hours COLUMN ...")
+        self.calculate_latest_checkin_hours()
+        print(np.unique(self.df["latest_checkin_hours"] ))
+        logger.info("CREATING pronoun_density COLUMN ...")
+        self.df["pronoun_density"] = self.df["review_text"].apply(self.compute_pronoun_density)
+        print(np.unique(self.df["pronoun_density"] ))
+        logger.info("CREATING avg_sentence_length COLUMN ...")
+        self.df["avg_sentence_length"] = self.df["review_text"].apply(self.compute_avg_sentence_length)
+        print(np.unique(self.df["avg_sentence_length"] ))
+        logger.info("CREATING excessive_punctuation_count COLUMN ...")
+        self.df["excessive_punctuation_count"] = self.df["review_text"].apply(self.compute_excessive_punctuation)
+        print(np.unique(self.df["excessive_punctuation_count"] ))
+        logger.info("CREATING sentiment_polarity COLUMN ...")
+        self.df["sentiment_polarity"] = self.df["review_text"].apply(self.compute_sentiment_polarity)
+        print(np.unique(self.df["sentiment_polarity"] ))
+        logger.info("CREATING good_severity and  bad_severity COLUMNS ...")
+        severity_scores = self.df["review_text"].apply(self.calculate_sentiment_severity)
+        self.df[["good_severity", "bad_severity"]] = severity_scores
+        print(np.unique(self.df["good_severity"] ))
+        print(np.unique(self.df["bad_severity"] ))
+        logger.info("CREATING code_switching_flag COLUMN ...")
+        self.df["code_switching_flag"] = self.df["review_text"].apply(self.compute_code_switching_flag)
+        print(np.unique(self.df["code_switching_flag"] ))
+        all_texts = self.df["review_text"].tolist()
+        tokenized_ids = self.batch_tokenize(all_texts, batch_size=32, max_length=512)
+        logger.info("CREATING grammar_error_score COLUMN ...")
+        self.df["grammar_error_score"] = self.compute_grammar_error_score(all_texts, tokenized_ids)
+        print(np.unique(self.df["grammar_error_score"] ))
+        logger.info("CREATING repetitive_words_count COLUMN ...")
+        self.df["repetitive_words_count"] = self.compute_repetitive_words_count(all_texts, tokenized_ids)
+        print(np.unique(self.df["repetitive_words_count"] ))
+        logger.info("CREATING similarity_to_other_reviews COLUMN ...")
+        similarity_scores = self.compute_similarity_to_other_reviews(batch_size=32, max_length=512)
+        self.df["similarity_to_other_reviews"] = self.df["review_id"].map(similarity_scores)
+        print(np.unique(self.df["similarity_to_other_reviews"] ))
+        logger.info("CREATING friends COLUMN ...")
+        self.calculate_friend_count()
+        print(self.df["friends"].value_counts())
+        logger.info("CREATING elite COLUMN ...")
+        self.transform_elite_status()
+        print(self.df["elite"].value_counts())
+        logger.info("CREATING review_useful_funny_cool COLUMN ...")
+        self.calculate_review_useful_funny_cool()
+        print(self.df["review_useful_funny_cool"].value_counts())
+        logger.info("CREATING user_useful_funny_cool COLUMN ...")
+        self.calculate_user_useful_funny_cool()
+        print(self.df["user_useful_funny_cool"].value_counts())
+        logger.info("CREATING LABEL COLUMN ...")
+        self.df["fake"] = self.df.apply(self.compute_fake_score, axis=1)
+        print(self.df["fake"].value_counts())
+        logger.info("SEEING NULL VALUES IN FINAL COLUMNS.....")
+        print(set(self.df.isnull().sum().values))
+        for col in self.df.columns:
+            if self.df[col].isnull().sum()>0:
+                print(f" {col} has {self.df[col].isnull().sum()} null values")
+        return self.df

src/__pycache__/analyze_yelp_data.cpython-311.pyc ADDED Viewed

Binary file (20.6 kB). View file

src/__pycache__/clean_data.cpython-311.pyc ADDED Viewed

Binary file (6.21 kB). View file

src/__pycache__/clean_data.cpython-39.pyc ADDED Viewed

Binary file (3.03 kB). View file

src/__pycache__/create_dataset.cpython-311.pyc ADDED Viewed

Binary file (7.58 kB). View file

src/__pycache__/create_dataset.cpython-39.pyc ADDED Viewed

Binary file (3.98 kB). View file

src/__pycache__/data_balancing.cpython-311.pyc ADDED Viewed

Binary file (12.6 kB). View file

src/__pycache__/feature_analyzer.cpython-311.pyc ADDED Viewed

Binary file (17.3 kB). View file

src/__pycache__/feature_analyzer.cpython-39.pyc ADDED Viewed

Binary file (8.73 kB). View file

src/__pycache__/feature_importance.cpython-311.pyc ADDED Viewed

Binary file (10.6 kB). View file

src/__pycache__/model.cpython-311.pyc ADDED Viewed

Binary file (13.9 kB). View file

src/__pycache__/model.cpython-39.pyc ADDED Viewed

Binary file (17.6 kB). View file

src/__pycache__/model1.cpython-311.pyc ADDED Viewed

Binary file (42.9 kB). View file

src/__pycache__/model1.cpython-39.pyc ADDED Viewed

Binary file (17.2 kB). View file

src/__pycache__/model3.cpython-311.pyc ADDED Viewed

Binary file (44 kB). View file

src/__pycache__/model3.cpython-39.pyc ADDED Viewed

Binary file (17.6 kB). View file

src/__pycache__/model_trainer.cpython-311.pyc ADDED Viewed

Binary file (2.31 kB). View file

src/__pycache__/model_trainer.cpython-39.pyc ADDED Viewed

Binary file (1.32 kB). View file

src/__pycache__/models.cpython-311.pyc ADDED Viewed

Binary file (45.6 kB). View file

src/__pycache__/preprocessing.cpython-311.pyc ADDED Viewed

Binary file (50.7 kB). View file

src/__pycache__/preprocessing.cpython-39.pyc ADDED Viewed

Binary file (24.4 kB). View file

src/analyze_yelp_data.py ADDED Viewed

	@@ -0,0 +1,320 @@

+import pandas as pd
+import numpy as np
+from transformers import AutoTokenizer, AutoModel
+import torch
+from sklearn.ensemble import IsolationForest
+from sklearn.preprocessing import StandardScaler
+from textblob import TextBlob
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.decomposition import PCA
+import warnings
+from typing import Dict, List, Tuple
+import logging
+from collections import Counter
+from detoxify import Detoxify
+import re
+from datetime import datetime
+import seaborn as sns
+import matplotlib.pyplot as plt
+from pathlib import Path
+import json
+class AdvancedYelpAnalyzer:
+    def __init__(self, df: pd.DataFrame):
+        """Initialize the analyzer with necessary models and configurations"""
+        self.df = df.copy()
+        self.bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+        self.bert_model = AutoModel.from_pretrained('bert-base-uncased')
+        self.vader = SentimentIntensityAnalyzer()
+        self.toxic_model = Detoxify('original')
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.bert_model.to(self.device)
+        # Configure logging
+        logging.basicConfig(level=logging.INFO)
+        self.logger = logging.getLogger(__name__)
+    def get_bert_embeddings(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
+        """Generate BERT embeddings for text"""
+        embeddings = []
+        for i in range(0, len(texts), batch_size):
+            batch_texts = texts[i:i + batch_size]
+            encoded = self.bert_tokenizer(batch_texts,
+                                        padding=True,
+                                        truncation=True,
+                                        max_length=512,
+                                        return_tensors='pt')
+            with torch.no_grad():
+                encoded = {k: v.to(self.device) for k, v in encoded.items()}
+                outputs = self.bert_model(**encoded)
+                batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
+                embeddings.append(batch_embeddings)
+        return np.vstack(embeddings)
+    def analyze_sentiment(self) -> pd.DataFrame:
+        """Perform comprehensive sentiment analysis using multiple tools"""
+        self.logger.info("Starting sentiment analysis...")
+        # Calculate BERT embeddings for reviews
+        self.logger.info("Calculating BERT embeddings...")
+        review_texts = self.df['review_text'].fillna('').tolist()
+        bert_embeddings = self.get_bert_embeddings(review_texts)
+        # Calculate review length using BERT tokenizer
+        self.logger.info("Calculating tokenized lengths...")
+        self.df['review_length'] = self.df['review_text'].apply(
+            lambda x: len(self.bert_tokenizer.encode(str(x)))
+        )
+        # Store BERT embeddings mean and std as features
+        self.df['bert_embedding_mean'] = np.mean(bert_embeddings, axis=1)
+        self.df['bert_embedding_std'] = np.std(bert_embeddings, axis=1)
+        # TextBlob sentiment and subjectivity
+        self.df['textblob_polarity'] = self.df['review_text'].apply(
+            lambda x: TextBlob(str(x)).sentiment.polarity
+        )
+        self.df['textblob_subjectivity'] = self.df['review_text'].apply(
+            lambda x: TextBlob(str(x)).sentiment.subjectivity
+        )
+        # VADER sentiment with custom negative phrase handling
+        def get_enhanced_vader_scores(text):
+            # Custom negative phrases
+            negative_phrases = [
+                'too long', 'way too long', 'waiting', 'changed our minds',
+                'too many', 'took forever', 'took too long', 'waste of time',
+                'not worth', 'disappointing', 'mediocre', 'needs improvement'
+            ]
+            # Get base VADER scores
+            base_scores = self.vader.polarity_scores(str(text))
+            # Check for negative phrases
+            text_lower = str(text).lower()
+            neg_count = sum(1 for phrase in negative_phrases if phrase in text_lower)
+            # Adjust scores if negative phrases are found
+            if neg_count > 0:
+                base_scores['neg'] = max(base_scores['neg'], min(0.7, neg_count * 0.2))
+                base_scores['compound'] *= (1 - (neg_count * 0.15))
+                # Readjust neutral score
+                base_scores['neu'] = max(0, 1 - base_scores['neg'] - base_scores['pos'])
+            return base_scores
+        # Apply enhanced VADER scoring
+        vader_scores = self.df['review_text'].apply(get_enhanced_vader_scores)
+        self.df['vader_compound'] = vader_scores.apply(lambda x: x['compound'])
+        self.df['vader_negative'] = vader_scores.apply(lambda x: x['neg'])
+        self.df['vader_positive'] = vader_scores.apply(lambda x: x['pos'])
+        self.df['vader_neutral'] = vader_scores.apply(lambda x: x['neu'])
+        # Calculate sentiment extremity
+        self.df['sentiment_extremity'] = self.df['vader_compound'].abs()
+        return self.df
+    def detect_anomalies(self) -> pd.DataFrame:
+        """Detect anomalous reviews using Isolation Forest with BERT features"""
+        self.logger.info("Detecting anomalies...")
+        # Prepare features for anomaly detection
+        features = [
+            'review_stars',
+            'textblob_polarity',
+            'vader_compound',
+            'sentiment_extremity',
+            'review_length',
+            'bert_embedding_mean',
+            'bert_embedding_std'
+        ]
+        # Ensure all features exist
+        missing_features = [f for f in features if f not in self.df.columns]
+        if missing_features:
+            self.analyze_sentiment()
+        # Standardize features
+        scaler = StandardScaler()
+        X = scaler.fit_transform(self.df[features])
+        # Apply Isolation Forest
+        iso_forest = IsolationForest(
+            contamination=0.1,
+            random_state=42,
+            n_jobs=-1
+        )
+        # Fit and predict
+        self.df['is_anomaly'] = iso_forest.fit_predict(X)
+        self.df['anomaly_score'] = iso_forest.score_samples(X)
+        return self.df
+    def detect_ai_generated_text(self) -> pd.DataFrame:
+        """Estimate likelihood of AI-generated content"""
+        self.logger.info("Detecting AI-generated content...")
+        # Ensure sentiment analysis has been run
+        if 'textblob_subjectivity' not in self.df.columns:
+            self.analyze_sentiment()
+        # Use detoxify model to get toxicity scores
+        texts = self.df['review_text'].fillna('').tolist()
+        toxic_scores = self.toxic_model.predict(texts)
+        # Add scores to DataFrame
+        toxic_score_types = ['toxicity', 'severe_toxicity', 'obscene', 'identity_attack',
+                            'insult', 'threat', 'sexual_explicit']
+        for score_type in toxic_score_types:
+            if score_type in toxic_scores:
+                self.df[f'toxic_{score_type}'] = toxic_scores[score_type]
+        # Calculate AI generation likelihood based on various factors
+        self.df['ai_generated_likelihood'] = (
+            (self.df['textblob_subjectivity'] < 0.3) &  # Low subjectivity
+            (self.df['sentiment_extremity'] > 0.8) &    # Extreme sentiment
+            (self.df['review_length'] > self.df['review_length'].quantile(0.95)) & # Unusually long
+            (self.df['bert_embedding_std'] < self.df['bert_embedding_std'].quantile(0.25)) # Unusual language patterns
+        ).astype(int)
+        # Add additional AI detection features
+        self.df['ai_detection_score'] = (
+            (self.df['textblob_subjectivity'] * -1) +  # Lower subjectivity increases score
+            (self.df['sentiment_extremity'] * 0.5) +   # Extreme sentiment contributes somewhat
+            (self.df['bert_embedding_std'] * -0.5)     # Lower variation in embeddings increases score
+        ).clip(0, 1)  # Normalize between 0 and 1
+        return self.df
+    def analyze_business_categories(self) -> Dict:
+        """Analyze trends and patterns specific to business categories"""
+        self.logger.info("Analyzing business categories...")
+        # Extract categories
+        categories = self.df['categories'].fillna('').str.split(', ')
+        all_categories = [cat for cats in categories if isinstance(cats, list) for cat in cats]
+        category_counts = Counter(all_categories)
+        # Analyze reviews by category
+        category_analysis = {}
+        for category in set(all_categories):
+            category_reviews = self.df[self.df['categories'].str.contains(category, na=False)]
+            category_analysis[category] = {
+                'review_count': len(category_reviews),
+                'avg_rating': category_reviews['review_stars'].mean() if not category_reviews.empty else None,
+                'avg_sentiment': category_reviews['vader_compound'].mean() if 'vader_compound' in self.df.columns and not category_reviews.empty else None,
+                'avg_subjectivity': category_reviews['textblob_subjectivity'].mean() if 'textblob_subjectivity' in self.df.columns and not category_reviews.empty else None
+            }
+        return category_analysis
+    def visualize_results(self, output_dir: str):
+        """Create visualizations for analysis results"""
+        plt.figure(figsize=(15, 10))
+        # Sentiment Distribution
+        plt.subplot(2, 2, 1)
+        sns.histplot(data=self.df, x='vader_compound', bins=50)
+        plt.title('Sentiment Distribution')
+        # Review Volume Over Time
+        plt.subplot(2, 2, 2)
+        daily_reviews = self.df.groupby('review_date').size()
+        daily_reviews.plot()
+        plt.title('Review Volume Over Time')
+        # Anomaly Score Distribution
+        plt.subplot(2, 2, 3)
+        if 'anomaly_score' not in self.df.columns:
+            self.detect_anomalies()
+        sns.histplot(data=self.df, x='anomaly_score', bins=50)
+        plt.title('Anomaly Score Distribution')
+        # AI Generation Likelihood
+        plt.subplot(2, 2, 4)
+        if 'ai_generated_likelihood' not in self.df.columns:
+            self.detect_ai_generated_text()
+        sns.histplot(data=self.df, x='ai_generated_likelihood', bins=2)
+        plt.title('AI Generation Likelihood')
+        plt.tight_layout()
+        plt.savefig(f'{output_dir}/analysis_results.png')
+        plt.close()
+    def run_full_analysis(self, output_dir: str) -> Tuple[pd.DataFrame, Dict]:
+        """Run complete analysis pipeline with detailed outputs"""
+        self.logger.info("Starting full analysis pipeline...")
+        # Create output directory if it doesn't exist
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        try:
+            # Run all analyses
+            self.analyze_sentiment()
+            self.detect_anomalies()
+            self.detect_ai_generated_text()
+            category_analysis = self.analyze_business_categories()
+            # Create visualizations
+            self.visualize_results(str(output_dir))
+            # Compile results
+            analysis_results = {
+                'category_analysis': category_analysis,
+                'sentiment_summary': {
+                    'avg_sentiment': self.df['vader_compound'].mean(),
+                    'positive_reviews': len(self.df[self.df['vader_compound'] > 0.5]),
+                    'negative_reviews': len(self.df[self.df['vader_compound'] < -0.5]),
+                    'neutral_reviews': len(self.df[abs(self.df['vader_compound']) <= 0.5])
+                },
+                'ai_detection_summary': {
+                    'likely_ai_generated': len(self.df[self.df['ai_generated_likelihood'] == 1]),
+                    'avg_ai_score': self.df['ai_detection_score'].mean()
+                },
+                'anomaly_summary': {
+                    'anomalous_reviews': len(self.df[self.df['is_anomaly'] == -1]),
+                    'avg_anomaly_score': self.df['anomaly_score'].mean()
+                }
+            }
+            # Save results
+            self.df.to_csv(output_dir / "analyzed_data.csv", index=False)
+            with open(output_dir / "analysis_results.json", 'w') as f:
+                json.dump(analysis_results, f, indent=4)
+            return self.df, analysis_results
+        except Exception as e:
+            self.logger.error(f"Error during analysis: {str(e)}")
+            raise
+# For testing
+if __name__ == "__main__":
+    # Set up logging
+    logging.basicConfig(level=logging.INFO)
+    logger = logging.getLogger(__name__)
+    try:
+        # Read test data
+        df = pd.read_csv("test_data.csv")
+        # Initialize analyzer
+        analyzer = AdvancedYelpAnalyzer(df)
+        # Run analysis
+        output_dir = "output"
+        analyzed_df, results = analyzer.run_full_analysis(output_dir)
+        logger.info("Analysis completed successfully!")
+    except Exception as e:
+        logger.error(f"Error during testing: {str(e)}")
+        raise

src/clean_data.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# clean_yelp_data.py
+from loguru import logger
+import pandas as pd
+import numpy as np
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+import json
+from pathlib import Path
+import logging
+from scipy.stats import entropy
+import warnings
+from datetime import datetime
+import matplotlib.pyplot as plt
+import seaborn as sns
+import re
+from textblob import TextBlob
+import os
+from pathlib import Path
+class DataCleaner:
+    def __init__(self,df,output_path,filename="preprocessed_cleaned.csv"):
+        self.df=df
+        self.output_path=output_path
+        self.filename=filename
+    def saving_cleaned_preprocess(self):
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        output_file = Path(self.output_path) /  self.filename
+        logger.info(f"Files saved in directory {output_file} as : { self.filename}")
+        self.df.to_csv(output_file, index=False)
+    def dropping_unncessary_columns(self):
+        self.df.drop("review_text", axis=1, inplace=True)
+        self.df.drop("review_date", axis=1, inplace=True)
+        self.df.drop("business_name", axis=1, inplace=True)
+        self.df.drop("address", axis=1, inplace=True)
+        self.df.drop("city", axis=1, inplace=True)
+        self.df.drop("state", axis=1, inplace=True)
+        self.df.drop("postal_code", axis=1, inplace=True)
+        self.df.drop("categories", axis=1, inplace=True)
+        self.df.drop("user_name", axis=1, inplace=True)
+        self.df.drop("yelping_since", axis=1, inplace=True)
+        self.df.drop("checkin_date", axis=1, inplace=True)
+        self.df.drop("review_useful", axis=1, inplace=True)
+        self.df.drop("review_funny", axis=1, inplace=True)
+        self.df.drop("review_cool", axis=1, inplace=True)
+        self.df.drop("user_useful", axis=1, inplace=True)
+        self.df.drop("user_funny", axis=1, inplace=True)
+        self.df.drop("user_cool", axis=1, inplace=True)
+        self.df.drop("is_open", axis=1, inplace=True)
+        self.df.drop("compliment_hot", axis=1, inplace=True)
+        self.df.drop("compliment_more", axis=1, inplace=True)
+        self.df.drop("compliment_profile", axis=1, inplace=True)
+        self.df.drop("compliment_cute", axis=1, inplace=True)
+        self.df.drop("compliment_list", axis=1, inplace=True)
+        self.df.drop("compliment_note", axis=1, inplace=True)
+        self.df.drop("compliment_plain", axis=1, inplace=True)
+        self.df.drop("compliment_cool", axis=1, inplace=True)
+        self.df.drop("compliment_funny", axis=1, inplace=True)
+        self.df.drop("compliment_writer", axis=1, inplace=True)
+        self.df.drop("compliment_photos", axis=1, inplace=True)
+    def run_pipeline(self):
+        logger.info("Dropping Unnecessary Columns")
+        self.dropping_unncessary_columns()
+        logger.info("Checking Again for NULL values in Columns")
+        for col in self.df.columns:
+            if self.df[col].isnull().sum()>0:
+                print(f" {col} has {self.df[col].isnull().sum()} null values")
+        logger.info("Saving Cleaned and Preprocessed Data")
+        self.saving_cleaned_preprocess()

src/create_dataset.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import pandas as pd
+import ujson as json
+import gc
+import numpy as np
+from concurrent.futures import ProcessPoolExecutor
+import multiprocessing as mp
+from pymongo import MongoClient
+from collections import defaultdict
+from pathlib import Path
+# def read_json_parallel(file_path, num_workers=None):
+#     """Read JSON file using parallel processing"""
+#     if num_workers is None:
+#         num_workers = max(1, mp.cpu_count() - 1)
+#     print(f"Reading {file_path}...")
+#     # Read chunks and concatenate them into a single DataFrame
+#     df = pd.read_json(file_path, lines=True, dtype_backend="pyarrow", chunksize=100000)
+#     return next(df)
+def read_data_mongo(file_path, num_workers=None):
+    """Read JSON file using parallel processing"""
+    if num_workers is None:
+        num_workers = max(1, mp.cpu_count() - 1)
+    print(f"Reading {file_path}...")
+    conn_str = "mongodb://Mtalha:[email protected]/"
+    client = MongoClient(conn_str)
+    databases = client.list_database_names()
+    db_client=client["Yelp"]
+    # Read the entire file at once since chunksize isn't needed for parallel reading here
+    # Use 'records' orient if your JSON was saved with this format
+    try:
+        collection = db_client[file_path]
+        documents = collection.find({}, {"_id": 0})
+        data = list(documents)
+        final_dict=defaultdict(list)
+        for dictt in data:
+            for k,v in dictt.items():
+                final_dict[k].append(v)
+        df=pd.DataFrame(final_dict)
+        # df = pd.read_json(file_path, orient='records', dtype_backend="pyarrow")
+    except Exception as e:
+        # If 'records' doesn't work, try without specifying orient or with 'split'
+        # This is a fallback for different JSON structures
+        # df = pd.read_json(file_path, dtype_backend="pyarrow")
+        print("ERROR WHILE READING FILES FORM MONGODB AS : ",e)
+    print(f"Finished reading. DataFrame shape: {df.shape}")
+    return df
+def process_datasets(output_path,filename):
+    # File paths
+    file_paths = {
+        'business': "yelp_academic_dataset_business",
+        'checkin':  "yelp_academic_dataset_checkin",
+        'review':   "yelp_academic_dataset_review",
+        'tip':      "yelp_academic_dataset_tip",
+        'user':     "yelp_academic_dataset_user",
+        'google':   "google_review_dataset"
+    }
+    # Read datasets with progress tracking
+    print("Reading datasets...")
+    dfs = {}
+    for name, path in file_paths.items():
+        print(f"Processing {name} dataset...")
+        dfs[name] = read_data_mongo(path)
+        print(f"Finished reading {name} dataset. Shape: {dfs[name].shape}")
+    print("All files read. Starting column renaming...")
+    # Rename columns to avoid conflicts
+    # Reviews
+    dfs['review'] = dfs['review'].rename(columns={
+        'date': 'review_date',
+        'stars': 'review_stars',
+        'text': 'review_text',
+        'useful': 'review_useful',
+        'funny': 'review_funny',
+        'cool': 'review_cool'
+    })
+    # print("COLUMNS IN REVIEW DAFRA)
+    # Tips
+    dfs['tip'] = dfs['tip'].rename(columns={
+        'date': 'tip_date',
+        'text': 'tip_text',
+        'compliment_count': 'tip_compliment_count'
+    })
+    # Checkins
+    dfs['checkin'] = dfs['checkin'].rename(columns={
+         'date': 'checkin_date'
+    })
+    # Users
+    dfs['user'] = dfs['user'].rename(columns={
+        'name': 'user_name',
+        'review_count': 'user_review_count',
+        'useful': 'user_useful',
+        'funny': 'user_funny',
+        'cool': 'user_cool'
+    })
+    # Business
+    dfs['business'] = dfs['business'].rename(columns={
+        'name': 'business_name',
+        'stars': 'business_stars',
+        'review_count': 'business_review_count'
+    })
+    dfs['google'] = dfs['google'].rename(columns={
+        'name': 'business_name',
+        'stars': 'business_stars',
+        'review_count': 'business_review_count'
+    })
+    df_business_final= dfs['business']
+    df_google_final=dfs['google']
+    df_review_final=dfs['review']
+    df_tip_final=dfs['tip']
+    df_checkin_final=dfs['checkin']
+    df_user_final=dfs['user']
+    df_business_final=pd.concat([df_business_final,df_google_final],axis=0)
+    df_business_final.reset_index(drop=True,inplace=True)
+    print("Starting merge process...")
+    # Merge process with memory management
+    print("Step 1: Starting with reviews...")
+    merged_df = df_review_final
+    print("Step 2: Merging with business data...")
+    merged_df = merged_df.merge(
+        df_business_final,
+        on='business_id',
+        how='left'
+    )
+    print("Step 3: Merging with user data...")
+    merged_df = merged_df.merge(
+        df_user_final,
+        on='user_id',
+        how='left'
+    )
+    print("Step 4: Merging with checkin data...")
+    merged_df = merged_df.merge(
+        df_checkin_final,
+        on='business_id',
+        how='left'
+    )
+    print("Step 5: Aggregating and merging tip data...")
+    tip_agg = df_tip_final.groupby('business_id').agg({
+        'tip_compliment_count': 'sum',
+        'tip_text': 'count'
+    }).rename(columns={'tip_text': 'tip_count'})
+    merged_df = merged_df.merge(
+        tip_agg,
+        on='business_id',
+        how='left'
+    )
+    print("Filling NaN values...")
+    merged_df['tip_count'] = merged_df['tip_count'].fillna(0)
+    merged_df['tip_compliment_count'] = merged_df['tip_compliment_count'].fillna(0)
+    merged_df['checkin_date'] = merged_df['checkin_date'].fillna('')
+    merged_df["friends"].fillna(0,inplace=True)
+    for col in merged_df.columns:
+        if merged_df[col].isnull().sum()>0:
+            print(f" {col} has {merged_df[col].isnull().sum()} null values")
+    print("Shape of Merged Dataset is : ",merged_df.shape)
+    output_file = Path(output_path) / filename
+    print("COLUMNS BEFORE PREPROCESING")
+    print()
+    print(merged_df.info())
+    for col in merged_df.columns:
+        for v in merged_df[col]:
+            print(f"Type of values in {col} is {type(v)} and values are like : {v}")
+            break
+    merged_df.to_csv(output_file,index=False)
+    return merged_df
+# if __name__ == "__main__":
+#     process_datasets()

src/feature_analyzer.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from pathlib import Path
+from loguru import logger
+class FeatureAnalyzer:
+    def __init__(self,df,output_path):
+        self.df=df
+        self.output_path=output_path
+    def plot_correlation_heatmap(self):
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        numeric_cols = self.df.select_dtypes(include=[np.number]).columns.drop('fake')
+        correlation_matrix = self.df[numeric_cols].corr()
+        plt.figure(figsize=(14, 12))
+        sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1, center=0)
+        plt.title('Correlation Heatmap of Numeric Features', fontsize=16)
+        plt.tight_layout()
+        output_file = Path(self.output_path) / 'correlation_heatmap.png'
+        plt.savefig(output_file, dpi=300, bbox_inches='tight')
+        plt.close()
+        logger.info(f"Saved correlation heatmap to {output_file}")
+    def plot_mean_by_fake_bar(self):
+        key_features = [
+            'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
+            'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
+            'time_since_last_review_user', 'user_account_age', 'pronoun_density',
+            'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
+            'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
+        ]
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        mean_by_fake = self.df.groupby('fake')[key_features].mean().T
+        mean_by_fake.columns = ['Genuine (0)', 'Fake (1)']
+        plt.figure(figsize=(12, 8))
+        mean_by_fake.plot(kind='bar', color=['skyblue', 'salmon'], width=0.8)
+        plt.title('Mean Feature Values by Fake Label', fontsize=16)
+        plt.xlabel('Features', fontsize=12)
+        plt.ylabel('Mean Value', fontsize=12)
+        plt.xticks(rotation=45, ha='right')
+        plt.legend(title='Fake Label')
+        plt.tight_layout()
+        output_file = Path(self.output_path) / 'mean_by_fake_bar.png'
+        plt.savefig(output_file, dpi=300, bbox_inches='tight')
+        plt.close()
+        logger.info(f"Saved mean by fake bar plot to {output_file}")
+    def plot_violin_plots(self):
+        key_features = [
+            'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
+            'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
+            'time_since_last_review_user', 'user_account_age', 'pronoun_density',
+            'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
+            'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
+        ]
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        plt.figure(figsize=(14, 10))
+        for i, feature in enumerate(key_features[:6], 1):
+            plt.subplot(2, 3, i)
+            sns.violinplot(x='fake', y=feature, data=self.df, palette=['skyblue', 'salmon'])
+            plt.title(f'{feature} Distribution', fontsize=12)
+            plt.xlabel('Fake (0/1)', fontsize=10)
+        plt.tight_layout()
+        output_file = Path(self.output_path) / 'violin_plots.png'
+        plt.savefig(output_file, dpi=300, bbox_inches='tight')
+        plt.close()
+        logger.info(f"Saved violin plots to {output_file}")
+    def plot_box_plots(self):
+        key_features = [
+            'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
+            'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
+            'time_since_last_review_user', 'user_account_age', 'pronoun_density',
+            'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
+            'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
+        ]
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        plt.figure(figsize=(14, 10))
+        for i, feature in enumerate(key_features[6:11], 1):
+            plt.subplot(2, 3, i)
+            sns.boxplot(x='fake', y=feature, data=self.df, palette=['skyblue', 'salmon'])
+            plt.title(f'{feature} Distribution', fontsize=12)
+            plt.xlabel('Fake (0/1)', fontsize=10)
+        plt.tight_layout()
+        output_file = Path(self.output_path) / 'box_plots.png'
+        plt.savefig(output_file, dpi=300, bbox_inches='tight')
+        plt.close()
+        logger.info(f"Saved box plots to {output_file}")
+    def plot_scatter_review_grammar(self):
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        plt.figure(figsize=(10, 6))
+        sns.scatterplot(x='review_stars', y='grammar_error_score', hue='fake', data=self.df, palette=['blue', 'red'], alpha=0.5)
+        plt.title('Review Stars vs Grammar Error Score by Fake Label', fontsize=16)
+        plt.xlabel('Review Stars', fontsize=12)
+        plt.ylabel('Grammar Error Score', fontsize=12)
+        plt.legend(title='Fake')
+        plt.tight_layout()
+        output_file = Path(self.output_path) / 'scatter_review_grammar.png'
+        plt.savefig(output_file, dpi=300, bbox_inches='tight')
+        plt.close()
+        logger.info(f"Saved scatter plot to {output_file}")
+    def plot_density_plots(self):
+        key_features = [
+            'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
+            'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
+            'time_since_last_review_user', 'user_account_age', 'pronoun_density',
+            'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
+            'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
+        ]
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        plt.figure(figsize=(14, 10))
+        for i, feature in enumerate(key_features[:4], 1):
+            plt.subplot(2, 2, i)
+            for label in [0, 1]:
+                subset = self.df[self.df['fake'] == label]
+                sns.kdeplot(subset[feature], label=f'Fake={label}', fill=True, alpha=0.5)
+            plt.title(f'{feature} Density', fontsize=12)
+            plt.xlabel(feature, fontsize=10)
+            plt.legend()
+        plt.tight_layout()
+        output_file = Path(self.output_path) / 'density_plots.png'
+        plt.savefig(output_file, dpi=300, bbox_inches='tight')
+        plt.close()
+        logger.info(f"Saved density plots to {output_file}")
+    def plot_stacked_bar_similarity(self):
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        bins = pd.cut(self.df['similarity_to_other_reviews'], bins=10)
+        stacked_data = self.df.groupby([bins, 'fake']).size().unstack(fill_value=0)
+        stacked_data = stacked_data.div(stacked_data.sum(axis=1), axis=0)
+        plt.figure(figsize=(12, 8))
+        stacked_data.plot(kind='bar', stacked=True, color=['skyblue', 'salmon'], width=0.8)
+        plt.title('Proportion of Fake by Similarity to Other Reviews Bins', fontsize=16)
+        plt.xlabel('Similarity Bins', fontsize=12)
+        plt.ylabel('Proportion', fontsize=12)
+        plt.legend(['Genuine (0)', 'Fake (1)'], title='Fake Label')
+        plt.xticks(rotation=45, ha='right')
+        plt.tight_layout()
+        output_file = Path(self.output_path) / 'stacked_bar_similarity.png'
+        plt.savefig(output_file, dpi=300, bbox_inches='tight')
+        plt.close()
+        logger.info(f"Saved stacked bar plot to {output_file}")
+    def plot_pie_fake_distribution(self):
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        fake_counts = self.df['fake'].value_counts()
+        plt.figure(figsize=(8, 8))
+        plt.pie(fake_counts, labels=['Genuine (0)', 'Fake (1)'], colors=['skyblue', 'salmon'], autopct='%1.1f%%', startangle=90)
+        plt.title('Distribution of Fake Labels', fontsize=16)
+        plt.axis('equal')
+        output_file = Path(self.output_path) / 'pie_fake_distribution.png'
+        plt.savefig(output_file, dpi=300, bbox_inches='tight')
+        plt.close()
+        logger.info(f"Saved pie chart to {output_file}")
+    def plot_count_code_switching(self):
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        plt.figure(figsize=(8, 6))
+        sns.countplot(x='code_switching_flag', hue='fake', data=self.df, palette=['skyblue', 'salmon'])
+        plt.title('Count of Fake by Code Switching Flag', fontsize=16)
+        plt.xlabel('Code Switching Flag (0/1)', fontsize=12)
+        plt.ylabel('Count', fontsize=12)
+        plt.legend(title='Fake Label')
+        plt.tight_layout()
+        output_file = Path(self.output_path) / 'count_code_switching.png'
+        plt.savefig(output_file, dpi=300, bbox_inches='tight')
+        plt.close()
+        logger.info(f"Saved count plot to {output_file}")
+    def plot_variance_by_fake_bar(self):
+        key_features = [
+            'review_stars', 'business_stars', 'business_review_count', 'user_review_count',
+            'friends', 'fans', 'average_stars', 'tip_compliment_count', 'tip_count',
+            'time_since_last_review_user', 'user_account_age', 'pronoun_density',
+            'grammar_error_score', 'repetitive_words_count', 'similarity_to_other_reviews',
+            'review_useful_funny_cool', 'user_useful_funny_cool', 'sentiment_polarity'
+        ]
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        variance_by_fake = self.df.groupby('fake')[key_features].var().T
+        variance_by_fake.columns = ['Genuine (0)', 'Fake (1)']
+        plt.figure(figsize=(12, 8))
+        variance_by_fake.plot(kind='bar', color=['skyblue', 'salmon'], width=0.8)
+        plt.title('Feature Variance by Fake Label', fontsize=16)
+        plt.xlabel('Features', fontsize=12)
+        plt.ylabel('Variance', fontsize=12)
+        plt.xticks(rotation=45, ha='right')
+        plt.legend(title='Fake Label')
+        plt.tight_layout()
+        output_file = Path(self.output_path) / 'variance_by_fake_bar.png'
+        plt.savefig(output_file, dpi=300, bbox_inches='tight')
+        plt.close()
+        logger.info(f"Saved variance bar plot to {output_file}")
+    def run_pipeline(self):
+        sns.set(style="whitegrid")
+        plt.rcParams['figure.figsize'] = (12, 8)
+        self.plot_correlation_heatmap()
+        self.plot_mean_by_fake_bar()
+        self.plot_violin_plots()
+        self.plot_box_plots()
+        self.plot_scatter_review_grammar()
+        self.plot_density_plots()
+        self.plot_stacked_bar_similarity()
+        self.plot_pie_fake_distribution()
+        self.plot_count_code_switching()
+        self.plot_variance_by_fake_bar()

src/model.py ADDED Viewed

	@@ -0,0 +1,540 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch_geometric.data import HeteroData
+import numpy as np
+import pandas as pd
+import networkx as nx
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve
+from sklearn.model_selection import train_test_split
+from pathlib import Path
+from datetime import datetime
+from loguru import logger
+# Temporal Edge Features Function
+def create_temporal_edge_features(time_since_src, time_since_tgt, user_i, user_j):
+    delta_t = torch.abs(time_since_src - time_since_tgt).float()
+    hour_scale = torch.sin(delta_t / 3600)
+    day_scale = torch.sin(delta_t / (24 * 3600))
+    week_scale = torch.sin(delta_t / (7 * 24 * 3600))
+    same_user = (user_i == user_j).float()
+    burst_feature = same_user * torch.exp(-delta_t / (24 * 3600))
+    return torch.stack([hour_scale, day_scale, week_scale, burst_feature], dim=-1)
+# Custom Multihead Attention (unchanged)
+class CustomMultiheadAttention(nn.Module):
+    def __init__(self, embed_dim, num_heads):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+        self.scale = self.head_dim ** -0.5
+    def forward(self, query, key, value, attn_bias=None):
+        batch_size, seq_len, embed_dim = query.size()
+        q = self.q_proj(query)
+        k = self.k_proj(key)
+        v = self.v_proj(value)
+        q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        k = k.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        v = v.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale
+        if attn_bias is not None:
+            scores = scores + attn_bias.unsqueeze(1)
+        attn = F.softmax(scores, dim=-1)
+        out = torch.matmul(attn, v)
+        out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, embed_dim)
+        out = self.out_proj(out)
+        return out, attn
+# HeteroGraphormer (unchanged)
+class HeteroGraphormer(nn.Module):
+    def __init__(self, hidden_dim, output_dim, num_heads=4, edge_dim=4):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.embed_dict = nn.ModuleDict({
+            'user': nn.Linear(14, hidden_dim),
+            'business': nn.Linear(8, hidden_dim),
+            'review': nn.Linear(16, hidden_dim)
+        })
+        self.edge_proj = nn.Linear(edge_dim, hidden_dim)
+        self.gru_user = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
+        self.gru_business = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
+        self.gru_review = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
+        self.attention1 = CustomMultiheadAttention(hidden_dim, num_heads)
+        self.attention2 = CustomMultiheadAttention(hidden_dim, num_heads)
+        self.ffn1 = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim * 4),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(hidden_dim * 4, hidden_dim)
+        )
+        self.ffn2 = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim * 4),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(hidden_dim * 4, hidden_dim)
+        )
+        self.norm1 = nn.LayerNorm(hidden_dim)
+        self.norm2 = nn.LayerNorm(hidden_dim)
+        self.norm3 = nn.LayerNorm(hidden_dim)
+        self.norm4 = nn.LayerNorm(hidden_dim)
+        self.centrality_proj = nn.Linear(1, hidden_dim)
+        self.classifier = nn.Sequential(
+            nn.Linear(hidden_dim * 3, hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(hidden_dim, 1)
+        )
+        self.dropout = nn.Dropout(0.1)
+    def time_aware_aggregation(self, x, time_since, decay_rate=0.1):
+        weights = torch.exp(-decay_rate * time_since.unsqueeze(-1))
+        return x * weights
+    def forward(self, data, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict):
+        x_dict = {}
+        for node_type in data.x_dict:
+            x = self.embed_dict[node_type](data[node_type].x)
+            if node_type in time_since_dict:
+                x = self.time_aware_aggregation(x, time_since_dict[node_type])
+            x_dict[node_type] = x
+        x = torch.cat([x_dict['user'], x_dict['business'], x_dict['review']], dim=0)
+        centrality = self.centrality_proj(centrality_encoding)
+        x = x + centrality
+        x = x.unsqueeze(0)
+        x_user = x[:, :data['user'].x.size(0), :]
+        x_business = x[:, data['user'].x.size(0):data['user'].x.size(0) + data['business'].x.size(0), :]
+        x_review = x[:, data['user'].x.size(0) + data['business'].x.size(0):, :]
+        x_user, _ = self.gru_user(x_user)
+        x_business, _ = self.gru_business(x_business)
+        x_review, _ = self.gru_review(x_review)
+        x = torch.cat([x_user, x_business, x_review], dim=1)
+        total_nodes = x.size(1)
+        attn_bias = torch.zeros(1, total_nodes, total_nodes, device=x.device)
+        attn_bias[0] = -spatial_encoding
+        for edge_type in edge_features_dict:
+            edge_index = data[edge_type].edge_index
+            edge_feats = self.edge_proj(edge_features_dict[edge_type])
+            for i, (src, tgt) in enumerate(edge_index.t()):
+                attn_bias[0, src, tgt] += edge_feats[i].sum()
+        residual = x
+        x, _ = self.attention1(x, x, x, attn_bias=attn_bias)
+        x = self.norm1(x + residual)
+        x = self.dropout(x)
+        residual = x
+        x = self.ffn1(x)
+        x = self.norm2(x + residual)
+        x = self.dropout(x)
+        residual = x
+        x, _ = self.attention2(x, x, x, attn_bias=attn_bias)
+        x = self.norm3(x + residual)
+        x = self.dropout(x)
+        residual = x
+        x = self.ffn2(x)
+        x = self.norm4(x + residual)
+        x = self.dropout(x)
+        x = x.squeeze(0)
+        user_start = 0
+        business_start = data['user'].x.size(0)
+        review_start = business_start + data['business'].x.size(0)
+        h_user = x[user_start:business_start]
+        h_business = x[business_start:review_start]
+        h_review = x[review_start:]
+        user_indices = data['user', 'writes', 'review'].edge_index[0]
+        business_indices = data['review', 'about', 'business'].edge_index[1]
+        review_indices = data['user', 'writes', 'review'].edge_index[1]
+        h_user_mapped = h_user[user_indices]
+        h_business_mapped = h_business[business_indices]
+        h_review_mapped = h_review[review_indices]
+        combined = torch.cat([h_review_mapped, h_user_mapped, h_business_mapped], dim=-1)
+        logits = self.classifier(combined)
+        return torch.sigmoid(logits)
+# Updated GraphformerModel with Plotting
+class GraphformerModel:
+    def __init__(self, df, output_path, epochs, test_size=0.3):
+        self.df_whole = df
+        self.output_path = output_path
+        self.output_path = Path(self.output_path) / "GraphformerModel"
+        self.epochs = epochs
+        self.df, self.test_df = train_test_split(self.df_whole, test_size=test_size, random_state=42)
+        torch.manual_seed(42)
+        np.random.seed(42)
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.model = HeteroGraphormer(hidden_dim=64, output_dim=1, edge_dim=4).to(self.device)
+        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.005)
+        self.criterion = nn.BCELoss()
+    def compute_graph_encodings(self, data):
+        G = nx.DiGraph()
+        node_offset = 0
+        node_type_map = {}
+        for node_type in ['user', 'business', 'review']:
+            num_nodes = data[node_type].x.size(0)
+            for i in range(num_nodes):
+                G.add_node(node_offset + i)
+                node_type_map[node_offset + i] = node_type
+            node_offset += num_nodes
+        edge_types = [('user', 'writes', 'review'), ('review', 'about', 'business')]
+        for src_type, rel, tgt_type in edge_types:
+            edge_index = data[src_type, rel, tgt_type].edge_index
+            src_nodes = edge_index[0].tolist()
+            tgt_nodes = edge_index[1].tolist()
+            src_offset = 0 if src_type == 'user' else (self.num_users if src_type == 'business' else self.num_users + self.num_businesses)
+            tgt_offset = 0 if tgt_type == 'user' else (self.num_users if tgt_type == 'business' else self.num_users + self.num_businesses)
+            for src, tgt in zip(src_nodes, tgt_nodes):
+                G.add_edge(src + src_offset, tgt + tgt_offset)
+        num_nodes = G.number_of_nodes()
+        spatial_encoding = torch.full((num_nodes, num_nodes), float('inf'), device=self.device)
+        for i in range(num_nodes):
+            for j in range(num_nodes):
+                if i == j:
+                    spatial_encoding[i, j] = 0
+                elif nx.has_path(G, i, j):
+                    spatial_encoding[i, j] = nx.shortest_path_length(G, i, j)
+        centrality_encoding = torch.tensor([G.degree(i) for i in range(num_nodes)], dtype=torch.float, device=self.device).view(-1, 1)
+        return spatial_encoding, centrality_encoding, node_type_map
+    def compute_metrics(self, y_true, y_pred, y_prob, prefix=""):
+        metrics = {}
+        metrics[f"{prefix}accuracy"] = accuracy_score(y_true, y_pred)
+        metrics[f"{prefix}precision"] = precision_score(y_true, y_pred, zero_division=0)
+        metrics[f"{prefix}recall"] = recall_score(y_true, y_pred, zero_division=0)
+        metrics[f"{prefix}f1"] = f1_score(y_true, y_pred, zero_division=0)
+        metrics[f"{prefix}auc_roc"] = roc_auc_score(y_true, y_prob)
+        metrics[f"{prefix}conf_matrix"] = confusion_matrix(y_true, y_pred)
+        metrics[f"{prefix}class_report"] = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
+        return metrics
+    def run_model(self):
+        features = torch.tensor(self.df.drop(columns=['user_id', 'review_id', 'business_id', 'fake']).values, dtype=torch.float, device=self.device)
+        y = torch.tensor(self.df['fake'].values, dtype=torch.float, device=self.device)
+        time_since_user = torch.tensor(self.df['time_since_last_review_user'].values, dtype=torch.float, device=self.device)
+        time_since_business = torch.tensor(self.df['time_since_last_review_business'].values, dtype=torch.float, device=self.device)
+        num_rows = len(self.df)
+        graph = HeteroData()
+        self.num_users = len(self.df['user_id'].unique())
+        self.num_businesses = len(self.df['business_id'].unique())
+        user_indices = torch.tensor(self.df['user_id'].map({uid: i for i, uid in enumerate(self.df['user_id'].unique())}).values, dtype=torch.long, device=self.device)
+        business_indices = torch.tensor(self.df['business_id'].map({bid: i for i, bid in enumerate(self.df['business_id'].unique())}).values, dtype=torch.long, device=self.device)
+        review_indices = torch.arange(num_rows, dtype=torch.long, device=self.device)
+        user_feats = torch.zeros(self.num_users, 14, device=self.device)
+        business_feats = torch.zeros(self.num_businesses, 8, device=self.device)
+        review_feats = torch.zeros(num_rows, 16, device=self.device)
+        user_cols = ['hours', 'user_review_count', 'elite', 'friends', 'fans', 'average_stars',
+                     'time_since_last_review_user', 'user_account_age', 'user_degree',
+                     'user_review_burst_count', 'review_like_ratio', 'latest_checkin_hours',
+                     'user_useful_funny_cool', 'rating_variance_user']
+        business_cols = ['latitude', 'longitude', 'business_stars', 'business_review_count',
+                         'time_since_last_review_business', 'business_degree',
+                         'business_review_burst_count', 'rating_deviation_from_business_average']
+        review_cols = ['review_stars', 'tip_compliment_count', 'tip_count', 'average_time_between_reviews',
+                       'temporal_similarity', 'pronoun_density', 'avg_sentence_length',
+                       'excessive_punctuation_count', 'sentiment_polarity', 'good_severity',
+                       'bad_severity', 'code_switching_flag', 'grammar_error_score',
+                       'repetitive_words_count', 'similarity_to_other_reviews', 'review_useful_funny_cool']
+        for i in range(len(self.df)):
+            user_idx = user_indices[i]
+            business_idx = business_indices[i]
+            user_feats[user_idx] += features[i, :14]
+            business_feats[business_idx] += features[i, 14:22]
+        review_feats = features[:, 22:38]
+        graph['user'].x = user_feats
+        graph['business'].x = business_feats
+        graph['review'].x = review_feats
+        graph['review'].y = y
+        graph['user', 'writes', 'review'].edge_index = torch.stack([user_indices, review_indices], dim=0)
+        graph['review', 'about', 'business'].edge_index = torch.stack([review_indices, business_indices], dim=0)
+        edge_features_dict = {}
+        user_writes_edge = graph['user', 'writes', 'review'].edge_index
+        review_about_edge = graph['review', 'about', 'business'].edge_index
+        src_users = user_indices[user_writes_edge[0]]
+        tgt_reviews = review_indices[user_writes_edge[1]]
+        edge_features_dict[('user', 'writes', 'review')] = create_temporal_edge_features(
+            time_since_user[src_users], time_since_user[tgt_reviews], src_users, src_users
+        )
+        src_reviews = review_indices[review_about_edge[0]]
+        tgt_businesses = business_indices[review_about_edge[1]]
+        edge_features_dict[('review', 'about', 'business')] = create_temporal_edge_features(
+            time_since_business[src_reviews], time_since_business[tgt_businesses],
+            torch.zeros_like(src_reviews), torch.zeros_like(src_reviews)
+        )
+        user_time_since = self.df.groupby('user_id')['time_since_last_review_user'].min().reindex(
+            self.df['user_id'].unique(), fill_value=0).values
+        time_since_dict = {
+            'user': torch.tensor(user_time_since, dtype=torch.float, device=self.device)
+        }
+        spatial_encoding, centrality_encoding, node_type_map = self.compute_graph_encodings(graph)
+        # Training with metrics history
+        self.model.train()
+        train_metrics_history = []
+        for epoch in range(self.epochs):
+            self.optimizer.zero_grad()
+            out = self.model(graph, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict)
+            loss = self.criterion(out.squeeze(), y)
+            loss.backward()
+            self.optimizer.step()
+            pred_labels = (out.squeeze() > 0.5).float()
+            probs = out.squeeze().detach().cpu().numpy()
+            train_metrics = self.compute_metrics(y.cpu().numpy(), pred_labels.cpu().numpy(), probs, prefix="train_")
+            train_metrics['loss'] = loss.item()
+            train_metrics_history.append(train_metrics)
+            if epoch % 10 == 0:
+                logger.info(f"Epoch {epoch}, Loss: {loss.item():.4f}, Accuracy: {train_metrics['train_accuracy']:.4f}, F1: {train_metrics['train_f1']:.4f}")
+        # Save model
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        model_save_path = Path(self.output_path) / f"model_GraphformerModel_latest.pth"
+        torch.save(self.model.state_dict(), model_save_path)
+        # Testing
+        if self.test_df is not None:
+            test_features = torch.tensor(self.test_df.drop(columns=['user_id', 'review_id', 'business_id', 'fake']).values, dtype=torch.float, device=self.device)
+            test_y = torch.tensor(self.test_df['fake'].values, dtype=torch.float, device=self.device)
+            test_time_since_user = torch.tensor(self.test_df['time_since_last_review_user'].values, dtype=torch.float, device=self.device)
+            test_time_since_business = torch.tensor(self.test_df['time_since_last_review_business'].values, dtype=torch.float, device=self.device)
+            num_test_rows = len(self.test_df)
+            new_user_unique = self.test_df['user_id'].unique()
+            new_business_unique = self.test_df['business_id'].unique()
+            existing_user_ids = list(self.df['user_id'].unique())
+            user_mapping = {uid: i for i, uid in enumerate(existing_user_ids)}
+            total_users = self.num_users
+            for uid in new_user_unique:
+                if uid not in user_mapping:
+                    user_mapping[uid] = total_users
+                    total_users += 1
+            existing_business_ids = list(self.df['business_id'].unique())
+            business_mapping = {bid: i for i, bid in enumerate(existing_business_ids)}
+            total_businesses = self.num_businesses
+            for bid in new_business_unique:
+                if bid not in business_mapping:
+                    business_mapping[bid] = total_businesses
+                    total_businesses += 1
+            new_user_indices = torch.tensor([user_mapping[uid] for uid in self.test_df['user_id']], dtype=torch.long, device=self.device)
+            new_business_indices = torch.tensor([business_mapping[bid] for bid in self.test_df['business_id']], dtype=torch.long, device=self.device)
+            new_review_indices = torch.arange(num_rows, num_rows + num_test_rows, device=self.device)
+            if total_users > self.num_users:
+                additional_user_feats = torch.zeros(total_users - self.num_users, 14, device=self.device)
+                graph['user'].x = torch.cat([graph['user'].x, additional_user_feats], dim=0)
+            if total_businesses > self.num_businesses:
+                additional_business_feats = torch.zeros(total_businesses - self.num_businesses, 8, device=self.device)
+                graph['business'].x = torch.cat([graph['business'].x, additional_business_feats], dim=0)
+            for i in range(num_test_rows):
+                user_idx = new_user_indices[i]
+                business_idx = new_business_indices[i]
+                if user_idx < graph['user'].x.size(0):
+                    graph['user'].x[user_idx] += test_features[i, :14]
+                if business_idx < graph['business'].x.size(0):
+                    graph['business'].x[business_idx] += test_features[i, 14:22]
+            graph['review'].x = torch.cat([graph['review'].x, test_features[:, 22:38]], dim=0)
+            graph['review'].y = torch.cat([graph['review'].y, test_y], dim=0)
+            graph['user', 'writes', 'review'].edge_index = torch.cat([
+                graph['user', 'writes', 'review'].edge_index,
+                torch.stack([new_user_indices, new_review_indices], dim=0)], dim=1)
+            graph['review', 'about', 'business'].edge_index = torch.cat([
+                graph['review', 'about', 'business'].edge_index,
+                torch.stack([new_review_indices, new_business_indices], dim=0)], dim=1)
+            all_time_since_user = torch.cat([time_since_user, test_time_since_user])
+            all_time_since_business = torch.cat([time_since_business, test_time_since_business])
+            all_user_indices = torch.cat([user_indices, new_user_indices])
+            all_business_indices = torch.cat([business_indices, new_business_indices])
+            all_review_indices = torch.cat([review_indices, new_review_indices])
+            user_writes_edge = graph['user', 'writes', 'review'].edge_index
+            review_about_edge = graph['review', 'about', 'business'].edge_index
+            edge_features_dict[('user', 'writes', 'review')] = create_temporal_edge_features(
+                all_time_since_user[user_writes_edge[0]], all_time_since_user[user_writes_edge[1]],
+                all_user_indices[user_writes_edge[0]], all_user_indices[user_writes_edge[0]]
+            )
+            edge_features_dict[('review', 'about', 'business')] = create_temporal_edge_features(
+                all_time_since_business[review_about_edge[0]], all_time_since_business[review_about_edge[1]],
+                torch.zeros_like(review_about_edge[0]), torch.zeros_like(review_about_edge[0])
+            )
+            self.num_users = total_users
+            self.num_businesses = total_businesses
+            test_user_time_since = self.test_df.groupby('user_id')['time_since_last_review_user'].min().reindex(
+                pd.Index(list(self.df['user_id'].unique()) + list(self.test_df['user_id'].unique())), fill_value=0).values
+            time_since_dict['user'] = torch.tensor(test_user_time_since[:total_users], dtype=torch.float, device=self.device)
+            spatial_encoding, centrality_encoding, node_type_map = self.compute_graph_encodings(graph)
+            self.model.eval()
+            with torch.no_grad():
+                out = self.model(graph, spatial_encoding, centrality_encoding, node_type_map, time_since_dict, edge_features_dict)
+                pred_labels = (out.squeeze() > 0.5).float()
+                probs = out.squeeze().detach().cpu().numpy()
+                test_metrics = self.compute_metrics(graph['review'].y[-num_test_rows:].cpu().numpy(), pred_labels[-num_test_rows:].cpu().numpy(), probs[-num_test_rows:], prefix="test_")
+                train_metrics = self.compute_metrics(y.cpu().numpy(), pred_labels[:num_rows].cpu().numpy(), probs[:num_rows], prefix="train_")
+                logger.info(f"Test Accuracy: {test_metrics['test_accuracy']:.4f}, F1: {test_metrics['test_f1']:.4f}, AUC-ROC: {test_metrics['test_auc_roc']:.4f}")
+            # Save metrics to file
+            metrics_file = Path(self.output_path) / f"metrics_{timestamp}.txt"
+            with open(metrics_file, 'w') as f:
+                f.write("Training Metrics (Final Epoch):\n")
+                for k, v in train_metrics.items():
+                    f.write(f"{k}: {v}\n")
+                f.write("\nTest Metrics:\n")
+                for k, v in test_metrics.items():
+                    f.write(f"{k}: {v}\n")
+            # Plotting and saving to output_path
+            plt.figure(figsize=(12, 8))
+            plt.plot([m['loss'] for m in train_metrics_history], label='Training Loss')
+            plt.xlabel('Epoch')
+            plt.ylabel('Loss')
+            plt.title('Training Loss Curve')
+            plt.legend()
+            plt.grid(True)
+            plt.savefig(Path(self.output_path) / f"loss_curve_{timestamp}.png")
+            plt.close()
+            plt.figure(figsize=(12, 8))
+            plt.plot([m['train_accuracy'] for m in train_metrics_history], label='Training Accuracy')
+            plt.xlabel('Epoch')
+            plt.ylabel('Accuracy')
+            plt.title('Training Accuracy Curve')
+            plt.legend()
+            plt.grid(True)
+            plt.savefig(Path(self.output_path) / f"accuracy_curve_{timestamp}.png")
+            plt.close()
+            plt.figure(figsize=(12, 8))
+            plt.plot([m['train_precision'] for m in train_metrics_history], label='Training Precision')
+            plt.plot([m['train_recall'] for m in train_metrics_history], label='Training Recall')
+            plt.plot([m['train_f1'] for m in train_metrics_history], label='Training F1-Score')
+            plt.xlabel('Epoch')
+            plt.ylabel('Score')
+            plt.title('Training Precision, Recall, and F1-Score Curves')
+            plt.legend()
+            plt.grid(True)
+            plt.savefig(Path(self.output_path) / f"prf1_curves_{timestamp}.png")
+            plt.close()
+            plt.figure(figsize=(12, 8))
+            plt.plot([m['train_auc_roc'] for m in train_metrics_history], label='Training AUC-ROC')
+            plt.xlabel('Epoch')
+            plt.ylabel('AUC-ROC')
+            plt.title('Training AUC-ROC Curve')
+            plt.legend()
+            plt.grid(True)
+            plt.savefig(Path(self.output_path) / f"auc_roc_curve_train_{timestamp}.png")
+            plt.close()
+            plt.figure(figsize=(8, 6))
+            sns.heatmap(test_metrics['test_conf_matrix'], annot=True, fmt='d', cmap='Blues', cbar=False)
+            plt.xlabel('Predicted')
+            plt.ylabel('True')
+            plt.title('Test Confusion Matrix')
+            plt.savefig(Path(self.output_path) / f"confusion_matrix_test_{timestamp}.png")
+            plt.close()
+            fpr, tpr, _ = roc_curve(graph['review'].y[-num_test_rows:].cpu().numpy(), probs[-num_test_rows:])
+            plt.figure(figsize=(10, 6))
+            plt.plot(fpr, tpr, label=f'Test ROC Curve (AUC = {test_metrics["test_auc_roc"]:.4f})')
+            plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
+            plt.xlabel('False Positive Rate')
+            plt.ylabel('True Positive Rate')
+            plt.title('Test ROC Curve')
+            plt.legend()
+            plt.grid(True)
+            plt.savefig(Path(self.output_path) / f"roc_curve_test_{timestamp}.png")
+            plt.close()
+            plt.figure(figsize=(8, 6))
+            sns.heatmap(train_metrics['train_conf_matrix'], annot=True, fmt='d', cmap='Blues', cbar=False)
+            plt.xlabel('Predicted')
+            plt.ylabel('True')
+            plt.title('Training Confusion Matrix (Final Epoch)')
+            plt.savefig(Path(self.output_path) / f"confusion_matrix_train_{timestamp}.png")
+            plt.close()
+            fpr_train, tpr_train, _ = roc_curve(graph['review'].y[:num_rows].cpu().numpy(), probs[:num_rows])
+            plt.figure(figsize=(10, 6))
+            plt.plot(fpr_train, tpr_train, label=f'Training ROC Curve (AUC = {train_metrics["train_auc_roc"]:.4f})')
+            plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
+            plt.xlabel('False Positive Rate')
+            plt.ylabel('True Positive Rate')
+            plt.title('Training ROC Curve (Final Epoch)')
+            plt.legend()
+            plt.grid(True)
+            plt.savefig(Path(self.output_path) / f"roc_curve_train_{timestamp}.png")
+            plt.close()
+            logger.info(f"All metrics, plots, and model saved to {self.output_path}")

src/model_trainer.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from src.model import GraphformerModel
+from pathlib import Path
+from loguru import logger
+class ModelTrainer:
+    def __init__(self, df, output_path, epochs=100,test_size=0.3):
+        self.df = df
+        self.output_path = output_path
+        self.epochs = epochs
+        self.test_size=test_size
+        # Create output directory
+        Path(self.output_path).mkdir(parents=True, exist_ok=True)
+        # Initialize the HeteroGraphormerModel
+        self.model = GraphformerModel(df=self.df, output_path=self.output_path, epochs=self.epochs,test_size=self.test_size)
+        logger.info(f"Initialized ModelTrainer with output_path: {self.output_path} and epochs: {self.epochs}")
+    def train_and_evaluate(self):
+        try:
+            logger.info("Starting model training and evaluation")
+            self.model.run_model()
+            logger.info("GraphformerModel training and evaluation completed successfully")
+        except Exception as e:
+            logger.error(f"Error during GraphformerModel training and evaluation: {e}")
+            raise

src/preprocessing.py ADDED Viewed

	@@ -0,0 +1,832 @@

+from loguru import logger
+import pandas as pd
+import json
+from datetime import datetime
+import ast
+import numpy as np
+from pymongo import MongoClient
+from collections import defaultdict
+from tqdm import tqdm
+import time
+import requests
+import json
+import os
+import pandas as pd
+import nltk
+from nltk.tokenize import sent_tokenize, word_tokenize
+from nltk.corpus import stopwords
+from textblob import TextBlob
+import re
+from transformers import BertTokenizer, BertModel
+from transformers import RobertaTokenizer, RobertaModel
+import torch
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+# Download NLTK resources
+nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
+nltk.download('stopwords')
+nltk.download('punkt_tab')
+nltk.download('averaged_perceptron_tagger_eng')
+class Preprocessor:
+    def __init__(self,df):
+        self.df=df
+        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+        self.model = RobertaModel.from_pretrained('roberta-base')
+        self.stop_words = set(stopwords.words('english'))
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Add this line
+    def get_bert_embedding(self, text):
+        inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
+    def preprocess_text(self,text):
+        return text if pd.notna(text) else ""
+    def calculate_duration(self, time_range):
+        if not isinstance(time_range, str) or "-" not in time_range:
+            return None
+        start_str, end_str = time_range.split('-')
+        start_str = start_str.strip() + ':00' if len(start_str.split(':')) == 1 else start_str.strip()
+        end_str = end_str.strip() + ':00' if len(end_str.split(':')) == 1 else end_str.strip()
+        try:
+            start = datetime.strptime(start_str, '%H:%M')
+            end = datetime.strptime(end_str, '%H:%M')
+            duration = (end - start).total_seconds() / 3600
+            return duration if duration >= 0 else duration + 24
+        except ValueError:
+            return None
+    def calculate_sentiment_severity(self, text):
+        if pd.isna(text) or not text.strip():
+            return pd.Series({"good_severity": 0.0, "bad_severity": 0.0})
+        # Get sentiment polarity (-1 to 1)
+        blob = TextBlob(text)
+        polarity = blob.sentiment.polarity
+        # Define severity weights
+        good_weight = 0.7
+        bad_weight = 0.3
+        if polarity > 0:
+            good_severity = good_weight * polarity
+            bad_severity = 0.0
+        elif polarity < 0:
+            good_severity = 0.0
+            bad_severity = bad_weight * abs(polarity)
+        else:  # Neutral (polarity = 0)
+            good_severity = 0.0
+            bad_severity = 0.0
+        return pd.Series({"good_severity": good_severity, "bad_severity": bad_severity})
+    def get_avg_duration(self, hours_str):
+        if pd.isna(hours_str) or not isinstance(hours_str, str):
+            return pd.NA
+        try:
+            hours_dict = ast.literal_eval(hours_str)
+            if not hours_dict:
+                return pd.NA
+            durations = [self.calculate_duration(time_range) for time_range in hours_dict.values()]
+            valid_durations = [d for d in durations if d is not None]
+            return sum(valid_durations) / len(valid_durations) if valid_durations else pd.NA
+        except (ValueError, SyntaxError, ZeroDivisionError):
+            return pd.NA
+    def calculate_time_since_last_review(self):
+        present_date = datetime.now()
+        user_latest_timestamp = {}
+        # Convert review_date to datetime
+        self.df["review_date"] = pd.to_datetime(self.df["review_date"])
+        # Calculate hours difference for each user's latest review
+        for user_id in self.df["user_id"].unique():
+            latest_date = self.df[self.df["user_id"] == user_id]["review_date"].max()
+            if not isinstance(latest_date, datetime):
+                latest_date = latest_date.to_pydatetime()
+            hours_difference = (present_date - latest_date).total_seconds() / 3600
+            user_latest_timestamp[user_id] = hours_difference
+        # Map the hours difference to a new column
+        self.df["time_since_last_review_user"] = self.df["user_id"].map(user_latest_timestamp)
+    def calculate_time_since_last_review_business(self):
+        present_date = datetime.now()
+        # Ensure review_date is in datetime format
+        self.df["review_date"] = pd.to_datetime(self.df["review_date"])
+        # Initialize dictionary to store hours since last review for each business
+        business_latest_timestamp = {}
+        # Iterate over unique business_ids
+        for business_id in self.df["business_id"].unique():
+            # Get the latest review date for this business
+            latest_date = self.df[self.df["business_id"] == business_id]["review_date"].max()
+            # Convert to datetime object if needed
+            if not isinstance(latest_date, datetime):
+                latest_date = latest_date.to_pydatetime()
+            # Calculate hours difference (already in hours)
+            hours_difference = (present_date - latest_date).total_seconds() / 3600
+            business_latest_timestamp[business_id] = hours_difference
+        # Map the hours difference to the new column
+        self.df["time_since_last_review_business"] = self.df["business_id"].map(business_latest_timestamp)
+    def calculate_user_account_age(self):
+        present_date = datetime.now()
+        # Convert yelping_since to datetime
+        self.df["yelping_since"] = pd.to_datetime(self.df["yelping_since"])
+        # Calculate user account age in days
+        self.df["user_account_age"] = (present_date - self.df["yelping_since"]).dt.days
+    def calculate_avg_time_between_reviews(self):
+        # Ensure review_date is in datetime format
+        self.df["review_date"] = pd.to_datetime(self.df["review_date"])
+        # Sort the DataFrame by user_id and review_date to ensure chronological order
+        self.df = self.df.sort_values(["user_id", "review_date"])
+        # Define helper function to calculate average time between reviews
+        def calculate_avg_time(group):
+            if len(group) == 1:
+                return 0  # If only one review, assign 0
+            # Calculate differences in hours between consecutive reviews
+            diffs = group["review_date"].diff().dt.total_seconds() / 3600
+            # Drop the first NaN (from diff) and compute the mean
+            return diffs.dropna().mean()
+        # Apply the function to each user_id group and create a mapping
+        avg_time_per_user = self.df.groupby("user_id").apply(calculate_avg_time)
+        # Map the average time back to the original DataFrame
+        self.df["average_time_between_reviews"] = self.df["user_id"].map(avg_time_per_user)
+    def calculate_user_degree(self):
+    # Calculate the number of unique businesses per user
+        user_business_counts = self.df.groupby("user_id")["business_id"].nunique()
+        # Map the counts back to the original DataFrame
+        self.df["user_degree"] = self.df["user_id"].map(user_business_counts)
+    def calculate_business_degree(self):
+        # Calculate the number of unique users per business
+        business_user_counts = self.df.groupby("business_id")["user_id"].nunique()
+        # Map the counts back to the original DataFrame
+        self.df["business_degree"] = self.df["business_id"].map(business_user_counts)
+    def calculate_rating_variance_user(self):
+        # Calculate the mode (most frequent rating) per user
+        user_rating_mode = self.df.groupby("user_id")["review_stars"].agg(lambda x: x.mode()[0])
+        # Map the most frequent rating back to the original DataFrame
+        self.df["rating_variance_user"] = self.df["user_id"].map(user_rating_mode)
+    def calculate_user_review_burst_count(self):
+    # Ensure review_date is in datetime format
+        self.df["review_date"] = pd.to_datetime(self.df["review_date"])
+        # Sort by user_id and review_date for chronological order
+        self.df = self.df.sort_values(["user_id", "review_date"])
+        # Function to calculate the max number of reviews in any 20-day window
+        def calculate_burst_count(group):
+            if len(group) <= 1:
+                return 0  # No burst if 1 or fewer reviews
+            # Convert review_date to a Series for rolling window
+            dates = group["review_date"]
+            # Calculate the number of reviews within 20 days of each review
+            burst_counts = []
+            for i, date in enumerate(dates):
+                # Count reviews within 20 days after this date
+                window_end = date + pd.Timedelta(days=20)
+                count = ((dates >= date) & (dates <= window_end)).sum()
+                burst_counts.append(count)
+            # Return the maximum burst count for this user
+            return max(burst_counts)
+        # Calculate the burst count per user
+        user_burst_counts = self.df.groupby("user_id").apply(calculate_burst_count)
+        # Map the burst count back to the original DataFrame
+        self.df["user_review_burst_count"] = self.df["user_id"].map(user_burst_counts)
+    def calculate_business_review_burst_count(self):
+        # Ensure review_date is in datetime format
+        self.df["review_date"] = pd.to_datetime(self.df["review_date"])
+        # Sort by business_id and review_date for chronological order
+        self.df = self.df.sort_values(["business_id", "review_date"])
+        # Function to calculate the max number of reviews in any 10-day window
+        def calculate_burst_count(group):
+            if len(group) <= 1:
+                return 0  # No burst if 1 or fewer reviews
+            # Convert review_date to a Series for rolling window
+            dates = group["review_date"]
+            # Calculate the number of reviews within 10 days of each review
+            burst_counts = []
+            for i, date in enumerate(dates):
+                # Count reviews within 10 days after this date
+                window_end = date + pd.Timedelta(days=10)
+                count = ((dates >= date) & (dates <= window_end)).sum()
+                burst_counts.append(count)
+            # Return the maximum burst count for this business
+            return max(burst_counts)
+        # Calculate the burst count per business
+        business_burst_counts = self.df.groupby("business_id").apply(calculate_burst_count)
+        # Map the burst count back to the original DataFrame
+        self.df["business_review_burst_count"] = self.df["business_id"].map(business_burst_counts)
+    def calculate_temporal_similarity(self):
+        self.df["review_date"] = pd.to_datetime(self.df["review_date"])
+        # Extract the day of the week (0 = Monday, 6 = Sunday)
+        self.df["day_of_week"] = self.df["review_date"].dt.dayofweek
+        # Function to calculate avg hours between reviews on frequent days
+        def calculate_avg_hours_on_frequent_days(group):
+            frequent_days = group["day_of_week"].mode().tolist()
+            if len(group) <= 1:
+                return 0
+            frequent_reviews = group[group["day_of_week"].isin(frequent_days)]
+            if len(frequent_reviews) <= 1:
+                return 0
+            frequent_reviews = frequent_reviews.sort_values("review_date")
+            diffs = frequent_reviews["review_date"].diff().dt.total_seconds() / 3600
+            return diffs.dropna().mean()
+        # Calculate average hours for each user
+        avg_hours_per_user = self.df.groupby("user_id").apply(calculate_avg_hours_on_frequent_days)
+        # Map the average hours to the new column
+        self.df["temporal_similarity"] = self.df["user_id"].map(avg_hours_per_user)
+        # Drop temporary column
+        self.df = self.df.drop(columns=["day_of_week"])
+    def calculate_rating_deviation_from_business_average(self):
+    # Calculate the average rating per business
+        business_avg_rating = self.df.groupby("business_id")["review_stars"].mean()
+        # Map the average rating to each row
+        self.df["business_avg_rating"] = self.df["business_id"].map(business_avg_rating)
+        # Calculate the deviation from the business average
+        self.df["rating_deviation_from_business_average"] = (
+            self.df["review_stars"] - self.df["business_avg_rating"]
+        )
+        # Drop the temporary column
+        self.df = self.df.drop(columns=["business_avg_rating"])
+    def calculate_review_like_ratio(self):
+        # Create a binary column for liked reviews (stars >= 4)
+        self.df["is_liked"] = (self.df["review_stars"] >= 4).astype(int)
+        # Calculate the like ratio per user
+        user_like_ratio = self.df.groupby("user_id")["is_liked"].mean()
+        # Map the like ratio back to the DataFrame
+        self.df["review_like_ratio"] = self.df["user_id"].map(user_like_ratio)
+        # Drop the temporary column
+        self.df = self.df.drop(columns=["is_liked"])
+    def calculate_latest_checkin_hours(self):
+        self.df["yelping_since"] = pd.to_datetime(self.df["yelping_since"])
+        # Function to get the latest check-in date from a list of strings
+        def get_latest_checkin(checkin_list):
+            if not checkin_list or pd.isna(checkin_list):  # Handle empty or NaN
+                return None
+            if isinstance(checkin_list, str):
+                checkin_dates = checkin_list.split(", ")
+            else:
+                checkin_dates = checkin_list
+            return pd.to_datetime(checkin_dates).max()
+        # Apply the function to get the latest check-in date per row
+        self.df["latest_checkin_date"] = self.df["checkin_date"].apply(get_latest_checkin)
+        # Calculate the hours difference between latest check-in and yelping_since
+        self.df["latest_checkin_hours"] = (
+            (self.df["latest_checkin_date"] - self.df["yelping_since"])
+            .dt.total_seconds() / 3600
+        )
+        # Drop the temporary column
+        self.df = self.df.drop(columns=["latest_checkin_date"])
+        self.df["latest_checkin_hours"].fillna(0,inplace=True)
+    def compute_pronoun_density(self, text):
+        text = self.preprocess_text(text)
+        if not text:
+            return 0
+        words = word_tokenize(text.lower())
+        pos_tags = nltk.pos_tag(words)
+        pronouns = sum(1 for word, pos in pos_tags if pos in ['PRP', 'PRP$'] and word in ['i', 'we'])
+        return pronouns / len(words) if words else 0
+    def compute_avg_sentence_length(self, text):
+        text = self.preprocess_text(text)
+        if not text:
+            return 0
+        sentences = sent_tokenize(text)
+        return sum(len(word_tokenize(sent)) for sent in sentences) / len(sentences) if sentences else 0
+    def compute_excessive_punctuation(self, text):
+        text = self.preprocess_text(text)
+        return len(re.findall(r'[!?.]{2,}', text))
+    def compute_sentiment_polarity(self, text):
+        text = self.preprocess_text(text)
+        return TextBlob(text).sentiment.polarity if text else 0
+    def compute_code_switching_flag(self, text):
+        text = self.preprocess_text(text)
+        if not text:
+            return 0
+        tokens = self.tokenizer.tokenize(text.lower())
+        if not tokens:
+            return 0
+        english_words = self.stop_words  # Use self.stop_words from __init__
+        token_set = set(tokens)
+        english_count = sum(1 for token in tokens if token in english_words)
+        non_english_pattern = re.compile(r'[^\x00-\x7F]')
+        has_non_ascii = 1 if non_english_pattern.search(text) else 0
+        english_ratio = english_count / len(tokens) if tokens else 0
+        non_english_tokens = sum(1 for token in token_set if token not in english_words and "##" in token and has_non_ascii)
+        # Flag as code-switching if:
+        # 1. Mixed English presence (ratio between 0.1 and 0.9)
+        # 2. Non-ASCII characters present OR some non-English subword tokens
+        if 0.1 < english_ratio < 0.9 and (has_non_ascii or non_english_tokens > 0):
+            return 1
+        return 0
+    def batch_tokenize(self, texts, batch_size=32, max_length=512):
+        tokenized_outputs = []
+        for i in tqdm(range(0, len(texts), batch_size), desc="Tokenizing with RoBERTa on GPU"):
+            batch_texts = texts[i:i + batch_size]
+            valid_texts = [self.preprocess_text(t) for t in batch_texts]
+            # Tokenize with fixed max_length to ensure consistent tensor sizes
+            inputs = self.tokenizer(valid_texts, return_tensors='pt', truncation=True, padding='max_length', max_length=max_length)
+            tokenized_outputs.append(inputs['input_ids'].to(self.device))  # Move to GPU
+        # Concatenate on GPU with consistent sizes
+        return torch.cat(tokenized_outputs, dim=0)
+    def compute_grammar_error_score(self, texts, tokenized_ids):
+        print("Computing grammar error scores...")
+        error_scores = np.zeros(len(texts), dtype=float)
+        vocab_set = set(self.tokenizer.get_vocab().keys())
+        for i, input_ids in enumerate(tqdm(tokenized_ids, desc="Processing Grammar Errors")):
+            if input_ids.sum() == 0:  # Empty input
+                continue
+            tokens = self.tokenizer.convert_ids_to_tokens(input_ids.cpu().tolist(), skip_special_tokens=True)
+            unknown_count = sum(1 for token in tokens if token not in vocab_set and token not in self.stop_words)
+            total_count = len([t for t in tokens if t not in self.stop_words])
+            error_scores[i] = unknown_count / total_count if total_count > 0 else 0
+        return error_scores
+    def compute_repetitive_words_count(self, texts, tokenized_ids):
+        print("Computing repetitive words counts...")
+        rep_counts = np.zeros(len(texts), dtype=int)
+        for i, input_ids in enumerate(tqdm(tokenized_ids, desc="Processing Repetition")):
+            if input_ids.sum() == 0:  # Empty input
+                continue
+            tokens = self.tokenizer.convert_ids_to_tokens(input_ids.cpu().tolist(), skip_special_tokens=True)
+            valid_tokens = [t for t in tokens if t not in self.stop_words and len(t) > 2]
+            if valid_tokens:
+                token_counts = {}
+                for token in valid_tokens:
+                    token_counts[token] = token_counts.get(token, 0) + 1
+                rep_counts[i] = sum(1 for count in token_counts.values() if count > 1)
+        return rep_counts
+    def preprocess_text_for_similarity(self, text):
+        if pd.isna(text) or not text.strip():
+            return []
+        return [w for w in word_tokenize(str(text).lower()) if w not in self.stop_words]
+    def batch_encode_words(self, texts, batch_size=32, max_length=512):
+        word_lists = [self.preprocess_text_for_similarity(t) for t in tqdm(texts, desc="Tokenizing Texts")]
+        vocab = {word: idx + 1 for idx, word in enumerate(set.union(*[set(w) for w in word_lists if w]))}
+        encoded_batches = []
+        for i in tqdm(range(0, len(word_lists), batch_size), desc="Encoding Words on GPU"):
+            batch_words = word_lists[i:i + batch_size]
+            encoded = np.zeros((len(batch_words), max_length), dtype=np.int64)
+            for j, words in enumerate(batch_words):
+                if words:
+                    word_ids = [vocab.get(w, 0) for w in words][:max_length]
+                    encoded[j, :len(word_ids)] = word_ids
+            encoded_tensor = torch.tensor(encoded, dtype=torch.int64).to(self.device)
+            encoded_batches.append(encoded_tensor)
+        return torch.cat(encoded_batches, dim=0), vocab
+    def compute_similarity_to_other_reviews(self, batch_size=32, max_length=512):
+        all_texts = self.df["review_text"].tolist()
+        all_users = self.df["user_id"].tolist()
+        all_review_ids = self.df["review_id"].tolist()
+        encoded_words, vocab = self.batch_encode_words(all_texts, batch_size, max_length)
+        similarity_scores = {rid: 0.0 for rid in all_review_ids}  # Default scores
+        for i, (review_id, user_id) in enumerate(tqdm(zip(all_review_ids, all_users), desc="Computing Similarities on GPU")):
+            if pd.isna(review_id) or pd.isna(user_id):
+                continue
+            current_words = encoded_words[i]
+            if current_words.sum() == 0:
+                continue
+            other_indices = torch.tensor([j for j, u in enumerate(all_users) if u != user_id and pd.notna(u)],
+                                       dtype=torch.long).to(self.device)
+            if not other_indices.numel():
+                continue
+            other_words = encoded_words[other_indices]
+            current_set = torch.unique(current_words[current_words > 0])
+            other_flat = other_words[other_words > 0]
+            if other_flat.numel() == 0:
+                continue
+            other_set = torch.unique(other_flat)
+            intersection = torch.sum(torch.isin(current_set, other_set)).float()
+            union = torch.unique(torch.cat([current_set, other_set])).numel()
+            similarity = intersection / union if union > 0 else 0.0
+            similarity_scores[review_id] = similarity.item()
+        return pd.Series(similarity_scores, index=all_review_ids)
+    def calculate_friend_count(self):
+        friends = []
+        for v in self.df["friends"]:
+            if isinstance(v, str):
+                friends.append(len(v.split(",")))
+            elif type(v)==int or type(v)==float:
+                friends.append(0)
+        self.df["friends"] = friends
+    def count_elite_years(self, elite):
+        if pd.isna(elite):
+            return 0
+        return len(str(elite).split(","))
+    def transform_elite_status(self):
+        self.df["elite"] = self.df["elite"].apply(lambda x: True if self.count_elite_years(x) > 1 else False)
+        self.df["elite"] = self.df["elite"].astype(int)
+    def calculate_review_useful_funny_cool(self):
+        self.df["review_useful"] = pd.to_numeric(self.df["review_useful"], errors='coerce').fillna(0)
+        self.df["review_funny"] = pd.to_numeric(self.df["review_funny"], errors='coerce').fillna(0)
+        self.df["review_cool"] = pd.to_numeric(self.df["review_cool"], errors='coerce').fillna(0)
+        self.df["review_useful_funny_cool"] = (
+            self.df["review_useful"] +
+            self.df["review_funny"] +
+            self.df["review_cool"]
+        )
+        self.df["review_useful_funny_cool"] = self.df["review_useful_funny_cool"].fillna(0).astype(int)
+    def calculate_user_useful_funny_cool(self):
+        self.df["user_useful_funny_cool"] = (
+            self.df["user_useful"] +
+            self.df["user_funny"] +
+            self.df["user_cool"]
+        )
+        self.df["user_useful_funny_cool"] = self.df["user_useful_funny_cool"].fillna(0).astype(int)
+    def compute_fake_score(self, row):
+        suspicion_points = 0
+        # Linguistic Features
+        if row["pronoun_density"] < 0.01:  # Low personal engagement
+            suspicion_points += 1
+        if row["avg_sentence_length"] < 5 or row["avg_sentence_length"] > 30:  # Extreme lengths
+            suspicion_points += 1
+        if row["grammar_error_score"] > 5:  # Many errors
+            suspicion_points += 1
+        if row["repetitive_words_count"] > 5:  # High repetition
+            suspicion_points += 1
+        if row["code_switching_flag"] == 1:  # Language mixing
+            suspicion_points += 1
+        if row["excessive_punctuation_count"] > 3:  # Overuse of punctuation
+            suspicion_points += 1
+        if abs(row["sentiment_polarity"]) > 0.8:  # Extreme sentiment
+            suspicion_points += 1
+        # Review Patterns
+        if row["similarity_to_other_reviews"] > 0.8:  # High duplication
+            suspicion_points += 1
+        if row["user_review_burst_count"] > 5:  # Spammy bursts
+            suspicion_points += 1
+        if row["business_review_burst_count"] > 5:  # Targeted bursts
+            suspicion_points += 1
+        if abs(row["rating_deviation_from_business_average"]) > 2:  # Large rating deviation
+            suspicion_points += 1
+        if row["review_like_ratio"] > 0.9 or row["review_like_ratio"] < 0.1:  # Extreme like ratio
+            suspicion_points += 1
+        # User Behavior
+        if row["user_account_age"] < 30:  # Very new account (days)
+            suspicion_points += 1
+        if row["average_time_between_reviews"] < 24:  # Rapid reviews (hours)
+            suspicion_points += 1
+        if row["user_degree"] < 2:  # Low business interaction
+            suspicion_points += 1
+        if row["time_since_last_review_user"] < 24:  # Recent burst (hours)
+            suspicion_points += 1
+        # Threshold: 3 or more points = fake
+        return 1 if suspicion_points >= 3 else 0
+    def run_pipeline(self):
+        logger.info("FINALYZING HOURS COLUMN ...")
+        self.df["hours"] = self.df["hours"].apply(self.get_avg_duration)
+        self.df["hours"] = self.df["hours"].fillna(0)
+        print(self.df["hours"][:10])
+        print(self.df["hours"].isnull().sum())
+        logger.info("FINALYZING ATTRIBUTES COLUMN ...")
+        self.df.drop("attributes",axis=1,inplace=True)
+        logger.info("CREATING time_since_last_review_user COLUMN ...")
+        self.calculate_time_since_last_review()
+        print(np.unique(self.df["time_since_last_review_user"] ))
+        logger.info("CREATING time_since_last_review_business COLUMN ...")
+        self.calculate_time_since_last_review_business()
+        print(np.unique(self.df["time_since_last_review_business"] ))
+        logger.info("CREATING user_account_age COLUMN ...")
+        self.calculate_user_account_age()
+        print(np.unique(self.df["user_account_age"] ))
+        logger.info("CREATING average_time_between_reviews COLUMN ...")
+        self.calculate_avg_time_between_reviews()
+        print(np.unique(self.df["average_time_between_reviews"] ))
+        logger.info("CREATING user_degree COLUMN ...")
+        self.calculate_user_degree()
+        print(np.unique(self.df["user_degree"] ))
+        logger.info("CREATING business_degree COLUMN ...")
+        self.calculate_business_degree()
+        print(np.unique(self.df["business_degree"] ))
+        logger.info("CREATING rating_variance_user COLUMN ...")
+        self.calculate_rating_variance_user()
+        print(np.unique(self.df["rating_variance_user"] ))
+        logger.info("CREATING user_review_burst_count COLUMN ...")
+        self.calculate_user_review_burst_count()
+        print(np.unique(self.df["user_review_burst_count"] ))
+        logger.info("CREATING business_review_burst_count COLUMN ...")
+        self.calculate_business_review_burst_count()
+        print(np.unique(self.df["business_review_burst_count"] ))
+        logger.info("CREATING temporal_similarity COLUMN ...")
+        self.calculate_temporal_similarity()
+        print(np.unique(self.df["temporal_similarity"] ))
+        logger.info("CREATING rating_deviation_from_business_average COLUMN ...")
+        self.calculate_rating_deviation_from_business_average()
+        print(np.unique(self.df["rating_deviation_from_business_average"] ))
+        logger.info("CREATING review_like_ratio COLUMN ...")
+        self.calculate_review_like_ratio()
+        print(np.unique(self.df["review_like_ratio"] ))
+        logger.info("CREATING latest_checkin_hours COLUMN ...")
+        self.calculate_latest_checkin_hours()
+        print(np.unique(self.df["latest_checkin_hours"] ))
+        logger.info("CREATING pronoun_density COLUMN ...")
+        self.df["pronoun_density"] = self.df["review_text"].apply(self.compute_pronoun_density)
+        print(np.unique(self.df["pronoun_density"] ))
+        logger.info("CREATING avg_sentence_length COLUMN ...")
+        self.df["avg_sentence_length"] = self.df["review_text"].apply(self.compute_avg_sentence_length)
+        print(np.unique(self.df["avg_sentence_length"] ))
+        logger.info("CREATING excessive_punctuation_count COLUMN ...")
+        self.df["excessive_punctuation_count"] = self.df["review_text"].apply(self.compute_excessive_punctuation)
+        print(np.unique(self.df["excessive_punctuation_count"] ))
+        logger.info("CREATING sentiment_polarity COLUMN ...")
+        self.df["sentiment_polarity"] = self.df["review_text"].apply(self.compute_sentiment_polarity)
+        print(np.unique(self.df["sentiment_polarity"] ))
+        logger.info("CREATING good_severity and  bad_severity COLUMNS ...")
+        severity_scores = self.df["review_text"].apply(self.calculate_sentiment_severity)
+        self.df[["good_severity", "bad_severity"]] = severity_scores
+        print(np.unique(self.df["good_severity"] ))
+        print(np.unique(self.df["bad_severity"] ))
+        logger.info("CREATING code_switching_flag COLUMN ...")
+        self.df["code_switching_flag"] = self.df["review_text"].apply(self.compute_code_switching_flag)
+        print(np.unique(self.df["code_switching_flag"] ))
+        all_texts = self.df["review_text"].tolist()
+        tokenized_ids = self.batch_tokenize(all_texts, batch_size=32, max_length=512)
+        logger.info("CREATING grammar_error_score COLUMN ...")
+        self.df["grammar_error_score"] = self.compute_grammar_error_score(all_texts, tokenized_ids)
+        print(np.unique(self.df["grammar_error_score"] ))
+        logger.info("CREATING repetitive_words_count COLUMN ...")
+        self.df["repetitive_words_count"] = self.compute_repetitive_words_count(all_texts, tokenized_ids)
+        print(np.unique(self.df["repetitive_words_count"] ))
+        logger.info("CREATING similarity_to_other_reviews COLUMN ...")
+        similarity_scores = self.compute_similarity_to_other_reviews(batch_size=32, max_length=512)
+        self.df["similarity_to_other_reviews"] = self.df["review_id"].map(similarity_scores)
+        print(np.unique(self.df["similarity_to_other_reviews"] ))
+        logger.info("CREATING friends COLUMN ...")
+        self.calculate_friend_count()
+        print(self.df["friends"].value_counts())
+        logger.info("CREATING elite COLUMN ...")
+        self.transform_elite_status()
+        print(self.df["elite"].value_counts())
+        logger.info("CREATING review_useful_funny_cool COLUMN ...")
+        self.calculate_review_useful_funny_cool()
+        print(self.df["review_useful_funny_cool"].value_counts())
+        logger.info("CREATING user_useful_funny_cool COLUMN ...")
+        self.calculate_user_useful_funny_cool()
+        print(self.df["user_useful_funny_cool"].value_counts())
+        logger.info("CREATING LABEL COLUMN ...")
+        self.df["fake"] = self.df.apply(self.compute_fake_score, axis=1)
+        print(self.df["fake"].value_counts())
+        logger.info("SEEING NULL VALUES IN FINAL COLUMNS.....")
+        print(set(self.df.isnull().sum().values))
+        return self.df