|
|
|
from loguru import logger |
|
import pandas as pd |
|
import numpy as np |
|
from dataclasses import dataclass |
|
from typing import Dict, List, Optional, Tuple |
|
import json |
|
from pathlib import Path |
|
import logging |
|
from scipy.stats import entropy |
|
import warnings |
|
from datetime import datetime |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
import re |
|
from textblob import TextBlob |
|
import os |
|
from pathlib import Path |
|
|
|
class DataCleaner: |
|
def __init__(self,df,output_path,filename="preprocessed_cleaned.csv"): |
|
self.df=df |
|
self.output_path=output_path |
|
self.filename=filename |
|
def saving_cleaned_preprocess(self): |
|
Path(self.output_path).mkdir(parents=True, exist_ok=True) |
|
|
|
output_file = Path(self.output_path) / self.filename |
|
logger.info(f"Files saved in directory {output_file} as : { self.filename}") |
|
self.df.to_csv(output_file, index=False) |
|
|
|
def dropping_unncessary_columns(self): |
|
self.df.drop("review_text", axis=1, inplace=True) |
|
self.df.drop("review_date", axis=1, inplace=True) |
|
self.df.drop("business_name", axis=1, inplace=True) |
|
self.df.drop("address", axis=1, inplace=True) |
|
self.df.drop("city", axis=1, inplace=True) |
|
self.df.drop("state", axis=1, inplace=True) |
|
self.df.drop("postal_code", axis=1, inplace=True) |
|
self.df.drop("categories", axis=1, inplace=True) |
|
self.df.drop("user_name", axis=1, inplace=True) |
|
self.df.drop("yelping_since", axis=1, inplace=True) |
|
self.df.drop("checkin_date", axis=1, inplace=True) |
|
self.df.drop("review_useful", axis=1, inplace=True) |
|
self.df.drop("review_funny", axis=1, inplace=True) |
|
self.df.drop("review_cool", axis=1, inplace=True) |
|
self.df.drop("user_useful", axis=1, inplace=True) |
|
self.df.drop("user_funny", axis=1, inplace=True) |
|
self.df.drop("user_cool", axis=1, inplace=True) |
|
self.df.drop("is_open", axis=1, inplace=True) |
|
self.df.drop("compliment_hot", axis=1, inplace=True) |
|
self.df.drop("compliment_more", axis=1, inplace=True) |
|
self.df.drop("compliment_profile", axis=1, inplace=True) |
|
self.df.drop("compliment_cute", axis=1, inplace=True) |
|
self.df.drop("compliment_list", axis=1, inplace=True) |
|
self.df.drop("compliment_note", axis=1, inplace=True) |
|
self.df.drop("compliment_plain", axis=1, inplace=True) |
|
self.df.drop("compliment_cool", axis=1, inplace=True) |
|
self.df.drop("compliment_funny", axis=1, inplace=True) |
|
self.df.drop("compliment_writer", axis=1, inplace=True) |
|
self.df.drop("compliment_photos", axis=1, inplace=True) |
|
|
|
def run_pipeline(self): |
|
logger.info("Dropping Unnecessary Columns") |
|
self.dropping_unncessary_columns() |
|
|
|
logger.info("Checking Again for NULL values in Columns") |
|
for col in self.df.columns: |
|
if self.df[col].isnull().sum()>0: |
|
print(f" {col} has {self.df[col].isnull().sum()} null values") |
|
|
|
|
|
logger.info("Saving Cleaned and Preprocessed Data") |
|
self.saving_cleaned_preprocess() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|