flaskapp / src /clean_data.py
OVH
Added all the files
67b1c6c
# clean_yelp_data.py
from loguru import logger
import pandas as pd
import numpy as np
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
import json
from pathlib import Path
import logging
from scipy.stats import entropy
import warnings
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import re
from textblob import TextBlob
import os
from pathlib import Path
class DataCleaner:
def __init__(self,df,output_path,filename="preprocessed_cleaned.csv"):
self.df=df
self.output_path=output_path
self.filename=filename
def saving_cleaned_preprocess(self):
Path(self.output_path).mkdir(parents=True, exist_ok=True)
output_file = Path(self.output_path) / self.filename
logger.info(f"Files saved in directory {output_file} as : { self.filename}")
self.df.to_csv(output_file, index=False)
def dropping_unncessary_columns(self):
self.df.drop("review_text", axis=1, inplace=True)
self.df.drop("review_date", axis=1, inplace=True)
self.df.drop("business_name", axis=1, inplace=True)
self.df.drop("address", axis=1, inplace=True)
self.df.drop("city", axis=1, inplace=True)
self.df.drop("state", axis=1, inplace=True)
self.df.drop("postal_code", axis=1, inplace=True)
self.df.drop("categories", axis=1, inplace=True)
self.df.drop("user_name", axis=1, inplace=True)
self.df.drop("yelping_since", axis=1, inplace=True)
self.df.drop("checkin_date", axis=1, inplace=True)
self.df.drop("review_useful", axis=1, inplace=True)
self.df.drop("review_funny", axis=1, inplace=True)
self.df.drop("review_cool", axis=1, inplace=True)
self.df.drop("user_useful", axis=1, inplace=True)
self.df.drop("user_funny", axis=1, inplace=True)
self.df.drop("user_cool", axis=1, inplace=True)
self.df.drop("is_open", axis=1, inplace=True)
self.df.drop("compliment_hot", axis=1, inplace=True)
self.df.drop("compliment_more", axis=1, inplace=True)
self.df.drop("compliment_profile", axis=1, inplace=True)
self.df.drop("compliment_cute", axis=1, inplace=True)
self.df.drop("compliment_list", axis=1, inplace=True)
self.df.drop("compliment_note", axis=1, inplace=True)
self.df.drop("compliment_plain", axis=1, inplace=True)
self.df.drop("compliment_cool", axis=1, inplace=True)
self.df.drop("compliment_funny", axis=1, inplace=True)
self.df.drop("compliment_writer", axis=1, inplace=True)
self.df.drop("compliment_photos", axis=1, inplace=True)
def run_pipeline(self):
logger.info("Dropping Unnecessary Columns")
self.dropping_unncessary_columns()
logger.info("Checking Again for NULL values in Columns")
for col in self.df.columns:
if self.df[col].isnull().sum()>0:
print(f" {col} has {self.df[col].isnull().sum()} null values")
logger.info("Saving Cleaned and Preprocessed Data")
self.saving_cleaned_preprocess()