Spaces:

Askhedi
/

flaskapp

Sleeping

flaskapp / src /clean_data.py

OVH

Added all the files

67b1c6c 10 days ago

3.21 kB

	# clean_yelp_data.py
	from loguru import logger
	import pandas as pd
	import numpy as np
	from dataclasses import dataclass
	from typing import Dict, List, Optional, Tuple
	import json
	from pathlib import Path
	import logging
	from scipy.stats import entropy
	import warnings
	from datetime import datetime
	import matplotlib.pyplot as plt
	import seaborn as sns
	import re
	from textblob import TextBlob
	import os
	from pathlib import Path

	class DataCleaner:
	def __init__(self,df,output_path,filename="preprocessed_cleaned.csv"):
	self.df=df
	self.output_path=output_path
	self.filename=filename
	def saving_cleaned_preprocess(self):
	Path(self.output_path).mkdir(parents=True, exist_ok=True)

	output_file = Path(self.output_path) / self.filename
	logger.info(f"Files saved in directory {output_file} as : { self.filename}")
	self.df.to_csv(output_file, index=False)

	def dropping_unncessary_columns(self):
	self.df.drop("review_text", axis=1, inplace=True)
	self.df.drop("review_date", axis=1, inplace=True)
	self.df.drop("business_name", axis=1, inplace=True)
	self.df.drop("address", axis=1, inplace=True)
	self.df.drop("city", axis=1, inplace=True)
	self.df.drop("state", axis=1, inplace=True)
	self.df.drop("postal_code", axis=1, inplace=True)
	self.df.drop("categories", axis=1, inplace=True)
	self.df.drop("user_name", axis=1, inplace=True)
	self.df.drop("yelping_since", axis=1, inplace=True)
	self.df.drop("checkin_date", axis=1, inplace=True)
	self.df.drop("review_useful", axis=1, inplace=True)
	self.df.drop("review_funny", axis=1, inplace=True)
	self.df.drop("review_cool", axis=1, inplace=True)
	self.df.drop("user_useful", axis=1, inplace=True)
	self.df.drop("user_funny", axis=1, inplace=True)
	self.df.drop("user_cool", axis=1, inplace=True)
	self.df.drop("is_open", axis=1, inplace=True)
	self.df.drop("compliment_hot", axis=1, inplace=True)
	self.df.drop("compliment_more", axis=1, inplace=True)
	self.df.drop("compliment_profile", axis=1, inplace=True)
	self.df.drop("compliment_cute", axis=1, inplace=True)
	self.df.drop("compliment_list", axis=1, inplace=True)
	self.df.drop("compliment_note", axis=1, inplace=True)
	self.df.drop("compliment_plain", axis=1, inplace=True)
	self.df.drop("compliment_cool", axis=1, inplace=True)
	self.df.drop("compliment_funny", axis=1, inplace=True)
	self.df.drop("compliment_writer", axis=1, inplace=True)
	self.df.drop("compliment_photos", axis=1, inplace=True)

	def run_pipeline(self):
	logger.info("Dropping Unnecessary Columns")
	self.dropping_unncessary_columns()

	logger.info("Checking Again for NULL values in Columns")
	for col in self.df.columns:
	if self.df[col].isnull().sum()>0:
	print(f" {col} has {self.df[col].isnull().sum()} null values")


	logger.info("Saving Cleaned and Preprocessed Data")
	self.saving_cleaned_preprocess()