Spaces:
Runtime error
Runtime error
File size: 3,021 Bytes
92b63f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import os
import pandas as pd
from .zip_extractor import ZipExtractor
import logging
class DataIngestion:
def data_ingestion(self,zip_path: str) -> pd.DataFrame:
"""
Function to extract a zip file, read the CSV data into a DataFrame,
and handle any extraction or reading errors gracefully.
Parameters:
zip_path : str
Path to the zip file containing the CSV files.
Returns:
pd.DataFrame
A pandas DataFrame containing the data from the CSV file.
Raises:
Exception if extraction or CSV reading fails.
"""
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("logging.log",'w+'),
logging.StreamHandler()
]
)
try:
# Initialize ZipExtractor with specified zip path
extractor = ZipExtractor(zip_path=zip_path)
logging.info(f"Initialized ZipExtractor with path: {zip_path}")
# Extract files and ensure CSVs are in the specified folder
extractor.extract_files()
logging.info("CSV Files are extracted from {zip_path}.")
# Get the output folder where CSV files are extracted
output_folder = extractor.output_folder
logging.info(f"Extracted files are located in: {output_folder}")
# Find extracted CSV files in the output folder
csv_files = [file for file in os.listdir(output_folder) if file.endswith('.csv')]
if not csv_files:
logging.error("No CSV files found in the extracted folder.")
raise FileNotFoundError("No CSV files found in the extracted folder.")
# Read the first CSV file found into a DataFrame
csv_path = os.path.join(output_folder, csv_files[0])
data = pd.read_csv(csv_path)
logging.info(f"Successfully loaded data from {csv_files[0]}")
# logging.debug(f"Data preview:\n{data.head()}")
return data
except FileNotFoundError as e:
logging.error(f"Error: {e}")
raise
except pd.errors.EmptyDataError:
logging.error("Error: The CSV file is empty.")
raise
except pd.errors.ParserError:
logging.error("Error: The CSV file contains parsing errors.")
raise
except Exception as e:
logging.error(f"An unexpected error occurred during data ingestion: {e}")
raise
if __name__ == "__main__":
# Example usage
# data_ingest = DataIngestion()
# try:
# df = data_ingest.data_ingestion("data/raw/customer_churn_dataset-training-master.csv.zip")
# except Exception as e:
# logging.error(f"Data ingestion failed: {e}")
pass
|