|
import boto3 |
|
import pandas as pd |
|
from io import StringIO |
|
from datetime import datetime |
|
from tools.config import DOCUMENT_REDACTION_BUCKET, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION |
|
|
|
|
|
|
|
|
|
if AWS_ACCESS_KEY and AWS_SECRET_KEY and AWS_REGION: |
|
s3 = boto3.client('s3', |
|
aws_access_key_id=AWS_ACCESS_KEY, |
|
aws_secret_access_key=AWS_SECRET_KEY, |
|
region_name=AWS_REGION) |
|
else: s3 = boto3.client('s3') |
|
|
|
bucket_name = DOCUMENT_REDACTION_BUCKET |
|
prefix = 'usage/' |
|
earliest_date = '20250409' |
|
latest_date = '20250423' |
|
|
|
|
|
def list_files_in_s3(bucket, prefix): |
|
response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix) |
|
if 'Contents' in response: |
|
return [content['Key'] for content in response['Contents']] |
|
return [] |
|
|
|
|
|
def is_within_date_range(date_str, start_date, end_date): |
|
date_obj = datetime.strptime(date_str, '%Y%m%d') |
|
return start_date <= date_obj <= end_date |
|
|
|
|
|
start_date = datetime.strptime(earliest_date, '%Y%m%d') |
|
end_date = datetime.strptime(latest_date, '%Y%m%d') |
|
|
|
|
|
all_files = list_files_in_s3(bucket_name, prefix) |
|
|
|
|
|
log_files = [] |
|
for file in all_files: |
|
parts = file.split('/') |
|
if len(parts) >= 3: |
|
date_str = parts[1] |
|
if is_within_date_range(date_str, start_date, end_date) and parts[-1] == 'log.csv': |
|
log_files.append(file) |
|
|
|
|
|
df_list = [] |
|
for log_file in log_files: |
|
|
|
obj = s3.get_object(Bucket=bucket_name, Key=log_file) |
|
try: |
|
csv_content = obj['Body'].read().decode('utf-8') |
|
except: |
|
csv_content = obj['Body'].read().decode('latin-1') |
|
|
|
|
|
try: |
|
df = pd.read_csv(StringIO(csv_content)) |
|
except Exception as e: |
|
print("Could not load in log file:", log_file, "due to:", e) |
|
continue |
|
|
|
df_list.append(df) |
|
|
|
|
|
if df_list: |
|
concatenated_df = pd.concat(df_list, ignore_index=True) |
|
|
|
|
|
concatenated_df.to_csv('consolidated_logs.csv', index=False) |
|
print("Consolidated CSV saved as 'consolidated_logs.csv'") |
|
else: |
|
print("No log files found in the given date range.") |
|
|