Commit
·
eada14e
1
Parent(s):
f6e6d80
Added example .py file for accessing and downloading logs from s3
Browse files- load_s3_logs.py +66 -0
load_s3_logs.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import boto3
|
2 |
+
import pandas as pd
|
3 |
+
from io import StringIO
|
4 |
+
from datetime import datetime
|
5 |
+
from tools.config import DOCUMENT_REDACTION_BUCKET
|
6 |
+
|
7 |
+
# S3 setup
|
8 |
+
s3 = boto3.client('s3')
|
9 |
+
bucket_name = DOCUMENT_REDACTION_BUCKET
|
10 |
+
prefix = 'logs'# 'usage/' # 'feedback/' # Change as needed - top-level folder where logs are stored
|
11 |
+
earliest_date = '20250401' # Earliest date of logs folder retrieved
|
12 |
+
latest_date = '20250412' # Latest date of logs folder retrieved
|
13 |
+
|
14 |
+
# Function to list all files in a folder
|
15 |
+
def list_files_in_s3(bucket, prefix):
|
16 |
+
response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
|
17 |
+
if 'Contents' in response:
|
18 |
+
return [content['Key'] for content in response['Contents']]
|
19 |
+
return []
|
20 |
+
|
21 |
+
# Function to filter date range
|
22 |
+
def is_within_date_range(date_str, start_date, end_date):
|
23 |
+
date_obj = datetime.strptime(date_str, '%Y%m%d')
|
24 |
+
return start_date <= date_obj <= end_date
|
25 |
+
|
26 |
+
# Define the date range
|
27 |
+
start_date = datetime.strptime('20250401', '%Y%m%d') # Replace with your start date
|
28 |
+
end_date = datetime.strptime('20250412', '%Y%m%d') # Replace with your end date
|
29 |
+
|
30 |
+
# List all subfolders under 'usage/'
|
31 |
+
all_files = list_files_in_s3(bucket_name, prefix)
|
32 |
+
|
33 |
+
# Filter based on date range
|
34 |
+
log_files = []
|
35 |
+
for file in all_files:
|
36 |
+
parts = file.split('/')
|
37 |
+
if len(parts) >= 3:
|
38 |
+
date_str = parts[1]
|
39 |
+
if is_within_date_range(date_str, start_date, end_date) and parts[-1] == 'log.csv':
|
40 |
+
log_files.append(file)
|
41 |
+
|
42 |
+
# Download, read and concatenate CSV files into a pandas DataFrame
|
43 |
+
df_list = []
|
44 |
+
for log_file in log_files:
|
45 |
+
# Download the file
|
46 |
+
obj = s3.get_object(Bucket=bucket_name, Key=log_file)
|
47 |
+
csv_content = obj['Body'].read().decode('utf-8')
|
48 |
+
|
49 |
+
# Read CSV content into pandas DataFrame
|
50 |
+
try:
|
51 |
+
df = pd.read_csv(StringIO(csv_content))
|
52 |
+
except Exception as e:
|
53 |
+
print("Could not load in log file:", log_file, "due to:", e)
|
54 |
+
continue
|
55 |
+
|
56 |
+
df_list.append(df)
|
57 |
+
|
58 |
+
# Concatenate all DataFrames
|
59 |
+
if df_list:
|
60 |
+
concatenated_df = pd.concat(df_list, ignore_index=True)
|
61 |
+
|
62 |
+
# Save the concatenated DataFrame to a CSV file
|
63 |
+
concatenated_df.to_csv('consolidated_logs.csv', index=False)
|
64 |
+
print("Consolidated CSV saved as 'consolidated_logs.csv'")
|
65 |
+
else:
|
66 |
+
print("No log files found in the given date range.")
|