seanpedrickcase commited on
Commit
eada14e
·
1 Parent(s): f6e6d80

Added example .py file for accessing and downloading logs from s3

Browse files
Files changed (1) hide show
  1. load_s3_logs.py +66 -0
load_s3_logs.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import boto3
2
+ import pandas as pd
3
+ from io import StringIO
4
+ from datetime import datetime
5
+ from tools.config import DOCUMENT_REDACTION_BUCKET
6
+
7
+ # S3 setup
8
+ s3 = boto3.client('s3')
9
+ bucket_name = DOCUMENT_REDACTION_BUCKET
10
+ prefix = 'logs'# 'usage/' # 'feedback/' # Change as needed - top-level folder where logs are stored
11
+ earliest_date = '20250401' # Earliest date of logs folder retrieved
12
+ latest_date = '20250412' # Latest date of logs folder retrieved
13
+
14
+ # Function to list all files in a folder
15
+ def list_files_in_s3(bucket, prefix):
16
+ response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
17
+ if 'Contents' in response:
18
+ return [content['Key'] for content in response['Contents']]
19
+ return []
20
+
21
+ # Function to filter date range
22
+ def is_within_date_range(date_str, start_date, end_date):
23
+ date_obj = datetime.strptime(date_str, '%Y%m%d')
24
+ return start_date <= date_obj <= end_date
25
+
26
+ # Define the date range
27
+ start_date = datetime.strptime('20250401', '%Y%m%d') # Replace with your start date
28
+ end_date = datetime.strptime('20250412', '%Y%m%d') # Replace with your end date
29
+
30
+ # List all subfolders under 'usage/'
31
+ all_files = list_files_in_s3(bucket_name, prefix)
32
+
33
+ # Filter based on date range
34
+ log_files = []
35
+ for file in all_files:
36
+ parts = file.split('/')
37
+ if len(parts) >= 3:
38
+ date_str = parts[1]
39
+ if is_within_date_range(date_str, start_date, end_date) and parts[-1] == 'log.csv':
40
+ log_files.append(file)
41
+
42
+ # Download, read and concatenate CSV files into a pandas DataFrame
43
+ df_list = []
44
+ for log_file in log_files:
45
+ # Download the file
46
+ obj = s3.get_object(Bucket=bucket_name, Key=log_file)
47
+ csv_content = obj['Body'].read().decode('utf-8')
48
+
49
+ # Read CSV content into pandas DataFrame
50
+ try:
51
+ df = pd.read_csv(StringIO(csv_content))
52
+ except Exception as e:
53
+ print("Could not load in log file:", log_file, "due to:", e)
54
+ continue
55
+
56
+ df_list.append(df)
57
+
58
+ # Concatenate all DataFrames
59
+ if df_list:
60
+ concatenated_df = pd.concat(df_list, ignore_index=True)
61
+
62
+ # Save the concatenated DataFrame to a CSV file
63
+ concatenated_df.to_csv('consolidated_logs.csv', index=False)
64
+ print("Consolidated CSV saved as 'consolidated_logs.csv'")
65
+ else:
66
+ print("No log files found in the given date range.")