Sean Pedrick-Case commited on
Commit
baabf97
·
unverified ·
2 Parent(s): 643a230 69c2af9

Merge pull request #20 from seanpedrick-case/dev

Browse files

DynamoDB logging format/example and minor text revisions

DocRedactApp_0.6.1.spec → DocRedactApp.spec RENAMED
@@ -43,7 +43,7 @@ exe = EXE(
43
  a.scripts,
44
  [],
45
  exclude_binaries=True,
46
- name='DocRedactApp_0.4.0',
47
  debug=False,
48
  bootloader_ignore_signals=False,
49
  strip=False,
@@ -62,5 +62,5 @@ coll = COLLECT(
62
  strip=False,
63
  upx=True,
64
  upx_exclude=[],
65
- name='DocRedactApp_0.6.1',
66
  )
 
43
  a.scripts,
44
  [],
45
  exclude_binaries=True,
46
+ name='DocRedactApp_0.6.2',
47
  debug=False,
48
  bootloader_ignore_signals=False,
49
  strip=False,
 
62
  strip=False,
63
  upx=True,
64
  upx_exclude=[],
65
+ name='DocRedactApp_0.6.2',
66
  )
README.md CHANGED
@@ -10,6 +10,8 @@ license: agpl-3.0
10
  ---
11
  # Document redaction
12
 
 
 
13
  Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
14
 
15
  To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works ok for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
 
10
  ---
11
  # Document redaction
12
 
13
+ version: 0.6.2
14
+
15
  Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
16
 
17
  To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works ok for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
how_to_create_exe_dist.txt CHANGED
@@ -16,7 +16,7 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
16
 
17
  9.Run the following (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
18
 
19
- a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import=gradio_image_annotation --collect-data=gradio_image_annotation --collect-all=gradio_image_annotation --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --hidden-import=safehttpx --collect-all=safehttpx --hidden-import=presidio_analyzer --collect-all=presidio_analyzer --hidden-import=presidio_anonymizer --collect-all=presidio_anonymizer --hidden-import=presidio_image_redactor --collect-all=presidio_image_redactor --name DocRedactApp_0.4.0 app.py
20
 
21
  # Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
22
 
@@ -32,7 +32,7 @@ a = Analysis(
32
 
33
  hook-presidio-image-redactor.py
34
 
35
- c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.4.0.spec
36
 
37
 
38
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\redaction').
 
16
 
17
  9.Run the following (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
18
 
19
+ a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import=gradio_image_annotation --collect-data=gradio_image_annotation --collect-all=gradio_image_annotation --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --hidden-import=safehttpx --collect-all=safehttpx --hidden-import=presidio_analyzer --collect-all=presidio_analyzer --hidden-import=presidio_anonymizer --collect-all=presidio_anonymizer --hidden-import=presidio_image_redactor --collect-all=presidio_image_redactor --name DocRedactApp app.py
20
 
21
  # Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
22
 
 
32
 
33
  hook-presidio-image-redactor.py
34
 
35
+ c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp.spec
36
 
37
 
38
  9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\redaction').
load_dynamo_logs.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import boto3
2
+ import csv
3
+ from decimal import Decimal
4
+ from boto3.dynamodb.conditions import Key
5
+
6
+ from tools.config import AWS_REGION, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, OUTPUT_FOLDER
7
+
8
+ # Replace with your actual table name and region
9
+ TABLE_NAME = USAGE_LOG_DYNAMODB_TABLE_NAME # Choose as appropriate
10
+ REGION = AWS_REGION
11
+ CSV_OUTPUT = OUTPUT_FOLDER + 'dynamodb_logs_export.csv'
12
+
13
+ # Create DynamoDB resource
14
+ dynamodb = boto3.resource('dynamodb', region_name=REGION)
15
+ table = dynamodb.Table(TABLE_NAME)
16
+
17
+ # Helper function to convert Decimal to float or int
18
+ def convert_types(item):
19
+ for key, value in item.items():
20
+ if isinstance(value, Decimal):
21
+ # Convert to int if no decimal places, else float
22
+ item[key] = int(value) if value % 1 == 0 else float(value)
23
+ return item
24
+
25
+ # Paginated scan
26
+ def scan_table():
27
+ items = []
28
+ response = table.scan()
29
+ items.extend(response['Items'])
30
+
31
+ while 'LastEvaluatedKey' in response:
32
+ response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
33
+ items.extend(response['Items'])
34
+
35
+ return items
36
+
37
+ # Export to CSV
38
+ def export_to_csv(items, output_path):
39
+ if not items:
40
+ print("No items found.")
41
+ return
42
+
43
+ fieldnames = sorted(items[0].keys())
44
+
45
+ with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
46
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
47
+ writer.writeheader()
48
+
49
+ for item in items:
50
+ writer.writerow(convert_types(item))
51
+
52
+ print(f"Exported {len(items)} items to {output_path}")
53
+
54
+ # Run export
55
+ items = scan_table()
56
+ export_to_csv(items, CSV_OUTPUT)
load_s3_logs.py CHANGED
@@ -2,7 +2,7 @@ import boto3
2
  import pandas as pd
3
  from io import StringIO
4
  from datetime import datetime
5
- from tools.config import DOCUMENT_REDACTION_BUCKET, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION
6
 
7
  # Combine together log files that can be then used for e.g. dashboarding and financial tracking.
8
 
@@ -71,7 +71,7 @@ if df_list:
71
  concatenated_df = pd.concat(df_list, ignore_index=True)
72
 
73
  # Save the concatenated DataFrame to a CSV file
74
- concatenated_df.to_csv('consolidated_logs.csv', index=False)
75
- print("Consolidated CSV saved as 'consolidated_logs.csv'")
76
  else:
77
  print("No log files found in the given date range.")
 
2
  import pandas as pd
3
  from io import StringIO
4
  from datetime import datetime
5
+ from tools.config import DOCUMENT_REDACTION_BUCKET, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, OUTPUT_FOLDER
6
 
7
  # Combine together log files that can be then used for e.g. dashboarding and financial tracking.
8
 
 
71
  concatenated_df = pd.concat(df_list, ignore_index=True)
72
 
73
  # Save the concatenated DataFrame to a CSV file
74
+ concatenated_df.to_csv(OUTPUT_FOLDER + 'consolidated_s3_logs.csv', index=False)
75
+ print("Consolidated CSV saved as 'consolidated_s3_logs.csv'")
76
  else:
77
  print("No log files found in the given date range.")
pyproject.toml CHANGED
@@ -3,11 +3,11 @@ requires = ["setuptools>=61.0", "wheel"]
3
  build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
- name = "doc_redaction" # Your application's name
7
- version = "0.6.1" # Your application's current version
8
- description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface" # A short description
9
- readme = "README.md" # Path to your project's README file
10
- requires-python = ">=3.10" # The minimum Python version required
11
 
12
  dependencies = [
13
  "pdfminer.six==20240706",
@@ -45,13 +45,12 @@ repository = "https://github.com/seanpedrick-case/doc_redaction"
45
  [project.optional-dependencies]
46
  dev = ["pytest"]
47
 
48
- # Optional: You can add configuration for tools used in your project under the [tool] section
49
- # For example, configuration for a linter like Ruff:
50
  [tool.ruff]
51
  line-length = 88
52
  select = ["E", "F", "I"]
53
 
54
- # Optional: Configuration for a formatter like Black:
55
  [tool.black]
56
  line-length = 88
57
  target-version = ['py310']
 
3
  build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
+ name = "doc_redaction"
7
+ version = "0.6.2"
8
+ description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
 
12
  dependencies = [
13
  "pdfminer.six==20240706",
 
45
  [project.optional-dependencies]
46
  dev = ["pytest"]
47
 
48
+ # Configuration for Ruff linter:
 
49
  [tool.ruff]
50
  line-length = 88
51
  select = ["E", "F", "I"]
52
 
53
+ # Configuration for a Black formatter:
54
  [tool.black]
55
  line-length = 88
56
  target-version = ['py310']
tools/custom_csvlogger.py CHANGED
@@ -2,6 +2,7 @@ from __future__ import annotations
2
  import contextlib
3
  import csv
4
  import datetime
 
5
  import os
6
  import re
7
  import boto3
@@ -177,7 +178,7 @@ class CSVLogger_custom(FlaggingCallback):
177
  csv_data.append(username)
178
 
179
 
180
- timestamp = str(datetime.datetime.now())
181
  csv_data.append(timestamp)
182
 
183
  generated_id = str(uuid.uuid4())
 
2
  import contextlib
3
  import csv
4
  import datetime
5
+ from datetime import datetime
6
  import os
7
  import re
8
  import boto3
 
178
  csv_data.append(username)
179
 
180
 
181
+ timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] # Correct format for Amazon Athena
182
  csv_data.append(timestamp)
183
 
184
  generated_id = str(uuid.uuid4())