Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

Sean Pedrick-Case commited on Apr 29

Commit

baabf97

unverified ·

2 Parent(s): 643a230 69c2af9

Merge pull request #20 from seanpedrick-case/dev

Browse files

DynamoDB logging format/example and minor text revisions

Files changed (7) hide show

DocRedactApp_0.6.1.spec → DocRedactApp.spec +2 -2
README.md +2 -0
how_to_create_exe_dist.txt +2 -2
load_dynamo_logs.py +56 -0
load_s3_logs.py +3 -3
pyproject.toml +7 -8
tools/custom_csvlogger.py +2 -1

DocRedactApp_0.6.1.spec → DocRedactApp.spec RENAMED Viewed

@@ -43,7 +43,7 @@ exe = EXE(
     a.scripts,
     [],
     exclude_binaries=True,
-    name='DocRedactApp_0.4.0',
     debug=False,
     bootloader_ignore_signals=False,
     strip=False,
@@ -62,5 +62,5 @@ coll = COLLECT(
     strip=False,
     upx=True,
     upx_exclude=[],
-    name='DocRedactApp_0.6.1',
 )

     a.scripts,
     [],
     exclude_binaries=True,
+    name='DocRedactApp_0.6.2',
     debug=False,
     bootloader_ignore_signals=False,
     strip=False,
     strip=False,
     upx=True,
     upx_exclude=[],
+    name='DocRedactApp_0.6.2',
 )

README.md CHANGED Viewed

@@ -10,6 +10,8 @@ license: agpl-3.0
 ---
 # Document redaction
 Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
 To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works ok for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.

 ---
 # Document redaction
+version: 0.6.2
 Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
 To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works ok for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.

how_to_create_exe_dist.txt CHANGED Viewed

@@ -16,7 +16,7 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
 9.Run the following (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
-a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client  --collect-data=gradio --hidden-import=gradio_image_annotation --collect-data=gradio_image_annotation --collect-all=gradio_image_annotation --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --hidden-import=safehttpx --collect-all=safehttpx --hidden-import=presidio_analyzer --collect-all=presidio_analyzer --hidden-import=presidio_anonymizer --collect-all=presidio_anonymizer --hidden-import=presidio_image_redactor --collect-all=presidio_image_redactor --name DocRedactApp_0.4.0 app.py
 # Add --onefile  to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
@@ -32,7 +32,7 @@ a = Analysis(
 hook-presidio-image-redactor.py
-c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.4.0.spec
 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\redaction').

 9.Run the following (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
+a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client  --collect-data=gradio --hidden-import=gradio_image_annotation --collect-data=gradio_image_annotation --collect-all=gradio_image_annotation --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --hidden-import=safehttpx --collect-all=safehttpx --hidden-import=presidio_analyzer --collect-all=presidio_analyzer --hidden-import=presidio_anonymizer --collect-all=presidio_anonymizer --hidden-import=presidio_image_redactor --collect-all=presidio_image_redactor --name DocRedactApp app.py
 # Add --onefile  to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
 hook-presidio-image-redactor.py
+c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp.spec
 9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\redaction').

load_dynamo_logs.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import boto3
+import csv
+from decimal import Decimal
+from boto3.dynamodb.conditions import Key
+from tools.config import AWS_REGION, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, OUTPUT_FOLDER
+# Replace with your actual table name and region
+TABLE_NAME = USAGE_LOG_DYNAMODB_TABLE_NAME # Choose as appropriate
+REGION = AWS_REGION
+CSV_OUTPUT = OUTPUT_FOLDER + 'dynamodb_logs_export.csv'
+# Create DynamoDB resource
+dynamodb = boto3.resource('dynamodb', region_name=REGION)
+table = dynamodb.Table(TABLE_NAME)
+# Helper function to convert Decimal to float or int
+def convert_types(item):
+    for key, value in item.items():
+        if isinstance(value, Decimal):
+            # Convert to int if no decimal places, else float
+            item[key] = int(value) if value % 1 == 0 else float(value)
+    return item
+# Paginated scan
+def scan_table():
+    items = []
+    response = table.scan()
+    items.extend(response['Items'])
+    while 'LastEvaluatedKey' in response:
+        response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
+        items.extend(response['Items'])
+    return items
+# Export to CSV
+def export_to_csv(items, output_path):
+    if not items:
+        print("No items found.")
+        return
+    fieldnames = sorted(items[0].keys())
+    with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+        writer.writeheader()
+        for item in items:
+            writer.writerow(convert_types(item))
+    print(f"Exported {len(items)} items to {output_path}")
+# Run export
+items = scan_table()
+export_to_csv(items, CSV_OUTPUT)

load_s3_logs.py CHANGED Viewed

@@ -2,7 +2,7 @@ import boto3
 import pandas as pd
 from io import StringIO
 from datetime import datetime
-from tools.config import DOCUMENT_REDACTION_BUCKET, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION
 # Combine together log files that can be then used for e.g. dashboarding and financial tracking.
@@ -71,7 +71,7 @@ if df_list:
     concatenated_df = pd.concat(df_list, ignore_index=True)
     # Save the concatenated DataFrame to a CSV file
-    concatenated_df.to_csv('consolidated_logs.csv', index=False)
-    print("Consolidated CSV saved as 'consolidated_logs.csv'")
 else:
     print("No log files found in the given date range.")

 import pandas as pd
 from io import StringIO
 from datetime import datetime
+from tools.config import DOCUMENT_REDACTION_BUCKET, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, OUTPUT_FOLDER
 # Combine together log files that can be then used for e.g. dashboarding and financial tracking.
     concatenated_df = pd.concat(df_list, ignore_index=True)
     # Save the concatenated DataFrame to a CSV file
+    concatenated_df.to_csv(OUTPUT_FOLDER + 'consolidated_s3_logs.csv', index=False)
+    print("Consolidated CSV saved as 'consolidated_s3_logs.csv'")
 else:
     print("No log files found in the given date range.")

pyproject.toml CHANGED Viewed

@@ -3,11 +3,11 @@ requires = ["setuptools>=61.0", "wheel"]
 build-backend = "setuptools.build_meta"
 [project]
-name = "doc_redaction" # Your application's name
-version = "0.6.1" # Your application's current version
-description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface" # A short description
-readme = "README.md" # Path to your project's README file
-requires-python = ">=3.10" # The minimum Python version required
 dependencies = [
     "pdfminer.six==20240706",
@@ -45,13 +45,12 @@ repository = "https://github.com/seanpedrick-case/doc_redaction"
 [project.optional-dependencies]
 dev = ["pytest"]
-# Optional: You can add configuration for tools used in your project under the [tool] section
-# For example, configuration for a linter like Ruff:
 [tool.ruff]
 line-length = 88
 select = ["E", "F", "I"]
-# Optional: Configuration for a formatter like Black:
 [tool.black]
 line-length = 88
 target-version = ['py310']

 build-backend = "setuptools.build_meta"
 [project]
+name = "doc_redaction"
+version = "0.6.2"
+description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
+readme = "README.md"
+requires-python = ">=3.10"
 dependencies = [
     "pdfminer.six==20240706",
 [project.optional-dependencies]
 dev = ["pytest"]
+# Configuration for Ruff linter:
 [tool.ruff]
 line-length = 88
 select = ["E", "F", "I"]
+# Configuration for a Black formatter:
 [tool.black]
 line-length = 88
 target-version = ['py310']

tools/custom_csvlogger.py CHANGED Viewed

@@ -2,6 +2,7 @@ from __future__ import annotations
 import contextlib
 import csv
 import datetime
 import os
 import re
 import boto3
@@ -177,7 +178,7 @@ class CSVLogger_custom(FlaggingCallback):
             csv_data.append(username)
-        timestamp = str(datetime.datetime.now())
         csv_data.append(timestamp)
         generated_id = str(uuid.uuid4())

 import contextlib
 import csv
 import datetime
+from datetime import datetime
 import os
 import re
 import boto3
             csv_data.append(username)
+        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] # Correct format for Amazon Athena
         csv_data.append(timestamp)
         generated_id = str(uuid.uuid4())