Commit
·
2b56cca
1
Parent(s):
24196b2
Minor package updates, cleaned up README and pyproject files related to installation. Updated example_config.env
Browse files- README.md +1 -1
- example_config.env +24 -4
- pyproject.toml +40 -7
- requirements.txt +4 -4
- requirements_lightweight.txt +3 -3
README.md
CHANGED
|
@@ -112,7 +112,7 @@ source venv/bin/activate
|
|
| 112 |
This project uses `pyproject.toml` to manage dependencies. You can install everything with a single pip command. This process will also download the required Spacy models and other packages directly from their URLs.
|
| 113 |
|
| 114 |
```bash
|
| 115 |
-
pip install .
|
| 116 |
```
|
| 117 |
|
| 118 |
Alternatively, you can install from the `requirements_lightweight.txt` file:
|
|
|
|
| 112 |
This project uses `pyproject.toml` to manage dependencies. You can install everything with a single pip command. This process will also download the required Spacy models and other packages directly from their URLs.
|
| 113 |
|
| 114 |
```bash
|
| 115 |
+
pip install .
|
| 116 |
```
|
| 117 |
|
| 118 |
Alternatively, you can install from the `requirements_lightweight.txt` file:
|
example_config.env
CHANGED
|
@@ -1,12 +1,27 @@
|
|
| 1 |
-
|
| 2 |
-
|
|
|
|
|
|
|
| 3 |
SHOW_LANGUAGE_SELECTION=True
|
|
|
|
|
|
|
|
|
|
| 4 |
CHOSEN_LOCAL_OCR_MODEL=tesseract
|
| 5 |
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
DISPLAY_FILE_NAMES_IN_LOGS=False
|
| 8 |
|
| 9 |
-
RUN_AWS_FUNCTIONS=True # Set to False if you don't want to run AWS functions
|
| 10 |
SAVE_LOGS_TO_DYNAMODB=True
|
| 11 |
S3_COST_CODES_PATH=cost_codes.csv
|
| 12 |
SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True
|
|
@@ -24,3 +39,8 @@ GET_COST_CODES=True
|
|
| 24 |
COST_CODES_PATH=config/cost_codes.csv
|
| 25 |
ENFORCE_COST_CODES=True
|
| 26 |
DEFAULT_COST_CODE=example_cost_code
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Rename this file to app_config.env and place it in the folder config/ (i.e. it will be located at app_base_folder/config/app_config.env). The app will then automatically load in these variables at startup. See tools/config.py for all the possible config variables you can set, or src/app_settings.qmd for descriptions. Below are some suggested config variables to start
|
| 2 |
+
|
| 3 |
+
TESSERACT_FOLDER=tesseract/ # If in a custom folder, not needed if in PATH
|
| 4 |
+
POPPLER_FOLDER=poppler/poppler-24.02.0/Library/bin/ # If in a custom folder, Not needed if in PATH
|
| 5 |
SHOW_LANGUAGE_SELECTION=True
|
| 6 |
+
SHOW_PADDLE_MODEL_OPTIONS=False
|
| 7 |
+
SHOW_VLM_MODEL_OPTIONS=False
|
| 8 |
+
SHOW_LOCAL_OCR_MODEL_OPTIONS=True
|
| 9 |
CHOSEN_LOCAL_OCR_MODEL=tesseract
|
| 10 |
|
| 11 |
+
SAVE_EXAMPLE_HYBRID_IMAGES=True
|
| 12 |
+
SAVE_PAGE_OCR_VISUALISATIONS=True
|
| 13 |
+
OVERWRITE_EXISTING_OCR_RESULTS=False
|
| 14 |
+
CONVERT_LINE_TO_WORD_LEVEL=False
|
| 15 |
+
LOAD_PADDLE_AT_STARTUP=False
|
| 16 |
+
SAVE_VLM_INPUT_IMAGES=True
|
| 17 |
+
SAVE_WORD_SEGMENTER_OUTPUT_IMAGES=True
|
| 18 |
+
PREPROCESS_LOCAL_OCR_IMAGES=False
|
| 19 |
+
SAVE_PREPROCESS_IMAGES=True
|
| 20 |
+
|
| 21 |
+
SESSION_OUTPUT_FOLDER=False # Save outputs into user session folders
|
| 22 |
DISPLAY_FILE_NAMES_IN_LOGS=False
|
| 23 |
|
| 24 |
+
RUN_AWS_FUNCTIONS=True # Set to False if you don't want to run AWS functions. You can remove all the environment variables in the following section if you don't want to use them
|
| 25 |
SAVE_LOGS_TO_DYNAMODB=True
|
| 26 |
S3_COST_CODES_PATH=cost_codes.csv
|
| 27 |
SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True
|
|
|
|
| 39 |
COST_CODES_PATH=config/cost_codes.csv
|
| 40 |
ENFORCE_COST_CODES=True
|
| 41 |
DEFAULT_COST_CODE=example_cost_code
|
| 42 |
+
|
| 43 |
+
CUSTOM_BOX_COLOUR=(128, 128, 128)
|
| 44 |
+
USE_GUI_BOX_COLOURS_FOR_OUTPUTS=False
|
| 45 |
+
|
| 46 |
+
|
pyproject.toml
CHANGED
|
@@ -2,15 +2,40 @@
|
|
| 2 |
requires = ["setuptools>=61.0", "wheel"]
|
| 3 |
build-backend = "setuptools.build_meta"
|
| 4 |
|
| 5 |
-
[project.urls]
|
| 6 |
-
Homepage = "https://seanpedrick-case.github.io/doc_redaction/"
|
| 7 |
-
Repository = "https://github.com/seanpedrick-case/doc_redaction"
|
| 8 |
-
|
| 9 |
[project]
|
| 10 |
name = "doc_redaction"
|
| 11 |
version = "1.5.3"
|
| 12 |
description = "Redact PDF/image-based documents, Word, or CSV/XLSX files using a Gradio-based GUI interface"
|
| 13 |
readme = "README.md"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
requires-python = ">=3.10"
|
| 15 |
dependencies = [
|
| 16 |
"pdfminer.six==20251107",
|
|
@@ -24,10 +49,10 @@ dependencies = [
|
|
| 24 |
"pikepdf==9.11.0",
|
| 25 |
"pandas==2.3.3",
|
| 26 |
"scikit-learn==1.7.2",
|
| 27 |
-
"spacy==3.8.
|
| 28 |
"en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
|
| 29 |
"gradio==5.49.1",
|
| 30 |
-
"boto3==1.40.
|
| 31 |
"pyarrow==21.0.0",
|
| 32 |
"openpyxl==3.1.5",
|
| 33 |
"Faker==37.8.0",
|
|
@@ -38,9 +63,10 @@ dependencies = [
|
|
| 38 |
"python-dotenv==1.0.1",
|
| 39 |
"awslambdaric==3.1.1",
|
| 40 |
"python-docx==1.2.0",
|
| 41 |
-
"polars==1.
|
| 42 |
"defusedxml==0.7.1",
|
| 43 |
"numpy==2.2.6",
|
|
|
|
| 44 |
]
|
| 45 |
|
| 46 |
[project.optional-dependencies]
|
|
@@ -67,6 +93,13 @@ vlm = [
|
|
| 67 |
"accelerate==1.11.0",
|
| 68 |
]
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
# Configuration for Ruff linter:
|
| 71 |
[tool.ruff]
|
| 72 |
line-length = 88
|
|
|
|
| 2 |
requires = ["setuptools>=61.0", "wheel"]
|
| 3 |
build-backend = "setuptools.build_meta"
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
[project]
|
| 6 |
name = "doc_redaction"
|
| 7 |
version = "1.5.3"
|
| 8 |
description = "Redact PDF/image-based documents, Word, or CSV/XLSX files using a Gradio-based GUI interface"
|
| 9 |
readme = "README.md"
|
| 10 |
+
authors = [
|
| 11 |
+
{ name = "Sean Pedrick-Case", email = "[email protected]" },
|
| 12 |
+
]
|
| 13 |
+
maintainers = [
|
| 14 |
+
{ name = "Sean Pedrick-Case", email = "[email protected]" },
|
| 15 |
+
]
|
| 16 |
+
license = { text = "AGPL-3.0-only" } # This licence type required to use PyMuPDF
|
| 17 |
+
keywords = [
|
| 18 |
+
"redaction",
|
| 19 |
+
"pdf",
|
| 20 |
+
"nlp",
|
| 21 |
+
"documents",
|
| 22 |
+
"document-processing",
|
| 23 |
+
"gradio",
|
| 24 |
+
"pii",
|
| 25 |
+
"pii-detection"
|
| 26 |
+
]
|
| 27 |
+
classifiers = [
|
| 28 |
+
"Development Status :: 5 - Production/Stable",
|
| 29 |
+
"Intended Audience :: Developers",
|
| 30 |
+
"Intended Audience :: Legal Industry",
|
| 31 |
+
"Topic :: Text Processing :: General",
|
| 32 |
+
"Topic :: Security :: Cryptography",
|
| 33 |
+
"Programming Language :: Python :: 3",
|
| 34 |
+
"Programming Language :: Python :: 3.10",
|
| 35 |
+
"Programming Language :: Python :: 3.11",
|
| 36 |
+
"Programming Language :: Python :: 3.12",
|
| 37 |
+
"Programming Language :: Python :: 3.13",
|
| 38 |
+
]
|
| 39 |
requires-python = ">=3.10"
|
| 40 |
dependencies = [
|
| 41 |
"pdfminer.six==20251107",
|
|
|
|
| 49 |
"pikepdf==9.11.0",
|
| 50 |
"pandas==2.3.3",
|
| 51 |
"scikit-learn==1.7.2",
|
| 52 |
+
"spacy==3.8.8",
|
| 53 |
"en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
|
| 54 |
"gradio==5.49.1",
|
| 55 |
+
"boto3==1.40.72",
|
| 56 |
"pyarrow==21.0.0",
|
| 57 |
"openpyxl==3.1.5",
|
| 58 |
"Faker==37.8.0",
|
|
|
|
| 63 |
"python-dotenv==1.0.1",
|
| 64 |
"awslambdaric==3.1.1",
|
| 65 |
"python-docx==1.2.0",
|
| 66 |
+
"polars==1.35.2",
|
| 67 |
"defusedxml==0.7.1",
|
| 68 |
"numpy==2.2.6",
|
| 69 |
+
"spaces==0.42.1",
|
| 70 |
]
|
| 71 |
|
| 72 |
[project.optional-dependencies]
|
|
|
|
| 93 |
"accelerate==1.11.0",
|
| 94 |
]
|
| 95 |
|
| 96 |
+
[project.urls]
|
| 97 |
+
Homepage = "https://seanpedrick-case.github.io/doc_redaction/"
|
| 98 |
+
Repository = "https://github.com/seanpedrick-case/doc_redaction"
|
| 99 |
+
|
| 100 |
+
[project.scripts]
|
| 101 |
+
cli_redact = "cli_redact:main"
|
| 102 |
+
|
| 103 |
# Configuration for Ruff linter:
|
| 104 |
[tool.ruff]
|
| 105 |
line-length = 88
|
requirements.txt
CHANGED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
# --- Core and data packages ---
|
| 2 |
numpy==2.2.6
|
| 3 |
pandas==2.3.3
|
| 4 |
-
bleach
|
| 5 |
-
polars==1.
|
| 6 |
pyarrow==21.0.0
|
| 7 |
openpyxl==3.1.5
|
| 8 |
-
boto3==1.40.
|
| 9 |
python-dotenv==1.0.1
|
| 10 |
defusedxml==0.7.1
|
| 11 |
Faker==37.8.0
|
|
@@ -35,7 +35,7 @@ awslambdaric==3.1.1
|
|
| 35 |
|
| 36 |
# --- Machine learning / NLP ---
|
| 37 |
scikit-learn==1.7.2
|
| 38 |
-
spacy==3.8.
|
| 39 |
spaczz==0.6.1
|
| 40 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
| 41 |
transformers==4.57.1
|
|
|
|
| 1 |
# --- Core and data packages ---
|
| 2 |
numpy==2.2.6
|
| 3 |
pandas==2.3.3
|
| 4 |
+
bleach==6.3.0
|
| 5 |
+
polars==1.35.2
|
| 6 |
pyarrow==21.0.0
|
| 7 |
openpyxl==3.1.5
|
| 8 |
+
boto3==1.40.72
|
| 9 |
python-dotenv==1.0.1
|
| 10 |
defusedxml==0.7.1
|
| 11 |
Faker==37.8.0
|
|
|
|
| 35 |
|
| 36 |
# --- Machine learning / NLP ---
|
| 37 |
scikit-learn==1.7.2
|
| 38 |
+
spacy==3.8.8
|
| 39 |
spaczz==0.6.1
|
| 40 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
| 41 |
transformers==4.57.1
|
requirements_lightweight.txt
CHANGED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
# --- Core and data packages ---
|
| 2 |
numpy==2.2.6
|
| 3 |
pandas==2.3.3
|
| 4 |
-
polars==1.
|
| 5 |
bleach==6.3.0
|
| 6 |
pyarrow==21.0.0
|
| 7 |
openpyxl==3.1.5
|
| 8 |
-
boto3==1.40.
|
| 9 |
python-dotenv==1.0.1
|
| 10 |
defusedxml==0.7.1
|
| 11 |
Faker==37.8.0
|
|
@@ -14,7 +14,7 @@ rapidfuzz==3.14.1
|
|
| 14 |
|
| 15 |
# --- Machine learning / NLP ---
|
| 16 |
scikit-learn==1.7.2
|
| 17 |
-
spacy==3.8.
|
| 18 |
spaczz==0.6.1
|
| 19 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
| 20 |
|
|
|
|
| 1 |
# --- Core and data packages ---
|
| 2 |
numpy==2.2.6
|
| 3 |
pandas==2.3.3
|
| 4 |
+
polars==1.35.2
|
| 5 |
bleach==6.3.0
|
| 6 |
pyarrow==21.0.0
|
| 7 |
openpyxl==3.1.5
|
| 8 |
+
boto3==1.40.72
|
| 9 |
python-dotenv==1.0.1
|
| 10 |
defusedxml==0.7.1
|
| 11 |
Faker==37.8.0
|
|
|
|
| 14 |
|
| 15 |
# --- Machine learning / NLP ---
|
| 16 |
scikit-learn==1.7.2
|
| 17 |
+
spacy==3.8.8
|
| 18 |
spaczz==0.6.1
|
| 19 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
| 20 |
|