seanpedrickcase commited on
Commit
2b56cca
·
1 Parent(s): 24196b2

Minor package updates, cleaned up README and pyproject files related to installation. Updated example_config.env

Browse files
README.md CHANGED
@@ -112,7 +112,7 @@ source venv/bin/activate
112
  This project uses `pyproject.toml` to manage dependencies. You can install everything with a single pip command. This process will also download the required Spacy models and other packages directly from their URLs.
113
 
114
  ```bash
115
- pip install .[paddle,vlm]
116
  ```
117
 
118
  Alternatively, you can install from the `requirements_lightweight.txt` file:
 
112
  This project uses `pyproject.toml` to manage dependencies. You can install everything with a single pip command. This process will also download the required Spacy models and other packages directly from their URLs.
113
 
114
  ```bash
115
+ pip install .
116
  ```
117
 
118
  Alternatively, you can install from the `requirements_lightweight.txt` file:
example_config.env CHANGED
@@ -1,12 +1,27 @@
1
- TESSERACT_FOLDER=tesseract/
2
- POPPLER_FOLDER=poppler/poppler-24.02.0/Library/bin/
 
 
3
  SHOW_LANGUAGE_SELECTION=True
 
 
 
4
  CHOSEN_LOCAL_OCR_MODEL=tesseract
5
 
6
- SESSION_OUTPUT_FOLDER=False
 
 
 
 
 
 
 
 
 
 
7
  DISPLAY_FILE_NAMES_IN_LOGS=False
8
 
9
- RUN_AWS_FUNCTIONS=True # Set to False if you don't want to run AWS functions
10
  SAVE_LOGS_TO_DYNAMODB=True
11
  S3_COST_CODES_PATH=cost_codes.csv
12
  SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True
@@ -24,3 +39,8 @@ GET_COST_CODES=True
24
  COST_CODES_PATH=config/cost_codes.csv
25
  ENFORCE_COST_CODES=True
26
  DEFAULT_COST_CODE=example_cost_code
 
 
 
 
 
 
1
+ # Rename this file to app_config.env and place it in the folder config/ (i.e. it will be located at app_base_folder/config/app_config.env). The app will then automatically load in these variables at startup. See tools/config.py for all the possible config variables you can set, or src/app_settings.qmd for descriptions. Below are some suggested config variables to start
2
+
3
+ TESSERACT_FOLDER=tesseract/ # If in a custom folder, not needed if in PATH
4
+ POPPLER_FOLDER=poppler/poppler-24.02.0/Library/bin/ # If in a custom folder, Not needed if in PATH
5
  SHOW_LANGUAGE_SELECTION=True
6
+ SHOW_PADDLE_MODEL_OPTIONS=False
7
+ SHOW_VLM_MODEL_OPTIONS=False
8
+ SHOW_LOCAL_OCR_MODEL_OPTIONS=True
9
  CHOSEN_LOCAL_OCR_MODEL=tesseract
10
 
11
+ SAVE_EXAMPLE_HYBRID_IMAGES=True
12
+ SAVE_PAGE_OCR_VISUALISATIONS=True
13
+ OVERWRITE_EXISTING_OCR_RESULTS=False
14
+ CONVERT_LINE_TO_WORD_LEVEL=False
15
+ LOAD_PADDLE_AT_STARTUP=False
16
+ SAVE_VLM_INPUT_IMAGES=True
17
+ SAVE_WORD_SEGMENTER_OUTPUT_IMAGES=True
18
+ PREPROCESS_LOCAL_OCR_IMAGES=False
19
+ SAVE_PREPROCESS_IMAGES=True
20
+
21
+ SESSION_OUTPUT_FOLDER=False # Save outputs into user session folders
22
  DISPLAY_FILE_NAMES_IN_LOGS=False
23
 
24
+ RUN_AWS_FUNCTIONS=True # Set to False if you don't want to run AWS functions. You can remove all the environment variables in the following section if you don't want to use them
25
  SAVE_LOGS_TO_DYNAMODB=True
26
  S3_COST_CODES_PATH=cost_codes.csv
27
  SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True
 
39
  COST_CODES_PATH=config/cost_codes.csv
40
  ENFORCE_COST_CODES=True
41
  DEFAULT_COST_CODE=example_cost_code
42
+
43
+ CUSTOM_BOX_COLOUR=(128, 128, 128)
44
+ USE_GUI_BOX_COLOURS_FOR_OUTPUTS=False
45
+
46
+
pyproject.toml CHANGED
@@ -2,15 +2,40 @@
2
  requires = ["setuptools>=61.0", "wheel"]
3
  build-backend = "setuptools.build_meta"
4
 
5
- [project.urls]
6
- Homepage = "https://seanpedrick-case.github.io/doc_redaction/"
7
- Repository = "https://github.com/seanpedrick-case/doc_redaction"
8
-
9
  [project]
10
  name = "doc_redaction"
11
  version = "1.5.3"
12
  description = "Redact PDF/image-based documents, Word, or CSV/XLSX files using a Gradio-based GUI interface"
13
  readme = "README.md"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  requires-python = ">=3.10"
15
  dependencies = [
16
  "pdfminer.six==20251107",
@@ -24,10 +49,10 @@ dependencies = [
24
  "pikepdf==9.11.0",
25
  "pandas==2.3.3",
26
  "scikit-learn==1.7.2",
27
- "spacy==3.8.7",
28
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
29
  "gradio==5.49.1",
30
- "boto3==1.40.69",
31
  "pyarrow==21.0.0",
32
  "openpyxl==3.1.5",
33
  "Faker==37.8.0",
@@ -38,9 +63,10 @@ dependencies = [
38
  "python-dotenv==1.0.1",
39
  "awslambdaric==3.1.1",
40
  "python-docx==1.2.0",
41
- "polars==1.33.1",
42
  "defusedxml==0.7.1",
43
  "numpy==2.2.6",
 
44
  ]
45
 
46
  [project.optional-dependencies]
@@ -67,6 +93,13 @@ vlm = [
67
  "accelerate==1.11.0",
68
  ]
69
 
 
 
 
 
 
 
 
70
  # Configuration for Ruff linter:
71
  [tool.ruff]
72
  line-length = 88
 
2
  requires = ["setuptools>=61.0", "wheel"]
3
  build-backend = "setuptools.build_meta"
4
 
 
 
 
 
5
  [project]
6
  name = "doc_redaction"
7
  version = "1.5.3"
8
  description = "Redact PDF/image-based documents, Word, or CSV/XLSX files using a Gradio-based GUI interface"
9
  readme = "README.md"
10
+ authors = [
11
+ { name = "Sean Pedrick-Case", email = "[email protected]" },
12
+ ]
13
+ maintainers = [
14
+ { name = "Sean Pedrick-Case", email = "[email protected]" },
15
+ ]
16
+ license = { text = "AGPL-3.0-only" } # This licence type required to use PyMuPDF
17
+ keywords = [
18
+ "redaction",
19
+ "pdf",
20
+ "nlp",
21
+ "documents",
22
+ "document-processing",
23
+ "gradio",
24
+ "pii",
25
+ "pii-detection"
26
+ ]
27
+ classifiers = [
28
+ "Development Status :: 5 - Production/Stable",
29
+ "Intended Audience :: Developers",
30
+ "Intended Audience :: Legal Industry",
31
+ "Topic :: Text Processing :: General",
32
+ "Topic :: Security :: Cryptography",
33
+ "Programming Language :: Python :: 3",
34
+ "Programming Language :: Python :: 3.10",
35
+ "Programming Language :: Python :: 3.11",
36
+ "Programming Language :: Python :: 3.12",
37
+ "Programming Language :: Python :: 3.13",
38
+ ]
39
  requires-python = ">=3.10"
40
  dependencies = [
41
  "pdfminer.six==20251107",
 
49
  "pikepdf==9.11.0",
50
  "pandas==2.3.3",
51
  "scikit-learn==1.7.2",
52
+ "spacy==3.8.8",
53
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
54
  "gradio==5.49.1",
55
+ "boto3==1.40.72",
56
  "pyarrow==21.0.0",
57
  "openpyxl==3.1.5",
58
  "Faker==37.8.0",
 
63
  "python-dotenv==1.0.1",
64
  "awslambdaric==3.1.1",
65
  "python-docx==1.2.0",
66
+ "polars==1.35.2",
67
  "defusedxml==0.7.1",
68
  "numpy==2.2.6",
69
+ "spaces==0.42.1",
70
  ]
71
 
72
  [project.optional-dependencies]
 
93
  "accelerate==1.11.0",
94
  ]
95
 
96
+ [project.urls]
97
+ Homepage = "https://seanpedrick-case.github.io/doc_redaction/"
98
+ Repository = "https://github.com/seanpedrick-case/doc_redaction"
99
+
100
+ [project.scripts]
101
+ cli_redact = "cli_redact:main"
102
+
103
  # Configuration for Ruff linter:
104
  [tool.ruff]
105
  line-length = 88
requirements.txt CHANGED
@@ -1,11 +1,11 @@
1
  # --- Core and data packages ---
2
  numpy==2.2.6
3
  pandas==2.3.3
4
- bleach=6.3.0
5
- polars==1.33.1
6
  pyarrow==21.0.0
7
  openpyxl==3.1.5
8
- boto3==1.40.69
9
  python-dotenv==1.0.1
10
  defusedxml==0.7.1
11
  Faker==37.8.0
@@ -35,7 +35,7 @@ awslambdaric==3.1.1
35
 
36
  # --- Machine learning / NLP ---
37
  scikit-learn==1.7.2
38
- spacy==3.8.7
39
  spaczz==0.6.1
40
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
41
  transformers==4.57.1
 
1
  # --- Core and data packages ---
2
  numpy==2.2.6
3
  pandas==2.3.3
4
+ bleach==6.3.0
5
+ polars==1.35.2
6
  pyarrow==21.0.0
7
  openpyxl==3.1.5
8
+ boto3==1.40.72
9
  python-dotenv==1.0.1
10
  defusedxml==0.7.1
11
  Faker==37.8.0
 
35
 
36
  # --- Machine learning / NLP ---
37
  scikit-learn==1.7.2
38
+ spacy==3.8.8
39
  spaczz==0.6.1
40
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
41
  transformers==4.57.1
requirements_lightweight.txt CHANGED
@@ -1,11 +1,11 @@
1
  # --- Core and data packages ---
2
  numpy==2.2.6
3
  pandas==2.3.3
4
- polars==1.33.1
5
  bleach==6.3.0
6
  pyarrow==21.0.0
7
  openpyxl==3.1.5
8
- boto3==1.40.69
9
  python-dotenv==1.0.1
10
  defusedxml==0.7.1
11
  Faker==37.8.0
@@ -14,7 +14,7 @@ rapidfuzz==3.14.1
14
 
15
  # --- Machine learning / NLP ---
16
  scikit-learn==1.7.2
17
- spacy==3.8.7
18
  spaczz==0.6.1
19
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
20
 
 
1
  # --- Core and data packages ---
2
  numpy==2.2.6
3
  pandas==2.3.3
4
+ polars==1.35.2
5
  bleach==6.3.0
6
  pyarrow==21.0.0
7
  openpyxl==3.1.5
8
+ boto3==1.40.72
9
  python-dotenv==1.0.1
10
  defusedxml==0.7.1
11
  Faker==37.8.0
 
14
 
15
  # --- Machine learning / NLP ---
16
  scikit-learn==1.7.2
17
+ spacy==3.8.8
18
  spaczz==0.6.1
19
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
20