Merge pull request #54 from seanpedrick-case/dev
Browse filesAdded search text and easy redact feature, multi-language support, added support for PaddleOCR. Various minor fixes and package updates.
- Dockerfile +20 -6
- README.md +266 -9
- app.py +0 -0
- cdk/cdk_config.py +3 -3
- cdk/cdk_stack.py +15 -0
- cdk/post_cdk_build_quickstart.py +2 -2
- cdk/requirements.txt +3 -3
- example_config.env +26 -0
- index.qmd +1 -1
- load_dynamo_logs.py +45 -8
- pyproject.toml +20 -18
- requirements.txt +19 -17
- src/app_settings.qmd +69 -20
- src/user_guide.qmd +79 -6
- tools/aws_textract.py +9 -191
- tools/cli_redact.py +149 -69
- tools/config.py +29 -5
- tools/custom_image_analyser_engine.py +0 -0
- tools/data_anonymise.py +209 -65
- tools/example_cli_calls.txt +24 -0
- tools/file_conversion.py +195 -70
- tools/file_redaction.py +481 -391
- tools/find_duplicate_pages.py +696 -330
- tools/helper_functions.py +67 -21
- tools/load_spacy_model_custom_recognisers.py +350 -83
- tools/redaction_review.py +426 -86
Dockerfile
CHANGED
@@ -17,7 +17,7 @@ WORKDIR /src
|
|
17 |
|
18 |
COPY requirements.txt .
|
19 |
|
20 |
-
RUN pip install --no-cache-dir --target=/install -r requirements.txt && rm requirements.txt
|
21 |
|
22 |
# Add lambda entrypoint and script
|
23 |
COPY lambda_entrypoint.py .
|
@@ -54,7 +54,8 @@ ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp/ \
|
|
54 |
ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
|
55 |
USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
|
56 |
CONFIG_FOLDER=$APP_HOME/app/config/ \
|
57 |
-
XDG_CACHE_HOME=/tmp/xdg_cache/user_1000
|
|
|
58 |
|
59 |
# Create the base application directory and set its ownership
|
60 |
RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
|
@@ -81,13 +82,22 @@ RUN mkdir -p \
|
|
81 |
${APP_HOME}/app/logs \
|
82 |
${APP_HOME}/app/usage \
|
83 |
${APP_HOME}/app/feedback \
|
84 |
-
${APP_HOME}/app/config
|
85 |
|
86 |
-
# Now handle the /tmp and /var/tmp directories and their subdirectories
|
87 |
RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
|
88 |
&& chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
|
89 |
&& chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
|
90 |
-
&& chmod 700 ${XDG_CACHE_HOME}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
# Copy installed packages from builder stage
|
93 |
COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
|
@@ -115,6 +125,9 @@ VOLUME ["/home/user/app/logs"]
|
|
115 |
VOLUME ["/home/user/app/usage"]
|
116 |
VOLUME ["/home/user/app/feedback"]
|
117 |
VOLUME ["/home/user/app/config"]
|
|
|
|
|
|
|
118 |
VOLUME ["/tmp"]
|
119 |
VOLUME ["/var/tmp"]
|
120 |
|
@@ -127,7 +140,8 @@ ENV PATH=$APP_HOME/.local/bin:$PATH \
|
|
127 |
GRADIO_NUM_PORTS=1 \
|
128 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
129 |
GRADIO_SERVER_PORT=7860 \
|
130 |
-
GRADIO_ANALYTICS_ENABLED=False
|
|
|
131 |
|
132 |
ENTRYPOINT ["/entrypoint.sh"]
|
133 |
|
|
|
17 |
|
18 |
COPY requirements.txt .
|
19 |
|
20 |
+
RUN pip install --verbose --no-cache-dir --target=/install -r requirements.txt && rm requirements.txt
|
21 |
|
22 |
# Add lambda entrypoint and script
|
23 |
COPY lambda_entrypoint.py .
|
|
|
54 |
ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
|
55 |
USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
|
56 |
CONFIG_FOLDER=$APP_HOME/app/config/ \
|
57 |
+
XDG_CACHE_HOME=/tmp/xdg_cache/user_1000 \
|
58 |
+
TESSERACT_DATA_FOLDER=/usr/share/tessdata
|
59 |
|
60 |
# Create the base application directory and set its ownership
|
61 |
RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
|
|
|
82 |
${APP_HOME}/app/logs \
|
83 |
${APP_HOME}/app/usage \
|
84 |
${APP_HOME}/app/feedback \
|
85 |
+
${APP_HOME}/app/config
|
86 |
|
87 |
+
# Now handle the /tmp and /var/tmp directories and their subdirectories, paddle, spacy, tessdata
|
88 |
RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
|
89 |
&& chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
|
90 |
&& chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
|
91 |
+
&& chmod 700 ${XDG_CACHE_HOME} \
|
92 |
+
&& mkdir -p ${APP_HOME}/.paddlex \
|
93 |
+
&& chown user:user ${APP_HOME}/.paddlex \
|
94 |
+
&& chmod 755 ${APP_HOME}/.paddlex \
|
95 |
+
&& mkdir -p ${APP_HOME}/.local/share/spacy/data \
|
96 |
+
&& chown user:user ${APP_HOME}/.local/share/spacy/data \
|
97 |
+
&& chmod 755 ${APP_HOME}/.local/share/spacy/data \
|
98 |
+
&& mkdir -p /usr/share/tessdata \
|
99 |
+
&& chown user:user /usr/share/tessdata \
|
100 |
+
&& chmod 755 /usr/share/tessdata
|
101 |
|
102 |
# Copy installed packages from builder stage
|
103 |
COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
|
|
|
125 |
VOLUME ["/home/user/app/usage"]
|
126 |
VOLUME ["/home/user/app/feedback"]
|
127 |
VOLUME ["/home/user/app/config"]
|
128 |
+
VOLUME ["/home/user/.paddlex"]
|
129 |
+
VOLUME ["/home/user/.local/share/spacy/data"]
|
130 |
+
VOLUME ["/usr/share/tessdata"]
|
131 |
VOLUME ["/tmp"]
|
132 |
VOLUME ["/var/tmp"]
|
133 |
|
|
|
140 |
GRADIO_NUM_PORTS=1 \
|
141 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
142 |
GRADIO_SERVER_PORT=7860 \
|
143 |
+
GRADIO_ANALYTICS_ENABLED=False
|
144 |
+
|
145 |
|
146 |
ENTRYPOINT ["/entrypoint.sh"]
|
147 |
|
README.md
CHANGED
@@ -10,9 +10,9 @@ license: agpl-3.0
|
|
10 |
---
|
11 |
# Document redaction
|
12 |
|
13 |
-
version: 0.
|
14 |
|
15 |
-
Redact personally identifiable information (PII) from documents (pdf, images),
|
16 |
|
17 |
To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works quite well for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
|
18 |
|
@@ -20,7 +20,191 @@ After redaction, review suggested redactions on the 'Review redactions' tab. The
|
|
20 |
|
21 |
NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.
|
22 |
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
## Table of contents
|
26 |
|
@@ -35,7 +219,7 @@ NOTE: The app is not 100% accurate, and it will miss some personal information.
|
|
35 |
- [Redacting only specific pages](#redacting-only-specific-pages)
|
36 |
- [Handwriting and signature redaction](#handwriting-and-signature-redaction)
|
37 |
- [Reviewing and modifying suggested redactions](#reviewing-and-modifying-suggested-redactions)
|
38 |
-
- [Redacting tabular data files (CSV/XLSX) or copy and pasted text](#redacting-tabular-data-files-xlsxcsv-or-copy-and-pasted-text)
|
39 |
|
40 |
See the [advanced user guide here](#advanced-user-guide):
|
41 |
- [Merging redaction review files](#merging-redaction-review-files)
|
@@ -225,9 +409,11 @@ On the 'Review redactions' tab you have a visual interface that allows you to in
|
|
225 |
|
226 |
### Uploading documents for review
|
227 |
|
228 |
-
The top area has a file upload area where you can upload
|
229 |
|
230 |
-
Optionally, you can also upload one of the '..._ocr_output.csv'
|
|
|
|
|
231 |
|
232 |

|
233 |
|
@@ -315,6 +501,77 @@ Once you have filtered the table, or selected a row from the table, you have a f
|
|
315 |
|
316 |
If you made a mistake, click the 'Undo last element removal' button to restore the Search suggested redactions table to its previous state (can only undo the last action).
|
317 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
318 |
### Navigating through the document using the 'Search all extracted text'
|
319 |
|
320 |
The 'search all extracted text' table will contain text if you have just redacted a document, or if you have uploaded a '..._ocr_output.csv' file alongside a document file and review file on the Review redactions tab as [described above](#uploading-documents-for-review).
|
@@ -327,11 +584,11 @@ You can search through the extracted text by using the search bar just above the
|
|
327 |
|
328 |

|
329 |
|
330 |
-
## Redacting tabular data files (XLSX/CSV) or copy and pasted text
|
331 |
|
332 |
-
###
|
333 |
|
334 |
-
The app can be used to redact tabular data files such as xlsx or csv files. For this to work properly, your data file needs to be in a simple table format, with a single table starting from the first cell (A1), and no other information in the sheet. Similarly for .xlsx files, each sheet in the file that you want to redact should be in this simple format.
|
335 |
|
336 |
To demonstrate this, we can use [the example csv file 'combined_case_notes.csv'](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/combined_case_notes.csv), which is a small dataset of dummy social care case notes. Go to the 'Open text or Excel/csv files' tab. Drop the file into the upload area. After the file is loaded, you should see the suggested columns for redaction in the box underneath. You can select and deselect columns to redact as you wish from this list.
|
337 |
|
|
|
10 |
---
|
11 |
# Document redaction
|
12 |
|
13 |
+
version: 1.0.0
|
14 |
|
15 |
+
Redact personally identifiable information (PII) from documents (pdf, images), Word files (.docx), or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
|
16 |
|
17 |
To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works quite well for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
|
18 |
|
|
|
20 |
|
21 |
NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.
|
22 |
|
23 |
+
---
|
24 |
+
|
25 |
+
## 🚀 Quick Start - Installation and first run
|
26 |
+
|
27 |
+
Follow these instructions to get the document redaction application running on your local machine.
|
28 |
+
|
29 |
+
### 1. Prerequisites: System Dependencies
|
30 |
+
|
31 |
+
This application relies on two external tools for OCR (Tesseract) and PDF processing (Poppler). Please install them on your system before proceeding.
|
32 |
+
|
33 |
+
---
|
34 |
+
|
35 |
+
|
36 |
+
#### **On Windows**
|
37 |
+
|
38 |
+
Installation on Windows requires downloading installers and adding the programs to your system's PATH.
|
39 |
+
|
40 |
+
1. **Install Tesseract OCR:**
|
41 |
+
* Download the installer from the official Tesseract at [UB Mannheim page](https://github.com/UB-Mannheim/tesseract/wiki) (e.g., `tesseract-ocr-w64-setup-v5.X.X...exe`).
|
42 |
+
* Run the installer.
|
43 |
+
* **IMPORTANT:** During installation, ensure you select the option to "Add Tesseract to system PATH for all users" or a similar option. This is crucial for the application to find the Tesseract executable.
|
44 |
+
|
45 |
+
|
46 |
+
2. **Install Poppler:**
|
47 |
+
* Download the latest Poppler binary for Windows. A common source is the [Poppler for Windows](https://github.com/oschwartz10612/poppler-windows) GitHub releases page. Download the `.zip` file (e.g., `poppler-24.02.0-win.zip`).
|
48 |
+
* Extract the contents of the zip file to a permanent location on your computer, for example, `C:\Program Files\poppler\`.
|
49 |
+
* You must add the `bin` folder from your Poppler installation to your system's PATH environment variable.
|
50 |
+
* Search for "Edit the system environment variables" in the Windows Start Menu and open it.
|
51 |
+
* Click the "Environment Variables..." button.
|
52 |
+
* In the "System variables" section, find and select the `Path` variable, then click "Edit...".
|
53 |
+
* Click "New" and add the full path to the `bin` directory inside your Poppler folder (e.g., `C:\Program Files\poppler\poppler-24.02.0\bin`).
|
54 |
+
* Click OK on all windows to save the changes.
|
55 |
+
|
56 |
+
To verify, open a new Command Prompt and run `tesseract --version` and `pdftoppm -v`. If they both return version information, you have successfully installed the prerequisites.
|
57 |
+
|
58 |
+
---
|
59 |
+
|
60 |
+
#### **On Linux (Debian/Ubuntu)**
|
61 |
+
|
62 |
+
Open your terminal and run the following command to install Tesseract and Poppler:
|
63 |
+
|
64 |
+
```bash
|
65 |
+
sudo apt-get update && sudo apt-get install -y tesseract-ocr poppler-utils
|
66 |
+
```
|
67 |
+
|
68 |
+
#### **On Linux (Fedora/CentOS/RHEL)**
|
69 |
+
|
70 |
+
Open your terminal and use the `dnf` or `yum` package manager:
|
71 |
+
|
72 |
+
```bash
|
73 |
+
sudo dnf install -y tesseract poppler-utils
|
74 |
+
```
|
75 |
+
---
|
76 |
+
|
77 |
+
|
78 |
+
### 2. Installation: Code and Python Packages
|
79 |
+
|
80 |
+
Once the system prerequisites are installed, you can set up the Python environment.
|
81 |
+
|
82 |
+
#### Step 1: Clone the Repository
|
83 |
+
|
84 |
+
Open your terminal or Git Bash and clone this repository:
|
85 |
+
```bash
|
86 |
+
git clone https://github.com/seanpedrick-case/doc_redaction.git
|
87 |
+
cd doc_redaction
|
88 |
+
```
|
89 |
+
|
90 |
+
#### Step 2: Create and Activate a Virtual Environment (Recommended)
|
91 |
+
|
92 |
+
It is highly recommended to use a virtual environment to isolate project dependencies and avoid conflicts with other Python projects.
|
93 |
+
|
94 |
+
```bash
|
95 |
+
# Create the virtual environment
|
96 |
+
python -m venv venv
|
97 |
+
|
98 |
+
# Activate it
|
99 |
+
# On Windows:
|
100 |
+
.\venv\Scripts\activate
|
101 |
+
|
102 |
+
# On macOS/Linux:
|
103 |
+
source venv/bin/activate
|
104 |
+
```
|
105 |
+
|
106 |
+
#### Step 3: Install Python Dependencies
|
107 |
+
|
108 |
+
This project uses `pyproject.toml` to manage dependencies. You can install everything with a single pip command. This process will also download the required Spacy models and other packages directly from their URLs.
|
109 |
+
|
110 |
+
```bash
|
111 |
+
pip install .
|
112 |
+
```
|
113 |
+
|
114 |
+
Alternatively, you can use the `requirements.txt` file:
|
115 |
+
```bash
|
116 |
+
pip install -r requirements.txt
|
117 |
+
```
|
118 |
+
|
119 |
+
### 3. Run the Application
|
120 |
+
|
121 |
+
With all dependencies installed, you can now start the Gradio application.
|
122 |
+
|
123 |
+
```bash
|
124 |
+
python app.py
|
125 |
+
```
|
126 |
+
|
127 |
+
After running the command, the application will start, and you will see a local URL in your terminal (usually `http://127.0.0.1:7860`).
|
128 |
+
|
129 |
+
Open this URL in your web browser to use the document redaction tool
|
130 |
+
|
131 |
+
---
|
132 |
+
|
133 |
+
|
134 |
+
### 4. ⚙️ Configuration (Optional)
|
135 |
+
|
136 |
+
You can customise the application's behavior by creating a configuration file. This allows you to change settings without modifying the source code, such as enabling AWS features, changing logging behavior, or pointing to local Tesseract/Poppler installations. A full overview of all the potential settings you can modify in the app_config.env file can be seen in tools/config.py, with explanation on the documentation website for [the github repo](https://seanpedrick-case.github.io/doc_redaction/)
|
137 |
+
|
138 |
+
To get started:
|
139 |
+
1. Locate the `example_config.env` file in the root of the project.
|
140 |
+
2. Create a new file named `app_config.env` inside the `config/` directory (i.e., `config/app_config.env`).
|
141 |
+
3. Copy the contents from `example_config.env` into your new `config/app_config.env` file.
|
142 |
+
4. Modify the values in `config/app_config.env` to suit your needs. The application will automatically load these settings on startup.
|
143 |
+
|
144 |
+
If you do not create this file, the application will run with default settings.
|
145 |
+
|
146 |
+
#### Configuration Breakdown
|
147 |
+
|
148 |
+
Here is an overview of the most important settings, separated by whether they are for local use or require AWS.
|
149 |
+
|
150 |
+
---
|
151 |
+
|
152 |
+
#### **Local & General Settings (No AWS Required)**
|
153 |
+
|
154 |
+
These settings are useful for all users, regardless of whether you are using AWS.
|
155 |
+
|
156 |
+
* `TESSERACT_FOLDER` / `POPPLER_FOLDER`
|
157 |
+
* Use these if you installed Tesseract or Poppler to a custom location on **Windows** and did not add them to the system PATH.
|
158 |
+
* Provide the path to the respective installation folders (for Poppler, point to the `bin` sub-directory).
|
159 |
+
* **Examples:** `POPPLER_FOLDER=C:/Program Files/poppler-24.02.0/bin/` `TESSERACT_FOLDER=tesseract/`
|
160 |
+
|
161 |
+
* `SHOW_LANGUAGE_SELECTION=True`
|
162 |
+
* Set to `True` to display a language selection dropdown in the UI for OCR processing.
|
163 |
+
|
164 |
+
* `CHOSEN_LOCAL_OCR_MODEL=tesseract`"
|
165 |
+
* Choose the backend for local OCR. Options are `tesseract`, `paddle`, or `hybrid`. "Tesseract" is the default, and is recommended. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence. "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction.
|
166 |
+
|
167 |
+
* `SESSION_OUTPUT_FOLDER=False`
|
168 |
+
* If `True`, redacted files will be saved in unique subfolders within the `output/` directory for each session.
|
169 |
+
|
170 |
+
* `DISPLAY_FILE_NAMES_IN_LOGS=False`
|
171 |
+
* For privacy, file names are not recorded in usage logs by default. Set to `True` to include them.
|
172 |
+
|
173 |
+
---
|
174 |
+
|
175 |
+
#### **AWS-Specific Settings**
|
176 |
+
|
177 |
+
These settings are only relevant if you intend to use AWS services like Textract for OCR and Comprehend for PII detection.
|
178 |
+
|
179 |
+
* `RUN_AWS_FUNCTIONS=1`
|
180 |
+
* **This is the master switch.** You must set this to `1` to enable any AWS functionality. If it is `0`, all other AWS settings will be ignored.
|
181 |
+
|
182 |
+
* **UI Options:**
|
183 |
+
* `SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True`: Adds "AWS Textract" as an option in the text extraction dropdown.
|
184 |
+
* `SHOW_AWS_PII_DETECTION_OPTIONS=True`: Adds "AWS Comprehend" as an option in the PII detection dropdown.
|
185 |
+
|
186 |
+
* **Core AWS Configuration:**
|
187 |
+
* `AWS_REGION=example-region`: Set your AWS region (e.g., `us-east-1`).
|
188 |
+
* `DOCUMENT_REDACTION_BUCKET=example-bucket`: The name of the S3 bucket the application will use for temporary file storage and processing.
|
189 |
+
|
190 |
+
* **AWS Logging:**
|
191 |
+
* `SAVE_LOGS_TO_DYNAMODB=True`: If enabled, usage and feedback logs will be saved to DynamoDB tables.
|
192 |
+
* `ACCESS_LOG_DYNAMODB_TABLE_NAME`, `USAGE_LOG_DYNAMODB_TABLE_NAME`, etc.: Specify the names of your DynamoDB tables for logging.
|
193 |
+
|
194 |
+
* **Advanced AWS Textract Features:**
|
195 |
+
* `SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS=True`: Enables UI components for large-scale, asynchronous document processing via Textract.
|
196 |
+
* `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET=example-bucket-output`: A separate S3 bucket for the final output of asynchronous Textract jobs.
|
197 |
+
* `LOAD_PREVIOUS_TEXTRACT_JOBS_S3=True`: If enabled, the app will try to load the status of previously submitted asynchronous jobs from S3.
|
198 |
+
|
199 |
+
* **Cost Tracking (for internal accounting):**
|
200 |
+
* `SHOW_COSTS=True`: Displays an estimated cost for AWS operations. Can be enabled even if AWS functions are off.
|
201 |
+
* `GET_COST_CODES=True`: Enables a dropdown for users to select a cost code before running a job.
|
202 |
+
* `COST_CODES_PATH=config/cost_codes.csv`: The local path to a CSV file containing your cost codes.
|
203 |
+
* `ENFORCE_COST_CODES=True`: Makes selecting a cost code mandatory before starting a redaction.
|
204 |
+
|
205 |
+
Now you have the app installed, what follows is a guide on how to use it for basic and advanced redaction.
|
206 |
+
|
207 |
+
# User Guide
|
208 |
|
209 |
## Table of contents
|
210 |
|
|
|
219 |
- [Redacting only specific pages](#redacting-only-specific-pages)
|
220 |
- [Handwriting and signature redaction](#handwriting-and-signature-redaction)
|
221 |
- [Reviewing and modifying suggested redactions](#reviewing-and-modifying-suggested-redactions)
|
222 |
+
- [Redacting Word, tabular data files (CSV/XLSX) or copy and pasted text](#redacting-word-tabular-data-files-xlsxcsv-or-copy-and-pasted-text)
|
223 |
|
224 |
See the [advanced user guide here](#advanced-user-guide):
|
225 |
- [Merging redaction review files](#merging-redaction-review-files)
|
|
|
409 |
|
410 |
### Uploading documents for review
|
411 |
|
412 |
+
The top area has a file upload area where you can upload files for review . In the left box, upload the original PDF file. Click '1. Upload original PDF'. In the right box, you can upload the '..._review_file.csv' that is produced by the redaction process.
|
413 |
|
414 |
+
Optionally, you can upload a '..._ocr_result_with_words' file here, that will allow you to search through the text and easily [add new redactions based on word search](#searching-and-adding-custom-redactions). You can also upload one of the '..._ocr_output.csv' file here that comes out of a redaction task, so that you can navigate the extracted text from the document. Click the button '2. Upload Review or OCR csv files' load in these files.
|
415 |
+
|
416 |
+
Now you can review and modify the suggested redactions using the interface described below.
|
417 |
|
418 |

|
419 |
|
|
|
501 |
|
502 |
If you made a mistake, click the 'Undo last element removal' button to restore the Search suggested redactions table to its previous state (can only undo the last action).
|
503 |
|
504 |
+
### Searching and Adding Custom Redactions
|
505 |
+
|
506 |
+
After a document has been processed, you may need to redact specific terms, names, or phrases that the automatic PII (Personally Identifiable Information) detection might have missed. The **"Search text to make new redactions"** tab gives you the power to find and redact any text within your document manually.
|
507 |
+
|
508 |
+
#### How to Use the Search and Redact Feature
|
509 |
+
|
510 |
+
The workflow is designed to be simple: **Search → Select → Redact**.
|
511 |
+
|
512 |
+
---
|
513 |
+
|
514 |
+
#### **Step 1: Search for Text**
|
515 |
+
|
516 |
+
1. Navigate to the **"Search text to make new redactions"** tab.
|
517 |
+
2. The main table will initially be populated with all the text extracted from the document, broken down by word.
|
518 |
+
3. To narrow this down, use the **"Multi-word text search"** box to type the word or phrase you want to find.
|
519 |
+
4. Click the **"Search"** button or press Enter.
|
520 |
+
5. The table below will update to show only the rows containing text that matches your search query.
|
521 |
+
|
522 |
+
> **Tip:** You can also filter the results by page number using the **"Page"** dropdown. To clear all filters and see the full text again, click the **"Reset table to original state"** button.
|
523 |
+
|
524 |
+
---
|
525 |
+
|
526 |
+
#### **Step 2: Select and Review a Match**
|
527 |
+
|
528 |
+
When you click on any row in the search results table:
|
529 |
+
|
530 |
+
* The document preview on the left will automatically jump to that page, allowing you to see the word in its original context.
|
531 |
+
* The details of your selection will appear in the smaller **"Selected row"** table for confirmation.
|
532 |
+
|
533 |
+
---
|
534 |
+
|
535 |
+
#### **Step 3: Choose Your Redaction Method**
|
536 |
+
|
537 |
+
You have several powerful options for redacting the text you've found:
|
538 |
+
|
539 |
+
* **Redact a Single, Specific Instance:**
|
540 |
+
* Click on the exact row in the table you want to redact.
|
541 |
+
* Click the **`Redact specific text row`** button.
|
542 |
+
* Only that single instance will be redacted.
|
543 |
+
|
544 |
+
* **Redact All Instances of a Word/Phrase:**
|
545 |
+
* Let's say you want to redact the project name "Project Alpha" everywhere it appears.
|
546 |
+
* Find and select one instance of "Project Alpha" in the table.
|
547 |
+
* Click the **`Redact all words with same text as selected row`** button.
|
548 |
+
* The application will find and redact every single occurrence of "Project Alpha" throughout the entire document.
|
549 |
+
|
550 |
+
* **Redact All Current Search Results:**
|
551 |
+
* Perform a search (e.g., for a specific person's name).
|
552 |
+
* If you are confident that every result shown in the filtered table should be redacted, click the **`Redact all text in table`** button.
|
553 |
+
* This will apply a redaction to all currently visible items in the table in one go.
|
554 |
+
|
555 |
+
---
|
556 |
+
|
557 |
+
#### **Customising Your New Redactions**
|
558 |
+
|
559 |
+
Before you click one of the redact buttons, you can customize the appearance and label of the new redactions under the **"Search options"** accordion:
|
560 |
+
|
561 |
+
* **Label for new redactions:** Change the text that appears on the redaction box (default is "Redaction"). You could change this to "CONFIDENTIAL" or "CUSTOM".
|
562 |
+
* **Colour for labels:** Set a custom color for the redaction box by providing an RGB value. The format must be three numbers (0-255) in parentheses, for example:
|
563 |
+
* ` (255, 0, 0) ` for Red
|
564 |
+
* ` (0, 0, 0) ` for Black
|
565 |
+
* ` (255, 255, 0) ` for Yellow
|
566 |
+
|
567 |
+
#### **Undoing a Mistake**
|
568 |
+
|
569 |
+
If you make a mistake, you can reverse the last redaction action you performed on this tab.
|
570 |
+
|
571 |
+
* Click the **`Undo latest redaction`** button. This will revert the last set of redactions you added (whether it was a single row, all of a certain text, or all search results).
|
572 |
+
|
573 |
+
> **Important:** This undo button only works for the *most recent* action. It maintains a single backup state, so it cannot undo actions that are two or more steps in the past.
|
574 |
+
|
575 |
### Navigating through the document using the 'Search all extracted text'
|
576 |
|
577 |
The 'search all extracted text' table will contain text if you have just redacted a document, or if you have uploaded a '..._ocr_output.csv' file alongside a document file and review file on the Review redactions tab as [described above](#uploading-documents-for-review).
|
|
|
584 |
|
585 |

|
586 |
|
587 |
+
## Redacting Word, tabular data files (XLSX/CSV) or copy and pasted text
|
588 |
|
589 |
+
### Word or tabular data files (XLSX/CSV)
|
590 |
|
591 |
+
The app can be used to redact Word (.docx), or tabular data files such as xlsx or csv files. For this to work properly, your data file needs to be in a simple table format, with a single table starting from the first cell (A1), and no other information in the sheet. Similarly for .xlsx files, each sheet in the file that you want to redact should be in this simple format.
|
592 |
|
593 |
To demonstrate this, we can use [the example csv file 'combined_case_notes.csv'](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/combined_case_notes.csv), which is a small dataset of dummy social care case notes. Go to the 'Open text or Excel/csv files' tab. Drop the file into the upload area. After the file is loaded, you should see the suggested columns for redaction in the box underneath. You can select and deselect columns to redact as you wish from this list.
|
594 |
|
app.py
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
cdk/cdk_config.py
CHANGED
@@ -213,9 +213,9 @@ SAVE_LOGS_TO_CSV = get_or_create_env_var('SAVE_LOGS_TO_CSV', 'True')
|
|
213 |
|
214 |
### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
|
215 |
SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var('SAVE_LOGS_TO_DYNAMODB', 'True')
|
216 |
-
ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('ACCESS_LOG_DYNAMODB_TABLE_NAME', f"{CDK_PREFIX}dynamodb-access-
|
217 |
-
FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('FEEDBACK_LOG_DYNAMODB_TABLE_NAME', f"{CDK_PREFIX}dynamodb-feedback".lower())
|
218 |
-
USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('USAGE_LOG_DYNAMODB_TABLE_NAME', f"{CDK_PREFIX}dynamodb-usage".lower())
|
219 |
|
220 |
###
|
221 |
# REDACTION OPTIONS
|
|
|
213 |
|
214 |
### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
|
215 |
SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var('SAVE_LOGS_TO_DYNAMODB', 'True')
|
216 |
+
ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('ACCESS_LOG_DYNAMODB_TABLE_NAME', f"{CDK_PREFIX}dynamodb-access-logs".lower())
|
217 |
+
FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('FEEDBACK_LOG_DYNAMODB_TABLE_NAME', f"{CDK_PREFIX}dynamodb-feedback-logs".lower())
|
218 |
+
USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('USAGE_LOG_DYNAMODB_TABLE_NAME', f"{CDK_PREFIX}dynamodb-usage-logs".lower())
|
219 |
|
220 |
###
|
221 |
# REDACTION OPTIONS
|
cdk/cdk_stack.py
CHANGED
@@ -990,6 +990,21 @@ class CdkStack(Stack):
|
|
990 |
"sourceVolume": epheremal_storage_volume_name,
|
991 |
"containerPath": "/tmp/gradio_tmp",
|
992 |
"readOnly": False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
993 |
}
|
994 |
],
|
995 |
"readonlyRootFilesystem": read_only_file_system,
|
|
|
990 |
"sourceVolume": epheremal_storage_volume_name,
|
991 |
"containerPath": "/tmp/gradio_tmp",
|
992 |
"readOnly": False
|
993 |
+
},
|
994 |
+
{
|
995 |
+
"sourceVolume": epheremal_storage_volume_name,
|
996 |
+
"containerPath": "/home/user/.paddlex",
|
997 |
+
"readOnly": False
|
998 |
+
},
|
999 |
+
{
|
1000 |
+
"sourceVolume": epheremal_storage_volume_name,
|
1001 |
+
"containerPath": "/home/user/.local/share/spacy/data",
|
1002 |
+
"readOnly": False
|
1003 |
+
},
|
1004 |
+
{
|
1005 |
+
"sourceVolume": epheremal_storage_volume_name,
|
1006 |
+
"containerPath": "/usr/share/tessdata",
|
1007 |
+
"readOnly": False
|
1008 |
}
|
1009 |
],
|
1010 |
"readonlyRootFilesystem": read_only_file_system,
|
cdk/post_cdk_build_quickstart.py
CHANGED
@@ -13,10 +13,10 @@ start_codebuild_build(PROJECT_NAME=CODEBUILD_PROJECT_NAME)
|
|
13 |
# Upload config.env file to S3 bucket
|
14 |
upload_file_to_s3(local_file_paths="config/config.env", s3_key="", s3_bucket=S3_LOG_CONFIG_BUCKET_NAME)
|
15 |
|
16 |
-
total_seconds =
|
17 |
update_interval = 1 # Update every second
|
18 |
|
19 |
-
print("Waiting
|
20 |
|
21 |
# tqdm iterates over a range, and you perform a small sleep in each iteration
|
22 |
for i in tqdm(range(total_seconds), desc="Building container"):
|
|
|
13 |
# Upload config.env file to S3 bucket
|
14 |
upload_file_to_s3(local_file_paths="config/config.env", s3_key="", s3_bucket=S3_LOG_CONFIG_BUCKET_NAME)
|
15 |
|
16 |
+
total_seconds = 660 # 11 minutes
|
17 |
update_interval = 1 # Update every second
|
18 |
|
19 |
+
print("Waiting 11 minutes for the CodeBuild container to build.")
|
20 |
|
21 |
# tqdm iterates over a range, and you perform a small sleep in each iteration
|
22 |
for i in tqdm(range(total_seconds), desc="Building container"):
|
cdk/requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
aws-cdk-lib==2.
|
2 |
-
boto3==1.
|
3 |
-
pandas==2.3.
|
4 |
nodejs==0.1.1
|
5 |
python-dotenv==1.0.1
|
|
|
1 |
+
aws-cdk-lib==2.212.0
|
2 |
+
boto3==1.40.10
|
3 |
+
pandas==2.3.1
|
4 |
nodejs==0.1.1
|
5 |
python-dotenv==1.0.1
|
example_config.env
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
TESSERACT_FOLDER=tesseract/
|
2 |
+
POPPLER_FOLDER=poppler/poppler-24.02.0/Library/bin/
|
3 |
+
SHOW_LANGUAGE_SELECTION=True
|
4 |
+
CHOSEN_LOCAL_OCR_MODEL=tesseract
|
5 |
+
|
6 |
+
SESSION_OUTPUT_FOLDER=False
|
7 |
+
DISPLAY_FILE_NAMES_IN_LOGS=False
|
8 |
+
|
9 |
+
RUN_AWS_FUNCTIONS=1 # Set to 0 if you don't want to run AWS functions
|
10 |
+
SAVE_LOGS_TO_DYNAMODB=True
|
11 |
+
S3_COST_CODES_PATH=cost_codes.csv
|
12 |
+
SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True
|
13 |
+
SHOW_AWS_PII_DETECTION_OPTIONS=True
|
14 |
+
AWS_REGION=example-region
|
15 |
+
DOCUMENT_REDACTION_BUCKET=example-bucket
|
16 |
+
SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS=True
|
17 |
+
TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET=example-bucket-output
|
18 |
+
LOAD_PREVIOUS_TEXTRACT_JOBS_S3=True
|
19 |
+
ACCESS_LOG_DYNAMODB_TABLE_NAME=example-dynamodb-access-log
|
20 |
+
USAGE_LOG_DYNAMODB_TABLE_NAME=example-dynamodb-usage
|
21 |
+
FEEDBACK_LOG_DYNAMODB_TABLE_NAME=example-dynamodb-feedback
|
22 |
+
SHOW_COSTS=True
|
23 |
+
GET_COST_CODES=True
|
24 |
+
COST_CODES_PATH=config/cost_codes.csv
|
25 |
+
ENFORCE_COST_CODES=True
|
26 |
+
DEFAULT_COST_CODE=example_cost_code
|
index.qmd
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
title: "Home"
|
3 |
---
|
4 |
|
5 |
-
version: 0.
|
6 |
|
7 |
Welcome to the Document Redaction App documentation. This site provides comprehensive documentation for the Document Redaction App.
|
8 |
|
|
|
2 |
title: "Home"
|
3 |
---
|
4 |
|
5 |
+
version: 1.0.0
|
6 |
|
7 |
Welcome to the Document Redaction App documentation. This site provides comprehensive documentation for the Document Redaction App.
|
8 |
|
load_dynamo_logs.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import boto3
|
2 |
import csv
|
3 |
from decimal import Decimal
|
|
|
4 |
from boto3.dynamodb.conditions import Key
|
5 |
|
6 |
from tools.config import AWS_REGION, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, OUTPUT_FOLDER
|
@@ -16,11 +17,26 @@ table = dynamodb.Table(TABLE_NAME)
|
|
16 |
|
17 |
# Helper function to convert Decimal to float or int
|
18 |
def convert_types(item):
|
|
|
19 |
for key, value in item.items():
|
|
|
20 |
if isinstance(value, Decimal):
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
# Paginated scan
|
26 |
def scan_table():
|
@@ -35,22 +51,43 @@ def scan_table():
|
|
35 |
return items
|
36 |
|
37 |
# Export to CSV
|
38 |
-
|
|
|
39 |
if not items:
|
40 |
print("No items found.")
|
41 |
return
|
42 |
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
-
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
writer.writeheader()
|
48 |
|
49 |
for item in items:
|
|
|
|
|
50 |
writer.writerow(convert_types(item))
|
51 |
|
52 |
print(f"Exported {len(items)} items to {output_path}")
|
53 |
|
54 |
# Run export
|
55 |
items = scan_table()
|
56 |
-
export_to_csv(items, CSV_OUTPUT)
|
|
|
1 |
import boto3
|
2 |
import csv
|
3 |
from decimal import Decimal
|
4 |
+
import datetime
|
5 |
from boto3.dynamodb.conditions import Key
|
6 |
|
7 |
from tools.config import AWS_REGION, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, OUTPUT_FOLDER
|
|
|
17 |
|
18 |
# Helper function to convert Decimal to float or int
|
19 |
def convert_types(item):
|
20 |
+
new_item = {}
|
21 |
for key, value in item.items():
|
22 |
+
# Handle Decimals first
|
23 |
if isinstance(value, Decimal):
|
24 |
+
new_item[key] = int(value) if value % 1 == 0 else float(value)
|
25 |
+
# Handle Strings that might be dates
|
26 |
+
elif isinstance(value, str):
|
27 |
+
try:
|
28 |
+
# Attempt to parse a common ISO 8601 format.
|
29 |
+
# The .replace() handles the 'Z' for Zulu/UTC time.
|
30 |
+
dt_obj = datetime.datetime.fromisoformat(value.replace('Z', '+00:00'))
|
31 |
+
# Now that we have a datetime object, format it as desired
|
32 |
+
new_item[key] = dt_obj.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
|
33 |
+
except (ValueError, TypeError):
|
34 |
+
# If it fails to parse, it's just a regular string
|
35 |
+
new_item[key] = value
|
36 |
+
# Handle all other types
|
37 |
+
else:
|
38 |
+
new_item[key] = value
|
39 |
+
return new_item
|
40 |
|
41 |
# Paginated scan
|
42 |
def scan_table():
|
|
|
51 |
return items
|
52 |
|
53 |
# Export to CSV
|
54 |
+
# Export to CSV
|
55 |
+
def export_to_csv(items, output_path, fields_to_drop: list = None):
|
56 |
if not items:
|
57 |
print("No items found.")
|
58 |
return
|
59 |
|
60 |
+
# Use a set for efficient lookup
|
61 |
+
drop_set = set(fields_to_drop or [])
|
62 |
+
|
63 |
+
# Get a comprehensive list of all possible headers from all items
|
64 |
+
all_keys = set()
|
65 |
+
for item in items:
|
66 |
+
all_keys.update(item.keys())
|
67 |
+
|
68 |
+
# Determine the final fieldnames by subtracting the ones to drop
|
69 |
+
fieldnames = sorted(list(all_keys - drop_set))
|
70 |
+
|
71 |
+
print("Final CSV columns will be:", fieldnames)
|
72 |
|
73 |
+
with open(output_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
|
74 |
+
# The key fix is here: extrasaction='ignore'
|
75 |
+
# restval='' is also good practice to handle rows that are missing a key
|
76 |
+
writer = csv.DictWriter(
|
77 |
+
csvfile,
|
78 |
+
fieldnames=fieldnames,
|
79 |
+
extrasaction='ignore',
|
80 |
+
restval=''
|
81 |
+
)
|
82 |
writer.writeheader()
|
83 |
|
84 |
for item in items:
|
85 |
+
# The convert_types function can now return the full dict,
|
86 |
+
# and the writer will simply ignore the extra fields.
|
87 |
writer.writerow(convert_types(item))
|
88 |
|
89 |
print(f"Exported {len(items)} items to {output_path}")
|
90 |
|
91 |
# Run export
|
92 |
items = scan_table()
|
93 |
+
export_to_csv(items, CSV_OUTPUT, fields_to_drop=[])
|
pyproject.toml
CHANGED
@@ -4,38 +4,40 @@ build-backend = "setuptools.build_meta"
|
|
4 |
|
5 |
[project]
|
6 |
name = "doc_redaction"
|
7 |
-
version = "0.
|
8 |
description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
|
9 |
readme = "README.md"
|
10 |
requires-python = ">=3.10"
|
11 |
|
12 |
dependencies = [
|
13 |
-
"pdfminer.six==
|
14 |
"pdf2image==1.17.0",
|
15 |
-
"pymupdf==1.26.
|
16 |
-
"opencv-python==4.
|
17 |
-
"presidio_analyzer==2.2.
|
18 |
-
"presidio_anonymizer==2.2.
|
19 |
-
"presidio-image-redactor==0.0.
|
20 |
-
"pikepdf==9.
|
21 |
-
"pandas==2.3.
|
22 |
-
"scikit-learn==1.
|
23 |
"spacy==3.8.7",
|
24 |
# Direct URL dependency for spacy model
|
25 |
"en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
|
26 |
-
"gradio==5.
|
27 |
-
"boto3==1.
|
28 |
-
"pyarrow==
|
29 |
"openpyxl==3.1.5",
|
30 |
-
"Faker==
|
31 |
-
"python-levenshtein==0.
|
32 |
"spaczz==0.6.1",
|
33 |
# Direct URL dependency for gradio_image_annotator wheel
|
34 |
"gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl",
|
35 |
-
"rapidfuzz==3.
|
36 |
"python-dotenv==1.0.1",
|
37 |
-
"
|
38 |
-
"
|
|
|
|
|
39 |
]
|
40 |
|
41 |
[project.urls]
|
|
|
4 |
|
5 |
[project]
|
6 |
name = "doc_redaction"
|
7 |
+
version = "1.0.0"
|
8 |
description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
|
9 |
readme = "README.md"
|
10 |
requires-python = ">=3.10"
|
11 |
|
12 |
dependencies = [
|
13 |
+
"pdfminer.six==20250506",
|
14 |
"pdf2image==1.17.0",
|
15 |
+
"pymupdf==1.26.3",
|
16 |
+
"opencv-python==4.12.0.88",
|
17 |
+
"presidio_analyzer==2.2.359",
|
18 |
+
"presidio_anonymizer==2.2.359",
|
19 |
+
"presidio-image-redactor==0.0.57",
|
20 |
+
"pikepdf==9.10.2",
|
21 |
+
"pandas==2.3.1",
|
22 |
+
"scikit-learn==1.7.1",
|
23 |
"spacy==3.8.7",
|
24 |
# Direct URL dependency for spacy model
|
25 |
"en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
|
26 |
+
"gradio==5.43.1",
|
27 |
+
"boto3==1.40.10",
|
28 |
+
"pyarrow==21.0.0",
|
29 |
"openpyxl==3.1.5",
|
30 |
+
"Faker==37.5.3",
|
31 |
+
"python-levenshtein==0.27.1",
|
32 |
"spaczz==0.6.1",
|
33 |
# Direct URL dependency for gradio_image_annotator wheel
|
34 |
"gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl",
|
35 |
+
"rapidfuzz==3.13.0",
|
36 |
"python-dotenv==1.0.1",
|
37 |
+
"awslambdaric==3.1.1",
|
38 |
+
"python-docx==1.2.0",
|
39 |
+
"paddlepaddle==3.1.0",
|
40 |
+
"paddleocr==3.1.1"
|
41 |
]
|
42 |
|
43 |
[project.urls]
|
requirements.txt
CHANGED
@@ -1,28 +1,30 @@
|
|
1 |
-
pdfminer.six==
|
2 |
pdf2image==1.17.0
|
3 |
-
pymupdf==1.26.
|
4 |
-
opencv-python==4.
|
5 |
-
presidio_analyzer==2.2.
|
6 |
-
presidio_anonymizer==2.2.
|
7 |
-
presidio-image-redactor==0.0.
|
8 |
-
pikepdf==9.
|
9 |
-
pandas==2.3.
|
10 |
-
scikit-learn==1.
|
11 |
spacy==3.8.7
|
12 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
13 |
-
gradio==5.
|
14 |
-
boto3==1.
|
15 |
-
pyarrow==
|
16 |
openpyxl==3.1.5
|
17 |
-
Faker==
|
18 |
-
python-levenshtein==0.
|
19 |
spaczz==0.6.1
|
20 |
# The following version
|
21 |
https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
|
22 |
-
rapidfuzz==3.
|
23 |
python-dotenv==1.0.1
|
24 |
-
|
25 |
-
|
|
|
|
|
26 |
|
27 |
|
28 |
|
|
|
1 |
+
pdfminer.six==20250506
|
2 |
pdf2image==1.17.0
|
3 |
+
pymupdf==1.26.3
|
4 |
+
opencv-python==4.12.0.88
|
5 |
+
presidio_analyzer==2.2.359
|
6 |
+
presidio_anonymizer==2.2.359
|
7 |
+
presidio-image-redactor==0.0.57
|
8 |
+
pikepdf==9.10.2
|
9 |
+
pandas==2.3.1
|
10 |
+
scikit-learn==1.7.1
|
11 |
spacy==3.8.7
|
12 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
13 |
+
gradio==5.43.1
|
14 |
+
boto3==1.40.10
|
15 |
+
pyarrow==21.0.0
|
16 |
openpyxl==3.1.5
|
17 |
+
Faker==37.5.3
|
18 |
+
python-levenshtein==0.27.1
|
19 |
spaczz==0.6.1
|
20 |
# The following version
|
21 |
https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
|
22 |
+
rapidfuzz==3.13.0
|
23 |
python-dotenv==1.0.1
|
24 |
+
awslambdaric==3.1.1
|
25 |
+
python-docx==1.2.0
|
26 |
+
paddlepaddle==3.1.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
|
27 |
+
paddleocr==3.1.1
|
28 |
|
29 |
|
30 |
|
src/app_settings.qmd
CHANGED
@@ -115,6 +115,16 @@ Configuration for input and output file handling.
|
|
115 |
* **Default Value:** `'input/'`
|
116 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
## Logging Options
|
119 |
|
120 |
Settings for configuring application logging, including log formats and storage locations.
|
@@ -161,7 +171,7 @@ Settings for configuring application logging, including log formats and storage
|
|
161 |
|
162 |
* **`CSV_USAGE_LOG_HEADERS`**
|
163 |
* **Description:** Defines custom headers for CSV usage logs.
|
164 |
-
* **Default Value:** A predefined list of header names. Refer to `
|
165 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
166 |
|
167 |
* **`SAVE_LOGS_TO_DYNAMODB`**
|
@@ -214,12 +224,17 @@ Settings for configuring application logging, including log formats and storage
|
|
214 |
Configurations related to the text redaction process, including PII detection models and external tool paths.
|
215 |
|
216 |
* **`TESSERACT_FOLDER`**
|
217 |
-
* **Description:** Path to the local Tesseract OCR installation folder. Only required if Tesseract is not in
|
218 |
* **Default Value:** `""` (empty string)
|
219 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
220 |
|
|
|
|
|
|
|
|
|
|
|
221 |
* **`POPPLER_FOLDER`**
|
222 |
-
* **Description:** Path to the local Poppler installation's `bin` folder.
|
223 |
* **Default Value:** `""` (empty string)
|
224 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
225 |
|
@@ -283,24 +298,34 @@ Configurations related to the text redaction process, including PII detection mo
|
|
283 |
* **Default Value:** Value of `AWS_PII_OPTION` if `SHOW_AWS_PII_DETECTION_OPTIONS` is True, else value of `LOCAL_PII_OPTION`.
|
284 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`. Provide one of the PII detection option display names.
|
285 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
* **`CHOSEN_COMPREHEND_ENTITIES`**
|
287 |
* **Description:** A list of AWS Comprehend PII entity types to be redacted when using AWS Comprehend.
|
288 |
-
* **Default Value:** A predefined list of entity types. Refer to `
|
289 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`. This should be a string representation of a Python list.
|
290 |
|
291 |
* **`FULL_COMPREHEND_ENTITY_LIST`**
|
292 |
* **Description:** The complete list of PII entity types supported by AWS Comprehend that can be selected for redaction.
|
293 |
-
* **Default Value:** A predefined list of entity types. Refer to `
|
294 |
* **Configuration:** This is typically an informational variable reflecting the capabilities of AWS Comprehend and is not meant to be changed by users directly affecting redaction behavior (use `CHOSEN_COMPREHEND_ENTITIES` for that). Set as an environment variable directly, or include in `config/app_config.env`.
|
295 |
|
296 |
* **`CHOSEN_REDACT_ENTITIES`**
|
297 |
* **Description:** A list of local PII entity types to be redacted when using the local PII detection model.
|
298 |
-
* **Default Value:** A predefined list of entity types. Refer to `
|
299 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`. This should be a string representation of a Python list.
|
300 |
|
301 |
* **`FULL_ENTITY_LIST`**
|
302 |
* **Description:** The complete list of PII entity types supported by the local PII detection model that can be selected for redaction.
|
303 |
-
* **Default Value:** A predefined list of entity types. Refer to `
|
304 |
* **Configuration:** This is typically an informational variable reflecting the capabilities of the local model and is not meant to be changed by users directly affecting redaction behavior (use `CHOSEN_REDACT_ENTITIES` for that). Set as an environment variable directly, or include in `config/app_config.env`.
|
305 |
|
306 |
* **`PAGE_BREAK_VALUE`**
|
@@ -309,20 +334,15 @@ Configurations related to the text redaction process, including PII detection mo
|
|
309 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
310 |
|
311 |
* **`MAX_TIME_VALUE`**
|
312 |
-
* **Description:** Specifies
|
313 |
* **Default Value:** `'999999'`
|
314 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
315 |
|
316 |
* **`CUSTOM_BOX_COLOUR`**
|
317 |
-
* **Description:** Allows specifying a custom color for the redaction boxes drawn on documents
|
318 |
* **Default Value:** `""` (empty string)
|
319 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
320 |
|
321 |
-
* **`REDACTION_LANGUAGE`**
|
322 |
-
* **Description:** Specifies the language for redaction processing. Currently, only "en" (English) is supported.
|
323 |
-
* **Default Value:** `"en"`
|
324 |
-
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
325 |
-
|
326 |
* **`RETURN_PDF_END_OF_REDACTION`**
|
327 |
* **Description:** If set to `'True'`, the application will return a PDF document at the end of the redaction task.
|
328 |
* **Default Value:** `"True"`
|
@@ -333,13 +353,42 @@ Configurations related to the text redaction process, including PII detection mo
|
|
333 |
* **Default Value:** `"False"`
|
334 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
335 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
336 |
## App Run Options
|
337 |
|
338 |
General runtime configurations for the application.
|
339 |
|
340 |
* **`TLDEXTRACT_CACHE`**
|
341 |
-
* **Description:** Path to the cache
|
342 |
-
* **Default Value:** `'tld
|
343 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
344 |
|
345 |
* **`COGNITO_AUTH`**
|
@@ -436,7 +485,7 @@ Settings related to tracking and applying cost codes for application usage.
|
|
436 |
Configurations for features related to processing whole documents via APIs, particularly AWS Textract for large documents.
|
437 |
|
438 |
* **`SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS`**
|
439 |
-
* **Description:** Controls whether UI options for whole document Textract calls are displayed.
|
440 |
* **Default Value:** `'False'`
|
441 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
442 |
|
@@ -461,12 +510,12 @@ Configurations for features related to processing whole documents via APIs, part
|
|
461 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
|
462 |
|
463 |
* **`TEXTRACT_JOBS_S3_LOC`**
|
464 |
-
* **Description:** The S3 subfolder (within
|
465 |
* **Default Value:** `'output'`
|
466 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
|
467 |
|
468 |
* **`TEXTRACT_JOBS_S3_INPUT_LOC`**
|
469 |
-
* **Description:** The S3 subfolder (within
|
470 |
* **Default Value:** `'input'`
|
471 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
|
472 |
|
@@ -478,4 +527,4 @@ Configurations for features related to processing whole documents via APIs, part
|
|
478 |
* **`DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS`**
|
479 |
* **Description:** Specifies the number of past days for which to display whole document Textract jobs in the UI.
|
480 |
* **Default Value:** `'7'`
|
481 |
-
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
|
|
115 |
* **Default Value:** `'input/'`
|
116 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
117 |
|
118 |
+
* **`GRADIO_TEMP_DIR`**
|
119 |
+
* **Description:** Defines the path for Gradio's temporary file storage.
|
120 |
+
* **Default Value:** `'tmp/gradio_tmp/'`
|
121 |
+
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
122 |
+
|
123 |
+
* **`MPLCONFIGDIR`**
|
124 |
+
* **Description:** Specifies the cache directory for the Matplotlib library, which is used for plotting and image handling.
|
125 |
+
* **Default Value:** `'tmp/matplotlib_cache/'`
|
126 |
+
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
127 |
+
|
128 |
## Logging Options
|
129 |
|
130 |
Settings for configuring application logging, including log formats and storage locations.
|
|
|
171 |
|
172 |
* **`CSV_USAGE_LOG_HEADERS`**
|
173 |
* **Description:** Defines custom headers for CSV usage logs.
|
174 |
+
* **Default Value:** A predefined list of header names. Refer to `config.py` for the complete list.
|
175 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
176 |
|
177 |
* **`SAVE_LOGS_TO_DYNAMODB`**
|
|
|
224 |
Configurations related to the text redaction process, including PII detection models and external tool paths.
|
225 |
|
226 |
* **`TESSERACT_FOLDER`**
|
227 |
+
* **Description:** Path to the local Tesseract OCR installation folder. Only required if Tesseract is not in the system's PATH, or when running a packaged executable (e.g., via PyInstaller).
|
228 |
* **Default Value:** `""` (empty string)
|
229 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
230 |
|
231 |
+
* **`TESSERACT_DATA_FOLDER`**
|
232 |
+
* **Description:** Path to the Tesseract trained data files (e.g., `tessdata`).
|
233 |
+
* **Default Value:** `"/usr/share/tessdata"`
|
234 |
+
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
235 |
+
|
236 |
* **`POPPLER_FOLDER`**
|
237 |
+
* **Description:** Path to the local Poppler installation's `bin` folder. Poppler is used for PDF processing. Only required if Poppler is not in the system's PATH.
|
238 |
* **Default Value:** `""` (empty string)
|
239 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
240 |
|
|
|
298 |
* **Default Value:** Value of `AWS_PII_OPTION` if `SHOW_AWS_PII_DETECTION_OPTIONS` is True, else value of `LOCAL_PII_OPTION`.
|
299 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`. Provide one of the PII detection option display names.
|
300 |
|
301 |
+
* **`CHOSEN_LOCAL_OCR_MODEL`**
|
302 |
+
* **Description:** Choose the engine for local OCR: `"tesseract"`, `"paddle"`, or `"hybrid"`. "paddle" is effective for line extraction but not word-level redaction. "hybrid" uses Tesseract first, then PaddleOCR for low-confidence words.
|
303 |
+
* **Default Value:** `"tesseract"`
|
304 |
+
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
305 |
+
|
306 |
+
* **`PREPROCESS_LOCAL_OCR_IMAGES`**
|
307 |
+
* **Description:** If set to `"True"`, images will be preprocessed (e.g., deskewed, contrast adjusted) before being sent to the local OCR engine. This can sometimes yield worse results on clean scans.
|
308 |
+
* **Default Value:** `"False"`
|
309 |
+
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
310 |
+
|
311 |
* **`CHOSEN_COMPREHEND_ENTITIES`**
|
312 |
* **Description:** A list of AWS Comprehend PII entity types to be redacted when using AWS Comprehend.
|
313 |
+
* **Default Value:** A predefined list of entity types. Refer to `config.py` for the complete list.
|
314 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`. This should be a string representation of a Python list.
|
315 |
|
316 |
* **`FULL_COMPREHEND_ENTITY_LIST`**
|
317 |
* **Description:** The complete list of PII entity types supported by AWS Comprehend that can be selected for redaction.
|
318 |
+
* **Default Value:** A predefined list of entity types. Refer to `config.py` for the complete list.
|
319 |
* **Configuration:** This is typically an informational variable reflecting the capabilities of AWS Comprehend and is not meant to be changed by users directly affecting redaction behavior (use `CHOSEN_COMPREHEND_ENTITIES` for that). Set as an environment variable directly, or include in `config/app_config.env`.
|
320 |
|
321 |
* **`CHOSEN_REDACT_ENTITIES`**
|
322 |
* **Description:** A list of local PII entity types to be redacted when using the local PII detection model.
|
323 |
+
* **Default Value:** A predefined list of entity types. Refer to `config.py` for the complete list.
|
324 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`. This should be a string representation of a Python list.
|
325 |
|
326 |
* **`FULL_ENTITY_LIST`**
|
327 |
* **Description:** The complete list of PII entity types supported by the local PII detection model that can be selected for redaction.
|
328 |
+
* **Default Value:** A predefined list of entity types. Refer to `config.py` for the complete list.
|
329 |
* **Configuration:** This is typically an informational variable reflecting the capabilities of the local model and is not meant to be changed by users directly affecting redaction behavior (use `CHOSEN_REDACT_ENTITIES` for that). Set as an environment variable directly, or include in `config/app_config.env`.
|
330 |
|
331 |
* **`PAGE_BREAK_VALUE`**
|
|
|
334 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
335 |
|
336 |
* **`MAX_TIME_VALUE`**
|
337 |
+
* **Description:** Specifies a maximum time value for long-running processes.
|
338 |
* **Default Value:** `'999999'`
|
339 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
340 |
|
341 |
* **`CUSTOM_BOX_COLOUR`**
|
342 |
+
* **Description:** Allows specifying a custom color for the redaction boxes drawn on documents. Only `"grey"` is currently supported as a custom value. If empty, a default color is used.
|
343 |
* **Default Value:** `""` (empty string)
|
344 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
345 |
|
|
|
|
|
|
|
|
|
|
|
346 |
* **`RETURN_PDF_END_OF_REDACTION`**
|
347 |
* **Description:** If set to `'True'`, the application will return a PDF document at the end of the redaction task.
|
348 |
* **Default Value:** `"True"`
|
|
|
353 |
* **Default Value:** `"False"`
|
354 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
355 |
|
356 |
+
## Language Options
|
357 |
+
|
358 |
+
Settings for multi-language support in OCR and PII detection.
|
359 |
+
|
360 |
+
* **`SHOW_LANGUAGE_SELECTION`**
|
361 |
+
* **Description:** If set to `"True"`, a dropdown menu for language selection will be visible in the user interface.
|
362 |
+
* **Default Value:** `"False"`
|
363 |
+
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
364 |
+
|
365 |
+
* **`DEFAULT_LANGUAGE_FULL_NAME`**
|
366 |
+
* **Description:** The default language's full name (e.g., "english") to be displayed in the UI.
|
367 |
+
* **Default Value:** `"english"`
|
368 |
+
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
369 |
+
|
370 |
+
* **`DEFAULT_LANGUAGE`**
|
371 |
+
* **Description:** The default language's short code (e.g., "en") used by the backend engines. Ensure the corresponding Tesseract/PaddleOCR language packs are installed.
|
372 |
+
* **Default Value:** `"en"`
|
373 |
+
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
374 |
+
|
375 |
+
* **`MAPPED_LANGUAGE_CHOICES`**
|
376 |
+
* **Description:** A string list of full language names (e.g., 'english', 'french') presented to the user in the language dropdown.
|
377 |
+
* **Default Value:** A predefined list. See `config.py`.
|
378 |
+
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
379 |
+
|
380 |
+
* **`LANGUAGE_CHOICES`**
|
381 |
+
* **Description:** A string list of short language codes (e.g., 'en', 'fr') that correspond to `MAPPED_LANGUAGE_CHOICES`. This is what the backend uses.
|
382 |
+
* **Default Value:** A predefined list. See `config.py`.
|
383 |
+
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
384 |
+
|
385 |
## App Run Options
|
386 |
|
387 |
General runtime configurations for the application.
|
388 |
|
389 |
* **`TLDEXTRACT_CACHE`**
|
390 |
+
* **Description:** Path to the cache directory used by the `tldextract` library, which helps in accurately extracting top-level domains (TLDs) from URLs.
|
391 |
+
* **Default Value:** `'tmp/tld/'`
|
392 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
393 |
|
394 |
* **`COGNITO_AUTH`**
|
|
|
485 |
Configurations for features related to processing whole documents via APIs, particularly AWS Textract for large documents.
|
486 |
|
487 |
* **`SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS`**
|
488 |
+
* **Description:** Controls whether UI options for whole document Textract calls are displayed.
|
489 |
* **Default Value:** `'False'`
|
490 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
491 |
|
|
|
510 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
|
511 |
|
512 |
* **`TEXTRACT_JOBS_S3_LOC`**
|
513 |
+
* **Description:** The S3 subfolder (within the main redaction bucket) where Textract job data (output) is stored.
|
514 |
* **Default Value:** `'output'`
|
515 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
|
516 |
|
517 |
* **`TEXTRACT_JOBS_S3_INPUT_LOC`**
|
518 |
+
* **Description:** The S3 subfolder (within the main redaction bucket) where Textract job input is stored.
|
519 |
* **Default Value:** `'input'`
|
520 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
|
521 |
|
|
|
527 |
* **`DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS`**
|
528 |
* **Description:** Specifies the number of past days for which to display whole document Textract jobs in the UI.
|
529 |
* **Default Value:** `'7'`
|
530 |
+
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
src/user_guide.qmd
CHANGED
@@ -20,7 +20,7 @@ format:
|
|
20 |
- [Redacting only specific pages](#redacting-only-specific-pages)
|
21 |
- [Handwriting and signature redaction](#handwriting-and-signature-redaction)
|
22 |
- [Reviewing and modifying suggested redactions](#reviewing-and-modifying-suggested-redactions)
|
23 |
-
- [Redacting tabular data files (CSV/XLSX) or copy and pasted text](#redacting-tabular-data-files-xlsxcsv-or-copy-and-pasted-text)
|
24 |
|
25 |
See the [advanced user guide here](#advanced-user-guide):
|
26 |
- [Merging redaction review files](#merging-redaction-review-files)
|
@@ -210,9 +210,11 @@ On the 'Review redactions' tab you have a visual interface that allows you to in
|
|
210 |
|
211 |
### Uploading documents for review
|
212 |
|
213 |
-
The top area has a file upload area where you can upload
|
214 |
|
215 |
-
Optionally, you can also upload one of the '..._ocr_output.csv'
|
|
|
|
|
216 |
|
217 |

|
218 |
|
@@ -300,6 +302,77 @@ Once you have filtered the table, or selected a row from the table, you have a f
|
|
300 |
|
301 |
If you made a mistake, click the 'Undo last element removal' button to restore the Search suggested redactions table to its previous state (can only undo the last action).
|
302 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
303 |
### Navigating through the document using the 'Search all extracted text'
|
304 |
|
305 |
The 'search all extracted text' table will contain text if you have just redacted a document, or if you have uploaded a '..._ocr_output.csv' file alongside a document file and review file on the Review redactions tab as [described above](#uploading-documents-for-review).
|
@@ -312,11 +385,11 @@ You can search through the extracted text by using the search bar just above the
|
|
312 |
|
313 |

|
314 |
|
315 |
-
## Redacting tabular data files (XLSX/CSV) or copy and pasted text
|
316 |
|
317 |
-
###
|
318 |
|
319 |
-
The app can be used to redact tabular data files such as xlsx or csv files. For this to work properly, your data file needs to be in a simple table format, with a single table starting from the first cell (A1), and no other information in the sheet. Similarly for .xlsx files, each sheet in the file that you want to redact should be in this simple format.
|
320 |
|
321 |
To demonstrate this, we can use [the example csv file 'combined_case_notes.csv'](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/combined_case_notes.csv), which is a small dataset of dummy social care case notes. Go to the 'Open text or Excel/csv files' tab. Drop the file into the upload area. After the file is loaded, you should see the suggested columns for redaction in the box underneath. You can select and deselect columns to redact as you wish from this list.
|
322 |
|
|
|
20 |
- [Redacting only specific pages](#redacting-only-specific-pages)
|
21 |
- [Handwriting and signature redaction](#handwriting-and-signature-redaction)
|
22 |
- [Reviewing and modifying suggested redactions](#reviewing-and-modifying-suggested-redactions)
|
23 |
+
- [Redacting Word, tabular data files (CSV/XLSX) or copy and pasted text](#redacting-word-tabular-data-files-xlsxcsv-or-copy-and-pasted-text)
|
24 |
|
25 |
See the [advanced user guide here](#advanced-user-guide):
|
26 |
- [Merging redaction review files](#merging-redaction-review-files)
|
|
|
210 |
|
211 |
### Uploading documents for review
|
212 |
|
213 |
+
The top area has a file upload area where you can upload files for review . In the left box, upload the original PDF file. Click '1. Upload original PDF'. In the right box, you can upload the '..._review_file.csv' that is produced by the redaction process.
|
214 |
|
215 |
+
Optionally, you can upload a '..._ocr_result_with_words' file here, that will allow you to search through the text and easily [add new redactions based on word search](#searching-and-adding-custom-redactions). You can also upload one of the '..._ocr_output.csv' file here that comes out of a redaction task, so that you can navigate the extracted text from the document. Click the button '2. Upload Review or OCR csv files' load in these files.
|
216 |
+
|
217 |
+
Now you can review and modify the suggested redactions using the interface described below.
|
218 |
|
219 |

|
220 |
|
|
|
302 |
|
303 |
If you made a mistake, click the 'Undo last element removal' button to restore the Search suggested redactions table to its previous state (can only undo the last action).
|
304 |
|
305 |
+
### Searching and Adding Custom Redactions
|
306 |
+
|
307 |
+
After a document has been processed, you may need to redact specific terms, names, or phrases that the automatic PII (Personally Identifiable Information) detection might have missed. The **"Search text to make new redactions"** tab gives you the power to find and redact any text within your document manually.
|
308 |
+
|
309 |
+
#### How to Use the Search and Redact Feature
|
310 |
+
|
311 |
+
The workflow is designed to be simple: **Search → Select → Redact**.
|
312 |
+
|
313 |
+
---
|
314 |
+
|
315 |
+
#### **Step 1: Search for Text**
|
316 |
+
|
317 |
+
1. Navigate to the **"Search text to make new redactions"** tab.
|
318 |
+
2. The main table will initially be populated with all the text extracted from the document, broken down by word.
|
319 |
+
3. To narrow this down, use the **"Multi-word text search"** box to type the word or phrase you want to find.
|
320 |
+
4. Click the **"Search"** button or press Enter.
|
321 |
+
5. The table below will update to show only the rows containing text that matches your search query.
|
322 |
+
|
323 |
+
> **Tip:** You can also filter the results by page number using the **"Page"** dropdown. To clear all filters and see the full text again, click the **"Reset table to original state"** button.
|
324 |
+
|
325 |
+
---
|
326 |
+
|
327 |
+
#### **Step 2: Select and Review a Match**
|
328 |
+
|
329 |
+
When you click on any row in the search results table:
|
330 |
+
|
331 |
+
* The document preview on the left will automatically jump to that page, allowing you to see the word in its original context.
|
332 |
+
* The details of your selection will appear in the smaller **"Selected row"** table for confirmation.
|
333 |
+
|
334 |
+
---
|
335 |
+
|
336 |
+
#### **Step 3: Choose Your Redaction Method**
|
337 |
+
|
338 |
+
You have several powerful options for redacting the text you've found:
|
339 |
+
|
340 |
+
* **Redact a Single, Specific Instance:**
|
341 |
+
* Click on the exact row in the table you want to redact.
|
342 |
+
* Click the **`Redact specific text row`** button.
|
343 |
+
* Only that single instance will be redacted.
|
344 |
+
|
345 |
+
* **Redact All Instances of a Word/Phrase:**
|
346 |
+
* Let's say you want to redact the project name "Project Alpha" everywhere it appears.
|
347 |
+
* Find and select one instance of "Project Alpha" in the table.
|
348 |
+
* Click the **`Redact all words with same text as selected row`** button.
|
349 |
+
* The application will find and redact every single occurrence of "Project Alpha" throughout the entire document.
|
350 |
+
|
351 |
+
* **Redact All Current Search Results:**
|
352 |
+
* Perform a search (e.g., for a specific person's name).
|
353 |
+
* If you are confident that every result shown in the filtered table should be redacted, click the **`Redact all text in table`** button.
|
354 |
+
* This will apply a redaction to all currently visible items in the table in one go.
|
355 |
+
|
356 |
+
---
|
357 |
+
|
358 |
+
#### **Customising Your New Redactions**
|
359 |
+
|
360 |
+
Before you click one of the redact buttons, you can customize the appearance and label of the new redactions under the **"Search options"** accordion:
|
361 |
+
|
362 |
+
* **Label for new redactions:** Change the text that appears on the redaction box (default is "Redaction"). You could change this to "CONFIDENTIAL" or "CUSTOM".
|
363 |
+
* **Colour for labels:** Set a custom color for the redaction box by providing an RGB value. The format must be three numbers (0-255) in parentheses, for example:
|
364 |
+
* ` (255, 0, 0) ` for Red
|
365 |
+
* ` (0, 0, 0) ` for Black
|
366 |
+
* ` (255, 255, 0) ` for Yellow
|
367 |
+
|
368 |
+
#### **Undoing a Mistake**
|
369 |
+
|
370 |
+
If you make a mistake, you can reverse the last redaction action you performed on this tab.
|
371 |
+
|
372 |
+
* Click the **`Undo latest redaction`** button. This will revert the last set of redactions you added (whether it was a single row, all of a certain text, or all search results).
|
373 |
+
|
374 |
+
> **Important:** This undo button only works for the *most recent* action. It maintains a single backup state, so it cannot undo actions that are two or more steps in the past.
|
375 |
+
|
376 |
### Navigating through the document using the 'Search all extracted text'
|
377 |
|
378 |
The 'search all extracted text' table will contain text if you have just redacted a document, or if you have uploaded a '..._ocr_output.csv' file alongside a document file and review file on the Review redactions tab as [described above](#uploading-documents-for-review).
|
|
|
385 |
|
386 |

|
387 |
|
388 |
+
## Redacting Word, tabular data files (XLSX/CSV) or copy and pasted text
|
389 |
|
390 |
+
### Word or tabular data files (XLSX/CSV)
|
391 |
|
392 |
+
The app can be used to redact Word (.docx), or tabular data files such as xlsx or csv files. For this to work properly, your data file needs to be in a simple table format, with a single table starting from the first cell (A1), and no other information in the sheet. Similarly for .xlsx files, each sheet in the file that you want to redact should be in this simple format.
|
393 |
|
394 |
To demonstrate this, we can use [the example csv file 'combined_case_notes.csv'](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/combined_case_notes.csv), which is a small dataset of dummy social care case notes. Go to the 'Open text or Excel/csv files' tab. Drop the file into the upload area. After the file is loaded, you should see the suggested columns for redaction in the box underneath. You can select and deselect columns to redact as you wish from this list.
|
395 |
|
tools/aws_textract.py
CHANGED
@@ -3,7 +3,6 @@ from typing import List
|
|
3 |
import io
|
4 |
import os
|
5 |
import json
|
6 |
-
from collections import defaultdict
|
7 |
import pikepdf
|
8 |
import time
|
9 |
import pandas as pd
|
@@ -13,17 +12,12 @@ from tools.config import AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION
|
|
13 |
def extract_textract_metadata(response:object):
|
14 |
"""Extracts metadata from an AWS Textract response."""
|
15 |
|
16 |
-
#print("Document metadata:", response['DocumentMetadata'])
|
17 |
-
|
18 |
request_id = response['ResponseMetadata']['RequestId']
|
19 |
pages = response['DocumentMetadata']['Pages']
|
20 |
-
#number_of_pages = response['DocumentMetadata']['NumberOfPages']
|
21 |
|
22 |
return str({
|
23 |
'RequestId': request_id,
|
24 |
'Pages': pages
|
25 |
-
#,
|
26 |
-
#'NumberOfPages': number_of_pages
|
27 |
})
|
28 |
|
29 |
def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str="", handwrite_signature_checkbox:List[str]=["Extract handwriting", "Redact all identified signatures"]):
|
@@ -54,7 +48,6 @@ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str=""
|
|
54 |
time.sleep(3)
|
55 |
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
|
56 |
else:
|
57 |
-
#print("Analysing document without signature detection")
|
58 |
# Call detect_document_text to extract plain text
|
59 |
try:
|
60 |
response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
|
@@ -74,12 +67,8 @@ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str=""
|
|
74 |
'data': response
|
75 |
}
|
76 |
|
77 |
-
#print("response:", response)
|
78 |
-
|
79 |
request_metadata = extract_textract_metadata(response) # Metadata comes out as a string
|
80 |
|
81 |
-
#print("request_metadata:", request_metadata)
|
82 |
-
|
83 |
# Return a list containing the wrapped response and the metadata
|
84 |
return wrapped_response, request_metadata # Return as a list to match the desired structure
|
85 |
|
@@ -103,179 +92,8 @@ def convert_pike_pdf_page_to_bytes(pdf:object, page_num:int):
|
|
103 |
# Now you can use the `pdf_bytes` to convert it to an image or further process
|
104 |
buffer.close()
|
105 |
|
106 |
-
#images = convert_from_bytes(pdf_bytes)
|
107 |
-
#image = images[0]
|
108 |
-
|
109 |
return pdf_bytes
|
110 |
|
111 |
-
# def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_no:int):
|
112 |
-
# '''
|
113 |
-
# Convert the json response from textract to the OCRResult format used elsewhere in the code. Looks for lines, words, and signatures. Handwriting and signatures are set aside especially for later in case the user wants to override the default behaviour and redact all handwriting/signatures.
|
114 |
-
# '''
|
115 |
-
# all_ocr_results = []
|
116 |
-
# signature_or_handwriting_recogniser_results = []
|
117 |
-
# signature_recogniser_results = []
|
118 |
-
# handwriting_recogniser_results = []
|
119 |
-
# signatures = []
|
120 |
-
# handwriting = []
|
121 |
-
# ocr_results_with_words = {}
|
122 |
-
# text_block={}
|
123 |
-
|
124 |
-
# i = 1
|
125 |
-
|
126 |
-
# # Assuming json_data is structured as a dictionary with a "pages" key
|
127 |
-
# #if "pages" in json_data:
|
128 |
-
# # Find the specific page data
|
129 |
-
# page_json_data = json_data #next((page for page in json_data["pages"] if page["page_no"] == page_no), None)
|
130 |
-
|
131 |
-
# #print("page_json_data:", page_json_data)
|
132 |
-
|
133 |
-
# if "Blocks" in page_json_data:
|
134 |
-
# # Access the data for the specific page
|
135 |
-
# text_blocks = page_json_data["Blocks"] # Access the Blocks within the page data
|
136 |
-
# # This is a new page
|
137 |
-
# elif "page_no" in page_json_data:
|
138 |
-
# text_blocks = page_json_data["data"]["Blocks"]
|
139 |
-
# else: text_blocks = []
|
140 |
-
|
141 |
-
# is_signature = False
|
142 |
-
# is_handwriting = False
|
143 |
-
|
144 |
-
# for text_block in text_blocks:
|
145 |
-
|
146 |
-
# if (text_block['BlockType'] == 'LINE') | (text_block['BlockType'] == 'SIGNATURE'): # (text_block['BlockType'] == 'WORD') |
|
147 |
-
|
148 |
-
# # Extract text and bounding box for the line
|
149 |
-
# line_bbox = text_block["Geometry"]["BoundingBox"]
|
150 |
-
# line_left = int(line_bbox["Left"] * page_width)
|
151 |
-
# line_top = int(line_bbox["Top"] * page_height)
|
152 |
-
# line_right = int((line_bbox["Left"] + line_bbox["Width"]) * page_width)
|
153 |
-
# line_bottom = int((line_bbox["Top"] + line_bbox["Height"]) * page_height)
|
154 |
-
|
155 |
-
# width_abs = int(line_bbox["Width"] * page_width)
|
156 |
-
# height_abs = int(line_bbox["Height"] * page_height)
|
157 |
-
|
158 |
-
# if text_block['BlockType'] == 'LINE':
|
159 |
-
|
160 |
-
# # Extract text and bounding box for the line
|
161 |
-
# line_text = text_block.get('Text', '')
|
162 |
-
# words = []
|
163 |
-
# current_line_handwriting_results = [] # Track handwriting results for this line
|
164 |
-
|
165 |
-
# if 'Relationships' in text_block:
|
166 |
-
# for relationship in text_block['Relationships']:
|
167 |
-
# if relationship['Type'] == 'CHILD':
|
168 |
-
# for child_id in relationship['Ids']:
|
169 |
-
# child_block = next((block for block in text_blocks if block['Id'] == child_id), None)
|
170 |
-
# if child_block and child_block['BlockType'] == 'WORD':
|
171 |
-
# word_text = child_block.get('Text', '')
|
172 |
-
# word_bbox = child_block["Geometry"]["BoundingBox"]
|
173 |
-
# confidence = child_block.get('Confidence','')
|
174 |
-
# word_left = int(word_bbox["Left"] * page_width)
|
175 |
-
# word_top = int(word_bbox["Top"] * page_height)
|
176 |
-
# word_right = int((word_bbox["Left"] + word_bbox["Width"]) * page_width)
|
177 |
-
# word_bottom = int((word_bbox["Top"] + word_bbox["Height"]) * page_height)
|
178 |
-
|
179 |
-
# # Extract BoundingBox details
|
180 |
-
# word_width = word_bbox["Width"]
|
181 |
-
# word_height = word_bbox["Height"]
|
182 |
-
|
183 |
-
# # Convert proportional coordinates to absolute coordinates
|
184 |
-
# word_width_abs = int(word_width * page_width)
|
185 |
-
# word_height_abs = int(word_height * page_height)
|
186 |
-
|
187 |
-
# words.append({
|
188 |
-
# 'text': word_text,
|
189 |
-
# 'bounding_box': (word_left, word_top, word_right, word_bottom)
|
190 |
-
# })
|
191 |
-
# # Check for handwriting
|
192 |
-
# text_type = child_block.get("TextType", '')
|
193 |
-
|
194 |
-
# if text_type == "HANDWRITING":
|
195 |
-
# is_handwriting = True
|
196 |
-
# entity_name = "HANDWRITING"
|
197 |
-
# word_end = len(word_text)
|
198 |
-
|
199 |
-
# recogniser_result = CustomImageRecognizerResult(
|
200 |
-
# entity_type=entity_name,
|
201 |
-
# text=word_text,
|
202 |
-
# score=confidence,
|
203 |
-
# start=0,
|
204 |
-
# end=word_end,
|
205 |
-
# left=word_left,
|
206 |
-
# top=word_top,
|
207 |
-
# width=word_width_abs,
|
208 |
-
# height=word_height_abs
|
209 |
-
# )
|
210 |
-
|
211 |
-
# # Add to handwriting collections immediately
|
212 |
-
# handwriting.append(recogniser_result)
|
213 |
-
# handwriting_recogniser_results.append(recogniser_result)
|
214 |
-
# signature_or_handwriting_recogniser_results.append(recogniser_result)
|
215 |
-
# current_line_handwriting_results.append(recogniser_result)
|
216 |
-
|
217 |
-
# # If handwriting or signature, add to bounding box
|
218 |
-
|
219 |
-
# elif (text_block['BlockType'] == 'SIGNATURE'):
|
220 |
-
# line_text = "SIGNATURE"
|
221 |
-
# is_signature = True
|
222 |
-
# entity_name = "SIGNATURE"
|
223 |
-
# confidence = text_block.get('Confidence', 0)
|
224 |
-
# word_end = len(line_text)
|
225 |
-
|
226 |
-
# recogniser_result = CustomImageRecognizerResult(
|
227 |
-
# entity_type=entity_name,
|
228 |
-
# text=line_text,
|
229 |
-
# score=confidence,
|
230 |
-
# start=0,
|
231 |
-
# end=word_end,
|
232 |
-
# left=line_left,
|
233 |
-
# top=line_top,
|
234 |
-
# width=width_abs,
|
235 |
-
# height=height_abs
|
236 |
-
# )
|
237 |
-
|
238 |
-
# # Add to signature collections immediately
|
239 |
-
# signatures.append(recogniser_result)
|
240 |
-
# signature_recogniser_results.append(recogniser_result)
|
241 |
-
# signature_or_handwriting_recogniser_results.append(recogniser_result)
|
242 |
-
|
243 |
-
# words = [{
|
244 |
-
# 'text': line_text,
|
245 |
-
# 'bounding_box': (line_left, line_top, line_right, line_bottom)
|
246 |
-
# }]
|
247 |
-
|
248 |
-
# ocr_results_with_words["text_line_" + str(i)] = {
|
249 |
-
# "line": i,
|
250 |
-
# 'text': line_text,
|
251 |
-
# 'bounding_box': (line_left, line_top, line_right, line_bottom),
|
252 |
-
# 'words': words
|
253 |
-
# }
|
254 |
-
|
255 |
-
# # Create OCRResult with absolute coordinates
|
256 |
-
# ocr_result = OCRResult(line_text, line_left, line_top, width_abs, height_abs)
|
257 |
-
# all_ocr_results.append(ocr_result)
|
258 |
-
|
259 |
-
# is_signature_or_handwriting = is_signature | is_handwriting
|
260 |
-
|
261 |
-
# # If it is signature or handwriting, will overwrite the default behaviour of the PII analyser
|
262 |
-
# if is_signature_or_handwriting:
|
263 |
-
# if recogniser_result not in signature_or_handwriting_recogniser_results:
|
264 |
-
# signature_or_handwriting_recogniser_results.append(recogniser_result)
|
265 |
-
|
266 |
-
# if is_signature:
|
267 |
-
# if recogniser_result not in signature_recogniser_results:
|
268 |
-
# signature_recogniser_results.append(recogniser_result)
|
269 |
-
|
270 |
-
# if is_handwriting:
|
271 |
-
# if recogniser_result not in handwriting_recogniser_results:
|
272 |
-
# handwriting_recogniser_results.append(recogniser_result)
|
273 |
-
|
274 |
-
# i += 1
|
275 |
-
|
276 |
-
# return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_words
|
277 |
-
|
278 |
-
|
279 |
def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_no:int):
|
280 |
'''
|
281 |
Convert the json response from textract to the OCRResult format used elsewhere in the code. Looks for lines, words, and signatures. Handwriting and signatures are set aside especially for later in case the user wants to override the default behaviour and redact all handwriting/signatures.
|
@@ -289,15 +107,13 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
|
|
289 |
ocr_results_with_words = {}
|
290 |
text_block={}
|
291 |
|
292 |
-
|
293 |
|
294 |
# Assuming json_data is structured as a dictionary with a "pages" key
|
295 |
-
|
296 |
# Find the specific page data
|
297 |
page_json_data = json_data #next((page for page in json_data["pages"] if page["page_no"] == page_no), None)
|
298 |
|
299 |
-
#print("page_json_data:", page_json_data)
|
300 |
-
|
301 |
if "Blocks" in page_json_data:
|
302 |
# Access the data for the specific page
|
303 |
text_blocks = page_json_data["Blocks"] # Access the Blocks within the page data
|
@@ -424,8 +240,8 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
|
|
424 |
|
425 |
if line_text:
|
426 |
|
427 |
-
ocr_results_with_words["text_line_" + str(
|
428 |
-
"line":
|
429 |
'text': line_text,
|
430 |
'bounding_box': (line_left, line_top, line_right, line_bottom),
|
431 |
'words': words,
|
@@ -433,9 +249,12 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
|
|
433 |
}
|
434 |
|
435 |
# Create OCRResult with absolute coordinates
|
436 |
-
ocr_result = OCRResult(line_text, line_left, line_top, width_abs, height_abs)
|
437 |
all_ocr_results.append(ocr_result)
|
438 |
|
|
|
|
|
|
|
439 |
is_signature_or_handwriting = is_signature | is_handwriting
|
440 |
|
441 |
# If it is signature or handwriting, will overwrite the default behaviour of the PII analyser
|
@@ -451,7 +270,7 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
|
|
451 |
if recogniser_result not in handwriting_recogniser_results:
|
452 |
handwriting_recogniser_results.append(recogniser_result)
|
453 |
|
454 |
-
|
455 |
|
456 |
# Add page key to the line level results
|
457 |
all_ocr_results_with_page = {"page": page_no, "results": all_ocr_results}
|
@@ -459,7 +278,6 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
|
|
459 |
|
460 |
return all_ocr_results_with_page, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_words_with_page
|
461 |
|
462 |
-
|
463 |
def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
|
464 |
"""
|
465 |
Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.
|
|
|
3 |
import io
|
4 |
import os
|
5 |
import json
|
|
|
6 |
import pikepdf
|
7 |
import time
|
8 |
import pandas as pd
|
|
|
12 |
def extract_textract_metadata(response:object):
|
13 |
"""Extracts metadata from an AWS Textract response."""
|
14 |
|
|
|
|
|
15 |
request_id = response['ResponseMetadata']['RequestId']
|
16 |
pages = response['DocumentMetadata']['Pages']
|
|
|
17 |
|
18 |
return str({
|
19 |
'RequestId': request_id,
|
20 |
'Pages': pages
|
|
|
|
|
21 |
})
|
22 |
|
23 |
def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str="", handwrite_signature_checkbox:List[str]=["Extract handwriting", "Redact all identified signatures"]):
|
|
|
48 |
time.sleep(3)
|
49 |
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
|
50 |
else:
|
|
|
51 |
# Call detect_document_text to extract plain text
|
52 |
try:
|
53 |
response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
|
|
|
67 |
'data': response
|
68 |
}
|
69 |
|
|
|
|
|
70 |
request_metadata = extract_textract_metadata(response) # Metadata comes out as a string
|
71 |
|
|
|
|
|
72 |
# Return a list containing the wrapped response and the metadata
|
73 |
return wrapped_response, request_metadata # Return as a list to match the desired structure
|
74 |
|
|
|
92 |
# Now you can use the `pdf_bytes` to convert it to an image or further process
|
93 |
buffer.close()
|
94 |
|
|
|
|
|
|
|
95 |
return pdf_bytes
|
96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_no:int):
|
98 |
'''
|
99 |
Convert the json response from textract to the OCRResult format used elsewhere in the code. Looks for lines, words, and signatures. Handwriting and signatures are set aside especially for later in case the user wants to override the default behaviour and redact all handwriting/signatures.
|
|
|
107 |
ocr_results_with_words = {}
|
108 |
text_block={}
|
109 |
|
110 |
+
text_line_number = 1
|
111 |
|
112 |
# Assuming json_data is structured as a dictionary with a "pages" key
|
113 |
+
|
114 |
# Find the specific page data
|
115 |
page_json_data = json_data #next((page for page in json_data["pages"] if page["page_no"] == page_no), None)
|
116 |
|
|
|
|
|
117 |
if "Blocks" in page_json_data:
|
118 |
# Access the data for the specific page
|
119 |
text_blocks = page_json_data["Blocks"] # Access the Blocks within the page data
|
|
|
240 |
|
241 |
if line_text:
|
242 |
|
243 |
+
ocr_results_with_words["text_line_" + str(text_line_number)] = {
|
244 |
+
"line": text_line_number,
|
245 |
'text': line_text,
|
246 |
'bounding_box': (line_left, line_top, line_right, line_bottom),
|
247 |
'words': words,
|
|
|
249 |
}
|
250 |
|
251 |
# Create OCRResult with absolute coordinates
|
252 |
+
ocr_result = OCRResult(line_text, line_left, line_top, width_abs, height_abs, conf=confidence, line=text_line_number)
|
253 |
all_ocr_results.append(ocr_result)
|
254 |
|
255 |
+
# Increase line number
|
256 |
+
text_line_number += 1
|
257 |
+
|
258 |
is_signature_or_handwriting = is_signature | is_handwriting
|
259 |
|
260 |
# If it is signature or handwriting, will overwrite the default behaviour of the PII analyser
|
|
|
270 |
if recogniser_result not in handwriting_recogniser_results:
|
271 |
handwriting_recogniser_results.append(recogniser_result)
|
272 |
|
273 |
+
|
274 |
|
275 |
# Add page key to the line level results
|
276 |
all_ocr_results_with_page = {"page": page_no, "results": all_ocr_results}
|
|
|
278 |
|
279 |
return all_ocr_results_with_page, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_words_with_page
|
280 |
|
|
|
281 |
def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
|
282 |
"""
|
283 |
Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.
|
tools/cli_redact.py
CHANGED
@@ -1,84 +1,164 @@
|
|
1 |
import argparse
|
2 |
import os
|
3 |
-
|
4 |
-
from tools.
|
|
|
5 |
from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
|
6 |
from tools.file_redaction import choose_and_run_redactor
|
7 |
-
|
8 |
-
from datetime import datetime
|
9 |
-
|
10 |
-
chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER', 'CREDIT_DEBIT_CVV', 'CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS',
|
11 |
-
'NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD',
|
12 |
-
'IP_ADDRESS','MAC_ADDRESS','LICENSE_PLATE',
|
13 |
-
'VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER',
|
14 |
-
'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE',
|
15 |
-
'UK_NATIONAL_HEALTH_SERVICE_NUMBER']
|
16 |
-
chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS",
|
17 |
-
"STREETNAME", "UKPOSTCODE"]
|
18 |
-
|
19 |
-
def main(first_loop_state=True, latest_file_completed=0, output_summary="", output_file_list=None,
|
20 |
-
log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
|
21 |
-
current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Redact all identified handwriting", "Redact all identified signatures"]):
|
22 |
-
|
23 |
-
if output_file_list is None:
|
24 |
-
output_file_list = []
|
25 |
-
if log_files_list is None:
|
26 |
-
log_files_list = []
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
# Optional arguments with defaults matching the GUI app
|
34 |
-
parser.add_argument('--ocr_method', choices=[text_ocr_option, tesseract_ocr_option, textract_option],
|
35 |
-
default='Quick image analysis', help='OCR method to use')
|
36 |
-
parser.add_argument('--pii_detector', choices=[local_pii_detector, aws_pii_detector],
|
37 |
-
default='Local', help='PII detection method')
|
38 |
-
parser.add_argument('--page_min', type=int, default=0, help='First page to redact')
|
39 |
-
parser.add_argument('--page_max', type=int, default=0, help='Last page to redact')
|
40 |
-
parser.add_argument('--allow_list', help='Path to allow list CSV file')
|
41 |
-
parser.add_argument('--output_dir', default='output/', help='Output directory')
|
42 |
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
-
#
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
-
#
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
-
#
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
55 |
|
56 |
-
|
57 |
-
file_name_no_ext, file_name_with_ext, full_file_name = get_input_file_names(file_obj)
|
58 |
|
59 |
-
#
|
|
|
|
|
|
|
60 |
|
61 |
-
#
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
if __name__ == "__main__":
|
84 |
-
main()
|
|
|
1 |
import argparse
|
2 |
import os
|
3 |
+
import pandas as pd
|
4 |
+
from tools.config import get_or_create_env_var, LOCAL_PII_OPTION, AWS_PII_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION
|
5 |
+
from tools.helper_functions import ensure_output_folder_exists
|
6 |
from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
|
7 |
from tools.file_redaction import choose_and_run_redactor
|
8 |
+
from tools.anonymisation import anonymise_files_with_open_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
+
# --- Constants and Configuration ---
|
11 |
+
INPUT_FOLDER = 'input/'
|
12 |
+
OUTPUT_FOLDER = 'output/'
|
13 |
+
DEFAULT_LANGUAGE = 'en'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
+
# Define entities for redaction
|
16 |
+
chosen_comprehend_entities = [
|
17 |
+
'BANK_ACCOUNT_NUMBER', 'BANK_ROUTING', 'CREDIT_DEBIT_NUMBER',
|
18 |
+
'CREDIT_DEBIT_CVV', 'CREDIT_DEBIT_EXPIRY', 'PIN', 'EMAIL', 'ADDRESS',
|
19 |
+
'NAME', 'PHONE', 'PASSPORT_NUMBER', 'DRIVER_ID', 'USERNAME', 'PASSWORD',
|
20 |
+
'IP_ADDRESS', 'MAC_ADDRESS', 'LICENSE_PLATE', 'VEHICLE_IDENTIFICATION_NUMBER',
|
21 |
+
'UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER',
|
22 |
+
'SWIFT_CODE', 'UK_NATIONAL_HEALTH_SERVICE_NUMBER'
|
23 |
+
]
|
24 |
+
chosen_redact_entities = [
|
25 |
+
"TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"
|
26 |
+
]
|
27 |
+
|
28 |
+
# --- Main CLI Function ---
|
29 |
+
def main():
|
30 |
+
"""
|
31 |
+
A unified command-line interface to prepare, redact, and anonymise various document types.
|
32 |
+
"""
|
33 |
+
parser = argparse.ArgumentParser(
|
34 |
+
description='A versatile CLI for redacting PII from PDF/image files and anonymising Word/tabular data.',
|
35 |
+
formatter_class=argparse.RawTextHelpFormatter
|
36 |
+
)
|
37 |
|
38 |
+
# --- General Arguments (apply to all file types) ---
|
39 |
+
general_group = parser.add_argument_group('General Options')
|
40 |
+
general_group.add_argument('--input_file', required=True, help='Path to the input file to process.')
|
41 |
+
general_group.add_argument('--output_dir', default=OUTPUT_FOLDER, help='Directory for all output files.')
|
42 |
+
general_group.add_argument('--language', default=DEFAULT_LANGUAGE, help='Language of the document content.')
|
43 |
+
general_group.add_argument('--allow_list', help='Path to a CSV file with words to exclude from redaction.')
|
44 |
+
general_group.add_argument('--pii_detector',
|
45 |
+
choices=[LOCAL_PII_OPTION, AWS_PII_OPTION],
|
46 |
+
default=LOCAL_PII_OPTION,
|
47 |
+
help='Core PII detection method (Local or AWS).')
|
48 |
+
general_group.add_argument('--aws_access_key', default='', help='Your AWS Access Key ID.')
|
49 |
+
general_group.add_argument('--aws_secret_key', default='', help='Your AWS Secret Access Key.')
|
50 |
|
51 |
+
# --- PDF/Image Redaction Arguments ---
|
52 |
+
pdf_group = parser.add_argument_group('PDF/Image Redaction Options (.pdf, .png, .jpg)')
|
53 |
+
pdf_group.add_argument('--ocr_method',
|
54 |
+
choices=[SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION],
|
55 |
+
default=TESSERACT_TEXT_EXTRACT_OPTION,
|
56 |
+
help='OCR method for text extraction from images.')
|
57 |
+
pdf_group.add_argument('--page_min', type=int, default=0, help='First page to redact.')
|
58 |
+
pdf_group.add_argument('--page_max', type=int, default=999, help='Last page to redact.')
|
59 |
+
pdf_group.add_argument('--prepare_for_review', action='store_true', help='Prepare files for reviewing redactions.')
|
60 |
+
pdf_group.add_argument('--no_images', action='store_false', dest='prepare_images', help='Disable image creation for PDF pages.')
|
61 |
|
62 |
+
# --- Word/Tabular Anonymisation Arguments ---
|
63 |
+
tabular_group = parser.add_argument_group('Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)')
|
64 |
+
tabular_group.add_argument('--anon_strat', choices=['redact', 'encrypt', 'hash'], default='redact', help='The anonymisation strategy to apply.')
|
65 |
+
tabular_group.add_argument('--columns', nargs='+', default=[], help='A list of column names to anonymise in tabular data.')
|
66 |
+
tabular_group.add_argument('--excel_sheets', nargs='+', default=[], help='Specific Excel sheet names to process.')
|
67 |
+
tabular_group.add_argument('--deny_list', help='Path to a CSV file with specific terms/phrases to redact.')
|
68 |
+
tabular_group.add_argument('--fuzzy_mistakes', type=int, default=1, help='Number of allowed spelling mistakes for fuzzy matching.')
|
69 |
|
70 |
+
args = parser.parse_args()
|
|
|
71 |
|
72 |
+
# --- Initial Setup ---
|
73 |
+
ensure_output_folder_exists(args.output_dir)
|
74 |
+
_, file_extension = os.path.splitext(args.input_file)
|
75 |
+
file_extension = file_extension.lower()
|
76 |
|
77 |
+
# Load allow/deny lists
|
78 |
+
allow_list = pd.read_csv(args.allow_list) if args.allow_list else pd.DataFrame()
|
79 |
+
deny_list = pd.read_csv(args.deny_list).iloc[:, 0].tolist() if args.deny_list else []
|
80 |
+
|
81 |
+
|
82 |
+
# --- Route to the Correct Workflow Based on File Type ---
|
83 |
+
|
84 |
+
# Workflow 1: PDF/Image Redaction
|
85 |
+
if file_extension in ['.pdf', '.png', '.jpg', '.jpeg']:
|
86 |
+
print("--- Detected PDF/Image file. Starting Redaction Workflow... ---")
|
87 |
+
try:
|
88 |
+
# Step 1: Prepare the document
|
89 |
+
print("\nStep 1: Preparing document...")
|
90 |
+
(
|
91 |
+
prep_summary, prepared_pdf_paths, image_file_paths, _, _, pdf_doc,
|
92 |
+
image_annotations, _, original_cropboxes, page_sizes, textract_output_found, _, _, _, _
|
93 |
+
) = prepare_image_or_pdf(
|
94 |
+
file_paths=[args.input_file], text_extract_method=args.ocr_method,
|
95 |
+
all_line_level_ocr_results_df=pd.DataFrame(), all_page_line_level_ocr_results_with_words_df=pd.DataFrame(),
|
96 |
+
first_loop_state=True, prepare_for_review=args.prepare_for_review,
|
97 |
+
output_folder=args.output_dir, prepare_images=args.prepare_images
|
98 |
+
)
|
99 |
+
print(f"Preparation complete. {prep_summary}")
|
100 |
+
|
101 |
+
# Step 2: Redact the prepared document
|
102 |
+
print("\nStep 2: Running redaction...")
|
103 |
+
(
|
104 |
+
output_summary, output_files, _, _, log_files, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _
|
105 |
+
) = choose_and_run_redactor(
|
106 |
+
file_paths=[args.input_file], prepared_pdf_file_paths=prepared_pdf_paths,
|
107 |
+
pdf_image_file_paths=image_file_paths, chosen_redact_entities=chosen_redact_entities,
|
108 |
+
chosen_redact_comprehend_entities=chosen_comprehend_entities, text_extraction_method=args.ocr_method,
|
109 |
+
in_allow_list=allow_list, first_loop_state=True, page_min=args.page_min, page_max=args.page_max,
|
110 |
+
pymupdf_doc=pdf_doc, annotations_all_pages=image_annotations, page_sizes=page_sizes,
|
111 |
+
document_cropboxes=original_cropboxes, pii_identification_method=args.pii_detector,
|
112 |
+
aws_access_key_textbox=args.aws_access_key, aws_secret_key_textbox=args.aws_secret_key,
|
113 |
+
language=args.language, output_folder=args.output_dir
|
114 |
+
)
|
115 |
+
|
116 |
+
print("\n--- Redaction Process Complete ---")
|
117 |
+
print(f"Summary: {output_summary}")
|
118 |
+
print(f"\nOutput files saved to: {args.output_dir}")
|
119 |
+
print("Generated Files:", sorted(output_files))
|
120 |
+
if log_files: print("Log Files:", sorted(log_files))
|
121 |
+
|
122 |
+
except Exception as e:
|
123 |
+
print(f"\nAn error occurred during the PDF/Image redaction workflow: {e}")
|
124 |
+
|
125 |
+
# Workflow 2: Word/Tabular Data Anonymisation
|
126 |
+
elif file_extension in ['.docx', '.xlsx', '.xls', '.csv', '.parquet']:
|
127 |
+
print("--- Detected Word/Tabular file. Starting Anonymisation Workflow... ---")
|
128 |
+
try:
|
129 |
+
# Run the anonymisation function directly
|
130 |
+
output_summary, output_files, _, _, log_files, _, _ = anonymise_files_with_open_text(
|
131 |
+
file_paths=[args.input_file],
|
132 |
+
in_text="", # Not used for file-based operations
|
133 |
+
anon_strat=args.anon_strat,
|
134 |
+
chosen_cols=args.columns,
|
135 |
+
chosen_redact_entities=chosen_redact_entities,
|
136 |
+
in_allow_list=allow_list,
|
137 |
+
in_excel_sheets=args.excel_sheets,
|
138 |
+
first_loop_state=True,
|
139 |
+
output_folder=args.output_dir,
|
140 |
+
in_deny_list=deny_list,
|
141 |
+
max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes,
|
142 |
+
pii_identification_method=args.pii_detector,
|
143 |
+
chosen_redact_comprehend_entities=chosen_comprehend_entities,
|
144 |
+
aws_access_key_textbox=args.aws_access_key,
|
145 |
+
aws_secret_key_textbox=args.aws_secret_key,
|
146 |
+
language=args.language
|
147 |
+
)
|
148 |
+
|
149 |
+
print("\n--- Anonymisation Process Complete ---")
|
150 |
+
print(f"Summary: {output_summary}")
|
151 |
+
print(f"\nOutput files saved to: {args.output_dir}")
|
152 |
+
print("Generated Files:", sorted(output_files))
|
153 |
+
if log_files: print("Log Files:", sorted(log_files))
|
154 |
|
155 |
+
except Exception as e:
|
156 |
+
print(f"\nAn error occurred during the Word/Tabular anonymisation workflow: {e}")
|
157 |
+
|
158 |
+
else:
|
159 |
+
print(f"Error: Unsupported file type '{file_extension}'.")
|
160 |
+
print("Supported types for redaction: .pdf, .png, .jpg, .jpeg")
|
161 |
+
print("Supported types for anonymisation: .docx, .xlsx, .xls, .csv, .parquet")
|
162 |
|
163 |
if __name__ == "__main__":
|
164 |
+
main()
|
tools/config.py
CHANGED
@@ -154,10 +154,9 @@ if USE_LOG_SUBFOLDERS == "True":
|
|
154 |
ACCESS_LOGS_FOLDER = ACCESS_LOGS_FOLDER + full_log_subfolder
|
155 |
USAGE_LOGS_FOLDER = USAGE_LOGS_FOLDER + full_log_subfolder
|
156 |
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
S3_USAGE_LOGS_FOLDER = get_or_create_env_var('S3_USAGE_LOGS_FOLDER', USAGE_LOGS_FOLDER)
|
161 |
|
162 |
# Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
|
163 |
DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
|
@@ -197,6 +196,7 @@ FEEDBACK_LOG_FILE_NAME = get_or_create_env_var('FEEDBACK_LOG_FILE_NAME', LOG_FIL
|
|
197 |
|
198 |
# Create Tesseract and Poppler folders if you have installed them locally
|
199 |
TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") # # If installing for Windows, install Tesseract 5.5.0 from here: https://github.com/UB-Mannheim/tesseract/wiki. Then this environment variable should point to the Tesseract folder e.g. tesseract/
|
|
|
200 |
POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # If installing on Windows,install Poppler from here https://github.com/oschwartz10612/poppler-windows. This variable needs to point to the poppler bin folder e.g. poppler/poppler-24.02.0/Library/bin/
|
201 |
|
202 |
if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
|
@@ -266,6 +266,11 @@ TABULAR_PII_DETECTION_MODELS = PII_DETECTION_MODELS.copy()
|
|
266 |
if NO_REDACTION_PII_OPTION in TABULAR_PII_DETECTION_MODELS:
|
267 |
TABULAR_PII_DETECTION_MODELS.remove(NO_REDACTION_PII_OPTION)
|
268 |
|
|
|
|
|
|
|
|
|
|
|
269 |
# Entities for redaction
|
270 |
CHOSEN_COMPREHEND_ENTITIES = get_or_create_env_var('CHOSEN_COMPREHEND_ENTITIES', "['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']")
|
271 |
|
@@ -284,7 +289,26 @@ MAX_TIME_VALUE = get_or_create_env_var('MAX_TIME_VALUE', '999999')
|
|
284 |
|
285 |
CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") # only "grey" is currently supported as a custom box colour
|
286 |
|
287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
288 |
|
289 |
RETURN_PDF_END_OF_REDACTION = get_or_create_env_var("RETURN_PDF_END_OF_REDACTION", "True") # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.
|
290 |
|
|
|
154 |
ACCESS_LOGS_FOLDER = ACCESS_LOGS_FOLDER + full_log_subfolder
|
155 |
USAGE_LOGS_FOLDER = USAGE_LOGS_FOLDER + full_log_subfolder
|
156 |
|
157 |
+
S3_FEEDBACK_LOGS_FOLDER = get_or_create_env_var('S3_FEEDBACK_LOGS_FOLDER', 'feedback/' + full_log_subfolder)
|
158 |
+
S3_ACCESS_LOGS_FOLDER = get_or_create_env_var('S3_ACCESS_LOGS_FOLDER', 'logs/' + full_log_subfolder)
|
159 |
+
S3_USAGE_LOGS_FOLDER = get_or_create_env_var('S3_USAGE_LOGS_FOLDER', 'usage/' + full_log_subfolder)
|
|
|
160 |
|
161 |
# Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
|
162 |
DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
|
|
|
196 |
|
197 |
# Create Tesseract and Poppler folders if you have installed them locally
|
198 |
TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") # # If installing for Windows, install Tesseract 5.5.0 from here: https://github.com/UB-Mannheim/tesseract/wiki. Then this environment variable should point to the Tesseract folder e.g. tesseract/
|
199 |
+
TESSERACT_DATA_FOLDER = get_or_create_env_var('TESSERACT_DATA_FOLDER', "/usr/share/tessdata")
|
200 |
POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # If installing on Windows,install Poppler from here https://github.com/oschwartz10612/poppler-windows. This variable needs to point to the poppler bin folder e.g. poppler/poppler-24.02.0/Library/bin/
|
201 |
|
202 |
if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
|
|
|
266 |
if NO_REDACTION_PII_OPTION in TABULAR_PII_DETECTION_MODELS:
|
267 |
TABULAR_PII_DETECTION_MODELS.remove(NO_REDACTION_PII_OPTION)
|
268 |
|
269 |
+
### Local OCR model - Tesseract vs PaddleOCR
|
270 |
+
CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var('CHOSEN_LOCAL_OCR_MODEL', "tesseract") # Choose between "tesseract", "hybrid", and "paddle". "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence.
|
271 |
+
|
272 |
+
PREPROCESS_LOCAL_OCR_IMAGES = get_or_create_env_var('PREPROCESS_LOCAL_OCR_IMAGES', "False") # Whether to try and preprocess images before extracting text. NOTE: I have found in testing that this often results in WORSE results for scanned pages, so it is default False
|
273 |
+
|
274 |
# Entities for redaction
|
275 |
CHOSEN_COMPREHEND_ENTITIES = get_or_create_env_var('CHOSEN_COMPREHEND_ENTITIES', "['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']")
|
276 |
|
|
|
289 |
|
290 |
CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") # only "grey" is currently supported as a custom box colour
|
291 |
|
292 |
+
### Language selection options
|
293 |
+
|
294 |
+
SHOW_LANGUAGE_SELECTION = get_or_create_env_var("SHOW_LANGUAGE_SELECTION", "False")
|
295 |
+
|
296 |
+
DEFAULT_LANGUAGE_FULL_NAME = get_or_create_env_var("DEFAULT_LANGUAGE_FULL_NAME", "english")
|
297 |
+
DEFAULT_LANGUAGE = get_or_create_env_var("DEFAULT_LANGUAGE", "en") # For tesseract, ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system. You can find the relevant language packs here: https://github.com/tesseract-ocr/tessdata.
|
298 |
+
# For paddle, ensure the paddle language data (e.g., fra.traineddata) is installed on your system. You can find information on supported languages here: https://www.paddleocr.ai/main/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html
|
299 |
+
# For AWS Comprehend, only English and Spanish are supported https://docs.aws.amazon.com/comprehend/latest/dg/how-pii.html ['en', 'es']
|
300 |
+
# AWS Textract automatically detects the language of the document and supports the following languages: https://aws.amazon.com/textract/faqs/#topic-0. 'English, Spanish, Italian, Portuguese, French, German. Handwriting, Invoices and Receipts, Identity documents and Queries processing are in English only'
|
301 |
+
|
302 |
+
textract_language_choices = get_or_create_env_var("textract_language_choices", "['en', 'es', 'fr', 'de', 'it', 'pt']")
|
303 |
+
aws_comprehend_language_choices = get_or_create_env_var("aws_comprehend_language_choices", "['en', 'es']")
|
304 |
+
|
305 |
+
# The choices that the user sees
|
306 |
+
MAPPED_LANGUAGE_CHOICES = get_or_create_env_var("MAPPED_LANGUAGE_CHOICES", "['english', 'french', 'german', 'spanish', 'italian', 'dutch', 'portuguese', 'chinese', 'japanese', 'korean', 'lithuanian', 'macedonian', 'norwegian_bokmaal', 'polish', 'romanian', 'russian', 'slovenian', 'swedish', 'catalan', 'ukrainian']")
|
307 |
+
LANGUAGE_CHOICES = get_or_create_env_var("LANGUAGE_CHOICES", "['en', 'fr', 'de', 'es', 'it', 'nl', 'pt', 'zh', 'ja', 'ko', 'lt', 'mk', 'nb', 'pl', 'ro', 'ru', 'sl', 'sv', 'ca', 'uk']")
|
308 |
+
|
309 |
+
|
310 |
+
|
311 |
+
### File output options
|
312 |
|
313 |
RETURN_PDF_END_OF_REDACTION = get_or_create_env_var("RETURN_PDF_END_OF_REDACTION", "True") # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.
|
314 |
|
tools/custom_image_analyser_engine.py
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
tools/data_anonymise.py
CHANGED
@@ -6,16 +6,19 @@ import time
|
|
6 |
import boto3
|
7 |
import botocore
|
8 |
import pandas as pd
|
|
|
|
|
9 |
from openpyxl import Workbook
|
10 |
from faker import Faker
|
11 |
from gradio import Progress
|
12 |
-
from typing import List, Dict, Any
|
|
|
13 |
from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
|
14 |
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
|
15 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
16 |
-
from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER
|
17 |
from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
|
18 |
-
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
|
19 |
# Use custom version of analyze_dict to be able to track progress
|
20 |
from tools.presidio_analyzer_custom import analyze_dict
|
21 |
|
@@ -46,7 +49,7 @@ def initial_clean(text:str) -> str:
|
|
46 |
return text
|
47 |
|
48 |
def process_recognizer_result(result:RecognizerResult, recognizer_result:RecognizerResult, data_row:int, dictionary_key:int, df_dict:Dict[str, List[Any]], keys_to_keep:List[str]) -> List[str]:
|
49 |
-
output =
|
50 |
|
51 |
if hasattr(result, 'value'):
|
52 |
text = result.value[data_row]
|
@@ -86,7 +89,7 @@ def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult],
|
|
86 |
Returns:
|
87 |
str: A string containing the detailed decision process output.
|
88 |
"""
|
89 |
-
decision_process_output =
|
90 |
keys_to_keep = ['entity_type', 'start', 'end']
|
91 |
|
92 |
# Run through each column to analyse for PII
|
@@ -115,22 +118,16 @@ def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
|
|
115 |
# ## Pick out common names and replace them with the same person value
|
116 |
df_dict = df.to_dict(orient="list")
|
117 |
|
118 |
-
analyzer = AnalyzerEngine()
|
119 |
-
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=
|
120 |
|
121 |
-
analyzer_results = batch_analyzer.analyze_dict(df_dict, language=
|
122 |
analyzer_results = list(analyzer_results)
|
123 |
|
124 |
-
# + tags=[]
|
125 |
text = analyzer_results[3].value
|
126 |
|
127 |
-
# + tags=[]
|
128 |
recognizer_result = str(analyzer_results[3].recognizer_results)
|
129 |
|
130 |
-
# + tags=[]
|
131 |
-
recognizer_result
|
132 |
-
|
133 |
-
# + tags=[]
|
134 |
data_str = recognizer_result # abbreviated for brevity
|
135 |
|
136 |
# Adjusting the parse_dict function to handle trailing ']'
|
@@ -153,7 +150,7 @@ def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
|
|
153 |
|
154 |
# Re-running the improved processing code
|
155 |
|
156 |
-
result =
|
157 |
|
158 |
for lst_str in list_strs:
|
159 |
# Splitting each list string into individual dictionary strings
|
@@ -164,73 +161,156 @@ def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
|
|
164 |
dicts = [parse_dict(d) for d in dict_strs]
|
165 |
result.append(dicts)
|
166 |
|
167 |
-
|
168 |
-
|
169 |
-
# + tags=[]
|
170 |
-
names = []
|
171 |
|
172 |
for idx, paragraph in enumerate(text):
|
173 |
-
paragraph_texts =
|
174 |
for dictionary in result[idx]:
|
175 |
if dictionary['type'] == 'PERSON':
|
176 |
paragraph_texts.append(paragraph[dictionary['start']:dictionary['end']])
|
177 |
names.append(paragraph_texts)
|
178 |
|
179 |
-
# + tags=[]
|
180 |
# Flatten the list of lists and extract unique names
|
181 |
unique_names = list(set(name for sublist in names for name in sublist))
|
182 |
|
183 |
-
# + tags=[]
|
184 |
fake_names = pd.Series(unique_names).apply(fake_first_name)
|
185 |
|
186 |
-
# + tags=[]
|
187 |
mapping_df = pd.DataFrame(data={"Unique names":unique_names,
|
188 |
"Fake names": fake_names})
|
189 |
|
190 |
-
# + tags=[]
|
191 |
-
# Convert mapping dataframe to dictionary
|
192 |
# Convert mapping dataframe to dictionary, adding word boundaries for full-word match
|
193 |
name_map = {r'\b' + k + r'\b': v for k, v in zip(mapping_df['Unique names'], mapping_df['Fake names'])}
|
194 |
|
195 |
-
# + tags=[]
|
196 |
name_map
|
197 |
|
198 |
-
# + tags=[]
|
199 |
scrubbed_df_consistent_names = df.replace(name_map, regex = True)
|
200 |
|
201 |
-
# + tags=[]
|
202 |
scrubbed_df_consistent_names
|
203 |
|
204 |
return scrubbed_df_consistent_names
|
205 |
|
206 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
in_text: str,
|
208 |
anon_strat: str,
|
209 |
chosen_cols: List[str],
|
210 |
-
language: str,
|
211 |
chosen_redact_entities: List[str],
|
212 |
in_allow_list: List[str] = None,
|
213 |
latest_file_completed: int = 0,
|
214 |
-
out_message: list =
|
215 |
-
out_file_paths: list =
|
216 |
-
log_files_output_paths: list =
|
217 |
-
in_excel_sheets: list =
|
218 |
first_loop_state: bool = False,
|
219 |
output_folder: str = OUTPUT_FOLDER,
|
220 |
-
in_deny_list:list[str]=
|
221 |
max_fuzzy_spelling_mistakes_num:int=0,
|
222 |
pii_identification_method:str="Local",
|
223 |
-
chosen_redact_comprehend_entities:List[str]=
|
224 |
comprehend_query_number:int=0,
|
225 |
aws_access_key_textbox:str='',
|
226 |
aws_secret_key_textbox:str='',
|
227 |
actual_time_taken_number:float=0,
|
|
|
228 |
progress: Progress = Progress(track_tqdm=True)):
|
229 |
"""
|
230 |
This function anonymises data files based on the provided parameters.
|
231 |
|
232 |
Parameters:
|
233 |
-
- file_paths (List[str]): A list of file paths to anonymise.
|
234 |
- in_text (str): The text to anonymise if file_paths is 'open_text'.
|
235 |
- anon_strat (str): The anonymisation strategy to use.
|
236 |
- chosen_cols (List[str]): A list of column names to anonymise.
|
@@ -252,17 +332,26 @@ def anonymise_data_files(file_paths: List[str],
|
|
252 |
- aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
|
253 |
- aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
|
254 |
- actual_time_taken_number (float, optional): Time taken to do the redaction.
|
|
|
255 |
- progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
|
256 |
"""
|
257 |
|
258 |
tic = time.perf_counter()
|
259 |
comprehend_client = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
260 |
|
261 |
# If this is the first time around, set variables to 0/blank
|
262 |
if first_loop_state==True:
|
263 |
latest_file_completed = 0
|
264 |
-
out_message =
|
265 |
-
out_file_paths =
|
266 |
|
267 |
# Load file
|
268 |
# If out message or out_file_paths are blank, change to a list so it can be appended to
|
@@ -272,23 +361,23 @@ def anonymise_data_files(file_paths: List[str],
|
|
272 |
#print("log_files_output_paths:",log_files_output_paths)
|
273 |
|
274 |
if isinstance(log_files_output_paths, str):
|
275 |
-
log_files_output_paths =
|
276 |
|
277 |
if not out_file_paths:
|
278 |
-
out_file_paths =
|
279 |
|
280 |
if isinstance(in_allow_list, list):
|
281 |
if in_allow_list:
|
282 |
in_allow_list_flat = in_allow_list
|
283 |
else:
|
284 |
-
in_allow_list_flat =
|
285 |
elif isinstance(in_allow_list, pd.DataFrame):
|
286 |
if not in_allow_list.empty:
|
287 |
in_allow_list_flat = list(in_allow_list.iloc[:, 0].unique())
|
288 |
else:
|
289 |
-
in_allow_list_flat =
|
290 |
else:
|
291 |
-
in_allow_list_flat =
|
292 |
|
293 |
anon_df = pd.DataFrame()
|
294 |
|
@@ -342,15 +431,37 @@ def anonymise_data_files(file_paths: List[str],
|
|
342 |
sheet_name = ""
|
343 |
file_type = ""
|
344 |
|
345 |
-
out_file_paths, out_message, key_string, log_files_output_paths =
|
346 |
else:
|
347 |
# If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
|
348 |
file_type = detect_file_type(anon_file)
|
349 |
print("File type is:", file_type)
|
350 |
|
351 |
out_file_part = get_file_name_without_type(anon_file.name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
352 |
|
353 |
-
|
354 |
print("Running through all xlsx sheets")
|
355 |
#anon_xlsx = pd.ExcelFile(anon_file)
|
356 |
if not in_excel_sheets:
|
@@ -371,14 +482,14 @@ def anonymise_data_files(file_paths: List[str],
|
|
371 |
|
372 |
anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
|
373 |
|
374 |
-
out_file_paths, out_message, key_string, log_files_output_paths =
|
375 |
|
376 |
else:
|
377 |
sheet_name = ""
|
378 |
anon_df = read_file(anon_file)
|
379 |
out_file_part = get_file_name_without_type(anon_file.name)
|
380 |
|
381 |
-
out_file_paths, out_message, key_string, log_files_output_paths =
|
382 |
|
383 |
# Increase latest file completed count unless we are at the last file
|
384 |
if latest_file_completed != len(file_paths):
|
@@ -392,6 +503,9 @@ def anonymise_data_files(file_paths: List[str],
|
|
392 |
|
393 |
actual_time_taken_number += out_time_float
|
394 |
|
|
|
|
|
|
|
395 |
out_message.append("Anonymisation of file '" + out_file_part + "' successfully completed in")
|
396 |
|
397 |
out_message_out = '\n'.join(out_message)
|
@@ -406,7 +520,7 @@ def anonymise_data_files(file_paths: List[str],
|
|
406 |
|
407 |
return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, actual_time_taken_number
|
408 |
|
409 |
-
def
|
410 |
anon_file: str,
|
411 |
anon_df: pd.DataFrame,
|
412 |
chosen_cols: List[str],
|
@@ -415,18 +529,20 @@ def anon_wrapper_func(
|
|
415 |
out_message: str,
|
416 |
excel_sheet_name: str,
|
417 |
anon_strat: str,
|
418 |
-
language: str,
|
419 |
chosen_redact_entities: List[str],
|
420 |
in_allow_list: List[str],
|
421 |
file_type: str,
|
422 |
anon_xlsx_export_file_name: str,
|
423 |
log_files_output_paths: List[str],
|
424 |
-
in_deny_list: List[str]=
|
425 |
max_fuzzy_spelling_mistakes_num:int=0,
|
426 |
pii_identification_method:str="Local",
|
427 |
-
|
|
|
428 |
comprehend_query_number:int=0,
|
429 |
comprehend_client:botocore.client.BaseClient="",
|
|
|
430 |
output_folder: str = OUTPUT_FOLDER
|
431 |
):
|
432 |
"""
|
@@ -469,7 +585,7 @@ def anon_wrapper_func(
|
|
469 |
Returns:
|
470 |
A list containing the common strings.
|
471 |
"""
|
472 |
-
common_strings =
|
473 |
for string in list1:
|
474 |
if string in list2:
|
475 |
common_strings.append(string)
|
@@ -485,7 +601,9 @@ def anon_wrapper_func(
|
|
485 |
|
486 |
if any_cols_found == False:
|
487 |
out_message = "No chosen columns found in dataframe: " + out_file_part
|
|
|
488 |
print(out_message)
|
|
|
489 |
else:
|
490 |
chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
|
491 |
|
@@ -495,9 +613,12 @@ def anon_wrapper_func(
|
|
495 |
anon_df_part = anon_df[chosen_cols_in_anon_df]
|
496 |
anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
|
497 |
|
498 |
-
# Anonymise the selected columns
|
499 |
-
anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client)
|
500 |
|
|
|
|
|
|
|
|
|
|
|
501 |
# Rejoin the dataframe together
|
502 |
anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
|
503 |
anon_df_out = anon_df_out[all_cols_original_order]
|
@@ -531,7 +652,7 @@ def anon_wrapper_func(
|
|
531 |
|
532 |
else:
|
533 |
anon_export_file_name = output_folder + out_file_part + "_anon_" + anon_strat_txt + ".csv"
|
534 |
-
anon_df_out.to_csv(anon_export_file_name, index = None)
|
535 |
|
536 |
decision_process_log_output_file = anon_export_file_name + "_decision_process_output.txt"
|
537 |
with open(decision_process_log_output_file, "w") as f:
|
@@ -553,14 +674,15 @@ def anonymise_script(df:pd.DataFrame,
|
|
553 |
anon_strat:str,
|
554 |
language:str,
|
555 |
chosen_redact_entities:List[str],
|
556 |
-
in_allow_list:List[str]=
|
557 |
-
in_deny_list:List[str]=
|
558 |
max_fuzzy_spelling_mistakes_num:int=0,
|
559 |
pii_identification_method:str="Local",
|
560 |
-
chosen_redact_comprehend_entities:List[str]=
|
561 |
comprehend_query_number:int=0,
|
562 |
-
comprehend_client:botocore.client.BaseClient="",
|
563 |
custom_entities:List[str]=custom_entities,
|
|
|
564 |
progress:Progress=Progress(track_tqdm=False)):
|
565 |
'''
|
566 |
Conduct anonymisation of a dataframe using Presidio and/or AWS Comprehend if chosen.
|
@@ -580,21 +702,43 @@ def anonymise_script(df:pd.DataFrame,
|
|
580 |
if in_allow_list:
|
581 |
in_allow_list_flat = in_allow_list
|
582 |
else:
|
583 |
-
in_allow_list_flat =
|
584 |
elif isinstance(in_allow_list, pd.DataFrame):
|
585 |
if not in_allow_list.empty:
|
586 |
in_allow_list_flat = list(in_allow_list.iloc[:, 0].unique())
|
587 |
else:
|
588 |
-
in_allow_list_flat =
|
589 |
else:
|
590 |
-
in_allow_list_flat =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
591 |
|
592 |
if isinstance(in_deny_list, pd.DataFrame):
|
593 |
if not in_deny_list.empty:
|
594 |
in_deny_list = in_deny_list.iloc[:, 0].tolist()
|
595 |
else:
|
596 |
# Handle the case where the DataFrame is empty
|
597 |
-
in_deny_list =
|
598 |
|
599 |
# Sort the strings in order from the longest string to the shortest
|
600 |
in_deny_list = sorted(in_deny_list, key=len, reverse=True)
|
@@ -612,7 +756,7 @@ def anonymise_script(df:pd.DataFrame,
|
|
612 |
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
|
613 |
anonymizer = AnonymizerEngine()#conflict_resolution=ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED)
|
614 |
batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
|
615 |
-
analyzer_results =
|
616 |
|
617 |
if pii_identification_method == "Local":
|
618 |
|
|
|
6 |
import boto3
|
7 |
import botocore
|
8 |
import pandas as pd
|
9 |
+
import docx
|
10 |
+
import gradio as gr
|
11 |
from openpyxl import Workbook
|
12 |
from faker import Faker
|
13 |
from gradio import Progress
|
14 |
+
from typing import List, Dict, Any, Optional
|
15 |
+
from botocore.client import BaseClient
|
16 |
from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
|
17 |
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
|
18 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
19 |
+
from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER, DEFAULT_LANGUAGE, aws_comprehend_language_choices
|
20 |
from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
|
21 |
+
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities, create_nlp_analyser, load_spacy_model
|
22 |
# Use custom version of analyze_dict to be able to track progress
|
23 |
from tools.presidio_analyzer_custom import analyze_dict
|
24 |
|
|
|
49 |
return text
|
50 |
|
51 |
def process_recognizer_result(result:RecognizerResult, recognizer_result:RecognizerResult, data_row:int, dictionary_key:int, df_dict:Dict[str, List[Any]], keys_to_keep:List[str]) -> List[str]:
|
52 |
+
output = list()
|
53 |
|
54 |
if hasattr(result, 'value'):
|
55 |
text = result.value[data_row]
|
|
|
89 |
Returns:
|
90 |
str: A string containing the detailed decision process output.
|
91 |
"""
|
92 |
+
decision_process_output = list()
|
93 |
keys_to_keep = ['entity_type', 'start', 'end']
|
94 |
|
95 |
# Run through each column to analyse for PII
|
|
|
118 |
# ## Pick out common names and replace them with the same person value
|
119 |
df_dict = df.to_dict(orient="list")
|
120 |
|
121 |
+
#analyzer = AnalyzerEngine()
|
122 |
+
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
|
123 |
|
124 |
+
analyzer_results = batch_analyzer.analyze_dict(df_dict, language=DEFAULT_LANGUAGE)
|
125 |
analyzer_results = list(analyzer_results)
|
126 |
|
|
|
127 |
text = analyzer_results[3].value
|
128 |
|
|
|
129 |
recognizer_result = str(analyzer_results[3].recognizer_results)
|
130 |
|
|
|
|
|
|
|
|
|
131 |
data_str = recognizer_result # abbreviated for brevity
|
132 |
|
133 |
# Adjusting the parse_dict function to handle trailing ']'
|
|
|
150 |
|
151 |
# Re-running the improved processing code
|
152 |
|
153 |
+
result = list()
|
154 |
|
155 |
for lst_str in list_strs:
|
156 |
# Splitting each list string into individual dictionary strings
|
|
|
161 |
dicts = [parse_dict(d) for d in dict_strs]
|
162 |
result.append(dicts)
|
163 |
|
164 |
+
names = list()
|
|
|
|
|
|
|
165 |
|
166 |
for idx, paragraph in enumerate(text):
|
167 |
+
paragraph_texts = list()
|
168 |
for dictionary in result[idx]:
|
169 |
if dictionary['type'] == 'PERSON':
|
170 |
paragraph_texts.append(paragraph[dictionary['start']:dictionary['end']])
|
171 |
names.append(paragraph_texts)
|
172 |
|
|
|
173 |
# Flatten the list of lists and extract unique names
|
174 |
unique_names = list(set(name for sublist in names for name in sublist))
|
175 |
|
|
|
176 |
fake_names = pd.Series(unique_names).apply(fake_first_name)
|
177 |
|
|
|
178 |
mapping_df = pd.DataFrame(data={"Unique names":unique_names,
|
179 |
"Fake names": fake_names})
|
180 |
|
|
|
|
|
181 |
# Convert mapping dataframe to dictionary, adding word boundaries for full-word match
|
182 |
name_map = {r'\b' + k + r'\b': v for k, v in zip(mapping_df['Unique names'], mapping_df['Fake names'])}
|
183 |
|
|
|
184 |
name_map
|
185 |
|
|
|
186 |
scrubbed_df_consistent_names = df.replace(name_map, regex = True)
|
187 |
|
|
|
188 |
scrubbed_df_consistent_names
|
189 |
|
190 |
return scrubbed_df_consistent_names
|
191 |
|
192 |
+
def handle_docx_anonymisation(
|
193 |
+
file_path: str,
|
194 |
+
output_folder: str,
|
195 |
+
anon_strat: str,
|
196 |
+
chosen_redact_entities: List[str],
|
197 |
+
in_allow_list: List[str],
|
198 |
+
in_deny_list: List[str],
|
199 |
+
max_fuzzy_spelling_mistakes_num: int,
|
200 |
+
pii_identification_method: str,
|
201 |
+
chosen_redact_comprehend_entities: List[str],
|
202 |
+
comprehend_query_number: int,
|
203 |
+
comprehend_client: BaseClient,
|
204 |
+
language: Optional[str] = DEFAULT_LANGUAGE,
|
205 |
+
nlp_analyser: AnalyzerEngine = nlp_analyser
|
206 |
+
):
|
207 |
+
"""
|
208 |
+
Anonymises a .docx file by extracting text, processing it, and re-inserting it.
|
209 |
+
|
210 |
+
Returns:
|
211 |
+
A tuple containing the output file path and the log file path.
|
212 |
+
"""
|
213 |
+
|
214 |
+
# 1. Load the document and extract text elements
|
215 |
+
doc = docx.Document(file_path)
|
216 |
+
text_elements = list() # This will store the actual docx objects (paragraphs, cells)
|
217 |
+
original_texts = list() # This will store the text from those objects
|
218 |
+
|
219 |
+
# Extract from paragraphs
|
220 |
+
for para in doc.paragraphs:
|
221 |
+
if para.text.strip(): # Only process non-empty paragraphs
|
222 |
+
text_elements.append(para)
|
223 |
+
original_texts.append(para.text)
|
224 |
+
|
225 |
+
# Extract from tables
|
226 |
+
for table in doc.tables:
|
227 |
+
for row in table.rows:
|
228 |
+
for cell in row.cells:
|
229 |
+
if cell.text.strip(): # Only process non-empty cells
|
230 |
+
text_elements.append(cell)
|
231 |
+
original_texts.append(cell.text)
|
232 |
+
|
233 |
+
# If there's no text to process, return early
|
234 |
+
if not original_texts:
|
235 |
+
print(f"No text found in {file_path}. Skipping.")
|
236 |
+
return None, None
|
237 |
+
|
238 |
+
# 2. Convert to a DataFrame for the existing anonymisation script
|
239 |
+
df_to_anonymise = pd.DataFrame({'text_to_redact': original_texts})
|
240 |
+
|
241 |
+
# 3. Call the core anonymisation script
|
242 |
+
anonymised_df, _, decision_log = anonymise_script(
|
243 |
+
df=df_to_anonymise,
|
244 |
+
anon_strat=anon_strat,
|
245 |
+
language=language,
|
246 |
+
chosen_redact_entities=chosen_redact_entities,
|
247 |
+
in_allow_list=in_allow_list,
|
248 |
+
in_deny_list=in_deny_list,
|
249 |
+
max_fuzzy_spelling_mistakes_num=max_fuzzy_spelling_mistakes_num,
|
250 |
+
pii_identification_method=pii_identification_method,
|
251 |
+
chosen_redact_comprehend_entities=chosen_redact_comprehend_entities,
|
252 |
+
comprehend_query_number=comprehend_query_number,
|
253 |
+
comprehend_client=comprehend_client,
|
254 |
+
nlp_analyser=nlp_analyser
|
255 |
+
)
|
256 |
+
|
257 |
+
anonymised_texts = anonymised_df['text_to_redact'].tolist()
|
258 |
+
|
259 |
+
# 4. Re-insert the anonymised text back into the document objects
|
260 |
+
for element, new_text in zip(text_elements, anonymised_texts):
|
261 |
+
if isinstance(element, docx.text.paragraph.Paragraph):
|
262 |
+
# Clear existing content (runs) and add the new text in a single new run
|
263 |
+
element.clear()
|
264 |
+
element.add_run(new_text)
|
265 |
+
elif isinstance(element, docx.table._Cell):
|
266 |
+
# For cells, setting .text works similarly
|
267 |
+
element.text = new_text
|
268 |
+
|
269 |
+
# 5. Save the redacted document and the log file
|
270 |
+
base_name = os.path.basename(file_path)
|
271 |
+
file_name_without_ext = os.path.splitext(base_name)[0]
|
272 |
+
|
273 |
+
output_docx_path = os.path.join(output_folder, f"{file_name_without_ext}_redacted.docx")
|
274 |
+
log_file_path = os.path.join(output_folder, f"{file_name_without_ext}_redacted_log.txt")
|
275 |
+
|
276 |
+
output_xlsx_path = os.path.join(output_folder, f"{file_name_without_ext}_redacted.csv")
|
277 |
+
|
278 |
+
anonymised_df.to_csv(output_xlsx_path, encoding="utf-8-sig")
|
279 |
+
doc.save(output_docx_path)
|
280 |
+
|
281 |
+
with open(log_file_path, "w", encoding="utf-8-sig") as f:
|
282 |
+
f.write(decision_log)
|
283 |
+
|
284 |
+
return output_docx_path, log_file_path, output_xlsx_path
|
285 |
+
|
286 |
+
def anonymise_files_with_open_text(file_paths: List[str],
|
287 |
in_text: str,
|
288 |
anon_strat: str,
|
289 |
chosen_cols: List[str],
|
|
|
290 |
chosen_redact_entities: List[str],
|
291 |
in_allow_list: List[str] = None,
|
292 |
latest_file_completed: int = 0,
|
293 |
+
out_message: list = list(),
|
294 |
+
out_file_paths: list = list(),
|
295 |
+
log_files_output_paths: list = list(),
|
296 |
+
in_excel_sheets: list = list(),
|
297 |
first_loop_state: bool = False,
|
298 |
output_folder: str = OUTPUT_FOLDER,
|
299 |
+
in_deny_list:list[str]=list(),
|
300 |
max_fuzzy_spelling_mistakes_num:int=0,
|
301 |
pii_identification_method:str="Local",
|
302 |
+
chosen_redact_comprehend_entities:List[str]=list(),
|
303 |
comprehend_query_number:int=0,
|
304 |
aws_access_key_textbox:str='',
|
305 |
aws_secret_key_textbox:str='',
|
306 |
actual_time_taken_number:float=0,
|
307 |
+
language: Optional[str] = None,
|
308 |
progress: Progress = Progress(track_tqdm=True)):
|
309 |
"""
|
310 |
This function anonymises data files based on the provided parameters.
|
311 |
|
312 |
Parameters:
|
313 |
+
- file_paths (List[str]): A list of file paths to anonymise: '.xlsx', '.xls', '.csv', '.parquet', or '.docx'.
|
314 |
- in_text (str): The text to anonymise if file_paths is 'open_text'.
|
315 |
- anon_strat (str): The anonymisation strategy to use.
|
316 |
- chosen_cols (List[str]): A list of column names to anonymise.
|
|
|
332 |
- aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
|
333 |
- aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
|
334 |
- actual_time_taken_number (float, optional): Time taken to do the redaction.
|
335 |
+
- language (str, optional): The language of the text to anonymise.
|
336 |
- progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
|
337 |
"""
|
338 |
|
339 |
tic = time.perf_counter()
|
340 |
comprehend_client = ""
|
341 |
+
|
342 |
+
# Use provided language or default
|
343 |
+
language = language or DEFAULT_LANGUAGE
|
344 |
+
|
345 |
+
if pii_identification_method == "AWS Comprehend":
|
346 |
+
if language not in aws_comprehend_language_choices:
|
347 |
+
out_message = f"Please note that this language is not supported by AWS Comprehend: {language}"
|
348 |
+
raise Warning(out_message)
|
349 |
|
350 |
# If this is the first time around, set variables to 0/blank
|
351 |
if first_loop_state==True:
|
352 |
latest_file_completed = 0
|
353 |
+
out_message = list()
|
354 |
+
out_file_paths = list()
|
355 |
|
356 |
# Load file
|
357 |
# If out message or out_file_paths are blank, change to a list so it can be appended to
|
|
|
361 |
#print("log_files_output_paths:",log_files_output_paths)
|
362 |
|
363 |
if isinstance(log_files_output_paths, str):
|
364 |
+
log_files_output_paths = list()
|
365 |
|
366 |
if not out_file_paths:
|
367 |
+
out_file_paths = list()
|
368 |
|
369 |
if isinstance(in_allow_list, list):
|
370 |
if in_allow_list:
|
371 |
in_allow_list_flat = in_allow_list
|
372 |
else:
|
373 |
+
in_allow_list_flat = list()
|
374 |
elif isinstance(in_allow_list, pd.DataFrame):
|
375 |
if not in_allow_list.empty:
|
376 |
in_allow_list_flat = list(in_allow_list.iloc[:, 0].unique())
|
377 |
else:
|
378 |
+
in_allow_list_flat = list()
|
379 |
else:
|
380 |
+
in_allow_list_flat = list()
|
381 |
|
382 |
anon_df = pd.DataFrame()
|
383 |
|
|
|
431 |
sheet_name = ""
|
432 |
file_type = ""
|
433 |
|
434 |
+
out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=OUTPUT_FOLDER)
|
435 |
else:
|
436 |
# If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
|
437 |
file_type = detect_file_type(anon_file)
|
438 |
print("File type is:", file_type)
|
439 |
|
440 |
out_file_part = get_file_name_without_type(anon_file.name)
|
441 |
+
|
442 |
+
if file_type == 'docx':
|
443 |
+
output_path, log_path, output_xlsx_path = handle_docx_anonymisation(
|
444 |
+
file_path=anon_file.name, # .name if it's a temp file object
|
445 |
+
output_folder=output_folder,
|
446 |
+
anon_strat=anon_strat,
|
447 |
+
chosen_redact_entities=chosen_redact_entities,
|
448 |
+
in_allow_list=in_allow_list_flat,
|
449 |
+
in_deny_list=in_deny_list,
|
450 |
+
max_fuzzy_spelling_mistakes_num=max_fuzzy_spelling_mistakes_num,
|
451 |
+
pii_identification_method=pii_identification_method,
|
452 |
+
chosen_redact_comprehend_entities=chosen_redact_comprehend_entities,
|
453 |
+
comprehend_query_number=comprehend_query_number,
|
454 |
+
comprehend_client=comprehend_client,
|
455 |
+
language=language
|
456 |
+
)
|
457 |
+
if output_path:
|
458 |
+
out_file_paths.append(output_path)
|
459 |
+
if output_xlsx_path:
|
460 |
+
out_file_paths.append(output_xlsx_path)
|
461 |
+
if log_path:
|
462 |
+
log_files_output_paths.append(log_path)
|
463 |
|
464 |
+
elif file_type == 'xlsx':
|
465 |
print("Running through all xlsx sheets")
|
466 |
#anon_xlsx = pd.ExcelFile(anon_file)
|
467 |
if not in_excel_sheets:
|
|
|
482 |
|
483 |
anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
|
484 |
|
485 |
+
out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
|
486 |
|
487 |
else:
|
488 |
sheet_name = ""
|
489 |
anon_df = read_file(anon_file)
|
490 |
out_file_part = get_file_name_without_type(anon_file.name)
|
491 |
|
492 |
+
out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
|
493 |
|
494 |
# Increase latest file completed count unless we are at the last file
|
495 |
if latest_file_completed != len(file_paths):
|
|
|
503 |
|
504 |
actual_time_taken_number += out_time_float
|
505 |
|
506 |
+
if isinstance(out_message, str):
|
507 |
+
out_message = [out_message]
|
508 |
+
|
509 |
out_message.append("Anonymisation of file '" + out_file_part + "' successfully completed in")
|
510 |
|
511 |
out_message_out = '\n'.join(out_message)
|
|
|
520 |
|
521 |
return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, actual_time_taken_number
|
522 |
|
523 |
+
def tabular_anonymise_wrapper_func(
|
524 |
anon_file: str,
|
525 |
anon_df: pd.DataFrame,
|
526 |
chosen_cols: List[str],
|
|
|
529 |
out_message: str,
|
530 |
excel_sheet_name: str,
|
531 |
anon_strat: str,
|
532 |
+
language: str,
|
533 |
chosen_redact_entities: List[str],
|
534 |
in_allow_list: List[str],
|
535 |
file_type: str,
|
536 |
anon_xlsx_export_file_name: str,
|
537 |
log_files_output_paths: List[str],
|
538 |
+
in_deny_list: List[str]=list(),
|
539 |
max_fuzzy_spelling_mistakes_num:int=0,
|
540 |
pii_identification_method:str="Local",
|
541 |
+
comprehend_language: Optional[str] = None,
|
542 |
+
chosen_redact_comprehend_entities:List[str]=list(),
|
543 |
comprehend_query_number:int=0,
|
544 |
comprehend_client:botocore.client.BaseClient="",
|
545 |
+
nlp_analyser: AnalyzerEngine = nlp_analyser,
|
546 |
output_folder: str = OUTPUT_FOLDER
|
547 |
):
|
548 |
"""
|
|
|
585 |
Returns:
|
586 |
A list containing the common strings.
|
587 |
"""
|
588 |
+
common_strings = list()
|
589 |
for string in list1:
|
590 |
if string in list2:
|
591 |
common_strings.append(string)
|
|
|
601 |
|
602 |
if any_cols_found == False:
|
603 |
out_message = "No chosen columns found in dataframe: " + out_file_part
|
604 |
+
key_string = ""
|
605 |
print(out_message)
|
606 |
+
return out_file_paths, out_message, key_string, log_files_output_paths
|
607 |
else:
|
608 |
chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
|
609 |
|
|
|
613 |
anon_df_part = anon_df[chosen_cols_in_anon_df]
|
614 |
anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
|
615 |
|
|
|
|
|
616 |
|
617 |
+
# Anonymise the selected columns
|
618 |
+
anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, nlp_analyser=nlp_analyser)
|
619 |
+
|
620 |
+
anon_df_part_out.replace("^nan$", "", regex=True, inplace=True)
|
621 |
+
|
622 |
# Rejoin the dataframe together
|
623 |
anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
|
624 |
anon_df_out = anon_df_out[all_cols_original_order]
|
|
|
652 |
|
653 |
else:
|
654 |
anon_export_file_name = output_folder + out_file_part + "_anon_" + anon_strat_txt + ".csv"
|
655 |
+
anon_df_out.to_csv(anon_export_file_name, index = None, encoding="utf-8-sig")
|
656 |
|
657 |
decision_process_log_output_file = anon_export_file_name + "_decision_process_output.txt"
|
658 |
with open(decision_process_log_output_file, "w") as f:
|
|
|
674 |
anon_strat:str,
|
675 |
language:str,
|
676 |
chosen_redact_entities:List[str],
|
677 |
+
in_allow_list:List[str]=list(),
|
678 |
+
in_deny_list:List[str]=list(),
|
679 |
max_fuzzy_spelling_mistakes_num:int=0,
|
680 |
pii_identification_method:str="Local",
|
681 |
+
chosen_redact_comprehend_entities:List[str]=list(),
|
682 |
comprehend_query_number:int=0,
|
683 |
+
comprehend_client:botocore.client.BaseClient="",
|
684 |
custom_entities:List[str]=custom_entities,
|
685 |
+
nlp_analyser: AnalyzerEngine = nlp_analyser,
|
686 |
progress:Progress=Progress(track_tqdm=False)):
|
687 |
'''
|
688 |
Conduct anonymisation of a dataframe using Presidio and/or AWS Comprehend if chosen.
|
|
|
702 |
if in_allow_list:
|
703 |
in_allow_list_flat = in_allow_list
|
704 |
else:
|
705 |
+
in_allow_list_flat = list()
|
706 |
elif isinstance(in_allow_list, pd.DataFrame):
|
707 |
if not in_allow_list.empty:
|
708 |
in_allow_list_flat = list(in_allow_list.iloc[:, 0].unique())
|
709 |
else:
|
710 |
+
in_allow_list_flat = list()
|
711 |
else:
|
712 |
+
in_allow_list_flat = list()
|
713 |
+
|
714 |
+
### Language check - check if selected language packs exist
|
715 |
+
try:
|
716 |
+
if language != "en":
|
717 |
+
progress(0.1, desc=f"Loading SpaCy model for {language}")
|
718 |
+
|
719 |
+
load_spacy_model(language)
|
720 |
+
|
721 |
+
except Exception as e:
|
722 |
+
print(f"Error downloading language packs for {language}: {e}")
|
723 |
+
raise Exception(f"Error downloading language packs for {language}: {e}")
|
724 |
+
|
725 |
+
# Try updating the supported languages for the spacy analyser
|
726 |
+
try:
|
727 |
+
nlp_analyser = create_nlp_analyser(language, existing_nlp_analyser=nlp_analyser)
|
728 |
+
# Check list of nlp_analyser recognisers and languages
|
729 |
+
if language != "en":
|
730 |
+
gr.Info(f"Language: {language} only supports the following entity detection: {str(nlp_analyser.registry.get_supported_entities(languages=[language]))}")
|
731 |
+
|
732 |
+
except Exception as e:
|
733 |
+
print(f"Error creating nlp_analyser for {language}: {e}")
|
734 |
+
raise Exception(f"Error creating nlp_analyser for {language}: {e}")
|
735 |
|
736 |
if isinstance(in_deny_list, pd.DataFrame):
|
737 |
if not in_deny_list.empty:
|
738 |
in_deny_list = in_deny_list.iloc[:, 0].tolist()
|
739 |
else:
|
740 |
# Handle the case where the DataFrame is empty
|
741 |
+
in_deny_list = list() # or some default value
|
742 |
|
743 |
# Sort the strings in order from the longest string to the shortest
|
744 |
in_deny_list = sorted(in_deny_list, key=len, reverse=True)
|
|
|
756 |
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
|
757 |
anonymizer = AnonymizerEngine()#conflict_resolution=ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED)
|
758 |
batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
|
759 |
+
analyzer_results = list()
|
760 |
|
761 |
if pii_identification_method == "Local":
|
762 |
|
tools/example_cli_calls.txt
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
python cli_redact.py --help
|
2 |
+
|
3 |
+
python cli_redact.py \
|
4 |
+
--input_file "documents/confidential-report.pdf" \
|
5 |
+
--output_dir "output/redacted_reports/" \
|
6 |
+
--ocr_method "Local OCR model - PDFs without selectable text" \
|
7 |
+
--pii_detector "Local" \
|
8 |
+
--page_min 2 \
|
9 |
+
--page_max 10 \
|
10 |
+
--allow_list "config/project_allowlist.csv"
|
11 |
+
|
12 |
+
python your_cli_script.py \
|
13 |
+
--input_file "data/customer_data.xlsx" \
|
14 |
+
--output_dir "output/anonymised_data/" \
|
15 |
+
--anon_strat "redact" \
|
16 |
+
--columns "Customer Name" "Email" \
|
17 |
+
--excel_sheets "Q3-Data"
|
18 |
+
|
19 |
+
python your_cli_script.py \
|
20 |
+
--input_file "legal_docs/legal_agreement.docx" \
|
21 |
+
--output_dir "output/anonymised_docs/" \
|
22 |
+
--anon_strat "encrypt" \
|
23 |
+
--deny_list "config/codenames.csv" \
|
24 |
+
--language "en"
|
tools/file_conversion.py
CHANGED
@@ -4,6 +4,7 @@ import os
|
|
4 |
import re
|
5 |
import time
|
6 |
import json
|
|
|
7 |
import numpy as np
|
8 |
import pymupdf
|
9 |
from pymupdf import Document, Page, Rect
|
@@ -71,7 +72,7 @@ def check_image_size_and_reduce(out_path:str, image:Image):
|
|
71 |
Check if a given image size is above around 4.5mb, and reduce size if necessary. 5mb is the maximum possible to submit to AWS Textract.
|
72 |
'''
|
73 |
|
74 |
-
all_img_details =
|
75 |
page_num = 0
|
76 |
|
77 |
# Check file size and resize if necessary
|
@@ -133,6 +134,8 @@ def process_single_page_for_image_conversion(pdf_path:str, page_num:int, image_d
|
|
133 |
elif pdf_path.lower().endswith(".jpg") or pdf_path.lower().endswith(".png") or pdf_path.lower().endswith(".jpeg"):
|
134 |
image = Image.open(pdf_path)
|
135 |
image.save(out_path, format="PNG")
|
|
|
|
|
136 |
|
137 |
width, height = image.size
|
138 |
|
@@ -143,6 +146,7 @@ def process_single_page_for_image_conversion(pdf_path:str, page_num:int, image_d
|
|
143 |
return page_num, out_path, width, height
|
144 |
|
145 |
except Exception as e:
|
|
|
146 |
print(f"Error processing page {page_num + 1}: {e}")
|
147 |
return page_num, out_path_placeholder, pd.NA, pd.NA
|
148 |
else:
|
@@ -159,14 +163,14 @@ def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min
|
|
159 |
else:
|
160 |
page_count = pdfinfo_from_path(pdf_path)['Pages']
|
161 |
|
162 |
-
print(f"Number of pages in PDF: {page_count}")
|
163 |
|
164 |
# Set page max to length of pdf if not specified
|
165 |
if page_max == 0: page_max = page_count
|
166 |
|
167 |
-
results =
|
168 |
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
169 |
-
futures =
|
170 |
for page_num in range(page_min, page_max):
|
171 |
futures.append(executor.submit(process_single_page_for_image_conversion, pdf_path, page_num, image_dpi, create_images=create_images, input_folder=input_folder))
|
172 |
|
@@ -218,10 +222,10 @@ def process_file_for_image_creation(file_path:str, prepare_for_review:bool=False
|
|
218 |
|
219 |
else:
|
220 |
print(f"{file_path} is not an image or PDF file.")
|
221 |
-
img_path =
|
222 |
-
image_sizes_width =
|
223 |
-
image_sizes_height =
|
224 |
-
all_img_details =
|
225 |
|
226 |
return img_path, image_sizes_width, image_sizes_height, all_img_details
|
227 |
|
@@ -230,7 +234,7 @@ def get_input_file_names(file_input:List[str]):
|
|
230 |
Get list of input files to report to logs.
|
231 |
'''
|
232 |
|
233 |
-
all_relevant_files =
|
234 |
file_name_with_extension = ""
|
235 |
full_file_name = ""
|
236 |
total_pdf_page_count = 0
|
@@ -254,7 +258,7 @@ def get_input_file_names(file_input:List[str]):
|
|
254 |
file_extension = os.path.splitext(file_path)[1].lower()
|
255 |
|
256 |
# Check if the file is in acceptable types
|
257 |
-
if (file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet']) & ("review_file" not in file_path_without_ext) & ("ocr_output" not in file_path_without_ext):
|
258 |
all_relevant_files.append(file_path_without_ext)
|
259 |
file_name_with_extension = file_path_without_ext + file_extension
|
260 |
full_file_name = file_path
|
@@ -415,8 +419,8 @@ def redact_whole_pymupdf_page(rect_height:float, rect_width:float, page:Page, cu
|
|
415 |
return whole_page_img_annotation_box
|
416 |
|
417 |
def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float], image_sizes_height:List[float], image_file_paths:List[str]):
|
418 |
-
page_sizes =
|
419 |
-
original_cropboxes =
|
420 |
|
421 |
for page_no, page in enumerate(pymupdf_doc):
|
422 |
reported_page_no = page_no + 1
|
@@ -439,9 +443,6 @@ def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float]
|
|
439 |
out_page_image_sizes['cropbox_x_offset'] = pymupdf_page.cropbox.x0 - pymupdf_page.mediabox.x0
|
440 |
|
441 |
# cropbox_y_offset_from_top: Distance from MediaBox top edge to CropBox top edge
|
442 |
-
# MediaBox top y = mediabox.y1
|
443 |
-
# CropBox top y = cropbox.y1
|
444 |
-
# The difference is mediabox.y1 - cropbox.y1
|
445 |
out_page_image_sizes['cropbox_y_offset_from_top'] = pymupdf_page.mediabox.y1 - pymupdf_page.cropbox.y1
|
446 |
|
447 |
if image_sizes_width and image_sizes_height:
|
@@ -452,23 +453,57 @@ def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float]
|
|
452 |
|
453 |
return page_sizes, original_cropboxes
|
454 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
455 |
def prepare_image_or_pdf(
|
456 |
file_paths: List[str],
|
457 |
text_extract_method: str,
|
458 |
all_line_level_ocr_results_df:pd.DataFrame,
|
|
|
459 |
latest_file_completed: int = 0,
|
460 |
-
out_message: List[str] =
|
461 |
first_loop_state: bool = False,
|
462 |
number_of_pages:int = 0,
|
463 |
-
all_annotations_object:List =
|
464 |
prepare_for_review:bool = False,
|
465 |
-
in_fully_redacted_list:List[int]=
|
466 |
output_folder:str=OUTPUT_FOLDER,
|
467 |
input_folder:str=INPUT_FOLDER,
|
468 |
prepare_images:bool=True,
|
469 |
-
page_sizes:list[dict]=
|
|
|
470 |
textract_output_found:bool = False,
|
471 |
-
relevant_ocr_output_with_words_found:bool = False,
|
472 |
progress: Progress = Progress(track_tqdm=True)
|
473 |
) -> tuple[List[str], List[str]]:
|
474 |
"""
|
@@ -490,6 +525,7 @@ def prepare_image_or_pdf(
|
|
490 |
output_folder (optional, str): The output folder for file save
|
491 |
prepare_images (optional, bool): A boolean indicating whether to create images for each PDF page. Defaults to True.
|
492 |
page_sizes(optional, List[dict]): A list of dicts containing information about page sizes in various formats.
|
|
|
493 |
textract_output_found (optional, bool): A boolean indicating whether Textract analysis output has already been found. Defaults to False.
|
494 |
relevant_ocr_output_with_words_found (optional, bool): A boolean indicating whether local OCR analysis output has already been found. Defaults to False.
|
495 |
progress (optional, Progress): Progress tracker for the operation
|
@@ -501,11 +537,11 @@ def prepare_image_or_pdf(
|
|
501 |
|
502 |
tic = time.perf_counter()
|
503 |
json_from_csv = False
|
504 |
-
original_cropboxes =
|
505 |
-
converted_file_paths =
|
506 |
-
image_file_paths =
|
507 |
-
pymupdf_doc =
|
508 |
-
all_img_details =
|
509 |
review_file_csv = pd.DataFrame()
|
510 |
out_textract_path = ""
|
511 |
combined_out_message = ""
|
@@ -518,15 +554,15 @@ def prepare_image_or_pdf(
|
|
518 |
# If this is the first time around, set variables to 0/blank
|
519 |
if first_loop_state==True:
|
520 |
latest_file_completed = 0
|
521 |
-
out_message =
|
522 |
-
all_annotations_object =
|
523 |
else:
|
524 |
print("Now redacting file", str(latest_file_completed))
|
525 |
|
526 |
# If combined out message or converted_file_paths are blank, change to a list so it can be appended to
|
527 |
if isinstance(out_message, str): out_message = [out_message]
|
528 |
|
529 |
-
if not file_paths: file_paths =
|
530 |
|
531 |
if isinstance(file_paths, dict): file_paths = os.path.abspath(file_paths["name"])
|
532 |
|
@@ -542,7 +578,8 @@ def prepare_image_or_pdf(
|
|
542 |
final_out_message = '\n'.join(out_message)
|
543 |
else:
|
544 |
final_out_message = out_message
|
545 |
-
|
|
|
546 |
|
547 |
progress(0.1, desc='Preparing file')
|
548 |
|
@@ -555,8 +592,8 @@ def prepare_image_or_pdf(
|
|
555 |
|
556 |
# Loop through files to load in
|
557 |
for file in file_paths_loop:
|
558 |
-
converted_file_path =
|
559 |
-
image_file_path =
|
560 |
|
561 |
if isinstance(file, str):
|
562 |
file_path = file
|
@@ -565,15 +602,18 @@ def prepare_image_or_pdf(
|
|
565 |
file_path_without_ext = get_file_name_without_type(file_path)
|
566 |
file_name_with_ext = os.path.basename(file_path)
|
567 |
|
|
|
|
|
568 |
if not file_path:
|
569 |
-
out_message = "Please select
|
570 |
print(out_message)
|
571 |
-
raise
|
572 |
|
573 |
file_extension = os.path.splitext(file_path)[1].lower()
|
574 |
|
575 |
# If a pdf, load as a pymupdf document
|
576 |
if is_pdf(file_path):
|
|
|
577 |
pymupdf_doc = pymupdf.open(file_path)
|
578 |
pymupdf_pages = pymupdf_doc.page_count
|
579 |
|
@@ -588,16 +628,17 @@ def prepare_image_or_pdf(
|
|
588 |
|
589 |
#Create base version of the annotation object that doesn't have any annotations in it
|
590 |
if (not all_annotations_object) & (prepare_for_review == True):
|
591 |
-
all_annotations_object =
|
592 |
|
593 |
for image_path in image_file_paths:
|
594 |
annotation = {}
|
595 |
annotation["image"] = image_path
|
596 |
-
annotation["boxes"] =
|
597 |
|
598 |
all_annotations_object.append(annotation)
|
599 |
|
600 |
elif is_pdf_or_image(file_path): # Alternatively, if it's an image
|
|
|
601 |
# Check if the file is an image type and the user selected text ocr option
|
602 |
if file_extension in ['.jpg', '.jpeg', '.png'] and text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION:
|
603 |
text_extract_method = TESSERACT_TEXT_EXTRACT_OPTION
|
@@ -622,18 +663,25 @@ def prepare_image_or_pdf(
|
|
622 |
|
623 |
pymupdf_doc.save(converted_file_path, garbage=4, deflate=True, clean=True)
|
624 |
|
|
|
625 |
elif file_extension in ['.csv']:
|
626 |
if '_review_file' in file_path_without_ext:
|
627 |
review_file_csv = read_file(file_path)
|
628 |
all_annotations_object = convert_review_df_to_annotation_json(review_file_csv, image_file_paths, page_sizes)
|
629 |
-
json_from_csv = True
|
630 |
-
#print("Converted CSV review file to image annotation object")
|
631 |
elif '_ocr_output' in file_path_without_ext:
|
632 |
-
all_line_level_ocr_results_df = read_file(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
633 |
json_from_csv = False
|
634 |
|
635 |
# NEW IF STATEMENT
|
636 |
-
# If the file name ends with .json, check if we are loading for review. If yes, assume it is an
|
637 |
|
638 |
if (file_extension in ['.json']) | (json_from_csv == True):
|
639 |
|
@@ -661,7 +709,7 @@ def prepare_image_or_pdf(
|
|
661 |
continue
|
662 |
|
663 |
elif (file_extension in ['.json']) and '_ocr_results_with_words' in file_path_without_ext: #(prepare_for_review != True):
|
664 |
-
print("Saving local OCR output")
|
665 |
# Copy it to the output folder so it can be used later.
|
666 |
output_ocr_results_with_words_json_file_name = file_path_without_ext + ".json"
|
667 |
# if not file_path.endswith("_ocr_results_with_words.json"): output_ocr_results_with_words_json_file_name = file_path_without_ext + "_ocr_results_with_words.json"
|
@@ -672,6 +720,15 @@ def prepare_image_or_pdf(
|
|
672 |
# Use shutil to copy the file directly
|
673 |
shutil.copy2(file_path, out_ocr_results_with_words_path) # Preserves metadata
|
674 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
675 |
if text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_local_text.json"): relevant_ocr_output_with_words_found = True
|
676 |
if text_extract_method == TESSERACT_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_local_ocr.json"): relevant_ocr_output_with_words_found = True
|
677 |
if text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_textract.json"): relevant_ocr_output_with_words_found = True
|
@@ -742,7 +799,7 @@ def prepare_image_or_pdf(
|
|
742 |
continue
|
743 |
|
744 |
# If it's a zip, it could be extract from a Textract bulk API call. Check it's this, and load in json if found
|
745 |
-
|
746 |
|
747 |
# Assume it's a Textract response object. Copy it to the output folder so it can be used later.
|
748 |
out_folder = os.path.join(output_folder, file_path_without_ext + "_textract.json")
|
@@ -766,37 +823,25 @@ def prepare_image_or_pdf(
|
|
766 |
else:
|
767 |
print(f"Skipping {file_path}: Expected 1 JSON file, found {len(json_files)}")
|
768 |
|
769 |
-
elif file_extension in ['.csv'] and "ocr_output" in file_path:
|
770 |
-
continue
|
771 |
-
|
772 |
-
# Must be something else, return with error message
|
773 |
-
else:
|
774 |
-
if text_extract_method == TESSERACT_TEXT_EXTRACT_OPTION or text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
775 |
-
if is_pdf_or_image(file_path) == False:
|
776 |
-
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
777 |
-
print(out_message)
|
778 |
-
raise Exception(out_message)
|
779 |
-
|
780 |
-
elif text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION:
|
781 |
-
if is_pdf(file_path) == False:
|
782 |
-
out_message = "Please upload a PDF file for text analysis."
|
783 |
-
print(out_message)
|
784 |
-
raise Exception(out_message)
|
785 |
-
|
786 |
converted_file_paths.append(converted_file_path)
|
787 |
image_file_paths.extend(image_file_path)
|
788 |
|
789 |
toc = time.perf_counter()
|
790 |
-
out_time = f"File '{
|
791 |
|
792 |
print(out_time)
|
793 |
|
794 |
out_message.append(out_time)
|
795 |
combined_out_message = '\n'.join(out_message)
|
796 |
|
797 |
-
|
|
|
|
|
|
|
|
|
|
|
798 |
|
799 |
-
return combined_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df, relevant_ocr_output_with_words_found
|
800 |
|
801 |
def load_and_convert_ocr_results_with_words_json(ocr_results_with_words_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
|
802 |
"""
|
@@ -834,6 +879,8 @@ def load_and_convert_ocr_results_with_words_json(ocr_results_with_words_json_fil
|
|
834 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi, output_folder:str=OUTPUT_FOLDER, input_folder:str=INPUT_FOLDER):
|
835 |
file_path_without_ext = get_file_name_without_type(in_file_path)
|
836 |
|
|
|
|
|
837 |
out_file_paths = out_text_file_path
|
838 |
|
839 |
# Convert annotated text pdf back to image to give genuine redactions
|
@@ -896,7 +943,7 @@ def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
|
|
896 |
image_groups[item['image']].append(item)
|
897 |
|
898 |
# Process each group to prioritize items with non-empty boxes
|
899 |
-
result =
|
900 |
for image, items in image_groups.items():
|
901 |
# Filter items with non-empty boxes
|
902 |
non_empty_boxes = [item for item in items if item.get('boxes')]
|
@@ -1035,7 +1082,6 @@ def divide_coordinates_by_page_sizes(
|
|
1035 |
else:
|
1036 |
print("Skipping coordinate division due to missing or non-numeric dimension columns.")
|
1037 |
|
1038 |
-
|
1039 |
# --- Combine Relative and Processed Absolute DataFrames ---
|
1040 |
dfs_to_concat = [df for df in [df_rel, df_abs] if not df.empty]
|
1041 |
|
@@ -1046,7 +1092,6 @@ def divide_coordinates_by_page_sizes(
|
|
1046 |
print("Warning: Both relative and absolute splits resulted in empty DataFrames.")
|
1047 |
final_df = pd.DataFrame(columns=review_file_df.columns)
|
1048 |
|
1049 |
-
|
1050 |
# --- Final Sort ---
|
1051 |
required_sort_columns = {"page", xmin, ymin}
|
1052 |
if not final_df.empty and required_sort_columns.issubset(final_df.columns):
|
@@ -1428,7 +1473,7 @@ def create_annotation_dicts_from_annotation_df(
|
|
1428 |
def convert_annotation_json_to_review_df(
|
1429 |
all_annotations: List[dict],
|
1430 |
redaction_decision_output: pd.DataFrame = pd.DataFrame(),
|
1431 |
-
page_sizes: List[dict] =
|
1432 |
do_proximity_match: bool = True
|
1433 |
) -> pd.DataFrame:
|
1434 |
'''
|
@@ -1615,7 +1660,7 @@ def convert_annotation_json_to_review_df(
|
|
1615 |
if 'color' in review_file_df.columns:
|
1616 |
# Check if the column actually contains lists before applying lambda
|
1617 |
if review_file_df['color'].apply(lambda x: isinstance(x, list)).any():
|
1618 |
-
review_file_df["color"] = review_file_df["color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
|
1619 |
|
1620 |
# Sort the results
|
1621 |
# Ensure sort columns exist before sorting
|
@@ -1642,6 +1687,86 @@ def convert_annotation_json_to_review_df(
|
|
1642 |
|
1643 |
return review_file_df
|
1644 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1645 |
def fill_missing_box_ids(data_input: dict) -> dict:
|
1646 |
"""
|
1647 |
Generates unique alphanumeric IDs for bounding boxes in an input dictionary
|
@@ -1873,7 +1998,7 @@ def fill_missing_ids(df: pd.DataFrame, column_name: str = 'id', length: int = 12
|
|
1873 |
# --- Generate Unique IDs ---
|
1874 |
character_set = string.ascii_letters + string.digits # a-z, A-Z, 0-9
|
1875 |
generated_ids_set = set() # Keep track of IDs generated *in this run*
|
1876 |
-
new_ids_list =
|
1877 |
|
1878 |
max_possible_ids = len(character_set) ** length
|
1879 |
if num_needed > max_possible_ids:
|
@@ -2080,14 +2205,14 @@ def convert_review_df_to_annotation_json(
|
|
2080 |
|
2081 |
|
2082 |
# --- Build JSON Structure ---
|
2083 |
-
json_data =
|
2084 |
output_cols_for_boxes = [col for col in ["label", "color", xmin, ymin, xmax, ymax, "id", "text"] if col in review_file_df.columns]
|
2085 |
|
2086 |
# Iterate through page_sizes_df to define the structure (one entry per image path)
|
2087 |
for _, row in page_sizes_df.iterrows():
|
2088 |
page_num = row['page'] # Already Int64
|
2089 |
pdf_image_path = row['image_path']
|
2090 |
-
annotation_boxes =
|
2091 |
|
2092 |
# Check if the page exists in the grouped annotations (using the faster set lookup)
|
2093 |
# Check pd.notna because page_num could be <NA> if conversion failed
|
@@ -2106,7 +2231,7 @@ def convert_review_df_to_annotation_json(
|
|
2106 |
|
2107 |
except KeyError:
|
2108 |
print(f"Warning: Group key {page_num} not found despite being in group_keys (should not happen).")
|
2109 |
-
annotation_boxes =
|
2110 |
|
2111 |
# Append the structured data for this image/page
|
2112 |
json_data.append({
|
|
|
4 |
import re
|
5 |
import time
|
6 |
import json
|
7 |
+
import gradio as gr
|
8 |
import numpy as np
|
9 |
import pymupdf
|
10 |
from pymupdf import Document, Page, Rect
|
|
|
72 |
Check if a given image size is above around 4.5mb, and reduce size if necessary. 5mb is the maximum possible to submit to AWS Textract.
|
73 |
'''
|
74 |
|
75 |
+
all_img_details = list()
|
76 |
page_num = 0
|
77 |
|
78 |
# Check file size and resize if necessary
|
|
|
134 |
elif pdf_path.lower().endswith(".jpg") or pdf_path.lower().endswith(".png") or pdf_path.lower().endswith(".jpeg"):
|
135 |
image = Image.open(pdf_path)
|
136 |
image.save(out_path, format="PNG")
|
137 |
+
else:
|
138 |
+
raise Warning("Could not create image.")
|
139 |
|
140 |
width, height = image.size
|
141 |
|
|
|
146 |
return page_num, out_path, width, height
|
147 |
|
148 |
except Exception as e:
|
149 |
+
|
150 |
print(f"Error processing page {page_num + 1}: {e}")
|
151 |
return page_num, out_path_placeholder, pd.NA, pd.NA
|
152 |
else:
|
|
|
163 |
else:
|
164 |
page_count = pdfinfo_from_path(pdf_path)['Pages']
|
165 |
|
166 |
+
print(f"Creating images. Number of pages in PDF: {page_count}")
|
167 |
|
168 |
# Set page max to length of pdf if not specified
|
169 |
if page_max == 0: page_max = page_count
|
170 |
|
171 |
+
results = list()
|
172 |
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
173 |
+
futures = list()
|
174 |
for page_num in range(page_min, page_max):
|
175 |
futures.append(executor.submit(process_single_page_for_image_conversion, pdf_path, page_num, image_dpi, create_images=create_images, input_folder=input_folder))
|
176 |
|
|
|
222 |
|
223 |
else:
|
224 |
print(f"{file_path} is not an image or PDF file.")
|
225 |
+
img_path = list()
|
226 |
+
image_sizes_width = list()
|
227 |
+
image_sizes_height = list()
|
228 |
+
all_img_details = list()
|
229 |
|
230 |
return img_path, image_sizes_width, image_sizes_height, all_img_details
|
231 |
|
|
|
234 |
Get list of input files to report to logs.
|
235 |
'''
|
236 |
|
237 |
+
all_relevant_files = list()
|
238 |
file_name_with_extension = ""
|
239 |
full_file_name = ""
|
240 |
total_pdf_page_count = 0
|
|
|
258 |
file_extension = os.path.splitext(file_path)[1].lower()
|
259 |
|
260 |
# Check if the file is in acceptable types
|
261 |
+
if (file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet', '.docx']) & ("review_file" not in file_path_without_ext) & ("ocr_output" not in file_path_without_ext) & ("ocr_results_with_words" not in file_path_without_ext):
|
262 |
all_relevant_files.append(file_path_without_ext)
|
263 |
file_name_with_extension = file_path_without_ext + file_extension
|
264 |
full_file_name = file_path
|
|
|
419 |
return whole_page_img_annotation_box
|
420 |
|
421 |
def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float], image_sizes_height:List[float], image_file_paths:List[str]):
|
422 |
+
page_sizes = list()
|
423 |
+
original_cropboxes = list()
|
424 |
|
425 |
for page_no, page in enumerate(pymupdf_doc):
|
426 |
reported_page_no = page_no + 1
|
|
|
443 |
out_page_image_sizes['cropbox_x_offset'] = pymupdf_page.cropbox.x0 - pymupdf_page.mediabox.x0
|
444 |
|
445 |
# cropbox_y_offset_from_top: Distance from MediaBox top edge to CropBox top edge
|
|
|
|
|
|
|
446 |
out_page_image_sizes['cropbox_y_offset_from_top'] = pymupdf_page.mediabox.y1 - pymupdf_page.cropbox.y1
|
447 |
|
448 |
if image_sizes_width and image_sizes_height:
|
|
|
453 |
|
454 |
return page_sizes, original_cropboxes
|
455 |
|
456 |
+
def word_level_ocr_output_to_dataframe(ocr_results: dict) -> pd.DataFrame:
|
457 |
+
'''
|
458 |
+
Convert a json of ocr results to a dataframe
|
459 |
+
'''
|
460 |
+
rows = list()
|
461 |
+
ocr_result_page = ocr_results[0]
|
462 |
+
|
463 |
+
for ocr_result in ocr_results:
|
464 |
+
|
465 |
+
page_number = int(ocr_result['page'])
|
466 |
+
|
467 |
+
for line_key, line_data in ocr_result['results'].items():
|
468 |
+
|
469 |
+
line_number = int(line_data['line'])
|
470 |
+
for word in line_data['words']:
|
471 |
+
rows.append({
|
472 |
+
'page': page_number,
|
473 |
+
'line': line_number,
|
474 |
+
'word_text': word['text'],
|
475 |
+
'word_x0': word['bounding_box'][0],
|
476 |
+
'word_y0': word['bounding_box'][1],
|
477 |
+
'word_x1': word['bounding_box'][2],
|
478 |
+
'word_y1': word['bounding_box'][3],
|
479 |
+
'line_text': "", #line_data['text'], # This data is too large to include
|
480 |
+
'line_x0': line_data['bounding_box'][0],
|
481 |
+
'line_y0': line_data['bounding_box'][1],
|
482 |
+
'line_x1': line_data['bounding_box'][2],
|
483 |
+
'line_y1': line_data['bounding_box'][3],
|
484 |
+
})
|
485 |
+
|
486 |
+
return pd.DataFrame(rows)
|
487 |
+
|
488 |
def prepare_image_or_pdf(
|
489 |
file_paths: List[str],
|
490 |
text_extract_method: str,
|
491 |
all_line_level_ocr_results_df:pd.DataFrame,
|
492 |
+
all_page_line_level_ocr_results_with_words_df:pd.DataFrame,
|
493 |
latest_file_completed: int = 0,
|
494 |
+
out_message: List[str] = list(),
|
495 |
first_loop_state: bool = False,
|
496 |
number_of_pages:int = 0,
|
497 |
+
all_annotations_object:List = list(),
|
498 |
prepare_for_review:bool = False,
|
499 |
+
in_fully_redacted_list:List[int]=list(),
|
500 |
output_folder:str=OUTPUT_FOLDER,
|
501 |
input_folder:str=INPUT_FOLDER,
|
502 |
prepare_images:bool=True,
|
503 |
+
page_sizes:list[dict]=list(),
|
504 |
+
pymupdf_doc:Document = list(),
|
505 |
textract_output_found:bool = False,
|
506 |
+
relevant_ocr_output_with_words_found:bool = False,
|
507 |
progress: Progress = Progress(track_tqdm=True)
|
508 |
) -> tuple[List[str], List[str]]:
|
509 |
"""
|
|
|
525 |
output_folder (optional, str): The output folder for file save
|
526 |
prepare_images (optional, bool): A boolean indicating whether to create images for each PDF page. Defaults to True.
|
527 |
page_sizes(optional, List[dict]): A list of dicts containing information about page sizes in various formats.
|
528 |
+
pymupdf_doc(optional, Document): A pymupdf document object that indicates the existing PDF document object.
|
529 |
textract_output_found (optional, bool): A boolean indicating whether Textract analysis output has already been found. Defaults to False.
|
530 |
relevant_ocr_output_with_words_found (optional, bool): A boolean indicating whether local OCR analysis output has already been found. Defaults to False.
|
531 |
progress (optional, Progress): Progress tracker for the operation
|
|
|
537 |
|
538 |
tic = time.perf_counter()
|
539 |
json_from_csv = False
|
540 |
+
original_cropboxes = list() # Store original CropBox values
|
541 |
+
converted_file_paths = list()
|
542 |
+
image_file_paths = list()
|
543 |
+
# pymupdf_doc = list()
|
544 |
+
all_img_details = list()
|
545 |
review_file_csv = pd.DataFrame()
|
546 |
out_textract_path = ""
|
547 |
combined_out_message = ""
|
|
|
554 |
# If this is the first time around, set variables to 0/blank
|
555 |
if first_loop_state==True:
|
556 |
latest_file_completed = 0
|
557 |
+
out_message = list()
|
558 |
+
all_annotations_object = list()
|
559 |
else:
|
560 |
print("Now redacting file", str(latest_file_completed))
|
561 |
|
562 |
# If combined out message or converted_file_paths are blank, change to a list so it can be appended to
|
563 |
if isinstance(out_message, str): out_message = [out_message]
|
564 |
|
565 |
+
if not file_paths: file_paths = list()
|
566 |
|
567 |
if isinstance(file_paths, dict): file_paths = os.path.abspath(file_paths["name"])
|
568 |
|
|
|
578 |
final_out_message = '\n'.join(out_message)
|
579 |
else:
|
580 |
final_out_message = out_message
|
581 |
+
|
582 |
+
return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df, relevant_ocr_output_with_words_found, all_page_line_level_ocr_results_with_words_df
|
583 |
|
584 |
progress(0.1, desc='Preparing file')
|
585 |
|
|
|
592 |
|
593 |
# Loop through files to load in
|
594 |
for file in file_paths_loop:
|
595 |
+
converted_file_path = list()
|
596 |
+
image_file_path = list()
|
597 |
|
598 |
if isinstance(file, str):
|
599 |
file_path = file
|
|
|
602 |
file_path_without_ext = get_file_name_without_type(file_path)
|
603 |
file_name_with_ext = os.path.basename(file_path)
|
604 |
|
605 |
+
print("Loading file:", file_name_with_ext)
|
606 |
+
|
607 |
if not file_path:
|
608 |
+
out_message = "Please select at least one file."
|
609 |
print(out_message)
|
610 |
+
raise Warning(out_message)
|
611 |
|
612 |
file_extension = os.path.splitext(file_path)[1].lower()
|
613 |
|
614 |
# If a pdf, load as a pymupdf document
|
615 |
if is_pdf(file_path):
|
616 |
+
print(f"File {file_name_with_ext} is a PDF")
|
617 |
pymupdf_doc = pymupdf.open(file_path)
|
618 |
pymupdf_pages = pymupdf_doc.page_count
|
619 |
|
|
|
628 |
|
629 |
#Create base version of the annotation object that doesn't have any annotations in it
|
630 |
if (not all_annotations_object) & (prepare_for_review == True):
|
631 |
+
all_annotations_object = list()
|
632 |
|
633 |
for image_path in image_file_paths:
|
634 |
annotation = {}
|
635 |
annotation["image"] = image_path
|
636 |
+
annotation["boxes"] = list()
|
637 |
|
638 |
all_annotations_object.append(annotation)
|
639 |
|
640 |
elif is_pdf_or_image(file_path): # Alternatively, if it's an image
|
641 |
+
print(f"File {file_name_with_ext} is an image")
|
642 |
# Check if the file is an image type and the user selected text ocr option
|
643 |
if file_extension in ['.jpg', '.jpeg', '.png'] and text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION:
|
644 |
text_extract_method = TESSERACT_TEXT_EXTRACT_OPTION
|
|
|
663 |
|
664 |
pymupdf_doc.save(converted_file_path, garbage=4, deflate=True, clean=True)
|
665 |
|
666 |
+
# Loading in review files, ocr_outputs, or ocr_outputs_with_words
|
667 |
elif file_extension in ['.csv']:
|
668 |
if '_review_file' in file_path_without_ext:
|
669 |
review_file_csv = read_file(file_path)
|
670 |
all_annotations_object = convert_review_df_to_annotation_json(review_file_csv, image_file_paths, page_sizes)
|
671 |
+
json_from_csv = True
|
|
|
672 |
elif '_ocr_output' in file_path_without_ext:
|
673 |
+
all_line_level_ocr_results_df = read_file(file_path)
|
674 |
+
|
675 |
+
if "line" not in all_line_level_ocr_results_df.columns:
|
676 |
+
all_line_level_ocr_results_df["line"] = ""
|
677 |
+
|
678 |
+
json_from_csv = False
|
679 |
+
elif '_ocr_results_with_words' in file_path_without_ext:
|
680 |
+
all_page_line_level_ocr_results_with_words_df = read_file(file_path)
|
681 |
json_from_csv = False
|
682 |
|
683 |
# NEW IF STATEMENT
|
684 |
+
# If the file name ends with .json, check if we are loading for review. If yes, assume it is an annotations object, overwrite the current annotations object. If false, assume this is a Textract object, load in to Textract
|
685 |
|
686 |
if (file_extension in ['.json']) | (json_from_csv == True):
|
687 |
|
|
|
709 |
continue
|
710 |
|
711 |
elif (file_extension in ['.json']) and '_ocr_results_with_words' in file_path_without_ext: #(prepare_for_review != True):
|
712 |
+
print("Saving local OCR output with words")
|
713 |
# Copy it to the output folder so it can be used later.
|
714 |
output_ocr_results_with_words_json_file_name = file_path_without_ext + ".json"
|
715 |
# if not file_path.endswith("_ocr_results_with_words.json"): output_ocr_results_with_words_json_file_name = file_path_without_ext + "_ocr_results_with_words.json"
|
|
|
720 |
# Use shutil to copy the file directly
|
721 |
shutil.copy2(file_path, out_ocr_results_with_words_path) # Preserves metadata
|
722 |
|
723 |
+
if prepare_for_review == True:
|
724 |
+
print("Converting local OCR output with words to csv")
|
725 |
+
page_sizes_df = pd.DataFrame(page_sizes)
|
726 |
+
all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(out_ocr_results_with_words_path, log_files_output_paths, page_sizes_df)
|
727 |
+
all_page_line_level_ocr_results_with_words_df = word_level_ocr_output_to_dataframe(all_page_line_level_ocr_results_with_words)
|
728 |
+
|
729 |
+
all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="word_x0", xmax="word_x1", ymin="word_y0", ymax="word_y1")
|
730 |
+
all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="line_x0", xmax="line_x1", ymin="line_y0", ymax="line_y1")
|
731 |
+
|
732 |
if text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_local_text.json"): relevant_ocr_output_with_words_found = True
|
733 |
if text_extract_method == TESSERACT_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_local_ocr.json"): relevant_ocr_output_with_words_found = True
|
734 |
if text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_textract.json"): relevant_ocr_output_with_words_found = True
|
|
|
799 |
continue
|
800 |
|
801 |
# If it's a zip, it could be extract from a Textract bulk API call. Check it's this, and load in json if found
|
802 |
+
if file_extension in ['.zip']:
|
803 |
|
804 |
# Assume it's a Textract response object. Copy it to the output folder so it can be used later.
|
805 |
out_folder = os.path.join(output_folder, file_path_without_ext + "_textract.json")
|
|
|
823 |
else:
|
824 |
print(f"Skipping {file_path}: Expected 1 JSON file, found {len(json_files)}")
|
825 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
826 |
converted_file_paths.append(converted_file_path)
|
827 |
image_file_paths.extend(image_file_path)
|
828 |
|
829 |
toc = time.perf_counter()
|
830 |
+
out_time = f"File '{file_name_with_ext}' prepared in {toc - tic:0.1f} seconds."
|
831 |
|
832 |
print(out_time)
|
833 |
|
834 |
out_message.append(out_time)
|
835 |
combined_out_message = '\n'.join(out_message)
|
836 |
|
837 |
+
if not page_sizes:
|
838 |
+
number_of_pages = 1
|
839 |
+
else:
|
840 |
+
number_of_pages = len(page_sizes)
|
841 |
+
|
842 |
+
print("Finished loading in files")
|
843 |
|
844 |
+
return combined_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df, relevant_ocr_output_with_words_found, all_page_line_level_ocr_results_with_words_df
|
845 |
|
846 |
def load_and_convert_ocr_results_with_words_json(ocr_results_with_words_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
|
847 |
"""
|
|
|
879 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi, output_folder:str=OUTPUT_FOLDER, input_folder:str=INPUT_FOLDER):
|
880 |
file_path_without_ext = get_file_name_without_type(in_file_path)
|
881 |
|
882 |
+
print("In convert_text_pdf_to_img_pdf function, file_path_without_ext:", file_path_without_ext)
|
883 |
+
|
884 |
out_file_paths = out_text_file_path
|
885 |
|
886 |
# Convert annotated text pdf back to image to give genuine redactions
|
|
|
943 |
image_groups[item['image']].append(item)
|
944 |
|
945 |
# Process each group to prioritize items with non-empty boxes
|
946 |
+
result = list()
|
947 |
for image, items in image_groups.items():
|
948 |
# Filter items with non-empty boxes
|
949 |
non_empty_boxes = [item for item in items if item.get('boxes')]
|
|
|
1082 |
else:
|
1083 |
print("Skipping coordinate division due to missing or non-numeric dimension columns.")
|
1084 |
|
|
|
1085 |
# --- Combine Relative and Processed Absolute DataFrames ---
|
1086 |
dfs_to_concat = [df for df in [df_rel, df_abs] if not df.empty]
|
1087 |
|
|
|
1092 |
print("Warning: Both relative and absolute splits resulted in empty DataFrames.")
|
1093 |
final_df = pd.DataFrame(columns=review_file_df.columns)
|
1094 |
|
|
|
1095 |
# --- Final Sort ---
|
1096 |
required_sort_columns = {"page", xmin, ymin}
|
1097 |
if not final_df.empty and required_sort_columns.issubset(final_df.columns):
|
|
|
1473 |
def convert_annotation_json_to_review_df(
|
1474 |
all_annotations: List[dict],
|
1475 |
redaction_decision_output: pd.DataFrame = pd.DataFrame(),
|
1476 |
+
page_sizes: List[dict] = list(),
|
1477 |
do_proximity_match: bool = True
|
1478 |
) -> pd.DataFrame:
|
1479 |
'''
|
|
|
1660 |
if 'color' in review_file_df.columns:
|
1661 |
# Check if the column actually contains lists before applying lambda
|
1662 |
if review_file_df['color'].apply(lambda x: isinstance(x, list)).any():
|
1663 |
+
review_file_df.loc[:, "color"] = review_file_df.loc[:, "color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
|
1664 |
|
1665 |
# Sort the results
|
1666 |
# Ensure sort columns exist before sorting
|
|
|
1687 |
|
1688 |
return review_file_df
|
1689 |
|
1690 |
+
def fill_missing_ids_in_list(data_list: list) -> list:
|
1691 |
+
"""
|
1692 |
+
Generates unique alphanumeric IDs for dictionaries in a list where the 'id' is
|
1693 |
+
missing, blank, or not a 12-character string.
|
1694 |
+
|
1695 |
+
Args:
|
1696 |
+
data_list (list): A list of dictionaries, each potentially with an 'id' key.
|
1697 |
+
|
1698 |
+
Returns:
|
1699 |
+
list: The input list with missing/invalid IDs filled.
|
1700 |
+
Note: The function modifies the input list in place.
|
1701 |
+
"""
|
1702 |
+
|
1703 |
+
# --- Input Validation ---
|
1704 |
+
if not isinstance(data_list, list):
|
1705 |
+
raise TypeError("Input 'data_list' must be a list.")
|
1706 |
+
|
1707 |
+
if not data_list:
|
1708 |
+
return data_list # Return empty list as-is
|
1709 |
+
|
1710 |
+
id_length = 12
|
1711 |
+
character_set = string.ascii_letters + string.digits # a-z, A-Z, 0-9
|
1712 |
+
|
1713 |
+
# --- Get Existing IDs to Ensure Uniqueness ---
|
1714 |
+
# Collect all valid existing IDs first
|
1715 |
+
existing_ids = set()
|
1716 |
+
for item in data_list:
|
1717 |
+
if not isinstance(item, dict):
|
1718 |
+
continue # Skip non-dictionary items
|
1719 |
+
item_id = item.get('id')
|
1720 |
+
if isinstance(item_id, str) and len(item_id) == id_length:
|
1721 |
+
existing_ids.add(item_id)
|
1722 |
+
|
1723 |
+
# --- Identify and Fill Items Needing IDs ---
|
1724 |
+
generated_ids_set = set() # Keep track of IDs generated *in this run*
|
1725 |
+
num_filled = 0
|
1726 |
+
|
1727 |
+
for item in data_list:
|
1728 |
+
if not isinstance(item, dict):
|
1729 |
+
continue # Skip non-dictionary items
|
1730 |
+
|
1731 |
+
item_id = item.get('id')
|
1732 |
+
|
1733 |
+
# Check if ID needs to be generated
|
1734 |
+
# Needs ID if: key is missing, value is None, value is not a string,
|
1735 |
+
# value is an empty string after stripping whitespace, or value is a string
|
1736 |
+
# but not of the correct length.
|
1737 |
+
needs_new_id = (
|
1738 |
+
item_id is None or
|
1739 |
+
not isinstance(item_id, str) or
|
1740 |
+
item_id.strip() == "" or
|
1741 |
+
len(item_id) != id_length
|
1742 |
+
)
|
1743 |
+
|
1744 |
+
if needs_new_id:
|
1745 |
+
# Generate a unique ID
|
1746 |
+
attempts = 0
|
1747 |
+
while True:
|
1748 |
+
candidate_id = ''.join(random.choices(character_set, k=id_length))
|
1749 |
+
# Check against *all* existing valid IDs and *newly* generated ones in this run
|
1750 |
+
if candidate_id not in existing_ids and candidate_id not in generated_ids_set:
|
1751 |
+
generated_ids_set.add(candidate_id)
|
1752 |
+
item['id'] = candidate_id # Assign the new ID directly to the item dict
|
1753 |
+
num_filled += 1
|
1754 |
+
break # Found a unique ID
|
1755 |
+
attempts += 1
|
1756 |
+
# Safety break for unlikely infinite loop (though highly improbable with 12 chars)
|
1757 |
+
if attempts > len(data_list) * 100 + 1000:
|
1758 |
+
raise RuntimeError(f"Failed to generate a unique ID after {attempts} attempts. Check ID length or existing IDs.")
|
1759 |
+
|
1760 |
+
if num_filled > 0:
|
1761 |
+
pass
|
1762 |
+
#print(f"Successfully filled {num_filled} missing or invalid IDs.")
|
1763 |
+
else:
|
1764 |
+
pass
|
1765 |
+
#print("No missing or invalid IDs found.")
|
1766 |
+
|
1767 |
+
# The input list 'data_list' has been modified in place
|
1768 |
+
return data_list
|
1769 |
+
|
1770 |
def fill_missing_box_ids(data_input: dict) -> dict:
|
1771 |
"""
|
1772 |
Generates unique alphanumeric IDs for bounding boxes in an input dictionary
|
|
|
1998 |
# --- Generate Unique IDs ---
|
1999 |
character_set = string.ascii_letters + string.digits # a-z, A-Z, 0-9
|
2000 |
generated_ids_set = set() # Keep track of IDs generated *in this run*
|
2001 |
+
new_ids_list = list() # Store the generated IDs in order
|
2002 |
|
2003 |
max_possible_ids = len(character_set) ** length
|
2004 |
if num_needed > max_possible_ids:
|
|
|
2205 |
|
2206 |
|
2207 |
# --- Build JSON Structure ---
|
2208 |
+
json_data = list()
|
2209 |
output_cols_for_boxes = [col for col in ["label", "color", xmin, ymin, xmax, ymax, "id", "text"] if col in review_file_df.columns]
|
2210 |
|
2211 |
# Iterate through page_sizes_df to define the structure (one entry per image path)
|
2212 |
for _, row in page_sizes_df.iterrows():
|
2213 |
page_num = row['page'] # Already Int64
|
2214 |
pdf_image_path = row['image_path']
|
2215 |
+
annotation_boxes = list() # Default to empty list
|
2216 |
|
2217 |
# Check if the page exists in the grouped annotations (using the faster set lookup)
|
2218 |
# Check pd.notna because page_num could be <NA> if conversion failed
|
|
|
2231 |
|
2232 |
except KeyError:
|
2233 |
print(f"Warning: Group key {page_num} not found despite being in group_keys (should not happen).")
|
2234 |
+
annotation_boxes = list() # Keep empty
|
2235 |
|
2236 |
# Append the structured data for this image/page
|
2237 |
json_data.append({
|
tools/file_redaction.py
CHANGED
@@ -15,14 +15,15 @@ from pdfminer.high_level import extract_pages
|
|
15 |
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal, LTAnno
|
16 |
from pikepdf import Pdf, Dictionary, Name
|
17 |
from pymupdf import Rect, Page, Document
|
|
|
18 |
import gradio as gr
|
19 |
from gradio import Progress
|
20 |
from collections import defaultdict # For efficient grouping
|
21 |
|
22 |
-
from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, LOCAL_PII_OPTION, AWS_PII_OPTION, NO_REDACTION_PII_OPTION
|
23 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, recreate_page_line_level_ocr_results_with_page
|
24 |
-
from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression
|
25 |
-
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
|
26 |
from tools.helper_functions import get_file_name_without_type, clean_unicode_text
|
27 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
|
28 |
|
@@ -59,7 +60,15 @@ def sum_numbers_before_seconds(string:str):
|
|
59 |
|
60 |
return sum_of_numbers
|
61 |
|
62 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
merged = {}
|
64 |
|
65 |
for item in data:
|
@@ -76,79 +85,55 @@ def merge_page_results(data):
|
|
76 |
|
77 |
return list(merged.values())
|
78 |
|
79 |
-
def word_level_ocr_output_to_dataframe(ocr_result: dict) -> pd.DataFrame:
|
80 |
-
rows = []
|
81 |
-
ocr_result = ocr_result[0]
|
82 |
-
|
83 |
-
page_number = int(ocr_result['page'])
|
84 |
-
|
85 |
-
for line_key, line_data in ocr_result['results'].items():
|
86 |
-
line_number = int(line_data['line'])
|
87 |
-
for word in line_data['words']:
|
88 |
-
rows.append({
|
89 |
-
'page': page_number,
|
90 |
-
'line': line_number,
|
91 |
-
'word_text': word['text'],
|
92 |
-
'word_x0': word['bounding_box'][0],
|
93 |
-
'word_y0': word['bounding_box'][1],
|
94 |
-
'word_x1': word['bounding_box'][2],
|
95 |
-
'word_y1': word['bounding_box'][3],
|
96 |
-
'line_text': line_data['text'],
|
97 |
-
'line_x0': line_data['bounding_box'][0],
|
98 |
-
'line_y0': line_data['bounding_box'][1],
|
99 |
-
'line_x1': line_data['bounding_box'][2],
|
100 |
-
'line_y1': line_data['bounding_box'][3],
|
101 |
-
})
|
102 |
-
|
103 |
-
return pd.DataFrame(rows)
|
104 |
-
|
105 |
def choose_and_run_redactor(file_paths:List[str],
|
106 |
prepared_pdf_file_paths:List[str],
|
107 |
-
pdf_image_file_paths:List[str],
|
108 |
-
language:str,
|
109 |
chosen_redact_entities:List[str],
|
110 |
chosen_redact_comprehend_entities:List[str],
|
111 |
text_extraction_method:str,
|
112 |
-
in_allow_list:List[List[str]]=
|
113 |
-
custom_recogniser_word_list:List[str]=
|
114 |
-
redact_whole_page_list:List[str]=
|
115 |
latest_file_completed:int=0,
|
116 |
-
combined_out_message:List=
|
117 |
-
out_file_paths:List=
|
118 |
-
log_files_output_paths:List=
|
119 |
first_loop_state:bool=False,
|
120 |
page_min:int=0,
|
121 |
page_max:int=999,
|
122 |
estimated_time_taken_state:float=0.0,
|
123 |
-
handwrite_signature_checkbox:List[str]=["Extract handwriting", "Extract signatures"],
|
124 |
all_request_metadata_str:str = "",
|
125 |
-
annotations_all_pages:List[dict]=
|
126 |
-
|
127 |
-
all_pages_decision_process_table:pd.DataFrame=
|
128 |
-
pymupdf_doc=
|
129 |
current_loop_page:int=0,
|
130 |
page_break_return:bool=False,
|
131 |
-
pii_identification_method:str="Local",
|
132 |
comprehend_query_number:int=0,
|
133 |
max_fuzzy_spelling_mistakes_num:int=1,
|
134 |
match_fuzzy_whole_phrase_bool:bool=True,
|
135 |
aws_access_key_textbox:str='',
|
136 |
aws_secret_key_textbox:str='',
|
137 |
annotate_max_pages:int=1,
|
138 |
-
review_file_state:pd.DataFrame=
|
139 |
output_folder:str=OUTPUT_FOLDER,
|
140 |
-
document_cropboxes:List=
|
141 |
-
page_sizes:List[dict]=
|
142 |
textract_output_found:bool=False,
|
143 |
text_extraction_only:bool=False,
|
144 |
-
duplication_file_path_outputs:list=
|
145 |
review_file_path:str="",
|
146 |
input_folder:str=INPUT_FOLDER,
|
147 |
total_textract_query_number:int=0,
|
148 |
ocr_file_path:str="",
|
149 |
-
all_page_line_level_ocr_results =
|
150 |
-
all_page_line_level_ocr_results_with_words =
|
151 |
-
|
|
|
|
|
|
|
152 |
RETURN_PDF_END_OF_REDACTION:bool=RETURN_PDF_END_OF_REDACTION,
|
153 |
progress=gr.Progress(track_tqdm=True)):
|
154 |
'''
|
@@ -157,7 +142,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
157 |
- file_paths (List[str]): A list of paths to the files to be redacted.
|
158 |
- prepared_pdf_file_paths (List[str]): A list of paths to the PDF files prepared for redaction.
|
159 |
- pdf_image_file_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
|
160 |
-
|
161 |
- chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
|
162 |
- chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
|
163 |
- text_extraction_method (str): The method to use to extract text from documents.
|
@@ -175,7 +160,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
175 |
- handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
|
176 |
- all_request_metadata_str (str, optional): A string containing all request metadata. Defaults to an empty string.
|
177 |
- annotations_all_pages (List[dict], optional): A list of dictionaries containing all image annotations. Defaults to an empty list.
|
178 |
-
-
|
179 |
- all_pages_decision_process_table (pd.DataFrame, optional): A DataFrame containing all decision process tables. Defaults to an empty DataFrame.
|
180 |
- pymupdf_doc (optional): A list containing the PDF document object. Defaults to an empty list.
|
181 |
- current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
|
@@ -200,7 +185,11 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
200 |
- ocr_file_path (str, optional): The latest ocr file path created by the app.
|
201 |
- all_page_line_level_ocr_results (list, optional): All line level text on the page with bounding boxes.
|
202 |
- all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
|
203 |
-
-
|
|
|
|
|
|
|
|
|
204 |
- RETURN_PDF_END_OF_REDACTION (bool, optional): Boolean to determine whether to return a redacted PDF at the end of the redaction process.
|
205 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
206 |
|
@@ -210,11 +199,31 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
210 |
|
211 |
out_message = ""
|
212 |
pdf_file_name_with_ext = ""
|
213 |
-
pdf_file_name_without_ext = ""
|
214 |
-
|
|
|
215 |
all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
216 |
-
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
|
|
|
|
|
|
217 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
# Ensure all_pages_decision_process_table is in correct format for downstream processes
|
219 |
if isinstance(all_pages_decision_process_table,list):
|
220 |
if not all_pages_decision_process_table: all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"])
|
@@ -227,7 +236,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
227 |
#print("First_loop_state is True")
|
228 |
latest_file_completed = 0
|
229 |
current_loop_page = 0
|
230 |
-
out_file_paths =
|
|
|
231 |
estimate_total_processing_time = 0
|
232 |
estimated_time_taken_state = 0
|
233 |
comprehend_query_number = 0
|
@@ -239,7 +249,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
239 |
elif (first_loop_state == False) & (current_loop_page == 999):
|
240 |
current_loop_page = 0
|
241 |
total_textract_query_number = 0
|
242 |
-
comprehend_query_number = 0
|
243 |
|
244 |
# Choose the correct file to prepare
|
245 |
if isinstance(file_paths, str): file_paths_list = [os.path.abspath(file_paths)]
|
@@ -256,6 +266,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
256 |
# Check if any files were found and assign to file_paths_list
|
257 |
file_paths_list = filtered_files if filtered_files else []
|
258 |
|
|
|
|
|
259 |
# If latest_file_completed is used, get the specific file
|
260 |
if not isinstance(file_paths, (str, dict)): file_paths_loop = [file_paths_list[int(latest_file_completed)]] if len(file_paths_list) > latest_file_completed else []
|
261 |
else: file_paths_loop = file_paths_list
|
@@ -287,8 +299,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
287 |
# Only send across review file if redaction has been done
|
288 |
if pii_identification_method != NO_REDACTION_PII_OPTION:
|
289 |
|
290 |
-
if len(review_out_file_paths) == 1:
|
291 |
-
#review_file_path = [x for x in out_file_paths if "review_file" in x]
|
292 |
if review_file_path: review_out_file_paths.append(review_file_path)
|
293 |
|
294 |
if not isinstance(pymupdf_doc, list):
|
@@ -299,7 +310,9 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
299 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
300 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
301 |
|
302 |
-
|
|
|
|
|
303 |
|
304 |
#if first_loop_state == False:
|
305 |
# Prepare documents and images as required if they don't already exist
|
@@ -329,10 +342,10 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
329 |
|
330 |
# Call prepare_image_or_pdf only if needed
|
331 |
if prepare_images_flag is not None:
|
332 |
-
out_message, prepared_pdf_file_paths, pdf_image_file_paths, annotate_max_pages, annotate_max_pages_bottom, pymupdf_doc, annotations_all_pages, review_file_state, document_cropboxes, page_sizes, textract_output_found, all_img_details_state, placeholder_ocr_results_df, local_ocr_output_found_checkbox = prepare_image_or_pdf(
|
333 |
-
file_paths_loop, text_extraction_method,
|
334 |
annotate_max_pages, annotations_all_pages, document_cropboxes, redact_whole_page_list,
|
335 |
-
output_folder, prepare_images=prepare_images_flag, page_sizes=page_sizes, input_folder=input_folder
|
336 |
)
|
337 |
|
338 |
page_sizes_df = pd.DataFrame(page_sizes)
|
@@ -343,8 +356,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
343 |
|
344 |
page_sizes = page_sizes_df.to_dict(orient="records")
|
345 |
|
346 |
-
number_of_pages = pymupdf_doc.page_count
|
347 |
-
|
348 |
|
349 |
# If we have reached the last page, return message and outputs
|
350 |
if current_loop_page >= number_of_pages:
|
@@ -361,11 +373,12 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
361 |
# Only send across review file if redaction has been done
|
362 |
if pii_identification_method != NO_REDACTION_PII_OPTION:
|
363 |
# If only pdf currently in review outputs, add on the latest review file
|
364 |
-
if len(review_out_file_paths) == 1:
|
365 |
-
#review_file_path = [x for x in out_file_paths if "review_file" in x]
|
366 |
if review_file_path: review_out_file_paths.append(review_file_path)
|
367 |
|
368 |
-
|
|
|
|
|
369 |
|
370 |
# Load/create allow list
|
371 |
# If string, assume file path
|
@@ -374,7 +387,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
374 |
if not in_allow_list.empty:
|
375 |
in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
|
376 |
else:
|
377 |
-
in_allow_list_flat =
|
378 |
|
379 |
# If string, assume file path
|
380 |
if isinstance(custom_recogniser_word_list, str):
|
@@ -383,7 +396,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
383 |
if not custom_recogniser_word_list.empty:
|
384 |
custom_recogniser_word_list_flat = custom_recogniser_word_list.iloc[:, 0].tolist()
|
385 |
else:
|
386 |
-
custom_recogniser_word_list_flat =
|
387 |
|
388 |
# Sort the strings in order from the longest string to the shortest
|
389 |
custom_recogniser_word_list_flat = sorted(custom_recogniser_word_list_flat, key=len, reverse=True)
|
@@ -399,7 +412,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
399 |
print("Could not convert whole page redaction data to number list due to:", e)
|
400 |
redact_whole_page_list_flat = redact_whole_page_list.iloc[:,0].tolist()
|
401 |
else:
|
402 |
-
redact_whole_page_list_flat =
|
403 |
|
404 |
|
405 |
|
@@ -452,13 +465,28 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
452 |
else:
|
453 |
textract_client = ""
|
454 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
455 |
# Check if output_folder exists, create it if it doesn't
|
456 |
if not os.path.exists(output_folder): os.makedirs(output_folder)
|
457 |
|
458 |
progress(0.5, desc="Extracting text and redacting document")
|
459 |
|
460 |
all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"])
|
461 |
-
|
462 |
|
463 |
# Run through file loop, redact each file at a time
|
464 |
for file in file_paths_loop:
|
@@ -482,16 +510,16 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
482 |
raise Exception(out_message)
|
483 |
|
484 |
# Output file paths names
|
485 |
-
orig_pdf_file_path = output_folder +
|
486 |
review_file_path = orig_pdf_file_path + '_review_file.csv'
|
487 |
|
488 |
# Load in all_ocr_results_with_words if it exists as a file path and doesn't exist already
|
489 |
-
file_name = get_file_name_without_type(file_path)
|
490 |
|
491 |
-
if text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION: file_ending = "
|
492 |
-
elif text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION: file_ending = "
|
493 |
-
elif text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION: file_ending = "
|
494 |
-
all_page_line_level_ocr_results_with_words_json_file_path = output_folder +
|
495 |
|
496 |
if not all_page_line_level_ocr_results_with_words:
|
497 |
if local_ocr_output_found_checkbox == True and os.path.exists(all_page_line_level_ocr_results_with_words_json_file_path):
|
@@ -509,7 +537,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
509 |
|
510 |
print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
|
511 |
|
512 |
-
pymupdf_doc, all_pages_decision_process_table, out_file_paths, new_textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return,
|
513 |
pdf_image_file_paths,
|
514 |
language,
|
515 |
chosen_redact_entities,
|
@@ -523,7 +551,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
523 |
current_loop_page,
|
524 |
page_break_return,
|
525 |
annotations_all_pages,
|
526 |
-
|
527 |
all_pages_decision_process_table,
|
528 |
pymupdf_doc,
|
529 |
pii_identification_method,
|
@@ -538,14 +566,17 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
538 |
text_extraction_only,
|
539 |
all_page_line_level_ocr_results,
|
540 |
all_page_line_level_ocr_results_with_words,
|
|
|
541 |
log_files_output_paths=log_files_output_paths,
|
|
|
542 |
output_folder=output_folder)
|
543 |
-
|
544 |
-
# Save Textract request metadata (if exists)
|
545 |
|
|
|
|
|
|
|
|
|
546 |
if new_textract_request_metadata and isinstance(new_textract_request_metadata, list):
|
547 |
-
all_textract_request_metadata.extend(new_textract_request_metadata)
|
548 |
-
|
549 |
|
550 |
elif text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION:
|
551 |
|
@@ -556,7 +587,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
556 |
# Analyse text-based pdf
|
557 |
print('Redacting file as text-based PDF')
|
558 |
|
559 |
-
pymupdf_doc, all_pages_decision_process_table,
|
560 |
file_path,
|
561 |
language,
|
562 |
chosen_redact_entities,
|
@@ -567,7 +598,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
567 |
current_loop_page,
|
568 |
page_break_return,
|
569 |
annotations_all_pages,
|
570 |
-
|
571 |
all_pages_decision_process_table,
|
572 |
pymupdf_doc,
|
573 |
all_page_line_level_ocr_results_with_words,
|
@@ -615,43 +646,50 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
615 |
out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.pdf"
|
616 |
print("Saving redacted PDF file:", out_redacted_pdf_file_path)
|
617 |
save_pdf_with_or_without_compression(pymupdf_doc, out_redacted_pdf_file_path)
|
|
|
618 |
out_file_paths.append(out_redacted_pdf_file_path)
|
619 |
|
620 |
-
if not
|
621 |
-
|
622 |
-
else:
|
623 |
|
624 |
-
ocr_file_path = orig_pdf_file_path + "_ocr_output.csv"
|
625 |
-
|
|
|
|
|
626 |
|
627 |
-
all_line_level_ocr_results_df.to_csv(ocr_file_path, index = None, encoding="utf-8")
|
628 |
out_file_paths.append(ocr_file_path)
|
629 |
-
|
630 |
duplication_file_path_outputs.append(ocr_file_path)
|
631 |
|
632 |
if all_page_line_level_ocr_results_with_words:
|
633 |
-
#print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
|
634 |
-
#
|
635 |
-
#if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
|
636 |
-
|
637 |
all_page_line_level_ocr_results_with_words = merge_page_results(all_page_line_level_ocr_results_with_words)
|
638 |
-
|
639 |
-
# print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
|
640 |
-
|
641 |
with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
|
642 |
json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
|
643 |
|
644 |
all_page_line_level_ocr_results_with_words_df = word_level_ocr_output_to_dataframe(all_page_line_level_ocr_results_with_words)
|
645 |
|
646 |
all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="word_x0", xmax="word_x1", ymin="word_y0", ymax="word_y1")
|
|
|
647 |
|
648 |
-
|
|
|
|
|
649 |
|
650 |
-
|
651 |
-
|
652 |
-
|
|
|
653 |
|
654 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
655 |
|
656 |
if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
|
657 |
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
|
@@ -659,8 +697,10 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
659 |
if all_page_line_level_ocr_results_with_words_df_file_path not in log_files_output_paths:
|
660 |
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_df_file_path)
|
661 |
|
|
|
|
|
|
|
662 |
# Convert the gradio annotation boxes to relative coordinates
|
663 |
-
# Convert annotations_all_pages to a consistent relative coordinate format output
|
664 |
progress(0.93, "Creating review file output")
|
665 |
page_sizes = page_sizes_df.to_dict(orient="records")
|
666 |
all_image_annotations_df = convert_annotation_data_to_dataframe(annotations_all_pages)
|
@@ -674,10 +714,9 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
674 |
# Don't need page sizes in outputs
|
675 |
review_file_state.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
|
676 |
|
677 |
-
review_file_state.to_csv(review_file_path, index=None)
|
678 |
|
679 |
-
if pii_identification_method != NO_REDACTION_PII_OPTION:
|
680 |
-
out_file_paths.append(review_file_path)
|
681 |
|
682 |
# Make a combined message for the file
|
683 |
if isinstance(out_message, list) and out_message:
|
@@ -699,18 +738,16 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
699 |
time_taken = toc - tic
|
700 |
estimated_time_taken_state += time_taken
|
701 |
|
702 |
-
# If textract requests made, write to logging file.
|
703 |
if all_textract_request_metadata and isinstance(all_textract_request_metadata, list):
|
704 |
all_request_metadata_str = '\n'.join(all_textract_request_metadata).strip()
|
705 |
|
706 |
all_textract_request_metadata_file_path = output_folder + pdf_file_name_without_ext + "_textract_metadata.txt"
|
707 |
|
708 |
-
with open(all_textract_request_metadata_file_path, "w") as f:
|
709 |
-
f.write(all_request_metadata_str)
|
710 |
|
711 |
# Add the request metadata to the log outputs if not there already
|
712 |
-
if all_textract_request_metadata_file_path not in log_files_output_paths:
|
713 |
-
log_files_output_paths.append(all_textract_request_metadata_file_path)
|
714 |
|
715 |
new_textract_query_numbers = len(all_textract_request_metadata)
|
716 |
total_textract_query_number += new_textract_query_numbers
|
@@ -725,7 +762,9 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
725 |
|
726 |
if total_textract_query_number > number_of_pages: total_textract_query_number = number_of_pages
|
727 |
|
728 |
-
|
|
|
|
|
729 |
|
730 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
|
731 |
'''
|
@@ -1063,7 +1102,7 @@ def set_cropbox_safely(page: Page, original_cropbox: Optional[Rect]):
|
|
1063 |
else:
|
1064 |
page.set_cropbox(original_cropbox)
|
1065 |
|
1066 |
-
def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]=
|
1067 |
|
1068 |
rect_height = page.rect.height
|
1069 |
rect_width = page.rect.width
|
@@ -1090,7 +1129,7 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
|
|
1090 |
image_dimensions = {}
|
1091 |
|
1092 |
out_annotation_boxes = {}
|
1093 |
-
all_image_annotation_boxes =
|
1094 |
|
1095 |
if isinstance(image, Image.Image):
|
1096 |
image_path = move_page_info(str(page))
|
@@ -1201,10 +1240,25 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
|
|
1201 |
# IMAGE-BASED OCR PDF TEXT DETECTION/REDACTION WITH TESSERACT OR AWS TEXTRACT
|
1202 |
###
|
1203 |
|
1204 |
-
def merge_img_bboxes(bboxes, combined_results: Dict, page_signature_recogniser_results=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1205 |
|
1206 |
-
all_bboxes =
|
1207 |
-
merged_bboxes =
|
1208 |
grouped_bboxes = defaultdict(list)
|
1209 |
|
1210 |
# Deep copy original bounding boxes to retain them
|
@@ -1219,7 +1273,7 @@ def merge_img_bboxes(bboxes, combined_results: Dict, page_signature_recogniser_r
|
|
1219 |
merged_bboxes.extend(copy.deepcopy(page_signature_recogniser_results))
|
1220 |
|
1221 |
# Reconstruct bounding boxes for substrings of interest
|
1222 |
-
reconstructed_bboxes =
|
1223 |
for bbox in bboxes:
|
1224 |
bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
|
1225 |
for line_text, line_info in combined_results.items():
|
@@ -1229,7 +1283,7 @@ def merge_img_bboxes(bboxes, combined_results: Dict, page_signature_recogniser_r
|
|
1229 |
start_char = line_text.index(bbox.text)
|
1230 |
end_char = start_char + len(bbox.text)
|
1231 |
|
1232 |
-
relevant_words =
|
1233 |
current_char = 0
|
1234 |
for word in line_info['words']:
|
1235 |
word_end = current_char + len(word['text'])
|
@@ -1318,33 +1372,35 @@ def redact_image_pdf(file_path:str,
|
|
1318 |
page_max:int=999,
|
1319 |
text_extraction_method:str=TESSERACT_TEXT_EXTRACT_OPTION,
|
1320 |
handwrite_signature_checkbox:List[str]=["Extract handwriting", "Extract signatures"],
|
1321 |
-
textract_request_metadata:list=
|
1322 |
current_loop_page:int=0,
|
1323 |
page_break_return:bool=False,
|
1324 |
-
annotations_all_pages:List=
|
1325 |
-
|
1326 |
all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"]),
|
1327 |
-
pymupdf_doc:Document =
|
1328 |
pii_identification_method:str="Local",
|
1329 |
comprehend_query_number:int=0,
|
1330 |
comprehend_client:str="",
|
1331 |
textract_client:str="",
|
1332 |
-
custom_recogniser_word_list:List[str]=
|
1333 |
-
redact_whole_page_list:List[str]=
|
1334 |
max_fuzzy_spelling_mistakes_num:int=1,
|
1335 |
match_fuzzy_whole_phrase_bool:bool=True,
|
1336 |
page_sizes_df:pd.DataFrame=pd.DataFrame(),
|
1337 |
text_extraction_only:bool=False,
|
1338 |
-
all_page_line_level_ocr_results =
|
1339 |
-
all_page_line_level_ocr_results_with_words =
|
|
|
1340 |
page_break_val:int=int(PAGE_BREAK_VALUE),
|
1341 |
-
log_files_output_paths:List=
|
1342 |
max_time:int=int(MAX_TIME_VALUE),
|
1343 |
-
|
|
|
1344 |
progress=Progress(track_tqdm=True)):
|
1345 |
|
1346 |
'''
|
1347 |
-
This function redacts sensitive information from a PDF document. It takes the following parameters:
|
1348 |
|
1349 |
- file_path (str): The path to the PDF file to be redacted.
|
1350 |
- pdf_image_file_paths (List[str]): A list of paths to the PDF file pages converted to images.
|
@@ -1357,9 +1413,10 @@ def redact_image_pdf(file_path:str,
|
|
1357 |
- text_extraction_method (str, optional): The type of analysis to perform on the PDF. Defaults to TESSERACT_TEXT_EXTRACT_OPTION.
|
1358 |
- handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
|
1359 |
- textract_request_metadata (list, optional): Metadata related to the redaction request. Defaults to an empty string.
|
|
|
1360 |
- page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
|
1361 |
- annotations_all_pages (List, optional): List of annotations on all pages that is used by the gradio_image_annotation object.
|
1362 |
-
-
|
1363 |
- all_pages_decision_process_table (pd.DataFrame, optional): All redaction decisions for document as a Pandas dataframe.
|
1364 |
- pymupdf_doc (Document, optional): The document as a PyMupdf object.
|
1365 |
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
@@ -1372,10 +1429,14 @@ def redact_image_pdf(file_path:str,
|
|
1372 |
- match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
|
1373 |
- page_sizes_df (pd.DataFrame, optional): A pandas dataframe of PDF page sizes in PDF or image format.
|
1374 |
- text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
|
1375 |
-
-
|
|
|
|
|
|
|
1376 |
- log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
|
1377 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
1378 |
-
-
|
|
|
1379 |
- progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
1380 |
|
1381 |
The function returns a redacted PDF document along with processing output objects.
|
@@ -1386,6 +1447,17 @@ def redact_image_pdf(file_path:str,
|
|
1386 |
file_name = get_file_name_without_type(file_path)
|
1387 |
comprehend_query_number_new = 0
|
1388 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1389 |
# Update custom word list analyser object with any new words that have been added to the custom deny list
|
1390 |
if custom_recogniser_word_list:
|
1391 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
@@ -1396,7 +1468,11 @@ def redact_image_pdf(file_path:str,
|
|
1396 |
new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
|
1397 |
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
1398 |
|
1399 |
-
|
|
|
|
|
|
|
|
|
1400 |
|
1401 |
if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
|
1402 |
out_message = "Connection to AWS Comprehend service unsuccessful."
|
@@ -1406,7 +1482,7 @@ def redact_image_pdf(file_path:str,
|
|
1406 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION and textract_client == "":
|
1407 |
out_message_warning = "Connection to AWS Textract service unsuccessful. Redaction will only continue if local AWS Textract results can be found."
|
1408 |
print(out_message_warning)
|
1409 |
-
#raise Exception(out_message)
|
1410 |
|
1411 |
number_of_pages = pymupdf_doc.page_count
|
1412 |
print("Number of pages:", str(number_of_pages))
|
@@ -1425,7 +1501,7 @@ def redact_image_pdf(file_path:str,
|
|
1425 |
textract_data, is_missing, log_files_output_paths = load_and_convert_textract_json(textract_json_file_path, log_files_output_paths, page_sizes_df)
|
1426 |
original_textract_data = textract_data.copy()
|
1427 |
|
1428 |
-
print("Successfully loaded in Textract analysis results from file")
|
1429 |
|
1430 |
# If running local OCR option, check if file already exists. If it does, load in existing data
|
1431 |
if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
|
@@ -1433,7 +1509,7 @@ def redact_image_pdf(file_path:str,
|
|
1433 |
all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(all_page_line_level_ocr_results_with_words_json_file_path, log_files_output_paths, page_sizes_df)
|
1434 |
original_all_page_line_level_ocr_results_with_words = all_page_line_level_ocr_results_with_words.copy()
|
1435 |
|
1436 |
-
print("Loaded in local OCR analysis results from file")
|
1437 |
|
1438 |
###
|
1439 |
if current_loop_page == 0: page_loop_start = 0
|
@@ -1442,11 +1518,11 @@ def redact_image_pdf(file_path:str,
|
|
1442 |
progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
|
1443 |
|
1444 |
# If there's data from a previous run (passed in via the DataFrame parameters), add it
|
1445 |
-
all_line_level_ocr_results_list =
|
1446 |
-
all_pages_decision_process_list =
|
1447 |
|
1448 |
-
if not
|
1449 |
-
all_line_level_ocr_results_list.extend(
|
1450 |
if not all_pages_decision_process_table.empty:
|
1451 |
all_pages_decision_process_list.extend(all_pages_decision_process_table.to_dict('records'))
|
1452 |
|
@@ -1454,10 +1530,10 @@ def redact_image_pdf(file_path:str,
|
|
1454 |
# Go through each page
|
1455 |
for page_no in progress_bar:
|
1456 |
|
1457 |
-
handwriting_or_signature_boxes =
|
1458 |
-
page_signature_recogniser_results =
|
1459 |
-
page_handwriting_recogniser_results =
|
1460 |
-
page_line_level_ocr_results_with_words =
|
1461 |
page_break_return = False
|
1462 |
reported_page_number = str(page_no + 1)
|
1463 |
|
@@ -1495,12 +1571,7 @@ def redact_image_pdf(file_path:str,
|
|
1495 |
print("Can't find original cropbox details for page, using current PyMuPDF page cropbox")
|
1496 |
original_cropbox = pymupdf_page.cropbox.irect
|
1497 |
|
1498 |
-
# Possibility to use different languages
|
1499 |
-
if language == 'en': ocr_lang = 'eng'
|
1500 |
-
else: ocr_lang = language
|
1501 |
-
|
1502 |
# Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
|
1503 |
-
|
1504 |
# If using Tesseract
|
1505 |
if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
|
1506 |
|
@@ -1513,7 +1584,7 @@ def redact_image_pdf(file_path:str,
|
|
1513 |
)
|
1514 |
|
1515 |
page_line_level_ocr_results_with_words = matching_page if matching_page else []
|
1516 |
-
else: page_line_level_ocr_results_with_words =
|
1517 |
|
1518 |
if page_line_level_ocr_results_with_words:
|
1519 |
print("Found OCR results for page in existing OCR with words object")
|
@@ -1523,11 +1594,14 @@ def redact_image_pdf(file_path:str,
|
|
1523 |
|
1524 |
page_line_level_ocr_results, page_line_level_ocr_results_with_words = combine_ocr_results(page_word_level_ocr_results, page=reported_page_number)
|
1525 |
|
|
|
|
|
|
|
1526 |
all_page_line_level_ocr_results_with_words.append(page_line_level_ocr_results_with_words)
|
1527 |
|
1528 |
# Check if page exists in existing textract data. If not, send to service to analyse
|
1529 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
1530 |
-
text_blocks =
|
1531 |
|
1532 |
if not textract_data:
|
1533 |
try:
|
@@ -1565,7 +1639,7 @@ def redact_image_pdf(file_path:str,
|
|
1565 |
text_blocks, new_textract_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1566 |
|
1567 |
# Check if "pages" key exists, if not, initialise it as an empty list
|
1568 |
-
if "pages" not in textract_data: textract_data["pages"] =
|
1569 |
|
1570 |
# Append the new page data
|
1571 |
textract_data["pages"].append(text_blocks)
|
@@ -1573,11 +1647,11 @@ def redact_image_pdf(file_path:str,
|
|
1573 |
except Exception as e:
|
1574 |
out_message = "Textract extraction for page " + reported_page_number + " failed due to:" + str(e)
|
1575 |
print(out_message)
|
1576 |
-
text_blocks =
|
1577 |
new_textract_request_metadata = "Failed Textract API call"
|
1578 |
|
1579 |
# Check if "pages" key exists, if not, initialise it as an empty list
|
1580 |
-
if "pages" not in textract_data: textract_data["pages"] =
|
1581 |
|
1582 |
raise Exception(out_message)
|
1583 |
|
@@ -1589,6 +1663,9 @@ def redact_image_pdf(file_path:str,
|
|
1589 |
|
1590 |
page_line_level_ocr_results, handwriting_or_signature_boxes, page_signature_recogniser_results, page_handwriting_recogniser_results, page_line_level_ocr_results_with_words = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
|
1591 |
|
|
|
|
|
|
|
1592 |
all_page_line_level_ocr_results_with_words.append(page_line_level_ocr_results_with_words)
|
1593 |
|
1594 |
# Convert to DataFrame and add to ongoing logging table
|
@@ -1598,7 +1675,8 @@ def redact_image_pdf(file_path:str,
|
|
1598 |
'left': result.left,
|
1599 |
'top': result.top,
|
1600 |
'width': result.width,
|
1601 |
-
'height': result.height
|
|
|
1602 |
} for result in page_line_level_ocr_results['results']])
|
1603 |
|
1604 |
if not line_level_ocr_results_df.empty: # Ensure there are records to add
|
@@ -1613,21 +1691,22 @@ def redact_image_pdf(file_path:str,
|
|
1613 |
page_line_level_ocr_results_with_words['results'],
|
1614 |
chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
|
1615 |
pii_identification_method = pii_identification_method,
|
1616 |
-
comprehend_client=comprehend_client,
|
|
|
1617 |
language=language,
|
1618 |
-
entities=chosen_redact_entities,
|
1619 |
allow_list=allow_list,
|
1620 |
-
score_threshold=score_threshold
|
|
|
1621 |
)
|
1622 |
|
1623 |
comprehend_query_number = comprehend_query_number + comprehend_query_number_new
|
1624 |
|
1625 |
-
else: page_redaction_bounding_boxes =
|
1626 |
|
1627 |
# Merge redaction bounding boxes that are close together
|
1628 |
page_merged_redaction_bboxes = merge_img_bboxes(page_redaction_bounding_boxes, page_line_level_ocr_results_with_words['results'], page_signature_recogniser_results, page_handwriting_recogniser_results, handwrite_signature_checkbox)
|
1629 |
|
1630 |
-
else: page_merged_redaction_bboxes =
|
1631 |
|
1632 |
# 3. Draw the merged boxes
|
1633 |
## Apply annotations to pdf with pymupdf
|
@@ -1654,7 +1733,7 @@ def redact_image_pdf(file_path:str,
|
|
1654 |
fill = (0, 0, 0) # Fill colour for redactions
|
1655 |
draw = ImageDraw.Draw(image)
|
1656 |
|
1657 |
-
all_image_annotations_boxes =
|
1658 |
|
1659 |
for box in page_merged_redaction_bboxes:
|
1660 |
|
@@ -1696,9 +1775,7 @@ def redact_image_pdf(file_path:str,
|
|
1696 |
|
1697 |
page_image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
|
1698 |
|
1699 |
-
redacted_image = image.copy()
|
1700 |
-
#redacted_image.save("test_out_image.png")
|
1701 |
-
|
1702 |
|
1703 |
# Convert decision process to table
|
1704 |
decision_process_table = pd.DataFrame([{
|
@@ -1720,7 +1797,6 @@ def redact_image_pdf(file_path:str,
|
|
1720 |
all_pages_decision_process_list.extend(decision_process_table.to_dict('records'))
|
1721 |
|
1722 |
decision_process_table = fill_missing_ids(decision_process_table)
|
1723 |
-
decision_process_table.to_csv(output_folder + "decision_process_table_with_ids.csv")
|
1724 |
|
1725 |
toc = time.perf_counter()
|
1726 |
|
@@ -1855,224 +1931,225 @@ def get_text_container_characters(text_container:LTTextContainer):
|
|
1855 |
return characters
|
1856 |
return []
|
1857 |
|
1858 |
-
def create_line_level_ocr_results_from_characters(char_objects:List[LTChar]) -> Tuple[List[OCRResult], List[LTChar]]:
|
1859 |
-
'''
|
1860 |
-
Create an OCRResult object based on a list of pdfminer LTChar objects.
|
1861 |
-
'''
|
1862 |
|
1863 |
-
|
1864 |
-
|
1865 |
-
|
1866 |
-
|
|
|
|
|
|
|
|
|
1867 |
|
1868 |
-
# Initialize variables
|
1869 |
full_text = ""
|
1870 |
-
|
1871 |
-
overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]
|
1872 |
-
line_bboxes = []
|
1873 |
-
|
1874 |
-
# Iterate through the character objects
|
1875 |
-
current_word = ""
|
1876 |
-
current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # [x0, y0, x1, y1]
|
1877 |
|
1878 |
for char in char_objects:
|
1879 |
-
character_objects_out.append(char)
|
1880 |
-
|
1881 |
-
if not isinstance(char, LTAnno):
|
1882 |
-
character_text = char.get_text()
|
1883 |
-
# character_text_objects_out.append(character_text)
|
1884 |
|
1885 |
if isinstance(char, LTAnno):
|
1886 |
added_text = char.get_text()
|
1887 |
-
|
1888 |
-
# Handle double quotes
|
1889 |
-
#added_text = added_text.replace('"', '\\"') # Escape double quotes
|
1890 |
-
|
1891 |
-
# Handle space separately by finalizing the word
|
1892 |
-
full_text += added_text # Adds space or newline
|
1893 |
|
1894 |
-
if current_word: # Only finalise if there is a current word
|
1895 |
-
line_bboxes.append((current_word, current_word_bbox))
|
1896 |
-
current_word = ""
|
1897 |
-
current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # Reset for next word
|
1898 |
-
|
1899 |
-
# Check for line break (assuming a new line is indicated by a specific character)
|
1900 |
if '\n' in added_text:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1901 |
|
1902 |
-
# finalise the current line
|
1903 |
-
if current_word:
|
1904 |
-
line_bboxes.append((current_word, current_word_bbox))
|
1905 |
-
# Create an OCRResult for the current line
|
1906 |
-
line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0], 2), round(overall_bbox[1], 2), round(overall_bbox[2] - overall_bbox[0], 2), round(overall_bbox[3] - overall_bbox[1], 2)))
|
1907 |
-
line_level_characters_out.append(character_objects_out)
|
1908 |
# Reset for the next line
|
1909 |
-
character_objects_out =
|
1910 |
full_text = ""
|
1911 |
overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]
|
1912 |
-
|
1913 |
-
current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]
|
1914 |
-
|
1915 |
continue
|
1916 |
|
1917 |
-
#
|
|
|
|
|
1918 |
|
1919 |
-
#full_text += char.get_text()
|
1920 |
-
#added_text = re.sub(r'[^\x00-\x7F]+', ' ', char.get_text())
|
1921 |
-
added_text = char.get_text()
|
1922 |
-
if re.search(r'[^\x00-\x7F]', added_text): # Matches any non-ASCII character
|
1923 |
-
#added_text.encode('latin1', errors='replace').decode('utf-8')
|
1924 |
-
added_text = clean_unicode_text(added_text)
|
1925 |
-
full_text += added_text # Adds space or newline, removing
|
1926 |
-
|
1927 |
-
# Update overall bounding box
|
1928 |
x0, y0, x1, y1 = char.bbox
|
1929 |
-
overall_bbox[0] = min(overall_bbox[0], x0)
|
1930 |
-
overall_bbox[1] = min(overall_bbox[1], y0)
|
1931 |
-
overall_bbox[2] = max(overall_bbox[2], x1)
|
1932 |
-
overall_bbox[3] = max(overall_bbox[3], y1)
|
1933 |
-
|
1934 |
-
|
1935 |
-
|
1936 |
-
|
1937 |
-
|
1938 |
-
|
1939 |
-
|
1940 |
-
|
1941 |
-
|
1942 |
-
|
1943 |
-
|
1944 |
-
|
1945 |
-
|
1946 |
-
|
1947 |
-
|
1948 |
-
if full_text:
|
1949 |
-
print("full_text found")
|
1950 |
-
if re.search(r'[^\x00-\x7F]', full_text): # Matches any non-ASCII character
|
1951 |
-
# Convert special characters to a human-readable format
|
1952 |
-
|
1953 |
-
full_text = clean_unicode_text(full_text)
|
1954 |
-
full_text = full_text.strip()
|
1955 |
-
|
1956 |
-
line_ocr_result_bbox = round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)
|
1957 |
-
|
1958 |
-
line_ocr_result = OCRResult(full_text.strip(), line_ocr_result_bbox)
|
1959 |
-
|
1960 |
-
line_level_results_out.append(line_ocr_result)
|
1961 |
-
|
1962 |
-
else:
|
1963 |
-
line_ocr_result_bbox = []
|
1964 |
-
|
1965 |
-
# if line_ocr_result_bbox:
|
1966 |
-
# line_level_words_out["page"] = 1
|
1967 |
-
# line_level_words_out['results'] = {'text_line_1':{"line":1, "text":full_text, "bounding_box": line_ocr_result_bbox, "words": line_bboxes}}
|
1968 |
-
# else:
|
1969 |
-
# line_level_words_out = {}
|
1970 |
-
|
1971 |
|
1972 |
-
return line_level_results_out, line_level_characters_out
|
1973 |
|
1974 |
-
def
|
1975 |
"""
|
1976 |
-
Generates
|
1977 |
|
1978 |
-
This robust version
|
1979 |
-
1.
|
1980 |
-
2.
|
1981 |
-
3. Using
|
1982 |
|
1983 |
Args:
|
1984 |
-
|
1985 |
-
page_number: The page number where the characters are from.
|
1986 |
|
1987 |
Returns:
|
1988 |
-
A
|
1989 |
"""
|
1990 |
-
#
|
1991 |
-
text_chars = [c for c in
|
1992 |
|
1993 |
if not text_chars:
|
1994 |
-
return
|
1995 |
|
1996 |
-
# Sort
|
1997 |
-
text_chars.sort(key=lambda c:
|
1998 |
|
1999 |
-
|
2000 |
-
|
|
|
2001 |
|
2002 |
-
|
2003 |
-
|
2004 |
-
|
2005 |
prev_char = None
|
2006 |
|
2007 |
def finalize_word():
|
2008 |
nonlocal current_word_text, current_word_bbox
|
2009 |
-
|
2010 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2011 |
line_words.append({
|
2012 |
-
"text":
|
2013 |
-
"bounding_box":
|
2014 |
})
|
|
|
2015 |
current_word_text = ""
|
2016 |
current_word_bbox = [float('inf'), float('inf'), -1, -1]
|
2017 |
|
2018 |
-
def finalize_line():
|
2019 |
-
nonlocal line_text, line_bbox, line_words, line_number, prev_char
|
2020 |
-
finalize_word()
|
2021 |
-
if line_text.strip():
|
2022 |
-
page_data["results"][f"text_line_{line_number}"] = {
|
2023 |
-
"line": line_number,
|
2024 |
-
"text": line_text.strip(),
|
2025 |
-
"bounding_box": [round(b, 2) for b in line_bbox],
|
2026 |
-
"words": line_words
|
2027 |
-
}
|
2028 |
-
line_number += 1
|
2029 |
-
line_text, line_bbox, line_words = "", [float('inf'), float('inf'), -1, -1], []
|
2030 |
-
prev_char = None
|
2031 |
-
|
2032 |
for char in text_chars:
|
2033 |
char_text = clean_unicode_text(char.get_text())
|
2034 |
|
2035 |
-
|
2036 |
-
|
2037 |
-
|
2038 |
-
|
2039 |
-
# Line break detection
|
2040 |
-
if vertical_gap > char_height * 0.7:
|
2041 |
-
finalize_line()
|
2042 |
-
else:
|
2043 |
-
# Check for spacing between characters
|
2044 |
-
space_threshold = char.size * 0.5
|
2045 |
-
gap = char.bbox[0] - prev_char.bbox[2]
|
2046 |
-
if gap > max(space_threshold, 1.0):
|
2047 |
-
finalize_word()
|
2048 |
-
line_text += " "
|
2049 |
-
|
2050 |
-
# ✅ Explicitly finalize if space character
|
2051 |
-
if char_text == " ":
|
2052 |
finalize_word()
|
2053 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2054 |
prev_char = char
|
2055 |
-
continue
|
2056 |
|
2057 |
-
|
2058 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2059 |
|
2060 |
-
|
2061 |
-
|
2062 |
-
current_word_bbox[1] = min(current_word_bbox[1], char.bbox[1])
|
2063 |
-
current_word_bbox[2] = max(current_word_bbox[2], char.bbox[2])
|
2064 |
-
current_word_bbox[3] = max(current_word_bbox[3], char.bbox[3])
|
2065 |
|
2066 |
-
|
2067 |
-
|
2068 |
-
|
2069 |
-
|
|
|
|
|
|
|
|
|
2070 |
|
2071 |
prev_char = char
|
2072 |
|
2073 |
-
|
|
|
|
|
|
|
2074 |
|
2075 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2076 |
|
2077 |
def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
|
2078 |
decision_process_table = pd.DataFrame()
|
@@ -2098,7 +2175,7 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
|
|
2098 |
return decision_process_table
|
2099 |
|
2100 |
def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
|
2101 |
-
pikepdf_redaction_annotations_on_page =
|
2102 |
for analysed_bounding_box in analysed_bounding_boxes:
|
2103 |
|
2104 |
bounding_box = analysed_bounding_box["boundingBox"]
|
@@ -2131,27 +2208,27 @@ def redact_text_pdf(
|
|
2131 |
page_max: int = 999, # Maximum page number to end redaction
|
2132 |
current_loop_page: int = 0, # Current page being processed in the loop
|
2133 |
page_break_return: bool = False, # Flag to indicate if a page break should be returned
|
2134 |
-
annotations_all_pages: List[dict] =
|
2135 |
-
all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"]), # DataFrame for OCR results
|
2136 |
all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"]), # DataFrame for decision process table
|
2137 |
-
pymupdf_doc: List =
|
2138 |
-
all_page_line_level_ocr_results_with_words: List =
|
2139 |
-
pii_identification_method: str = "Local",
|
2140 |
comprehend_query_number:int = 0,
|
2141 |
comprehend_client="",
|
2142 |
-
custom_recogniser_word_list:List[str]=
|
2143 |
-
redact_whole_page_list:List[str]=
|
2144 |
max_fuzzy_spelling_mistakes_num:int=1,
|
2145 |
match_fuzzy_whole_phrase_bool:bool=True,
|
2146 |
page_sizes_df:pd.DataFrame=pd.DataFrame(),
|
2147 |
-
original_cropboxes:List[dict]=
|
2148 |
text_extraction_only:bool=False,
|
2149 |
output_folder:str=OUTPUT_FOLDER,
|
2150 |
page_break_val: int = int(PAGE_BREAK_VALUE), # Value for page break
|
2151 |
-
max_time: int = int(MAX_TIME_VALUE),
|
|
|
2152 |
progress: Progress = Progress(track_tqdm=True) # Progress tracking object
|
2153 |
-
):
|
2154 |
-
|
2155 |
'''
|
2156 |
Redact chosen entities from a PDF that is made up of multiple pages that are not images.
|
2157 |
|
@@ -2177,16 +2254,18 @@ def redact_text_pdf(
|
|
2177 |
- redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
|
2178 |
- max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
|
2179 |
- match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
|
2180 |
-
- page_sizes_df (pd.DataFrame, optional): A pandas dataframe
|
2181 |
- original_cropboxes (List[dict], optional): A list of dictionaries containing pymupdf cropbox information.
|
2182 |
- text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
|
|
|
2183 |
- output_folder (str, optional): The output folder for the function
|
2184 |
- page_break_val: Value for page break
|
2185 |
-
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
|
|
2186 |
- progress: Progress tracking object
|
2187 |
'''
|
2188 |
|
2189 |
-
tic = time.perf_counter()
|
2190 |
|
2191 |
if isinstance(all_line_level_ocr_results_df, pd.DataFrame):
|
2192 |
all_line_level_ocr_results_list = [all_line_level_ocr_results_df]
|
@@ -2198,6 +2277,17 @@ def redact_text_pdf(
|
|
2198 |
if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
|
2199 |
out_message = "Connection to AWS Comprehend service not found."
|
2200 |
raise Exception(out_message)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2201 |
|
2202 |
# Update custom word list analyser object with any new words that have been added to the custom deny list
|
2203 |
if custom_recogniser_word_list:
|
@@ -2207,20 +2297,18 @@ def redact_text_pdf(
|
|
2207 |
|
2208 |
nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
|
2209 |
new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
|
2210 |
-
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
2211 |
|
2212 |
# Open with Pikepdf to get text lines
|
2213 |
pikepdf_pdf = Pdf.open(file_path)
|
2214 |
number_of_pages = len(pikepdf_pdf.pages)
|
2215 |
|
2216 |
-
file_name = get_file_name_without_type(file_path)
|
|
|
|
|
2217 |
|
2218 |
-
if not all_page_line_level_ocr_results_with_words:
|
2219 |
-
all_page_line_level_ocr_results_with_words = []
|
2220 |
-
|
2221 |
# Check that page_min and page_max are within expected ranges
|
2222 |
-
if page_max > number_of_pages or page_max == 0:
|
2223 |
-
page_max = number_of_pages
|
2224 |
|
2225 |
if page_min <= 0: page_min = 0
|
2226 |
else: page_min = page_min - 1
|
@@ -2250,28 +2338,33 @@ def redact_text_pdf(
|
|
2250 |
# Go page by page
|
2251 |
for page_layout in extract_pages(file_path, page_numbers = [page_no], maxpages=1):
|
2252 |
|
2253 |
-
all_page_line_text_extraction_characters =
|
2254 |
-
all_page_line_level_text_extraction_results_list =
|
2255 |
-
page_analyser_results =
|
2256 |
-
page_redaction_bounding_boxes =
|
2257 |
|
2258 |
-
characters =
|
2259 |
-
pikepdf_redaction_annotations_on_page =
|
2260 |
page_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"])
|
2261 |
-
page_text_ocr_outputs = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
|
|
|
2262 |
|
2263 |
-
text_line_no =
|
2264 |
for n, text_container in enumerate(page_layout):
|
2265 |
-
characters =
|
2266 |
|
2267 |
if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
|
2268 |
characters = get_text_container_characters(text_container)
|
2269 |
-
text_line_no += 1
|
2270 |
|
2271 |
# Create dataframe for all the text on the page
|
2272 |
-
line_level_text_results_list, line_characters
|
|
|
|
|
|
|
|
|
2273 |
|
2274 |
-
|
2275 |
|
2276 |
### Create page_text_ocr_outputs (OCR format outputs)
|
2277 |
if line_level_text_results_list:
|
@@ -2282,14 +2375,19 @@ def redact_text_pdf(
|
|
2282 |
'left': result.left,
|
2283 |
'top': result.top,
|
2284 |
'width': result.width,
|
2285 |
-
'height': result.height
|
|
|
2286 |
} for result in line_level_text_results_list])
|
2287 |
|
2288 |
-
|
2289 |
|
2290 |
all_page_line_level_text_extraction_results_list.extend(line_level_text_results_list)
|
2291 |
all_page_line_text_extraction_characters.extend(line_characters)
|
2292 |
-
all_page_line_level_ocr_results_with_words.append(line_level_ocr_results_with_words)
|
|
|
|
|
|
|
|
|
2293 |
|
2294 |
### REDACTION
|
2295 |
if pii_identification_method != NO_REDACTION_PII_OPTION:
|
@@ -2315,7 +2413,7 @@ def redact_text_pdf(
|
|
2315 |
# Annotate redactions on page
|
2316 |
pikepdf_redaction_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_redaction_bounding_boxes)
|
2317 |
|
2318 |
-
else: pikepdf_redaction_annotations_on_page =
|
2319 |
|
2320 |
# Make pymupdf page redactions
|
2321 |
if redact_whole_page_list:
|
@@ -2339,8 +2437,8 @@ def redact_text_pdf(
|
|
2339 |
|
2340 |
# Join extracted text outputs for all lines together
|
2341 |
if not page_text_ocr_outputs.empty:
|
2342 |
-
|
2343 |
-
page_text_ocr_outputs = page_text_ocr_outputs.loc[:, ["page", "text", "left", "top", "width", "height"]]
|
2344 |
all_line_level_ocr_results_list.append(page_text_ocr_outputs)
|
2345 |
|
2346 |
toc = time.perf_counter()
|
@@ -2366,7 +2464,8 @@ def redact_text_pdf(
|
|
2366 |
# Write logs
|
2367 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
|
2368 |
all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
|
2369 |
-
|
|
|
2370 |
|
2371 |
current_loop_page += 1
|
2372 |
|
@@ -2395,19 +2494,14 @@ def redact_text_pdf(
|
|
2395 |
|
2396 |
# Write all page outputs
|
2397 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
|
2398 |
-
|
2399 |
-
#print("all_line_level_ocr_results_list:", all_line_level_ocr_results_list)
|
2400 |
-
|
2401 |
all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
|
2402 |
-
|
2403 |
-
#print("all_line_level_ocr_results_df after concat:", all_line_level_ocr_results_df)
|
2404 |
|
2405 |
# Convert decision table to relative coordinates
|
2406 |
all_pages_decision_process_table = divide_coordinates_by_page_sizes(all_pages_decision_process_table, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
|
2407 |
|
2408 |
# Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
|
2409 |
-
all_pages_decision_process_table['ymin'] =
|
2410 |
-
all_pages_decision_process_table['ymax'] =
|
2411 |
|
2412 |
# Convert decision table to relative coordinates
|
2413 |
all_line_level_ocr_results_df = divide_coordinates_by_page_sizes(all_line_level_ocr_results_df, page_sizes_df, xmin="left", xmax="width", ymin="top", ymax="height")
|
@@ -2416,13 +2510,9 @@ def redact_text_pdf(
|
|
2416 |
|
2417 |
# Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
|
2418 |
if not all_line_level_ocr_results_df.empty:
|
2419 |
-
all_line_level_ocr_results_df['top'] = all_line_level_ocr_results_df
|
2420 |
-
all_line_level_ocr_results_df['top'] = 1 - all_line_level_ocr_results_df['top']
|
2421 |
-
|
2422 |
-
#all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words_local_text.json"
|
2423 |
|
2424 |
-
#
|
2425 |
-
|
2426 |
-
# json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
|
2427 |
|
2428 |
return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words
|
|
|
15 |
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal, LTAnno
|
16 |
from pikepdf import Pdf, Dictionary, Name
|
17 |
from pymupdf import Rect, Page, Document
|
18 |
+
from presidio_analyzer import AnalyzerEngine
|
19 |
import gradio as gr
|
20 |
from gradio import Progress
|
21 |
from collections import defaultdict # For efficient grouping
|
22 |
|
23 |
+
from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, LOCAL_PII_OPTION, AWS_PII_OPTION, NO_REDACTION_PII_OPTION, DEFAULT_LANGUAGE, textract_language_choices, aws_comprehend_language_choices
|
24 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, recreate_page_line_level_ocr_results_with_page
|
25 |
+
from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression, word_level_ocr_output_to_dataframe
|
26 |
+
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer, load_spacy_model, download_tesseract_lang_pack, create_nlp_analyser
|
27 |
from tools.helper_functions import get_file_name_without_type, clean_unicode_text
|
28 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
|
29 |
|
|
|
60 |
|
61 |
return sum_of_numbers
|
62 |
|
63 |
+
def reverse_y_coords(df:pd.DataFrame, column:str):
|
64 |
+
df[column] = df[column]
|
65 |
+
df[column] = 1 - df[column].astype(float)
|
66 |
+
|
67 |
+
df[column] = df[column].round(6)
|
68 |
+
|
69 |
+
return df[column]
|
70 |
+
|
71 |
+
def merge_page_results(data:list):
|
72 |
merged = {}
|
73 |
|
74 |
for item in data:
|
|
|
85 |
|
86 |
return list(merged.values())
|
87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
def choose_and_run_redactor(file_paths:List[str],
|
89 |
prepared_pdf_file_paths:List[str],
|
90 |
+
pdf_image_file_paths:List[str],
|
|
|
91 |
chosen_redact_entities:List[str],
|
92 |
chosen_redact_comprehend_entities:List[str],
|
93 |
text_extraction_method:str,
|
94 |
+
in_allow_list:List[List[str]]=list(),
|
95 |
+
custom_recogniser_word_list:List[str]=list(),
|
96 |
+
redact_whole_page_list:List[str]=list(),
|
97 |
latest_file_completed:int=0,
|
98 |
+
combined_out_message:List=list(),
|
99 |
+
out_file_paths:List=list(),
|
100 |
+
log_files_output_paths:List=list(),
|
101 |
first_loop_state:bool=False,
|
102 |
page_min:int=0,
|
103 |
page_max:int=999,
|
104 |
estimated_time_taken_state:float=0.0,
|
105 |
+
handwrite_signature_checkbox:List[str]=list(["Extract handwriting", "Extract signatures"]),
|
106 |
all_request_metadata_str:str = "",
|
107 |
+
annotations_all_pages:List[dict]=list(),
|
108 |
+
all_page_line_level_ocr_results_df:pd.DataFrame=None,
|
109 |
+
all_pages_decision_process_table:pd.DataFrame=None,
|
110 |
+
pymupdf_doc=list(),
|
111 |
current_loop_page:int=0,
|
112 |
page_break_return:bool=False,
|
113 |
+
pii_identification_method:str="Local",
|
114 |
comprehend_query_number:int=0,
|
115 |
max_fuzzy_spelling_mistakes_num:int=1,
|
116 |
match_fuzzy_whole_phrase_bool:bool=True,
|
117 |
aws_access_key_textbox:str='',
|
118 |
aws_secret_key_textbox:str='',
|
119 |
annotate_max_pages:int=1,
|
120 |
+
review_file_state:pd.DataFrame=list(),
|
121 |
output_folder:str=OUTPUT_FOLDER,
|
122 |
+
document_cropboxes:List=list(),
|
123 |
+
page_sizes:List[dict]=list(),
|
124 |
textract_output_found:bool=False,
|
125 |
text_extraction_only:bool=False,
|
126 |
+
duplication_file_path_outputs:list=list(),
|
127 |
review_file_path:str="",
|
128 |
input_folder:str=INPUT_FOLDER,
|
129 |
total_textract_query_number:int=0,
|
130 |
ocr_file_path:str="",
|
131 |
+
all_page_line_level_ocr_results:list[dict] = list(),
|
132 |
+
all_page_line_level_ocr_results_with_words:list[dict] = list(),
|
133 |
+
all_page_line_level_ocr_results_with_words_df:pd.DataFrame=None,
|
134 |
+
chosen_local_model:str="tesseract",
|
135 |
+
language:str=DEFAULT_LANGUAGE,
|
136 |
+
prepare_images:bool=True,
|
137 |
RETURN_PDF_END_OF_REDACTION:bool=RETURN_PDF_END_OF_REDACTION,
|
138 |
progress=gr.Progress(track_tqdm=True)):
|
139 |
'''
|
|
|
142 |
- file_paths (List[str]): A list of paths to the files to be redacted.
|
143 |
- prepared_pdf_file_paths (List[str]): A list of paths to the PDF files prepared for redaction.
|
144 |
- pdf_image_file_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
|
145 |
+
|
146 |
- chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
|
147 |
- chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
|
148 |
- text_extraction_method (str): The method to use to extract text from documents.
|
|
|
160 |
- handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
|
161 |
- all_request_metadata_str (str, optional): A string containing all request metadata. Defaults to an empty string.
|
162 |
- annotations_all_pages (List[dict], optional): A list of dictionaries containing all image annotations. Defaults to an empty list.
|
163 |
+
- all_page_line_level_ocr_results_df (pd.DataFrame, optional): A DataFrame containing all line-level OCR results. Defaults to an empty DataFrame.
|
164 |
- all_pages_decision_process_table (pd.DataFrame, optional): A DataFrame containing all decision process tables. Defaults to an empty DataFrame.
|
165 |
- pymupdf_doc (optional): A list containing the PDF document object. Defaults to an empty list.
|
166 |
- current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
|
|
|
185 |
- ocr_file_path (str, optional): The latest ocr file path created by the app.
|
186 |
- all_page_line_level_ocr_results (list, optional): All line level text on the page with bounding boxes.
|
187 |
- all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
|
188 |
+
- all_page_line_level_ocr_results_with_words_df (pd.Dataframe, optional): All word level text on the page with bounding boxes as a dataframe.
|
189 |
+
- chosen_local_model (str): Which local model is being used for OCR on images - "tesseract", "paddle" for PaddleOCR, or "hybrid" to combine both.
|
190 |
+
- language (str, optional): The language of the text in the files. Defaults to English.
|
191 |
+
- language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
|
192 |
+
- prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
|
193 |
- RETURN_PDF_END_OF_REDACTION (bool, optional): Boolean to determine whether to return a redacted PDF at the end of the redaction process.
|
194 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
195 |
|
|
|
199 |
|
200 |
out_message = ""
|
201 |
pdf_file_name_with_ext = ""
|
202 |
+
pdf_file_name_without_ext = ""
|
203 |
+
page_break_return = False
|
204 |
+
blank_request_metadata = list()
|
205 |
all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
206 |
+
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
207 |
+
|
208 |
+
# Use provided language or default
|
209 |
+
language = language or DEFAULT_LANGUAGE
|
210 |
|
211 |
+
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
212 |
+
if language not in textract_language_choices:
|
213 |
+
out_message = f"Language '{language}' is not supported by AWS Textract. Please select a different language."
|
214 |
+
raise Warning(out_message)
|
215 |
+
elif pii_identification_method == AWS_PII_OPTION:
|
216 |
+
if language not in aws_comprehend_language_choices:
|
217 |
+
out_message = f"Language '{language}' is not supported by AWS Comprehend. Please select a different language."
|
218 |
+
raise Warning(out_message)
|
219 |
+
|
220 |
+
if all_page_line_level_ocr_results_with_words_df is None:
|
221 |
+
all_page_line_level_ocr_results_with_words_df = pd.DataFrame()
|
222 |
+
|
223 |
+
# Create copies of out_file_path objects to avoid overwriting each other on append actions
|
224 |
+
out_file_paths = out_file_paths.copy()
|
225 |
+
log_files_output_paths = log_files_output_paths.copy()
|
226 |
+
|
227 |
# Ensure all_pages_decision_process_table is in correct format for downstream processes
|
228 |
if isinstance(all_pages_decision_process_table,list):
|
229 |
if not all_pages_decision_process_table: all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"])
|
|
|
236 |
#print("First_loop_state is True")
|
237 |
latest_file_completed = 0
|
238 |
current_loop_page = 0
|
239 |
+
out_file_paths = list()
|
240 |
+
log_files_output_paths = list()
|
241 |
estimate_total_processing_time = 0
|
242 |
estimated_time_taken_state = 0
|
243 |
comprehend_query_number = 0
|
|
|
249 |
elif (first_loop_state == False) & (current_loop_page == 999):
|
250 |
current_loop_page = 0
|
251 |
total_textract_query_number = 0
|
252 |
+
comprehend_query_number = 0
|
253 |
|
254 |
# Choose the correct file to prepare
|
255 |
if isinstance(file_paths, str): file_paths_list = [os.path.abspath(file_paths)]
|
|
|
266 |
# Check if any files were found and assign to file_paths_list
|
267 |
file_paths_list = filtered_files if filtered_files else []
|
268 |
|
269 |
+
print("Latest file completed:", latest_file_completed)
|
270 |
+
|
271 |
# If latest_file_completed is used, get the specific file
|
272 |
if not isinstance(file_paths, (str, dict)): file_paths_loop = [file_paths_list[int(latest_file_completed)]] if len(file_paths_list) > latest_file_completed else []
|
273 |
else: file_paths_loop = file_paths_list
|
|
|
299 |
# Only send across review file if redaction has been done
|
300 |
if pii_identification_method != NO_REDACTION_PII_OPTION:
|
301 |
|
302 |
+
if len(review_out_file_paths) == 1:
|
|
|
303 |
if review_file_path: review_out_file_paths.append(review_file_path)
|
304 |
|
305 |
if not isinstance(pymupdf_doc, list):
|
|
|
310 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
311 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
312 |
|
313 |
+
page_break_return = True
|
314 |
+
|
315 |
+
return combined_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df, review_file_state
|
316 |
|
317 |
#if first_loop_state == False:
|
318 |
# Prepare documents and images as required if they don't already exist
|
|
|
342 |
|
343 |
# Call prepare_image_or_pdf only if needed
|
344 |
if prepare_images_flag is not None:
|
345 |
+
out_message, prepared_pdf_file_paths, pdf_image_file_paths, annotate_max_pages, annotate_max_pages_bottom, pymupdf_doc, annotations_all_pages, review_file_state, document_cropboxes, page_sizes, textract_output_found, all_img_details_state, placeholder_ocr_results_df, local_ocr_output_found_checkbox, all_page_line_level_ocr_results_with_words_df = prepare_image_or_pdf(
|
346 |
+
file_paths_loop, text_extraction_method, all_page_line_level_ocr_results_df, all_page_line_level_ocr_results_with_words_df, 0, out_message, True,
|
347 |
annotate_max_pages, annotations_all_pages, document_cropboxes, redact_whole_page_list,
|
348 |
+
output_folder=output_folder, prepare_images=prepare_images_flag, page_sizes=page_sizes, pymupdf_doc=pymupdf_doc, input_folder=input_folder
|
349 |
)
|
350 |
|
351 |
page_sizes_df = pd.DataFrame(page_sizes)
|
|
|
356 |
|
357 |
page_sizes = page_sizes_df.to_dict(orient="records")
|
358 |
|
359 |
+
number_of_pages = pymupdf_doc.page_count
|
|
|
360 |
|
361 |
# If we have reached the last page, return message and outputs
|
362 |
if current_loop_page >= number_of_pages:
|
|
|
373 |
# Only send across review file if redaction has been done
|
374 |
if pii_identification_method != NO_REDACTION_PII_OPTION:
|
375 |
# If only pdf currently in review outputs, add on the latest review file
|
376 |
+
if len(review_out_file_paths) == 1:
|
|
|
377 |
if review_file_path: review_out_file_paths.append(review_file_path)
|
378 |
|
379 |
+
page_break_return = False
|
380 |
+
|
381 |
+
return combined_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df, review_file_state
|
382 |
|
383 |
# Load/create allow list
|
384 |
# If string, assume file path
|
|
|
387 |
if not in_allow_list.empty:
|
388 |
in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
|
389 |
else:
|
390 |
+
in_allow_list_flat = list()
|
391 |
|
392 |
# If string, assume file path
|
393 |
if isinstance(custom_recogniser_word_list, str):
|
|
|
396 |
if not custom_recogniser_word_list.empty:
|
397 |
custom_recogniser_word_list_flat = custom_recogniser_word_list.iloc[:, 0].tolist()
|
398 |
else:
|
399 |
+
custom_recogniser_word_list_flat = list()
|
400 |
|
401 |
# Sort the strings in order from the longest string to the shortest
|
402 |
custom_recogniser_word_list_flat = sorted(custom_recogniser_word_list_flat, key=len, reverse=True)
|
|
|
412 |
print("Could not convert whole page redaction data to number list due to:", e)
|
413 |
redact_whole_page_list_flat = redact_whole_page_list.iloc[:,0].tolist()
|
414 |
else:
|
415 |
+
redact_whole_page_list_flat = list()
|
416 |
|
417 |
|
418 |
|
|
|
465 |
else:
|
466 |
textract_client = ""
|
467 |
|
468 |
+
### Language check - check if selected language packs exist
|
469 |
+
try:
|
470 |
+
if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION and chosen_local_model == "tesseract":
|
471 |
+
if language != "en":
|
472 |
+
progress(0.1, desc=f"Downloading Tesseract language pack for {language}")
|
473 |
+
download_tesseract_lang_pack(language)
|
474 |
+
|
475 |
+
if language != "en":
|
476 |
+
progress(0.1, desc=f"Loading SpaCy model for {language}")
|
477 |
+
load_spacy_model(language)
|
478 |
+
|
479 |
+
except Exception as e:
|
480 |
+
print(f"Error downloading language packs for {language}: {e}")
|
481 |
+
raise Exception(f"Error downloading language packs for {language}: {e}")
|
482 |
+
|
483 |
# Check if output_folder exists, create it if it doesn't
|
484 |
if not os.path.exists(output_folder): os.makedirs(output_folder)
|
485 |
|
486 |
progress(0.5, desc="Extracting text and redacting document")
|
487 |
|
488 |
all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"])
|
489 |
+
all_page_line_level_ocr_results_df = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height", "line"])
|
490 |
|
491 |
# Run through file loop, redact each file at a time
|
492 |
for file in file_paths_loop:
|
|
|
510 |
raise Exception(out_message)
|
511 |
|
512 |
# Output file paths names
|
513 |
+
orig_pdf_file_path = output_folder + pdf_file_name_without_ext
|
514 |
review_file_path = orig_pdf_file_path + '_review_file.csv'
|
515 |
|
516 |
# Load in all_ocr_results_with_words if it exists as a file path and doesn't exist already
|
517 |
+
#file_name = get_file_name_without_type(file_path)
|
518 |
|
519 |
+
if text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION: file_ending = "local_text"
|
520 |
+
elif text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION: file_ending = "local_ocr"
|
521 |
+
elif text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION: file_ending = "textract"
|
522 |
+
all_page_line_level_ocr_results_with_words_json_file_path = output_folder + pdf_file_name_without_ext + "_ocr_results_with_words_" + file_ending + ".json"
|
523 |
|
524 |
if not all_page_line_level_ocr_results_with_words:
|
525 |
if local_ocr_output_found_checkbox == True and os.path.exists(all_page_line_level_ocr_results_with_words_json_file_path):
|
|
|
537 |
|
538 |
print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
|
539 |
|
540 |
+
pymupdf_doc, all_pages_decision_process_table, out_file_paths, new_textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words = redact_image_pdf(file_path,
|
541 |
pdf_image_file_paths,
|
542 |
language,
|
543 |
chosen_redact_entities,
|
|
|
551 |
current_loop_page,
|
552 |
page_break_return,
|
553 |
annotations_all_pages,
|
554 |
+
all_page_line_level_ocr_results_df,
|
555 |
all_pages_decision_process_table,
|
556 |
pymupdf_doc,
|
557 |
pii_identification_method,
|
|
|
566 |
text_extraction_only,
|
567 |
all_page_line_level_ocr_results,
|
568 |
all_page_line_level_ocr_results_with_words,
|
569 |
+
chosen_local_model,
|
570 |
log_files_output_paths=log_files_output_paths,
|
571 |
+
nlp_analyser=nlp_analyser,
|
572 |
output_folder=output_folder)
|
|
|
|
|
573 |
|
574 |
+
# This line creates a copy of out_file_paths to break potential links with log_files_output_paths
|
575 |
+
out_file_paths = out_file_paths.copy()
|
576 |
+
|
577 |
+
# Save Textract request metadata (if exists)
|
578 |
if new_textract_request_metadata and isinstance(new_textract_request_metadata, list):
|
579 |
+
all_textract_request_metadata.extend(new_textract_request_metadata)
|
|
|
580 |
|
581 |
elif text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION:
|
582 |
|
|
|
587 |
# Analyse text-based pdf
|
588 |
print('Redacting file as text-based PDF')
|
589 |
|
590 |
+
pymupdf_doc, all_pages_decision_process_table, all_page_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words = redact_text_pdf(
|
591 |
file_path,
|
592 |
language,
|
593 |
chosen_redact_entities,
|
|
|
598 |
current_loop_page,
|
599 |
page_break_return,
|
600 |
annotations_all_pages,
|
601 |
+
all_page_line_level_ocr_results_df,
|
602 |
all_pages_decision_process_table,
|
603 |
pymupdf_doc,
|
604 |
all_page_line_level_ocr_results_with_words,
|
|
|
646 |
out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.pdf"
|
647 |
print("Saving redacted PDF file:", out_redacted_pdf_file_path)
|
648 |
save_pdf_with_or_without_compression(pymupdf_doc, out_redacted_pdf_file_path)
|
649 |
+
|
650 |
out_file_paths.append(out_redacted_pdf_file_path)
|
651 |
|
652 |
+
if not all_page_line_level_ocr_results_df.empty:
|
653 |
+
all_page_line_level_ocr_results_df = all_page_line_level_ocr_results_df[["page", "text", "left", "top", "width", "height", "line"]]
|
654 |
+
else: all_page_line_level_ocr_results_df = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height", "line"])
|
655 |
|
656 |
+
#ocr_file_path = orig_pdf_file_path + "_ocr_output.csv"
|
657 |
+
ocr_file_path = (output_folder + pdf_file_name_without_ext + "_ocr_output_" + file_ending + ".csv")
|
658 |
+
all_page_line_level_ocr_results_df.sort_values(["page", "line"], inplace=True)
|
659 |
+
all_page_line_level_ocr_results_df.to_csv(ocr_file_path, index = None, encoding="utf-8-sig")
|
660 |
|
|
|
661 |
out_file_paths.append(ocr_file_path)
|
|
|
662 |
duplication_file_path_outputs.append(ocr_file_path)
|
663 |
|
664 |
if all_page_line_level_ocr_results_with_words:
|
|
|
|
|
|
|
|
|
665 |
all_page_line_level_ocr_results_with_words = merge_page_results(all_page_line_level_ocr_results_with_words)
|
666 |
+
|
|
|
|
|
667 |
with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
|
668 |
json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
|
669 |
|
670 |
all_page_line_level_ocr_results_with_words_df = word_level_ocr_output_to_dataframe(all_page_line_level_ocr_results_with_words)
|
671 |
|
672 |
all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="word_x0", xmax="word_x1", ymin="word_y0", ymax="word_y1")
|
673 |
+
# all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="line_x0", xmax="line_x1", ymin="line_y0", ymax="line_y1")
|
674 |
|
675 |
+
if text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION:
|
676 |
+
# Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
|
677 |
+
if not all_page_line_level_ocr_results_with_words_df.empty:
|
678 |
|
679 |
+
# all_page_line_level_ocr_results_with_words_df['line_y0'] = reverse_y_coords(all_page_line_level_ocr_results_with_words_df, 'line_y0')
|
680 |
+
# all_page_line_level_ocr_results_with_words_df['line_y1'] = reverse_y_coords(all_page_line_level_ocr_results_with_words_df, 'line_y1')
|
681 |
+
all_page_line_level_ocr_results_with_words_df['word_y0'] = reverse_y_coords(all_page_line_level_ocr_results_with_words_df, 'word_y0')
|
682 |
+
all_page_line_level_ocr_results_with_words_df['word_y1'] = reverse_y_coords(all_page_line_level_ocr_results_with_words_df, 'word_y1')
|
683 |
|
684 |
+
all_page_line_level_ocr_results_with_words_df['line_text'] = ""
|
685 |
+
all_page_line_level_ocr_results_with_words_df['line_x0'] = ""
|
686 |
+
all_page_line_level_ocr_results_with_words_df['line_x1'] = ""
|
687 |
+
all_page_line_level_ocr_results_with_words_df['line_y0'] = ""
|
688 |
+
all_page_line_level_ocr_results_with_words_df['line_y1'] = ""
|
689 |
+
|
690 |
+
all_page_line_level_ocr_results_with_words_df.sort_values(["page", "line", "word_x0"], inplace=True)
|
691 |
+
all_page_line_level_ocr_results_with_words_df_file_path = all_page_line_level_ocr_results_with_words_json_file_path.replace(".json", ".csv")
|
692 |
+
all_page_line_level_ocr_results_with_words_df.to_csv(all_page_line_level_ocr_results_with_words_df_file_path, index = None, encoding="utf-8-sig")
|
693 |
|
694 |
if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
|
695 |
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
|
|
|
697 |
if all_page_line_level_ocr_results_with_words_df_file_path not in log_files_output_paths:
|
698 |
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_df_file_path)
|
699 |
|
700 |
+
if all_page_line_level_ocr_results_with_words_df_file_path not in out_file_paths:
|
701 |
+
out_file_paths.append(all_page_line_level_ocr_results_with_words_df_file_path)
|
702 |
+
|
703 |
# Convert the gradio annotation boxes to relative coordinates
|
|
|
704 |
progress(0.93, "Creating review file output")
|
705 |
page_sizes = page_sizes_df.to_dict(orient="records")
|
706 |
all_image_annotations_df = convert_annotation_data_to_dataframe(annotations_all_pages)
|
|
|
714 |
# Don't need page sizes in outputs
|
715 |
review_file_state.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
|
716 |
|
717 |
+
review_file_state.to_csv(review_file_path, index=None, encoding="utf-8-sig")
|
718 |
|
719 |
+
if pii_identification_method != NO_REDACTION_PII_OPTION: out_file_paths.append(review_file_path)
|
|
|
720 |
|
721 |
# Make a combined message for the file
|
722 |
if isinstance(out_message, list) and out_message:
|
|
|
738 |
time_taken = toc - tic
|
739 |
estimated_time_taken_state += time_taken
|
740 |
|
741 |
+
# If textract requests made, write to logging file. Also record number of Textract requests
|
742 |
if all_textract_request_metadata and isinstance(all_textract_request_metadata, list):
|
743 |
all_request_metadata_str = '\n'.join(all_textract_request_metadata).strip()
|
744 |
|
745 |
all_textract_request_metadata_file_path = output_folder + pdf_file_name_without_ext + "_textract_metadata.txt"
|
746 |
|
747 |
+
with open(all_textract_request_metadata_file_path, "w") as f: f.write(all_request_metadata_str)
|
|
|
748 |
|
749 |
# Add the request metadata to the log outputs if not there already
|
750 |
+
if all_textract_request_metadata_file_path not in log_files_output_paths: log_files_output_paths.append(all_textract_request_metadata_file_path)
|
|
|
751 |
|
752 |
new_textract_query_numbers = len(all_textract_request_metadata)
|
753 |
total_textract_query_number += new_textract_query_numbers
|
|
|
762 |
|
763 |
if total_textract_query_number > number_of_pages: total_textract_query_number = number_of_pages
|
764 |
|
765 |
+
page_break_return = True
|
766 |
+
|
767 |
+
return combined_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages_divide, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df, review_file_state
|
768 |
|
769 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
|
770 |
'''
|
|
|
1102 |
else:
|
1103 |
page.set_cropbox(original_cropbox)
|
1104 |
|
1105 |
+
def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]= list(), page_sizes_df:pd.DataFrame=pd.DataFrame()):
|
1106 |
|
1107 |
rect_height = page.rect.height
|
1108 |
rect_width = page.rect.width
|
|
|
1129 |
image_dimensions = {}
|
1130 |
|
1131 |
out_annotation_boxes = {}
|
1132 |
+
all_image_annotation_boxes = list()
|
1133 |
|
1134 |
if isinstance(image, Image.Image):
|
1135 |
image_path = move_page_info(str(page))
|
|
|
1240 |
# IMAGE-BASED OCR PDF TEXT DETECTION/REDACTION WITH TESSERACT OR AWS TEXTRACT
|
1241 |
###
|
1242 |
|
1243 |
+
def merge_img_bboxes(bboxes: list, combined_results: Dict, page_signature_recogniser_results: list = list(), page_handwriting_recogniser_results: list = list(), handwrite_signature_checkbox: List[str] = ["Extract handwriting", "Extract signatures"], horizontal_threshold: int = 50, vertical_threshold: int = 12):
|
1244 |
+
"""
|
1245 |
+
Merges bounding boxes for image annotations based on the provided results from signature and handwriting recognizers.
|
1246 |
+
|
1247 |
+
Args:
|
1248 |
+
bboxes (list): A list of bounding boxes to be merged.
|
1249 |
+
combined_results (Dict): A dictionary containing combined results with line text and their corresponding bounding boxes.
|
1250 |
+
page_signature_recogniser_results (list, optional): A list of results from the signature recognizer. Defaults to an empty list.
|
1251 |
+
page_handwriting_recogniser_results (list, optional): A list of results from the handwriting recognizer. Defaults to an empty list.
|
1252 |
+
handwrite_signature_checkbox (List[str], optional): A list of options indicating whether to extract handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
|
1253 |
+
horizontal_threshold (int, optional): The threshold for merging bounding boxes horizontally. Defaults to 50.
|
1254 |
+
vertical_threshold (int, optional): The threshold for merging bounding boxes vertically. Defaults to 12.
|
1255 |
+
|
1256 |
+
Returns:
|
1257 |
+
None: This function modifies the bounding boxes in place and does not return a value.
|
1258 |
+
"""
|
1259 |
|
1260 |
+
all_bboxes = list()
|
1261 |
+
merged_bboxes = list()
|
1262 |
grouped_bboxes = defaultdict(list)
|
1263 |
|
1264 |
# Deep copy original bounding boxes to retain them
|
|
|
1273 |
merged_bboxes.extend(copy.deepcopy(page_signature_recogniser_results))
|
1274 |
|
1275 |
# Reconstruct bounding boxes for substrings of interest
|
1276 |
+
reconstructed_bboxes = list()
|
1277 |
for bbox in bboxes:
|
1278 |
bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
|
1279 |
for line_text, line_info in combined_results.items():
|
|
|
1283 |
start_char = line_text.index(bbox.text)
|
1284 |
end_char = start_char + len(bbox.text)
|
1285 |
|
1286 |
+
relevant_words = list()
|
1287 |
current_char = 0
|
1288 |
for word in line_info['words']:
|
1289 |
word_end = current_char + len(word['text'])
|
|
|
1372 |
page_max:int=999,
|
1373 |
text_extraction_method:str=TESSERACT_TEXT_EXTRACT_OPTION,
|
1374 |
handwrite_signature_checkbox:List[str]=["Extract handwriting", "Extract signatures"],
|
1375 |
+
textract_request_metadata:list=list(),
|
1376 |
current_loop_page:int=0,
|
1377 |
page_break_return:bool=False,
|
1378 |
+
annotations_all_pages:List=list(),
|
1379 |
+
all_page_line_level_ocr_results_df:pd.DataFrame = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height", "line"]),
|
1380 |
all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"]),
|
1381 |
+
pymupdf_doc:Document = list(),
|
1382 |
pii_identification_method:str="Local",
|
1383 |
comprehend_query_number:int=0,
|
1384 |
comprehend_client:str="",
|
1385 |
textract_client:str="",
|
1386 |
+
custom_recogniser_word_list:List[str]=list(),
|
1387 |
+
redact_whole_page_list:List[str]=list(),
|
1388 |
max_fuzzy_spelling_mistakes_num:int=1,
|
1389 |
match_fuzzy_whole_phrase_bool:bool=True,
|
1390 |
page_sizes_df:pd.DataFrame=pd.DataFrame(),
|
1391 |
text_extraction_only:bool=False,
|
1392 |
+
all_page_line_level_ocr_results = list(),
|
1393 |
+
all_page_line_level_ocr_results_with_words = list(),
|
1394 |
+
chosen_local_model:str="tesseract",
|
1395 |
page_break_val:int=int(PAGE_BREAK_VALUE),
|
1396 |
+
log_files_output_paths:List=list(),
|
1397 |
max_time:int=int(MAX_TIME_VALUE),
|
1398 |
+
nlp_analyser: AnalyzerEngine = nlp_analyser,
|
1399 |
+
output_folder:str=OUTPUT_FOLDER,
|
1400 |
progress=Progress(track_tqdm=True)):
|
1401 |
|
1402 |
'''
|
1403 |
+
This function redacts sensitive information from a PDF document. It takes the following parameters in order:
|
1404 |
|
1405 |
- file_path (str): The path to the PDF file to be redacted.
|
1406 |
- pdf_image_file_paths (List[str]): A list of paths to the PDF file pages converted to images.
|
|
|
1413 |
- text_extraction_method (str, optional): The type of analysis to perform on the PDF. Defaults to TESSERACT_TEXT_EXTRACT_OPTION.
|
1414 |
- handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
|
1415 |
- textract_request_metadata (list, optional): Metadata related to the redaction request. Defaults to an empty string.
|
1416 |
+
- current_loop_page (int, optional): The current page being processed. Defaults to 0.
|
1417 |
- page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
|
1418 |
- annotations_all_pages (List, optional): List of annotations on all pages that is used by the gradio_image_annotation object.
|
1419 |
+
- all_page_line_level_ocr_results_df (pd.DataFrame, optional): All line level OCR results for the document as a Pandas dataframe,
|
1420 |
- all_pages_decision_process_table (pd.DataFrame, optional): All redaction decisions for document as a Pandas dataframe.
|
1421 |
- pymupdf_doc (Document, optional): The document as a PyMupdf object.
|
1422 |
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
|
|
1429 |
- match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
|
1430 |
- page_sizes_df (pd.DataFrame, optional): A pandas dataframe of PDF page sizes in PDF or image format.
|
1431 |
- text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
|
1432 |
+
- all_page_line_level_ocr_results (optional): List of all page line level OCR results.
|
1433 |
+
- all_page_line_level_ocr_results_with_words (optional): List of all page line level OCR results with words.
|
1434 |
+
- chosen_local_model (str, optional): The local model chosen for OCR. Defaults to "tesseract", other choices are "paddle" for PaddleOCR, or "hybrid" for a combination of both.
|
1435 |
+
- page_break_val (int, optional): The value at which to trigger a page break. Defaults to PAGE_BREAK_VALUE.
|
1436 |
- log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
|
1437 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
1438 |
+
- nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
|
1439 |
+
- output_folder (str, optional): The folder for file outputs.
|
1440 |
- progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
1441 |
|
1442 |
The function returns a redacted PDF document along with processing output objects.
|
|
|
1447 |
file_name = get_file_name_without_type(file_path)
|
1448 |
comprehend_query_number_new = 0
|
1449 |
|
1450 |
+
# Try updating the supported languages for the spacy analyser
|
1451 |
+
try:
|
1452 |
+
nlp_analyser = create_nlp_analyser(language, existing_nlp_analyser=nlp_analyser)
|
1453 |
+
# Check list of nlp_analyser recognisers and languages
|
1454 |
+
if language != "en":
|
1455 |
+
gr.Info(f"Language: {language} only supports the following entity detection: {str(nlp_analyser.registry.get_supported_entities(languages=[language]))}")
|
1456 |
+
|
1457 |
+
except Exception as e:
|
1458 |
+
print(f"Error creating nlp_analyser for {language}: {e}")
|
1459 |
+
raise Exception(f"Error creating nlp_analyser for {language}: {e}")
|
1460 |
+
|
1461 |
# Update custom word list analyser object with any new words that have been added to the custom deny list
|
1462 |
if custom_recogniser_word_list:
|
1463 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
|
|
1468 |
new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
|
1469 |
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
1470 |
|
1471 |
+
# Only load in PaddleOCR models if not running Textract
|
1472 |
+
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
1473 |
+
image_analyser = CustomImageAnalyzerEngine(analyzer_engine=nlp_analyser, ocr_engine="tesseract", language=language)
|
1474 |
+
else:
|
1475 |
+
image_analyser = CustomImageAnalyzerEngine(analyzer_engine=nlp_analyser, ocr_engine=chosen_local_model, language=language)
|
1476 |
|
1477 |
if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
|
1478 |
out_message = "Connection to AWS Comprehend service unsuccessful."
|
|
|
1482 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION and textract_client == "":
|
1483 |
out_message_warning = "Connection to AWS Textract service unsuccessful. Redaction will only continue if local AWS Textract results can be found."
|
1484 |
print(out_message_warning)
|
1485 |
+
#raise Exception(out_message)
|
1486 |
|
1487 |
number_of_pages = pymupdf_doc.page_count
|
1488 |
print("Number of pages:", str(number_of_pages))
|
|
|
1501 |
textract_data, is_missing, log_files_output_paths = load_and_convert_textract_json(textract_json_file_path, log_files_output_paths, page_sizes_df)
|
1502 |
original_textract_data = textract_data.copy()
|
1503 |
|
1504 |
+
#print("Successfully loaded in Textract analysis results from file")
|
1505 |
|
1506 |
# If running local OCR option, check if file already exists. If it does, load in existing data
|
1507 |
if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
|
|
|
1509 |
all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(all_page_line_level_ocr_results_with_words_json_file_path, log_files_output_paths, page_sizes_df)
|
1510 |
original_all_page_line_level_ocr_results_with_words = all_page_line_level_ocr_results_with_words.copy()
|
1511 |
|
1512 |
+
#print("Loaded in local OCR analysis results from file")
|
1513 |
|
1514 |
###
|
1515 |
if current_loop_page == 0: page_loop_start = 0
|
|
|
1518 |
progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
|
1519 |
|
1520 |
# If there's data from a previous run (passed in via the DataFrame parameters), add it
|
1521 |
+
all_line_level_ocr_results_list = list()
|
1522 |
+
all_pages_decision_process_list = list()
|
1523 |
|
1524 |
+
if not all_page_line_level_ocr_results_df.empty:
|
1525 |
+
all_line_level_ocr_results_list.extend(all_page_line_level_ocr_results_df.to_dict('records'))
|
1526 |
if not all_pages_decision_process_table.empty:
|
1527 |
all_pages_decision_process_list.extend(all_pages_decision_process_table.to_dict('records'))
|
1528 |
|
|
|
1530 |
# Go through each page
|
1531 |
for page_no in progress_bar:
|
1532 |
|
1533 |
+
handwriting_or_signature_boxes = list()
|
1534 |
+
page_signature_recogniser_results = list()
|
1535 |
+
page_handwriting_recogniser_results = list()
|
1536 |
+
page_line_level_ocr_results_with_words = list()
|
1537 |
page_break_return = False
|
1538 |
reported_page_number = str(page_no + 1)
|
1539 |
|
|
|
1571 |
print("Can't find original cropbox details for page, using current PyMuPDF page cropbox")
|
1572 |
original_cropbox = pymupdf_page.cropbox.irect
|
1573 |
|
|
|
|
|
|
|
|
|
1574 |
# Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
|
|
|
1575 |
# If using Tesseract
|
1576 |
if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
|
1577 |
|
|
|
1584 |
)
|
1585 |
|
1586 |
page_line_level_ocr_results_with_words = matching_page if matching_page else []
|
1587 |
+
else: page_line_level_ocr_results_with_words = list()
|
1588 |
|
1589 |
if page_line_level_ocr_results_with_words:
|
1590 |
print("Found OCR results for page in existing OCR with words object")
|
|
|
1594 |
|
1595 |
page_line_level_ocr_results, page_line_level_ocr_results_with_words = combine_ocr_results(page_word_level_ocr_results, page=reported_page_number)
|
1596 |
|
1597 |
+
if all_page_line_level_ocr_results_with_words is None:
|
1598 |
+
all_page_line_level_ocr_results_with_words = list()
|
1599 |
+
|
1600 |
all_page_line_level_ocr_results_with_words.append(page_line_level_ocr_results_with_words)
|
1601 |
|
1602 |
# Check if page exists in existing textract data. If not, send to service to analyse
|
1603 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
1604 |
+
text_blocks = list()
|
1605 |
|
1606 |
if not textract_data:
|
1607 |
try:
|
|
|
1639 |
text_blocks, new_textract_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1640 |
|
1641 |
# Check if "pages" key exists, if not, initialise it as an empty list
|
1642 |
+
if "pages" not in textract_data: textract_data["pages"] = list()
|
1643 |
|
1644 |
# Append the new page data
|
1645 |
textract_data["pages"].append(text_blocks)
|
|
|
1647 |
except Exception as e:
|
1648 |
out_message = "Textract extraction for page " + reported_page_number + " failed due to:" + str(e)
|
1649 |
print(out_message)
|
1650 |
+
text_blocks = list()
|
1651 |
new_textract_request_metadata = "Failed Textract API call"
|
1652 |
|
1653 |
# Check if "pages" key exists, if not, initialise it as an empty list
|
1654 |
+
if "pages" not in textract_data: textract_data["pages"] = list()
|
1655 |
|
1656 |
raise Exception(out_message)
|
1657 |
|
|
|
1663 |
|
1664 |
page_line_level_ocr_results, handwriting_or_signature_boxes, page_signature_recogniser_results, page_handwriting_recogniser_results, page_line_level_ocr_results_with_words = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
|
1665 |
|
1666 |
+
if all_page_line_level_ocr_results_with_words is None:
|
1667 |
+
all_page_line_level_ocr_results_with_words = list()
|
1668 |
+
|
1669 |
all_page_line_level_ocr_results_with_words.append(page_line_level_ocr_results_with_words)
|
1670 |
|
1671 |
# Convert to DataFrame and add to ongoing logging table
|
|
|
1675 |
'left': result.left,
|
1676 |
'top': result.top,
|
1677 |
'width': result.width,
|
1678 |
+
'height': result.height,
|
1679 |
+
'line': result.line
|
1680 |
} for result in page_line_level_ocr_results['results']])
|
1681 |
|
1682 |
if not line_level_ocr_results_df.empty: # Ensure there are records to add
|
|
|
1691 |
page_line_level_ocr_results_with_words['results'],
|
1692 |
chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
|
1693 |
pii_identification_method = pii_identification_method,
|
1694 |
+
comprehend_client=comprehend_client,
|
1695 |
+
custom_entities=chosen_redact_entities,
|
1696 |
language=language,
|
|
|
1697 |
allow_list=allow_list,
|
1698 |
+
score_threshold=score_threshold,
|
1699 |
+
nlp_analyser=nlp_analyser
|
1700 |
)
|
1701 |
|
1702 |
comprehend_query_number = comprehend_query_number + comprehend_query_number_new
|
1703 |
|
1704 |
+
else: page_redaction_bounding_boxes = list()
|
1705 |
|
1706 |
# Merge redaction bounding boxes that are close together
|
1707 |
page_merged_redaction_bboxes = merge_img_bboxes(page_redaction_bounding_boxes, page_line_level_ocr_results_with_words['results'], page_signature_recogniser_results, page_handwriting_recogniser_results, handwrite_signature_checkbox)
|
1708 |
|
1709 |
+
else: page_merged_redaction_bboxes = list()
|
1710 |
|
1711 |
# 3. Draw the merged boxes
|
1712 |
## Apply annotations to pdf with pymupdf
|
|
|
1733 |
fill = (0, 0, 0) # Fill colour for redactions
|
1734 |
draw = ImageDraw.Draw(image)
|
1735 |
|
1736 |
+
all_image_annotations_boxes = list()
|
1737 |
|
1738 |
for box in page_merged_redaction_bboxes:
|
1739 |
|
|
|
1775 |
|
1776 |
page_image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
|
1777 |
|
1778 |
+
redacted_image = image.copy()
|
|
|
|
|
1779 |
|
1780 |
# Convert decision process to table
|
1781 |
decision_process_table = pd.DataFrame([{
|
|
|
1797 |
all_pages_decision_process_list.extend(decision_process_table.to_dict('records'))
|
1798 |
|
1799 |
decision_process_table = fill_missing_ids(decision_process_table)
|
|
|
1800 |
|
1801 |
toc = time.perf_counter()
|
1802 |
|
|
|
1931 |
return characters
|
1932 |
return []
|
1933 |
|
|
|
|
|
|
|
|
|
1934 |
|
1935 |
+
def create_line_level_ocr_results_from_characters(char_objects:List, line_number:int) -> Tuple[List[OCRResult], List[List]]:
|
1936 |
+
"""
|
1937 |
+
Create OCRResult objects based on a list of pdfminer LTChar objects.
|
1938 |
+
This version is corrected to use the specified OCRResult class definition.
|
1939 |
+
"""
|
1940 |
+
line_level_results_out = list()
|
1941 |
+
line_level_characters_out = list()
|
1942 |
+
character_objects_out = list()
|
1943 |
|
|
|
1944 |
full_text = ""
|
1945 |
+
# [x0, y0, x1, y1]
|
1946 |
+
overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]
|
|
|
|
|
|
|
|
|
|
|
1947 |
|
1948 |
for char in char_objects:
|
1949 |
+
character_objects_out.append(char)
|
|
|
|
|
|
|
|
|
1950 |
|
1951 |
if isinstance(char, LTAnno):
|
1952 |
added_text = char.get_text()
|
1953 |
+
full_text += added_text
|
|
|
|
|
|
|
|
|
|
|
1954 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1955 |
if '\n' in added_text:
|
1956 |
+
if full_text.strip():
|
1957 |
+
# Create OCRResult for line
|
1958 |
+
line_level_results_out.append(OCRResult(
|
1959 |
+
text=full_text.strip(),
|
1960 |
+
left=round(overall_bbox[0], 2),
|
1961 |
+
top=round(overall_bbox[1], 2),
|
1962 |
+
width=round(overall_bbox[2] - overall_bbox[0], 2),
|
1963 |
+
height=round(overall_bbox[3] - overall_bbox[1], 2),
|
1964 |
+
line=line_number
|
1965 |
+
))
|
1966 |
+
line_level_characters_out.append(character_objects_out)
|
1967 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1968 |
# Reset for the next line
|
1969 |
+
character_objects_out = list()
|
1970 |
full_text = ""
|
1971 |
overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]
|
1972 |
+
line_number += 1
|
|
|
|
|
1973 |
continue
|
1974 |
|
1975 |
+
# This part handles LTChar objects
|
1976 |
+
added_text = clean_unicode_text(char.get_text())
|
1977 |
+
full_text += added_text
|
1978 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1979 |
x0, y0, x1, y1 = char.bbox
|
1980 |
+
overall_bbox[0] = min(overall_bbox[0], x0)
|
1981 |
+
overall_bbox[1] = min(overall_bbox[1], y0)
|
1982 |
+
overall_bbox[2] = max(overall_bbox[2], x1)
|
1983 |
+
overall_bbox[3] = max(overall_bbox[3], y1)
|
1984 |
+
|
1985 |
+
# Process the last line
|
1986 |
+
if full_text.strip():
|
1987 |
+
line_number += 1
|
1988 |
+
line_ocr_result = OCRResult(
|
1989 |
+
text=full_text.strip(),
|
1990 |
+
left=round(overall_bbox[0], 2),
|
1991 |
+
top=round(overall_bbox[1], 2),
|
1992 |
+
width=round(overall_bbox[2] - overall_bbox[0], 2),
|
1993 |
+
height=round(overall_bbox[3] - overall_bbox[1], 2),
|
1994 |
+
line=line_number
|
1995 |
+
)
|
1996 |
+
line_level_results_out.append(line_ocr_result)
|
1997 |
+
line_level_characters_out.append(character_objects_out)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1998 |
|
1999 |
+
return line_level_results_out, line_level_characters_out
|
2000 |
|
2001 |
+
def generate_words_for_line(line_chars: List) -> List[Dict[str, Any]]:
|
2002 |
"""
|
2003 |
+
Generates word-level results for a single, pre-defined line of characters.
|
2004 |
|
2005 |
+
This robust version correctly identifies word breaks by:
|
2006 |
+
1. Treating specific punctuation characters as standalone words.
|
2007 |
+
2. Explicitly using space characters (' ') as a primary word separator.
|
2008 |
+
3. Using a geometric gap between characters as a secondary, heuristic separator.
|
2009 |
|
2010 |
Args:
|
2011 |
+
line_chars: A list of pdfminer.six LTChar/LTAnno objects for one line.
|
|
|
2012 |
|
2013 |
Returns:
|
2014 |
+
A list of dictionaries, where each dictionary represents an individual word.
|
2015 |
"""
|
2016 |
+
# We only care about characters with coordinates and text for word building.
|
2017 |
+
text_chars = [c for c in line_chars if hasattr(c, 'bbox') and c.get_text()]
|
2018 |
|
2019 |
if not text_chars:
|
2020 |
+
return []
|
2021 |
|
2022 |
+
# Sort characters by horizontal position for correct processing.
|
2023 |
+
text_chars.sort(key=lambda c: c.bbox[0])
|
2024 |
|
2025 |
+
# NEW: Define punctuation that should be split into separate words.
|
2026 |
+
# The hyphen '-' is intentionally excluded to keep words like 'high-tech' together.
|
2027 |
+
PUNCTUATION_TO_SPLIT = {'.', ',', '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'}
|
2028 |
|
2029 |
+
line_words = list()
|
2030 |
+
current_word_text = ""
|
2031 |
+
current_word_bbox = [float('inf'), float('inf'), -1, -1] # [x0, y0, x1, y1]
|
2032 |
prev_char = None
|
2033 |
|
2034 |
def finalize_word():
|
2035 |
nonlocal current_word_text, current_word_bbox
|
2036 |
+
# Only add the word if it contains non-space text
|
2037 |
+
if current_word_text.strip():
|
2038 |
+
# bbox from [x0, y0, x1, y1] to your required format
|
2039 |
+
final_bbox = [
|
2040 |
+
round(current_word_bbox[0], 2),
|
2041 |
+
round(current_word_bbox[3], 2), # Note: using y1 from pdfminer bbox
|
2042 |
+
round(current_word_bbox[2], 2),
|
2043 |
+
round(current_word_bbox[1], 2), # Note: using y0 from pdfminer bbox
|
2044 |
+
]
|
2045 |
line_words.append({
|
2046 |
+
"text": current_word_text.strip(),
|
2047 |
+
"bounding_box": final_bbox
|
2048 |
})
|
2049 |
+
# Reset for the next word
|
2050 |
current_word_text = ""
|
2051 |
current_word_bbox = [float('inf'), float('inf'), -1, -1]
|
2052 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2053 |
for char in text_chars:
|
2054 |
char_text = clean_unicode_text(char.get_text())
|
2055 |
|
2056 |
+
# 1. NEW: Check for splitting punctuation first.
|
2057 |
+
if char_text in PUNCTUATION_TO_SPLIT:
|
2058 |
+
# Finalize any word that came immediately before the punctuation.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2059 |
finalize_word()
|
2060 |
+
|
2061 |
+
# Treat the punctuation itself as a separate word.
|
2062 |
+
px0, py0, px1, py1 = char.bbox
|
2063 |
+
punc_bbox = [round(px0, 2), round(py1, 2), round(px1, 2), round(py0, 2)]
|
2064 |
+
line_words.append({
|
2065 |
+
"text": char_text,
|
2066 |
+
"bounding_box": punc_bbox
|
2067 |
+
})
|
2068 |
+
|
2069 |
prev_char = char
|
2070 |
+
continue # Skip to the next character
|
2071 |
|
2072 |
+
# 2. Primary Signal: Is the character a space?
|
2073 |
+
if char_text.isspace():
|
2074 |
+
finalize_word() # End the preceding word
|
2075 |
+
prev_char = char
|
2076 |
+
continue # Skip to the next character, do not add the space to any word
|
2077 |
+
|
2078 |
+
# 3. Secondary Signal: Is there a large geometric gap?
|
2079 |
+
if prev_char:
|
2080 |
+
# A gap is considered a word break if it's larger than a fraction of the font size.
|
2081 |
+
space_threshold = prev_char.size * 0.25 # 25% of the char size
|
2082 |
+
min_gap = 1.0 # Or at least 1.0 unit
|
2083 |
+
gap = char.bbox[0] - prev_char.bbox[2] # gap = current_char.x0 - prev_char.x1
|
2084 |
|
2085 |
+
if gap > max(space_threshold, min_gap):
|
2086 |
+
finalize_word() # Found a gap, so end the previous word.
|
|
|
|
|
|
|
2087 |
|
2088 |
+
# Append the character's text and update the bounding box for the current word
|
2089 |
+
current_word_text += char_text
|
2090 |
+
|
2091 |
+
x0, y0, x1, y1 = char.bbox
|
2092 |
+
current_word_bbox[0] = min(current_word_bbox[0], x0)
|
2093 |
+
current_word_bbox[1] = min(current_word_bbox[3], y0) # pdfminer y0 is bottom
|
2094 |
+
current_word_bbox[2] = max(current_word_bbox[2], x1)
|
2095 |
+
current_word_bbox[3] = max(current_word_bbox[1], y1) # pdfminer y1 is top
|
2096 |
|
2097 |
prev_char = char
|
2098 |
|
2099 |
+
# After the loop, finalize the last word that was being built.
|
2100 |
+
finalize_word()
|
2101 |
+
|
2102 |
+
return line_words
|
2103 |
|
2104 |
+
def process_page_to_structured_ocr(
|
2105 |
+
all_char_objects: List,
|
2106 |
+
page_number: int,
|
2107 |
+
text_line_number: int, # This will now be treated as the STARTING line number
|
2108 |
+
) -> Tuple[Dict[str, Any], List[OCRResult], List[List]]:
|
2109 |
+
"""
|
2110 |
+
Orchestrates the OCR process, correctly handling multiple lines.
|
2111 |
+
|
2112 |
+
Returns:
|
2113 |
+
A tuple containing:
|
2114 |
+
1. A dictionary with detailed line/word results for the page.
|
2115 |
+
2. A list of the complete OCRResult objects for each line.
|
2116 |
+
3. A list of lists, containing the character objects for each line.
|
2117 |
+
"""
|
2118 |
+
page_data = {"page": str(page_number), "results": {}}
|
2119 |
+
|
2120 |
+
# Step 1: Get definitive lines and their character groups.
|
2121 |
+
# This function correctly returns all lines found in the input characters.
|
2122 |
+
line_results, lines_char_groups = create_line_level_ocr_results_from_characters(all_char_objects, text_line_number)
|
2123 |
+
|
2124 |
+
if not line_results:
|
2125 |
+
return {}, [], []
|
2126 |
+
|
2127 |
+
# Step 2: Iterate through each found line and generate its words.
|
2128 |
+
for i, (line_info, char_group) in enumerate(zip(line_results, lines_char_groups)):
|
2129 |
+
|
2130 |
+
current_line_number = line_info.line #text_line_number + i
|
2131 |
+
|
2132 |
+
word_level_results = generate_words_for_line(char_group)
|
2133 |
+
|
2134 |
+
# Create a unique, incrementing line number for each iteration.
|
2135 |
+
|
2136 |
+
line_key = f"text_line_{current_line_number}"
|
2137 |
+
|
2138 |
+
line_bbox = [line_info.left, line_info.top, line_info.left + line_info.width, line_info.top + line_info.height]
|
2139 |
+
|
2140 |
+
# Now, each line is added to the dictionary with its own unique key.
|
2141 |
+
page_data["results"][line_key] = {
|
2142 |
+
"line": current_line_number, # Use the unique line number
|
2143 |
+
"text": line_info.text,
|
2144 |
+
"bounding_box": line_bbox,
|
2145 |
+
"words": word_level_results
|
2146 |
+
}
|
2147 |
+
|
2148 |
+
# The list of OCRResult objects is already correct.
|
2149 |
+
line_level_ocr_results_list = line_results
|
2150 |
+
|
2151 |
+
# Return the structured dictionary, the list of OCRResult objects, and the character groups
|
2152 |
+
return page_data, line_level_ocr_results_list, lines_char_groups
|
2153 |
|
2154 |
def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
|
2155 |
decision_process_table = pd.DataFrame()
|
|
|
2175 |
return decision_process_table
|
2176 |
|
2177 |
def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
|
2178 |
+
pikepdf_redaction_annotations_on_page = list()
|
2179 |
for analysed_bounding_box in analysed_bounding_boxes:
|
2180 |
|
2181 |
bounding_box = analysed_bounding_box["boundingBox"]
|
|
|
2208 |
page_max: int = 999, # Maximum page number to end redaction
|
2209 |
current_loop_page: int = 0, # Current page being processed in the loop
|
2210 |
page_break_return: bool = False, # Flag to indicate if a page break should be returned
|
2211 |
+
annotations_all_pages: List[dict] = list(), # List of annotations across all pages
|
2212 |
+
all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height", "line"]), # DataFrame for OCR results
|
2213 |
all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"]), # DataFrame for decision process table
|
2214 |
+
pymupdf_doc: List = list(), # List of PyMuPDF documents
|
2215 |
+
all_page_line_level_ocr_results_with_words: List = list(),
|
2216 |
+
pii_identification_method: str = "Local",
|
2217 |
comprehend_query_number:int = 0,
|
2218 |
comprehend_client="",
|
2219 |
+
custom_recogniser_word_list:List[str]=list(),
|
2220 |
+
redact_whole_page_list:List[str]=list(),
|
2221 |
max_fuzzy_spelling_mistakes_num:int=1,
|
2222 |
match_fuzzy_whole_phrase_bool:bool=True,
|
2223 |
page_sizes_df:pd.DataFrame=pd.DataFrame(),
|
2224 |
+
original_cropboxes:List[dict]=list(),
|
2225 |
text_extraction_only:bool=False,
|
2226 |
output_folder:str=OUTPUT_FOLDER,
|
2227 |
page_break_val: int = int(PAGE_BREAK_VALUE), # Value for page break
|
2228 |
+
max_time: int = int(MAX_TIME_VALUE),
|
2229 |
+
nlp_analyser: AnalyzerEngine = nlp_analyser,
|
2230 |
progress: Progress = Progress(track_tqdm=True) # Progress tracking object
|
2231 |
+
):
|
|
|
2232 |
'''
|
2233 |
Redact chosen entities from a PDF that is made up of multiple pages that are not images.
|
2234 |
|
|
|
2254 |
- redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
|
2255 |
- max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
|
2256 |
- match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
|
2257 |
+
- page_sizes_df (pd.DataFrame, optional): A pandas dataframe of PDF page sizes in PDF or image format.
|
2258 |
- original_cropboxes (List[dict], optional): A list of dictionaries containing pymupdf cropbox information.
|
2259 |
- text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
|
2260 |
+
- language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
|
2261 |
- output_folder (str, optional): The output folder for the function
|
2262 |
- page_break_val: Value for page break
|
2263 |
+
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
2264 |
+
- nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
|
2265 |
- progress: Progress tracking object
|
2266 |
'''
|
2267 |
|
2268 |
+
tic = time.perf_counter()
|
2269 |
|
2270 |
if isinstance(all_line_level_ocr_results_df, pd.DataFrame):
|
2271 |
all_line_level_ocr_results_list = [all_line_level_ocr_results_df]
|
|
|
2277 |
if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
|
2278 |
out_message = "Connection to AWS Comprehend service not found."
|
2279 |
raise Exception(out_message)
|
2280 |
+
|
2281 |
+
# Try updating the supported languages for the spacy analyser
|
2282 |
+
try:
|
2283 |
+
nlp_analyser = create_nlp_analyser(language, existing_nlp_analyser=nlp_analyser)
|
2284 |
+
# Check list of nlp_analyser recognisers and languages
|
2285 |
+
if language != "en":
|
2286 |
+
gr.Info(f"Language: {language} only supports the following entity detection: {str(nlp_analyser.registry.get_supported_entities(languages=[language]))}")
|
2287 |
+
|
2288 |
+
except Exception as e:
|
2289 |
+
print(f"Error creating nlp_analyser for {language}: {e}")
|
2290 |
+
raise Exception(f"Error creating nlp_analyser for {language}: {e}")
|
2291 |
|
2292 |
# Update custom word list analyser object with any new words that have been added to the custom deny list
|
2293 |
if custom_recogniser_word_list:
|
|
|
2297 |
|
2298 |
nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
|
2299 |
new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
|
2300 |
+
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
2301 |
|
2302 |
# Open with Pikepdf to get text lines
|
2303 |
pikepdf_pdf = Pdf.open(file_path)
|
2304 |
number_of_pages = len(pikepdf_pdf.pages)
|
2305 |
|
2306 |
+
#file_name = get_file_name_without_type(file_path)
|
2307 |
+
|
2308 |
+
if not all_page_line_level_ocr_results_with_words: all_page_line_level_ocr_results_with_words = list()
|
2309 |
|
|
|
|
|
|
|
2310 |
# Check that page_min and page_max are within expected ranges
|
2311 |
+
if page_max > number_of_pages or page_max == 0: page_max = number_of_pages
|
|
|
2312 |
|
2313 |
if page_min <= 0: page_min = 0
|
2314 |
else: page_min = page_min - 1
|
|
|
2338 |
# Go page by page
|
2339 |
for page_layout in extract_pages(file_path, page_numbers = [page_no], maxpages=1):
|
2340 |
|
2341 |
+
all_page_line_text_extraction_characters = list()
|
2342 |
+
all_page_line_level_text_extraction_results_list = list()
|
2343 |
+
page_analyser_results = list()
|
2344 |
+
page_redaction_bounding_boxes = list()
|
2345 |
|
2346 |
+
characters = list()
|
2347 |
+
pikepdf_redaction_annotations_on_page = list()
|
2348 |
page_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"])
|
2349 |
+
page_text_ocr_outputs = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height", "line"])
|
2350 |
+
page_text_ocr_outputs_list = list()
|
2351 |
|
2352 |
+
text_line_no = 1
|
2353 |
for n, text_container in enumerate(page_layout):
|
2354 |
+
characters = list()
|
2355 |
|
2356 |
if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
|
2357 |
characters = get_text_container_characters(text_container)
|
2358 |
+
#text_line_no += 1
|
2359 |
|
2360 |
# Create dataframe for all the text on the page
|
2361 |
+
# line_level_text_results_list, line_characters = create_line_level_ocr_results_from_characters(characters)
|
2362 |
+
|
2363 |
+
# line_level_ocr_results_with_words = generate_word_level_ocr(characters, page_number=int(reported_page_number), text_line_number=text_line_no)
|
2364 |
+
|
2365 |
+
line_level_ocr_results_with_words, line_level_text_results_list, line_characters = process_page_to_structured_ocr(characters, page_number=int(reported_page_number), text_line_number=text_line_no)
|
2366 |
|
2367 |
+
text_line_no += len(line_level_text_results_list)
|
2368 |
|
2369 |
### Create page_text_ocr_outputs (OCR format outputs)
|
2370 |
if line_level_text_results_list:
|
|
|
2375 |
'left': result.left,
|
2376 |
'top': result.top,
|
2377 |
'width': result.width,
|
2378 |
+
'height': result.height,
|
2379 |
+
'line': result.line
|
2380 |
} for result in line_level_text_results_list])
|
2381 |
|
2382 |
+
page_text_ocr_outputs_list.append(line_level_text_results_df)
|
2383 |
|
2384 |
all_page_line_level_text_extraction_results_list.extend(line_level_text_results_list)
|
2385 |
all_page_line_text_extraction_characters.extend(line_characters)
|
2386 |
+
all_page_line_level_ocr_results_with_words.append(line_level_ocr_results_with_words)
|
2387 |
+
|
2388 |
+
#print("page_text_ocr_outputs_list:", page_text_ocr_outputs_list)
|
2389 |
+
page_text_ocr_outputs = pd.concat(page_text_ocr_outputs_list)
|
2390 |
+
#page_text_ocr_outputs.to_csv("output/page_text_ocr_outputs.csv")
|
2391 |
|
2392 |
### REDACTION
|
2393 |
if pii_identification_method != NO_REDACTION_PII_OPTION:
|
|
|
2413 |
# Annotate redactions on page
|
2414 |
pikepdf_redaction_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_redaction_bounding_boxes)
|
2415 |
|
2416 |
+
else: pikepdf_redaction_annotations_on_page = list()
|
2417 |
|
2418 |
# Make pymupdf page redactions
|
2419 |
if redact_whole_page_list:
|
|
|
2437 |
|
2438 |
# Join extracted text outputs for all lines together
|
2439 |
if not page_text_ocr_outputs.empty:
|
2440 |
+
page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["line"]).reset_index(drop=True)
|
2441 |
+
page_text_ocr_outputs = page_text_ocr_outputs.loc[:, ["page", "text", "left", "top", "width", "height", "line"]]
|
2442 |
all_line_level_ocr_results_list.append(page_text_ocr_outputs)
|
2443 |
|
2444 |
toc = time.perf_counter()
|
|
|
2464 |
# Write logs
|
2465 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
|
2466 |
all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
|
2467 |
+
|
2468 |
+
print("all_line_level_ocr_results_df:", all_line_level_ocr_results_df)
|
2469 |
|
2470 |
current_loop_page += 1
|
2471 |
|
|
|
2494 |
|
2495 |
# Write all page outputs
|
2496 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
|
|
|
|
|
|
|
2497 |
all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
|
|
|
|
|
2498 |
|
2499 |
# Convert decision table to relative coordinates
|
2500 |
all_pages_decision_process_table = divide_coordinates_by_page_sizes(all_pages_decision_process_table, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
|
2501 |
|
2502 |
# Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
|
2503 |
+
all_pages_decision_process_table['ymin'] = reverse_y_coords(all_pages_decision_process_table,'ymin')
|
2504 |
+
all_pages_decision_process_table['ymax'] = reverse_y_coords(all_pages_decision_process_table,'ymax')
|
2505 |
|
2506 |
# Convert decision table to relative coordinates
|
2507 |
all_line_level_ocr_results_df = divide_coordinates_by_page_sizes(all_line_level_ocr_results_df, page_sizes_df, xmin="left", xmax="width", ymin="top", ymax="height")
|
|
|
2510 |
|
2511 |
# Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
|
2512 |
if not all_line_level_ocr_results_df.empty:
|
2513 |
+
all_line_level_ocr_results_df['top'] = reverse_y_coords(all_line_level_ocr_results_df,'top')
|
|
|
|
|
|
|
2514 |
|
2515 |
+
# Remove empty dictionary items from ocr results with words
|
2516 |
+
all_page_line_level_ocr_results_with_words = [d for d in all_page_line_level_ocr_results_with_words if d]
|
|
|
2517 |
|
2518 |
return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words
|
tools/find_duplicate_pages.py
CHANGED
@@ -1,91 +1,482 @@
|
|
1 |
import pandas as pd
|
2 |
import os
|
3 |
import re
|
4 |
-
|
|
|
|
|
5 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
6 |
from sklearn.metrics.pairwise import cosine_similarity
|
7 |
-
from typing import List, Tuple, Optional, Dict
|
8 |
from collections import defaultdict
|
9 |
import gradio as gr
|
10 |
from gradio import Progress
|
11 |
from pathlib import Path
|
12 |
-
from
|
|
|
13 |
from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
|
14 |
-
import
|
15 |
-
|
16 |
-
nlp = en_core_web_lg.load()
|
17 |
|
18 |
similarity_threshold = 0.95
|
|
|
|
|
19 |
|
20 |
-
def
|
21 |
"""
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
|
|
|
|
|
|
28 |
Returns:
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
"""
|
31 |
-
all_data = []
|
32 |
-
output_files = []
|
33 |
|
34 |
-
if
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
-
|
|
|
|
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
if 'page' not in df.columns or 'text' not in df.columns:
|
51 |
-
print(f"Warning: Skipping {
|
52 |
continue
|
53 |
|
|
|
54 |
df['text'] = df['text'].fillna('').astype(str)
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
else:
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
-
grouped = df #.drop('line_number_by_page', axis=1)
|
66 |
-
|
67 |
-
# Add filename column
|
68 |
-
grouped['file'] = os.path.basename(file_path)
|
69 |
-
|
70 |
-
all_data.append(grouped)
|
71 |
-
|
72 |
if not all_data:
|
73 |
-
raise ValueError("No valid
|
74 |
-
|
75 |
-
#
|
76 |
combined_df = pd.concat(all_data, ignore_index=True)
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
-
#
|
79 |
-
|
80 |
|
81 |
-
|
82 |
-
combined_df.
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
'''
|
90 |
Clean and stem text columns in a data frame
|
91 |
'''
|
@@ -176,7 +567,7 @@ def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str,
|
|
176 |
|
177 |
# 1. Save the main results DataFrame
|
178 |
similarity_file_output_path = output_folder_path / 'page_similarity_results.csv'
|
179 |
-
final_df.to_csv(similarity_file_output_path, index=False)
|
180 |
|
181 |
output_paths.append(str(similarity_file_output_path))
|
182 |
print(f"Main results saved to {similarity_file_output_path}")
|
@@ -213,156 +604,254 @@ def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str,
|
|
213 |
|
214 |
return output_paths
|
215 |
|
216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
df_combined: pd.DataFrame,
|
218 |
-
similarity_threshold: float =
|
219 |
-
min_word_count: int =
|
220 |
min_consecutive_pages: int = 1,
|
221 |
-
greedy_match: bool =
|
222 |
-
combine_pages:bool=
|
223 |
-
|
|
|
|
|
|
|
|
|
224 |
progress=Progress(track_tqdm=True)
|
225 |
) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
|
226 |
"""
|
227 |
-
Identifies similar pages
|
228 |
-
1. Single Page: If greedy_match=False and min_consecutive_pages=1.
|
229 |
-
2. Fixed-Length Subdocument: If greedy_match=False and min_consecutive_pages > 1.
|
230 |
-
3. Greedy Consecutive Match: If greedy_match=True.
|
231 |
"""
|
232 |
-
|
233 |
-
output_paths = []
|
234 |
progress(0.1, desc="Processing and filtering text")
|
235 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
df['word_count'] = df['text_clean'].str.split().str.len().fillna(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
original_row_count = len(df)
|
238 |
df_filtered = df[df['word_count'] >= min_word_count].copy()
|
239 |
df_filtered.reset_index(drop=True, inplace=True)
|
240 |
|
241 |
print(f"Filtered out {original_row_count - len(df_filtered)} pages with fewer than {min_word_count} words.")
|
242 |
-
|
243 |
if len(df_filtered) < 2:
|
244 |
return pd.DataFrame(), [], df_combined
|
245 |
-
|
246 |
-
vectorizer = TfidfVectorizer()
|
247 |
-
tfidf_matrix = vectorizer.fit_transform(df_filtered['text_clean'])
|
248 |
|
249 |
-
progress(0.3, desc="Calculating text similarity")
|
250 |
-
similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
|
251 |
-
coo_matrix = similarity_matrix.tocoo()
|
252 |
|
253 |
-
# Create a DataFrame of all individual page pairs above the threshold.
|
254 |
-
# This is the base for all three matching strategies.
|
255 |
-
similar_pages = [
|
256 |
-
(r, c, v) for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
|
257 |
-
if r < c and v >= similarity_threshold
|
258 |
-
]
|
259 |
-
|
260 |
-
if not similar_pages:
|
261 |
-
return pd.DataFrame(), [], df_combined
|
262 |
|
263 |
-
|
|
|
264 |
|
265 |
-
|
266 |
-
|
267 |
-
if greedy_match:
|
268 |
-
print("Finding matches using greedy consecutive strategy.")
|
269 |
-
|
270 |
-
# A set of pairs for fast lookups of (page1_idx, page2_idx)
|
271 |
-
valid_pairs_set = set(zip(base_similarity_df['Page1_Index'], base_similarity_df['Page2_Index']))
|
272 |
|
273 |
-
#
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
all_sequences = []
|
278 |
-
|
279 |
-
# Iterate through all potential starting pairs, sorted for consistent results
|
280 |
-
sorted_pairs = base_similarity_df.sort_values(['Page1_Index', 'Page2_Index'])
|
281 |
-
|
282 |
-
for _, row in sorted_pairs.iterrows():
|
283 |
-
start_idx1, start_idx2 = int(row['Page1_Index']), int(row['Page2_Index'])
|
284 |
-
|
285 |
-
# If this pair has already been consumed by a previous sequence, skip it
|
286 |
-
if start_idx1 in consumed_indices_1 or start_idx2 in consumed_indices_2:
|
287 |
-
continue
|
288 |
-
|
289 |
-
# This is a new sequence, start expanding it
|
290 |
-
current_sequence = [(start_idx1, start_idx2)]
|
291 |
-
k = 1
|
292 |
-
while True:
|
293 |
-
next_idx1 = start_idx1 + k
|
294 |
-
next_idx2 = start_idx2 + k
|
295 |
-
|
296 |
-
# Check if the next pair in the sequence is a valid match
|
297 |
-
if (next_idx1, next_idx2) in valid_pairs_set and \
|
298 |
-
next_idx1 not in consumed_indices_1 and \
|
299 |
-
next_idx2 not in consumed_indices_2:
|
300 |
-
current_sequence.append((next_idx1, next_idx2))
|
301 |
-
k += 1
|
302 |
-
else:
|
303 |
-
# The sequence has ended
|
304 |
-
break
|
305 |
-
|
306 |
-
# Record the found sequence and mark all its pages as consumed
|
307 |
-
sequence_indices_1 = [p[0] for p in current_sequence]
|
308 |
-
sequence_indices_2 = [p[1] for p in current_sequence]
|
309 |
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
return pd.DataFrame(), [], df_combined
|
|
|
|
|
321 |
|
322 |
-
|
323 |
-
# We can add back the average similarity if needed, but it requires more lookups.
|
324 |
-
# For now, we'll omit it for simplicity in the greedy approach.
|
325 |
-
# ... (The rest is metadata mapping, same as the subdocument case)
|
326 |
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
similarity_df
|
|
|
|
|
|
|
332 |
is_consecutive = (similarity_df['Page1_Index'].diff() == 1) & (similarity_df['Page2_Index'].diff() == 1)
|
|
|
|
|
|
|
333 |
block_id = is_consecutive.eq(False).cumsum()
|
|
|
|
|
334 |
grouped = similarity_df.groupby(block_id)
|
|
|
|
|
335 |
agg_results = grouped.agg(
|
336 |
-
Page1_Start_Index=('Page1_Index', 'first'),
|
337 |
-
|
338 |
-
|
|
|
|
|
|
|
339 |
).reset_index(drop=True)
|
340 |
-
subdocument_df = agg_results[agg_results['Match_Length'] >= min_consecutive_pages].copy()
|
341 |
-
if subdocument_df.empty: return pd.DataFrame(), [], df_combined
|
342 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
343 |
else:
|
344 |
-
|
345 |
-
|
346 |
final_df = map_metadata_single_page(base_similarity_df, df_filtered)
|
347 |
-
#
|
348 |
-
pass # The final_df is already prepared
|
349 |
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
progress(0.8, desc="Saving output files")
|
356 |
|
357 |
output_paths = save_results_and_redaction_lists(final_df, output_folder, combine_pages)
|
358 |
|
359 |
-
|
360 |
-
|
361 |
-
# ==============================================================================
|
362 |
-
# GRADIO HELPER FUNCTIONS
|
363 |
-
# ==============================================================================
|
364 |
|
365 |
-
|
|
|
366 |
def handle_selection_and_preview(evt: gr.SelectData, results_df:pd.DataFrame, full_duplicate_data_by_file: dict):
|
367 |
"""
|
368 |
This single function handles a user selecting a row. It:
|
@@ -413,18 +902,16 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
|
|
413 |
Wrapper function updated to include the 'greedy_match' boolean.
|
414 |
"""
|
415 |
if not files:
|
416 |
-
|
417 |
-
return None, None, None
|
418 |
|
419 |
progress(0, desc="Combining input files...")
|
420 |
-
df_combined, _ = combine_ocr_output_text(files, combine_pages=combine_pages)
|
421 |
|
422 |
if df_combined.empty:
|
423 |
-
|
424 |
-
return None, None, None
|
425 |
|
426 |
# Call the main analysis function with the new parameter
|
427 |
-
results_df, output_paths, full_df =
|
428 |
df_combined=df_combined,
|
429 |
similarity_threshold=threshold,
|
430 |
min_word_count=min_words,
|
@@ -436,7 +923,6 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
|
|
436 |
|
437 |
# Clip text to first 200 characters
|
438 |
full_df['text'] = full_df['text'].str[:preview_length]
|
439 |
-
|
440 |
# Preprocess full_data (without preview text) for fast access (run once)
|
441 |
full_data_by_file = {
|
442 |
file: df.sort_values('page').set_index('page')
|
@@ -446,7 +932,7 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
|
|
446 |
if results_df.empty:
|
447 |
gr.Info(f"No duplicate pages found, no results returned.")
|
448 |
|
449 |
-
return results_df, output_paths, full_data_by_file
|
450 |
|
451 |
def show_page_previews(full_data_by_file: dict, results_df: pd.DataFrame, evt: gr.SelectData, preview_length:int=500):
|
452 |
"""
|
@@ -531,14 +1017,31 @@ def add_new_annotations_to_existing_page_annotations(
|
|
531 |
|
532 |
return all_annotations, newly_added_annotation_group
|
533 |
|
534 |
-
def apply_whole_page_redactions_from_list(duplicate_page_numbers_df: pd.DataFrame, doc_file_name_with_extension_textbox: str, review_file_state: pd.DataFrame, duplicate_output_paths: list[str], pymupdf_doc: object, page_sizes: list[dict], all_existing_annotations: list[dict], combine_pages:bool=True, new_annotations_with_bounding_boxes:List[dict]=
|
535 |
'''
|
536 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
537 |
'''
|
|
|
|
|
|
|
|
|
|
|
|
|
538 |
all_annotations = all_existing_annotations.copy()
|
539 |
|
540 |
if not pymupdf_doc:
|
541 |
-
message = "No document file currently under review
|
542 |
print(f"Warning: {message}")
|
543 |
raise Warning(message)
|
544 |
|
@@ -667,131 +1170,27 @@ def apply_whole_page_redactions_from_list(duplicate_page_numbers_df: pd.DataFram
|
|
667 |
review_file_out = review_file_out.sort_values(by=["page", "ymin", "xmin"]).reset_index(drop=True)
|
668 |
review_file_out = review_file_out.drop_duplicates(subset=['page', 'label', 'text', 'id'], keep='first')
|
669 |
|
670 |
-
out_message = "Successfully created
|
671 |
print(out_message)
|
672 |
gr.Info(out_message)
|
673 |
|
674 |
return review_file_out, all_annotations
|
675 |
|
676 |
-
|
677 |
-
# --- 1. Helper Function to Parse the Combined Page/Line ID ---
|
678 |
def _parse_page_line_id(combined_id: int) -> Tuple[int, int]:
|
679 |
-
"""
|
680 |
-
|
681 |
-
|
682 |
-
|
683 |
-
|
684 |
-
Example: 100027 -> (1, 27)
|
685 |
-
200005 -> (2, 5)
|
686 |
-
"""
|
687 |
-
# zfill ensures the string is padded with leading zeros to 10 characters
|
688 |
-
s_id = str(combined_id).zfill(10)
|
689 |
-
page = int(s_id[:5])
|
690 |
-
line = int(s_id[5:])
|
691 |
-
return page, line
|
692 |
-
|
693 |
-
# def create_annotations_from_ocr_outputs(ocr_results_df_lines_to_annotate:pd.DataFrame):
|
694 |
-
# '''
|
695 |
-
# Create a set of annotation boxes based on selected ocr_results_df lines.
|
696 |
-
# '''
|
697 |
-
# annotations_by_page = []
|
698 |
-
|
699 |
-
# # --- Build Annotation Boxes for each selected line ---
|
700 |
-
# for _, line_row in ocr_results_df_lines_to_annotate.iterrows():
|
701 |
-
# # The coordinates are relative, so xmax = left + width and ymax = top + height
|
702 |
-
# box = {
|
703 |
-
# "label": "Similar Text", # Or any other label you prefer
|
704 |
-
# "xmin": line_row['left'],
|
705 |
-
# "ymin": line_row['top'] + line_row['height'],
|
706 |
-
# "xmax": line_row['left'] + line_row['width'],
|
707 |
-
# "ymax": line_row['top'] ,
|
708 |
-
# "text": line_row['text']
|
709 |
-
# }
|
710 |
-
# # --- 6. Group the box by its page number ---
|
711 |
-
# page_number = line_row['page']
|
712 |
-
# annotations_by_page[page_number].append(box)
|
713 |
-
|
714 |
-
# return annotations_by_page
|
715 |
-
|
716 |
-
# def create_annotation_objects_from_duplicates(
|
717 |
-
# duplicates_df: pd.DataFrame,
|
718 |
-
# ocr_results_df: pd.DataFrame,
|
719 |
-
# combine_pages:bool=False
|
720 |
-
# ) -> List[Dict]:
|
721 |
-
# """
|
722 |
-
# Creates structured annotation objects from selected ocr outputs.
|
723 |
-
|
724 |
-
# Args:
|
725 |
-
# duplicates_df (pd.DataFrame): DataFrame containing duplicate ranges with
|
726 |
-
# columns like 'Page2_Start_Page' and 'Page2_End_Page'.
|
727 |
-
# ocr_results_df (pd.DataFrame): DataFrame with OCR results, including columns
|
728 |
-
# 'page', 'text', 'left', 'top', 'width', 'height'.
|
729 |
-
|
730 |
-
# Returns:
|
731 |
-
# List[Dict]: A list of dictionaries, where each dict represents a page and its
|
732 |
-
# list of annotation boxes, in the format:
|
733 |
-
# [{"page": 1, "boxes": [...]}, {"page": 2, "boxes": [...]}]
|
734 |
-
# """
|
735 |
-
# annotations_by_page = []
|
736 |
-
|
737 |
-
# if combine_pages == False:
|
738 |
-
|
739 |
-
# # --- 2. Prepare OCR Data: Add a line number column if it doesn't exist ---
|
740 |
-
# if 'line_number_by_page' not in ocr_results_df.columns:
|
741 |
-
# print("Generating 'line_number_by_page' for ocr_results_df...")
|
742 |
-
# # Sort by page and original position to ensure correct line numbering
|
743 |
-
# ocr_results_df = ocr_results_df.sort_values(by=['page', 'top', 'left']).reset_index(drop=True)
|
744 |
-
# ocr_results_df['line_number_by_page'] = ocr_results_df.groupby('page').cumcount() + 1
|
745 |
-
|
746 |
-
# # Use defaultdict to easily append to lists for each page
|
747 |
-
# annotations_by_page = defaultdict(list)
|
748 |
-
|
749 |
-
# # --- 3. Iterate through each duplicate range ---
|
750 |
-
# for _, row in duplicates_df.iterrows():
|
751 |
-
# # Parse the start and end page/line numbers from the duplicate row
|
752 |
-
# start_page, start_line = _parse_page_line_id(row['Page2_Start_Page'])
|
753 |
-
# end_page, end_line = _parse_page_line_id(row['Page2_End_Page'])
|
754 |
-
|
755 |
-
# # --- 4. Select OCR Lines based on the range ---
|
756 |
-
# # This logic correctly handles ranges within a single page and across multiple pages
|
757 |
-
# if start_page == end_page:
|
758 |
-
# # Simple case: the range is on a single page
|
759 |
-
# condition = (
|
760 |
-
# (ocr_results_df['page'] == start_page) &
|
761 |
-
# (ocr_results_df['line_number_by_page'].between(start_line, end_line))
|
762 |
-
# )
|
763 |
-
# else:
|
764 |
-
# # Complex case: the range spans multiple pages
|
765 |
-
# # Condition for the first page in the range
|
766 |
-
# cond_start = (ocr_results_df['page'] == start_page) & (ocr_results_df['line_number_by_page'] >= start_line)
|
767 |
-
# # Condition for all pages between the start and end
|
768 |
-
# cond_middle = ocr_results_df['page'].between(start_page + 1, end_page - 1)
|
769 |
-
# # Condition for the last page in the range
|
770 |
-
# cond_end = (ocr_results_df['page'] == end_page) & (ocr_results_df['line_number_by_page'] <= end_line)
|
771 |
-
|
772 |
-
# condition = cond_start | cond_middle | cond_end
|
773 |
-
|
774 |
-
# lines_to_annotate = ocr_results_df[condition]
|
775 |
-
|
776 |
-
# annotations_by_page = create_annotations_from_ocr_outputs(lines_to_annotate)
|
777 |
-
|
778 |
-
# # --- Format the final output list ---
|
779 |
-
# final_output = []
|
780 |
-
# # Sort by page number for a predictable order
|
781 |
-
# for page, boxes in sorted(annotations_by_page.items()):
|
782 |
-
# final_output.append({
|
783 |
-
# "page": page,
|
784 |
-
# "boxes": boxes
|
785 |
-
# })
|
786 |
|
787 |
-
|
|
|
|
|
788 |
|
789 |
def create_annotation_objects_from_duplicates(
|
790 |
duplicates_df: pd.DataFrame,
|
791 |
ocr_results_df: pd.DataFrame,
|
792 |
page_sizes: List[Dict],
|
793 |
-
combine_pages:bool=False
|
794 |
-
) -> List[Dict]:
|
795 |
"""
|
796 |
Creates structured annotation objects from duplicate line ranges, mapping
|
797 |
page numbers to image paths.
|
@@ -807,8 +1206,12 @@ def create_annotation_objects_from_duplicates(
|
|
807 |
"""
|
808 |
final_output = []
|
809 |
|
|
|
|
|
|
|
|
|
|
|
810 |
if combine_pages == False:
|
811 |
-
# --- NEW: Create an efficient lookup map from page number to image path ---
|
812 |
page_to_image_map = {item['page']: item['image_path'] for item in page_sizes}
|
813 |
|
814 |
# Prepare OCR Data: Add a line number column if it doesn't exist
|
@@ -850,11 +1253,8 @@ def create_annotation_objects_from_duplicates(
|
|
850 |
"id": "" # to be filled in after
|
851 |
}
|
852 |
page_number = line_row['page']
|
853 |
-
|
854 |
|
855 |
annotations_by_page[page_number].append(box)
|
856 |
-
|
857 |
-
print("annotations_by_page:", annotations_by_page)
|
858 |
|
859 |
# --- Format the final output list using the page-to-image map ---
|
860 |
final_output = []
|
@@ -878,39 +1278,5 @@ def create_annotation_objects_from_duplicates(
|
|
878 |
# Handle cases where a page might not have a corresponding image path
|
879 |
print(f"Warning: Page {page_num} found in OCR data but has no corresponding "
|
880 |
f"entry in the 'page_sizes' object. This page's annotations will be skipped.")
|
881 |
-
|
882 |
-
print("final_output:", final_output)
|
883 |
|
884 |
-
return final_output
|
885 |
-
|
886 |
-
# --- Example Usage ---
|
887 |
-
|
888 |
-
# 1. Create your example DataFrames
|
889 |
-
# duplicates_data = {
|
890 |
-
# 'Page1_File': ['doc_a.csv'],
|
891 |
-
# 'Page1_Start_Page': [100009],
|
892 |
-
# 'Page1_End_Page': [100021],
|
893 |
-
# 'Page2_File': ['doc_a.csv'],
|
894 |
-
# 'Page2_Start_Page': [100027], # Page 1, Line 27
|
895 |
-
# 'Page2_End_Page': [200005], # Page 2, Line 5
|
896 |
-
# }
|
897 |
-
# duplicates_df = pd.DataFrame(duplicates_data)
|
898 |
-
|
899 |
-
# ocr_data = {
|
900 |
-
# 'page': [1]*30 + [2]*10, # 30 lines on page 1, 10 on page 2
|
901 |
-
# 'text': [f"Text on page {p}, line {l}" for p in [1, 2] for l in range(1, (31 if p==1 else 11))],
|
902 |
-
# # Example coordinates (using small, consistent values for demonstration)
|
903 |
-
# 'left': [0.1] * 40,
|
904 |
-
# 'top': [i*0.02 for i in range(30)] + [i*0.02 for i in range(10)],
|
905 |
-
# 'width': [0.8] * 40,
|
906 |
-
# 'height': [0.015] * 40,
|
907 |
-
# }
|
908 |
-
# ocr_results_df = pd.DataFrame(ocr_data)
|
909 |
-
|
910 |
-
|
911 |
-
# # 2. Run the function
|
912 |
-
# generated_annotations = create_annotation_objects_from_duplicates(duplicates_df, ocr_results_df)
|
913 |
-
|
914 |
-
# # 3. Print the result
|
915 |
-
# import json
|
916 |
-
# print(json.dumps(generated_annotations, indent=2))
|
|
|
1 |
import pandas as pd
|
2 |
import os
|
3 |
import re
|
4 |
+
import itertools
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
8 |
from sklearn.metrics.pairwise import cosine_similarity
|
9 |
+
from typing import List, Tuple, Optional, Dict, Union
|
10 |
from collections import defaultdict
|
11 |
import gradio as gr
|
12 |
from gradio import Progress
|
13 |
from pathlib import Path
|
14 |
+
from typing import List
|
15 |
+
from tools.helper_functions import OUTPUT_FOLDER
|
16 |
from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
|
17 |
+
from tools.load_spacy_model_custom_recognisers import nlp
|
|
|
|
|
18 |
|
19 |
similarity_threshold = 0.95
|
20 |
+
number_of_zeros_to_add_to_index = 7 # Number of zeroes to add between page number and line numbers to get a unique page/line index value
|
21 |
+
ID_MULTIPLIER = 100000
|
22 |
|
23 |
+
def split_text_with_punctuation(text: str) -> List[str]:
|
24 |
"""
|
25 |
+
A more concise version of the tokenization function using a single
|
26 |
+
powerful regex with re.findall.
|
27 |
+
"""
|
28 |
+
# This single regex pattern finds either:
|
29 |
+
# 1. A sequence of one or more punctuation marks `[.,?!:;]+`
|
30 |
+
# 2. OR a sequence of one or more characters that are NOT punctuation or whitespace `[^.,?!:;\s]+`
|
31 |
+
pattern = re.compile(r"([.,?!:;]+|[^.,?!:;\s]+)")
|
32 |
|
33 |
+
final_list = []
|
34 |
+
# We first split by whitespace to handle sentences correctly
|
35 |
+
for word in text.split():
|
36 |
+
# Then, for each whitespace-separated word, we tokenize it further
|
37 |
+
final_list.extend(pattern.findall(word))
|
38 |
+
|
39 |
+
return final_list
|
40 |
+
|
41 |
+
def extract_indices_from_page_ranges(
|
42 |
+
results_df: pd.DataFrame,
|
43 |
+
start_col: str = 'Page2_Start_Page',
|
44 |
+
end_col: str = 'Page2_End_Page',
|
45 |
+
modulo_divisor_number_of_zeros: int = number_of_zeros_to_add_to_index, # Search for number of added
|
46 |
+
converted_index: bool = False # Has the index been converted to the page_no + 0000 + line number format that needs the modulo divisor to convert back?
|
47 |
+
) -> List[int]:
|
48 |
+
all_indices = set()
|
49 |
+
modulo_divisor = int("1" + modulo_divisor_number_of_zeros*"0")
|
50 |
+
|
51 |
+
for _, row in results_df.iterrows():
|
52 |
+
start_page = row[start_col]
|
53 |
+
end_page = row[end_col]
|
54 |
+
for encoded_page_id in range(start_page, end_page + 1):
|
55 |
+
if converted_index == True:
|
56 |
+
original_page, original_index = _parse_page_line_id(encoded_page_id)#(encoded_page_id % modulo_divisor) - 1
|
57 |
+
else:
|
58 |
+
original_index = encoded_page_id
|
59 |
+
|
60 |
+
all_indices.add(original_index)
|
61 |
+
return sorted(list(all_indices))
|
62 |
+
|
63 |
+
def punctuation_at_word_text_end(word_level_df_orig: pd.DataFrame) -> bool:
|
64 |
+
"""
|
65 |
+
Check the first 1000 rows of word_level_df_orig to see if any of the strings
|
66 |
+
in 'word_text' end with a full stop '.', exclamation mark '!', or question mark '?',
|
67 |
+
for strings that do not contain these characters alone.
|
68 |
|
69 |
+
Args:
|
70 |
+
word_level_df_orig (pd.DataFrame): DataFrame containing word-level OCR data with 'word_text' column
|
71 |
+
|
72 |
Returns:
|
73 |
+
bool: True if any strings end with punctuation marks, False otherwise
|
74 |
+
"""
|
75 |
+
# Get the first 1000 rows or all rows if less than 1000
|
76 |
+
sample_df = word_level_df_orig.head(1000)
|
77 |
+
|
78 |
+
# Check if 'word_text' column exists
|
79 |
+
if 'word_text' not in sample_df.columns:
|
80 |
+
return False
|
81 |
+
|
82 |
+
# Define punctuation marks to check for
|
83 |
+
punctuation_marks = ['.', '!', '?']
|
84 |
+
|
85 |
+
# Check each word_text string
|
86 |
+
for word_text in sample_df['word_text']:
|
87 |
+
if pd.isna(word_text) or not isinstance(word_text, str):
|
88 |
+
continue
|
89 |
+
|
90 |
+
# Skip strings that contain only punctuation marks
|
91 |
+
if word_text.strip() in punctuation_marks:
|
92 |
+
continue
|
93 |
+
|
94 |
+
# Check if the string ends with any of the punctuation marks
|
95 |
+
if any(word_text.rstrip().endswith(punct) for punct in punctuation_marks):
|
96 |
+
return True
|
97 |
+
|
98 |
+
return False
|
99 |
+
|
100 |
+
def run_full_search_and_analysis(
|
101 |
+
search_query_text: str,
|
102 |
+
word_level_df_orig: pd.DataFrame,
|
103 |
+
similarity_threshold: float = 1,
|
104 |
+
combine_pages: bool = False,
|
105 |
+
min_word_count: int = 1,
|
106 |
+
min_consecutive_pages: int = 1,
|
107 |
+
greedy_match: bool = True,
|
108 |
+
remake_index: bool = False,
|
109 |
+
progress=gr.Progress(track_tqdm=True)
|
110 |
+
):
|
111 |
+
"""
|
112 |
+
This function orchestrates the entire pipeline for finding duplicate pages based on a user's search query. It takes in the search query text, the original word-level OCR data, and various parameters to control the analysis. The function then:
|
113 |
+
|
114 |
+
1. Converts the user's search query into a DataFrame format suitable for analysis.
|
115 |
+
2. Prepares the main word-level OCR data for processing by converting it into the required format.
|
116 |
+
3. Combines the search query DataFrame with the prepared OCR data DataFrame.
|
117 |
+
4. Executes the similarity analysis on the combined data using the specified parameters such as similarity threshold, minimum word count, minimum consecutive pages, and greedy match strategy.
|
118 |
+
|
119 |
+
Parameters:
|
120 |
+
- search_query_text (str): The text entered by the user to search for in the OCR data.
|
121 |
+
- word_level_df_orig (pd.DataFrame): The original DataFrame containing word-level OCR data.
|
122 |
+
- similarity_threshold (float, optional): The minimum similarity score required for two pages to be considered duplicates. Defaults to 1.
|
123 |
+
- combine_pages (bool, optional): A flag indicating whether to combine text from the same page number within a file. Defaults to False.
|
124 |
+
- min_word_count (int, optional): The minimum number of words required for a page to be considered in the analysis. Defaults to 1.
|
125 |
+
- min_consecutive_pages (int, optional): The minimum number of consecutive pages required to be considered a match. Defaults to 1.
|
126 |
+
- greedy_match (bool, optional): A flag indicating whether to use a greedy strategy for matching consecutive pages. Defaults to True.
|
127 |
+
- remake_index (bool, optional): A flag indicating whether to remake the index of the DataFrame during processing. Defaults to False.
|
128 |
+
- progress (gr.Progress, optional): A Progress object to track the progress of the operation. Defaults to a Progress object with track_tqdm set to True.
|
129 |
"""
|
|
|
|
|
130 |
|
131 |
+
if len(search_query_text) < 3:
|
132 |
+
raise Warning("Please use a search query with at least three letters.")
|
133 |
+
if len(search_query_text) > 100:
|
134 |
+
raise Warning("Please use a search query with at less than 100 characters.")
|
135 |
+
|
136 |
+
if punctuation_at_word_text_end(word_level_df_orig) == True: do_punctuation_split = False
|
137 |
+
else: do_punctuation_split = True
|
138 |
+
|
139 |
+
# Step 1: Process the user's search query string
|
140 |
+
search_query_data, query_word_length = create_dataframe_from_string(search_query_text, file_name="user_search_query", split_words=True, split_punctuation=do_punctuation_split)
|
141 |
+
if not search_query_data:
|
142 |
+
# Handle case where user submits an empty search string
|
143 |
+
raise Warning("Could not convert search string to required format")
|
144 |
+
|
145 |
+
if query_word_length > 25:
|
146 |
+
# Handle case where user submits an empty search string
|
147 |
+
raise Warning("Please use a query with less than 25 words")
|
148 |
+
|
149 |
+
# Overwrite min_consecutive_pages with the search string length
|
150 |
+
min_consecutive_pages = query_word_length
|
151 |
|
152 |
+
# Create word index from reference table
|
153 |
+
word_level_df_orig["index"] = word_level_df_orig.index
|
154 |
+
word_level_df = word_level_df_orig.copy()
|
155 |
|
156 |
+
# Step 2: Process the main word-level OCR DataFrame
|
157 |
+
word_level_data = convert_word_level_df(word_level_df, file_name="source_document")
|
158 |
+
|
159 |
+
# Step 3: Combine both data sources into one list
|
160 |
+
all_data_to_process = search_query_data + word_level_data
|
161 |
+
if not all_data_to_process:
|
162 |
+
raise gr.Error("No data to process. Please check your inputs.")
|
163 |
+
|
164 |
+
# Step 4: Run the combination logic
|
165 |
+
combined_df, _, full_out_ocr_df = combine_ocr_dataframes(
|
166 |
+
input_data=all_data_to_process,
|
167 |
+
combine_pages=combine_pages,
|
168 |
+
output_folder=None, # No need to save this intermediate file
|
169 |
+
remake_index=remake_index
|
170 |
+
)
|
171 |
+
|
172 |
+
# Step 5: Run the final similarity analysis on the combined data
|
173 |
+
results_df, duplicate_files, full_data = identify_similar_text_sequences(
|
174 |
+
df_combined=combined_df,
|
175 |
+
similarity_threshold=similarity_threshold,
|
176 |
+
min_word_count=min_word_count,
|
177 |
+
min_consecutive_pages=min_consecutive_pages,
|
178 |
+
greedy_match=greedy_match,
|
179 |
+
combine_pages=combine_pages,
|
180 |
+
inter_file_only=True,
|
181 |
+
do_text_clean=False,
|
182 |
+
file1_name="user_search_query",
|
183 |
+
file2_name="source_document",
|
184 |
+
progress=progress
|
185 |
+
)
|
186 |
+
|
187 |
+
print("Finished text search")
|
188 |
+
|
189 |
+
# Map the results back to the reference data file
|
190 |
+
if remake_index == True:
|
191 |
+
results_df_index_list = extract_indices_from_page_ranges(results_df, converted_index=True)
|
192 |
+
else:
|
193 |
+
results_df_index_list = extract_indices_from_page_ranges(results_df, converted_index=False)
|
194 |
+
|
195 |
+
word_level_df_out = word_level_df_orig.loc[word_level_df_orig["index"].isin(results_df_index_list)]
|
196 |
+
|
197 |
+
return word_level_df_out, duplicate_files, full_data
|
198 |
+
|
199 |
+
def create_all_data_to_process(converted_data:pd.DataFrame, other_data_list:List[Tuple]):
|
200 |
+
all_data_to_process = converted_data + other_data_list
|
201 |
+
return all_data_to_process
|
202 |
+
|
203 |
+
def convert_word_level_df(
|
204 |
+
word_level_df: pd.DataFrame,
|
205 |
+
file_name: str = "converted_dataframe"
|
206 |
+
) -> List[Tuple[str, pd.DataFrame]]:
|
207 |
+
"""
|
208 |
+
Converts a word-level OCR DataFrame to the format for
|
209 |
+
combine_ocr_dataframes.
|
210 |
+
|
211 |
+
A simple renaming and selection of relevant columns
|
212 |
+
|
213 |
+
Args:
|
214 |
+
word_level_df (pd.DataFrame):
|
215 |
+
A DataFrame containing detailed OCR output. Must include at least
|
216 |
+
the columns: 'page', 'line', and 'word_text'.
|
217 |
+
file_name (str, optional):
|
218 |
+
A unique identifier or "dummy" filename to assign to the resulting
|
219 |
+
data. Defaults to "converted_dataframe".
|
220 |
+
|
221 |
+
Returns:
|
222 |
+
List[Tuple[str, pd.DataFrame]]:
|
223 |
+
A list containing a single tuple of (file_name, DataFrame), ready
|
224 |
+
to be used as input for the combine_ocr_dataframes function. The
|
225 |
+
DataFrame will have 'page' and 'text' columns.
|
226 |
+
"""
|
227 |
+
# --- 1. Validate Input ---
|
228 |
+
required_columns = ['page', 'line', 'word_text']
|
229 |
+
if not all(col in word_level_df.columns for col in required_columns):
|
230 |
+
raise ValueError(f"Input DataFrame must contain all of the following columns: {required_columns}")
|
231 |
+
|
232 |
+
df = word_level_df.copy()
|
233 |
+
|
234 |
+
# --- 2. Process the DataFrame ---
|
235 |
+
# Ensure word_text is a string to allow for joining
|
236 |
+
df['word_text'] = df['word_text'].astype(str)
|
237 |
+
|
238 |
+
# Group by page and line number, then join the words with a space (not needed for word level search)
|
239 |
+
# The result is a Series with a MultiIndex (page, line)
|
240 |
+
#line_text_series = df.groupby(['page', 'line'])['word_text'].apply(' '.join)
|
241 |
+
|
242 |
+
# Convert the Series back to a DataFrame and reset the index
|
243 |
+
#line_level_df = line_text_series.reset_index()
|
244 |
+
|
245 |
+
# Rename the aggregated column from 'word_text' to the required 'text'
|
246 |
+
df = df.rename(columns={'word_text': 'text'})
|
247 |
+
|
248 |
+
# --- 3. Finalise the structure ---
|
249 |
+
# We now have a DataFrame with columns [page, line, text].
|
250 |
+
final_df = df[['page', 'text']]
|
251 |
+
|
252 |
+
# --- 4. Package for output ---
|
253 |
+
# Return in the required List[Tuple[str, DataFrame]] format
|
254 |
+
return [(file_name, final_df)]
|
255 |
+
|
256 |
+
def create_dataframe_from_string(
|
257 |
+
text_string: str,
|
258 |
+
file_name: str = "user_search_query",
|
259 |
+
page_number: int = 1,
|
260 |
+
split_words: bool = False,
|
261 |
+
split_punctuation: bool = True,
|
262 |
+
) -> Tuple[List[Tuple[str, pd.DataFrame]], int]:
|
263 |
+
"""
|
264 |
+
Converts a string into a DataFrame compatible with combine_ocr_dataframes.
|
265 |
+
|
266 |
+
Can operate in two modes:
|
267 |
+
1. As a single-line document (default).
|
268 |
+
2. As a multi-line document where each word from the string is a separate line.
|
269 |
+
|
270 |
+
Args:
|
271 |
+
text_string (str): The input text to be placed in the DataFrame.
|
272 |
+
file_name (str, optional): A dummy filename to assign to this text.
|
273 |
+
Defaults to "user_search_query".
|
274 |
+
page_number (int, optional): A dummy page number to assign. Defaults to 1.
|
275 |
+
split_words (bool, optional): If True, splits the input string by
|
276 |
+
whitespace and creates a row for each word.
|
277 |
+
If False (default), the entire string is
|
278 |
+
treated as a single text entry.
|
279 |
+
split_punctuation (bool, optional): If True, splits the 'end of sentence' punctuation off the end
|
280 |
+
of the search query to match the reference data.
|
281 |
|
282 |
+
Returns:
|
283 |
+
Tuple[List[Tuple[str, pd.DataFrame]], int]:
|
284 |
+
A list containing a single tuple: (file_name, DataFrame).
|
285 |
+
The DataFrame has 'page' and 'text' columns. Also, an integer value indicating the number of words in the search string.
|
286 |
+
Returns an empty list if the input string is empty or whitespace.
|
287 |
+
"""
|
288 |
+
# Handle empty input gracefully, this works for both modes.
|
289 |
+
if not text_string or not text_string.strip():
|
290 |
+
print("Warning: Input string is empty. Returning an empty list.")
|
291 |
+
return [], 0
|
292 |
+
|
293 |
+
if split_words:
|
294 |
+
# --- Split string into words, one per row, based on similar punctuation split technique used to create ocr_results_with_words objects ---
|
295 |
+
if split_punctuation == True:
|
296 |
+
words = split_text_with_punctuation(text_string)
|
297 |
+
else:
|
298 |
+
words = text_string.split()
|
299 |
|
300 |
+
#words = text_string.split()
|
301 |
+
len_words = len(words)
|
302 |
+
data = {
|
303 |
+
'page': [page_number] * len_words, # Assign the same page number to every word
|
304 |
+
'text': words # The list of words becomes the text column
|
305 |
+
}
|
306 |
+
else:
|
307 |
+
# --- Entire string in one row ---
|
308 |
+
len_words = 1
|
309 |
+
data = {
|
310 |
+
'page': [page_number],
|
311 |
+
'text': [text_string]
|
312 |
+
}
|
313 |
+
|
314 |
+
# Create the DataFrame from the prepared data
|
315 |
+
df = pd.DataFrame(data)
|
316 |
+
|
317 |
+
df["line"] = df.index + 1
|
318 |
+
|
319 |
+
# Return it in the required format: a list containing one (name, df) tuple
|
320 |
+
return [(file_name, df)], len_words
|
321 |
+
|
322 |
+
def combine_ocr_dataframes(
|
323 |
+
input_data: List[Tuple[str, pd.DataFrame]],
|
324 |
+
combine_pages: bool = True,
|
325 |
+
output_folder: str = OUTPUT_FOLDER,
|
326 |
+
output_filename: str = "combined_ocr_output.csv",
|
327 |
+
number_of_added_zeros: int = number_of_zeros_to_add_to_index,
|
328 |
+
remake_index:bool = True
|
329 |
+
) -> Tuple[pd.DataFrame, List[str]]:
|
330 |
+
"""
|
331 |
+
Combines text from multiple pandas DataFrames containing page and text columns.
|
332 |
+
|
333 |
+
This function takes a list of (name, DataFrame) tuples, processes each DataFrame
|
334 |
+
by grouping and concatenating text, and then combines them into a single DataFrame.
|
335 |
+
|
336 |
+
Args:
|
337 |
+
input_data (List[Tuple[str, pd.DataFrame]]):
|
338 |
+
A list of tuples, where each tuple contains a unique identifier (like a filename)
|
339 |
+
and a pandas DataFrame. Each DataFrame must have 'page' and 'text' columns.
|
340 |
+
combine_pages (bool, optional):
|
341 |
+
If True, text from the same page number within a file is joined into a
|
342 |
+
single row. If False, each line of text gets its own row with a unique
|
343 |
+
page identifier. Defaults to True.
|
344 |
+
output_folder (str, optional):
|
345 |
+
The folder where the combined CSV file will be saved. Defaults to OUTPUT_FOLDER.
|
346 |
+
output_filename (str, optional):
|
347 |
+
The name of the output CSV file. Defaults to "combined_ocr_output.csv".
|
348 |
+
|
349 |
+
Returns:
|
350 |
+
Tuple[pd.DataFrame, List[str]]:
|
351 |
+
A tuple containing:
|
352 |
+
- The final combined and processed DataFrame.
|
353 |
+
- A list containing the path to the saved output CSV file.
|
354 |
+
"""
|
355 |
+
all_data = []
|
356 |
+
|
357 |
+
for file_identifier, df_initial in input_data:
|
358 |
+
df = df_initial.copy() # Work on a copy to avoid side effects
|
359 |
+
|
360 |
+
# --- Validation ---
|
361 |
if 'page' not in df.columns or 'text' not in df.columns:
|
362 |
+
print(f"Warning: Skipping data for '{file_identifier}' - missing required columns 'page' and 'text'.")
|
363 |
continue
|
364 |
|
365 |
+
# --- Processing ---
|
366 |
df['text'] = df['text'].fillna('').astype(str)
|
367 |
+
|
368 |
+
if combine_pages:
|
369 |
+
# Group by page and concatenate text into a single string
|
370 |
+
processed_df = df.groupby('page')['text'].apply(' '.join).reset_index()
|
371 |
else:
|
372 |
+
if remake_index == True:
|
373 |
+
# # Create a unique, sortable page ID for each line without combining
|
374 |
+
# df['line_number_by_page'] = df.groupby('page').cumcount() + 1
|
375 |
+
# df['original_page'] = df['page']
|
376 |
+
# # Create a new page ID that combines page and line number for uniqueness
|
377 |
+
# df['page'] = (
|
378 |
+
# df['page'].astype(str).str.zfill(number_of_added_zeros) +
|
379 |
+
# df['line_number_by_page'].astype(str).str.zfill(number_of_added_zeros)
|
380 |
+
# ).astype(int)
|
381 |
+
|
382 |
+
# Define the multiplier based on the max expected lines per page.
|
383 |
+
# If you expect up to 99,999 lines, use 100,000.
|
384 |
+
|
385 |
+
df['line_number_by_page'] = df.groupby('page').cumcount() + 1
|
386 |
+
df['original_page'] = df['page']
|
387 |
+
|
388 |
+
# Create the new combined ID using arithmetic
|
389 |
+
df['page'] = (df['original_page'] * ID_MULTIPLIER) + df['line_number_by_page']
|
390 |
+
|
391 |
+
else:
|
392 |
+
if not 'index' in df.columns:
|
393 |
+
df['index'] = df.index
|
394 |
+
df['page'] = df['index']
|
395 |
+
|
396 |
+
processed_df = df
|
397 |
+
|
398 |
+
# Add the file identifier column
|
399 |
+
processed_df['file'] = file_identifier
|
400 |
+
all_data.append(processed_df)
|
401 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
402 |
if not all_data:
|
403 |
+
raise ValueError("No valid DataFrames were processed. Ensure input data is not empty and DataFrames have 'page' and 'text' columns.")
|
404 |
+
|
405 |
+
# --- Final Combination ---
|
406 |
combined_df = pd.concat(all_data, ignore_index=True)
|
407 |
+
|
408 |
+
# Reorder columns to a standard format, dropping intermediate columns
|
409 |
+
final_columns = ['file', 'page', 'text']
|
410 |
+
if 'original_page' in combined_df.columns:
|
411 |
+
final_columns.append('original_page') # Keep for context if created
|
412 |
|
413 |
+
# Ensure all final columns exist before trying to select them
|
414 |
+
existing_final_columns = [col for col in final_columns if col in combined_df.columns]
|
415 |
|
416 |
+
full_out_ocr_df = combined_df
|
417 |
+
combined_df = combined_df.copy()[existing_final_columns]
|
418 |
|
419 |
+
# --- Save Output ---
|
420 |
+
output_files = []
|
421 |
+
if output_folder and output_filename:
|
422 |
+
os.makedirs(output_folder, exist_ok=True)
|
423 |
+
output_path = os.path.join(output_folder, output_filename)
|
424 |
+
combined_df.to_csv(output_path, index=False)
|
425 |
+
output_files.append(output_path)
|
426 |
+
print(f"Successfully combined data and saved to: {output_path}")
|
427 |
+
|
428 |
+
return combined_df, output_files, full_out_ocr_df
|
429 |
+
|
430 |
+
def combine_ocr_output_text(
|
431 |
+
input_files: Union[str, List[str]],
|
432 |
+
combine_pages: bool = True,
|
433 |
+
remake_index: bool = True,
|
434 |
+
output_folder: str = OUTPUT_FOLDER
|
435 |
+
) -> Tuple[pd.DataFrame, List[str]]:
|
436 |
+
"""
|
437 |
+
Reads multiple OCR CSV files, combines them, and saves the result.
|
438 |
+
|
439 |
+
This function serves as a wrapper that reads CSV files from paths and then
|
440 |
+
uses the `combine_ocr_dataframes` function to perform the combination logic.
|
441 |
+
|
442 |
+
Args:
|
443 |
+
input_files (Union[str, List[str]]): A single file path or a list of file paths.
|
444 |
+
combine_pages (bool, optional): See `combine_ocr_dataframes`. Defaults to True.
|
445 |
+
output_folder (str, optional): See `combine_ocr_dataframes`. Defaults to OUTPUT_FOLDER.
|
446 |
+
|
447 |
+
Returns:
|
448 |
+
Tuple[pd.DataFrame, List[str]]: The combined DataFrame and the path to the output file.
|
449 |
+
"""
|
450 |
+
if isinstance(input_files, str):
|
451 |
+
file_paths_list = [input_files]
|
452 |
+
else:
|
453 |
+
file_paths_list = input_files
|
454 |
+
|
455 |
+
data_to_process = []
|
456 |
+
for file_path in file_paths_list:
|
457 |
+
try:
|
458 |
+
df = pd.read_csv(file_path)
|
459 |
+
# Use the base filename as the identifier
|
460 |
+
file_identifier = os.path.basename(file_path)
|
461 |
+
data_to_process.append((file_identifier, df))
|
462 |
+
except FileNotFoundError:
|
463 |
+
print(f"Warning: File not found, skipping: {file_path}")
|
464 |
+
except Exception as e:
|
465 |
+
print(f"Warning: Failed to read or process {file_path}. Error: {e}")
|
466 |
|
467 |
+
if not data_to_process:
|
468 |
+
raise ValueError("No valid CSV files could be read or processed.")
|
469 |
+
|
470 |
+
# Call the core function with the loaded data
|
471 |
+
return combine_ocr_dataframes(
|
472 |
+
input_data=data_to_process,
|
473 |
+
combine_pages=combine_pages,
|
474 |
+
output_folder=output_folder,
|
475 |
+
output_filename="combined_ocr_from_files.csv", # Specific name for this path
|
476 |
+
remake_index=remake_index
|
477 |
+
)
|
478 |
+
|
479 |
+
def clean_and_stem_text_series(df:pd.DataFrame, column:str):
|
480 |
'''
|
481 |
Clean and stem text columns in a data frame
|
482 |
'''
|
|
|
567 |
|
568 |
# 1. Save the main results DataFrame
|
569 |
similarity_file_output_path = output_folder_path / 'page_similarity_results.csv'
|
570 |
+
final_df.to_csv(similarity_file_output_path, index=False, encoding="utf-8-sig")
|
571 |
|
572 |
output_paths.append(str(similarity_file_output_path))
|
573 |
print(f"Main results saved to {similarity_file_output_path}")
|
|
|
604 |
|
605 |
return output_paths
|
606 |
|
607 |
+
# Define the set of punctuation characters for efficient lookup
|
608 |
+
PUNCTUATION_TO_STRIP = {'.', ',', '?', '!', ':', ';'}
|
609 |
+
|
610 |
+
def _sequences_match(query_seq: List[str], ref_seq: List[str]) -> bool:
|
611 |
+
"""
|
612 |
+
Helper function to compare two sequences of tokens with punctuation flexibility.
|
613 |
+
|
614 |
+
Returns True if the sequences match according to the rules:
|
615 |
+
1. An exact match is a match.
|
616 |
+
2. A reference token also matches a query token if it is the query token
|
617 |
+
followed by a single character from PUNCTUATION_TO_STRIP. This rule does not
|
618 |
+
apply if the reference token consists only of punctuation.
|
619 |
+
"""
|
620 |
+
if len(query_seq) != len(ref_seq):
|
621 |
+
return False
|
622 |
+
|
623 |
+
for query_token, ref_token in zip(query_seq, ref_seq):
|
624 |
+
# Rule 1: Check for a direct, exact match first (most common case)
|
625 |
+
if query_token == ref_token:
|
626 |
+
continue
|
627 |
+
|
628 |
+
# Rule 2: Check for the flexible punctuation match
|
629 |
+
# - The reference token must be longer than 1 character
|
630 |
+
# - Its last character must be in our punctuation set
|
631 |
+
# - The token without its last character must match the query token
|
632 |
+
if (
|
633 |
+
len(ref_token) > 1 and
|
634 |
+
ref_token[-1] in PUNCTUATION_TO_STRIP and
|
635 |
+
ref_token[:-1] == query_token
|
636 |
+
):
|
637 |
+
continue
|
638 |
+
|
639 |
+
# If neither rule applies, the tokens don't match, so the sequence doesn't match.
|
640 |
+
return False
|
641 |
+
|
642 |
+
# If the loop completes, every token has matched.
|
643 |
+
return True
|
644 |
+
|
645 |
+
|
646 |
+
def find_consecutive_sequence_matches(
|
647 |
+
df_filtered: pd.DataFrame,
|
648 |
+
search_file_name: str,
|
649 |
+
reference_file_name: str
|
650 |
+
) -> pd.DataFrame:
|
651 |
+
"""
|
652 |
+
Finds all occurrences of a consecutive sequence of tokens from a search file
|
653 |
+
within a larger reference file.
|
654 |
+
|
655 |
+
This function is designed for order-dependent matching, not "bag-of-words" similarity.
|
656 |
+
|
657 |
+
Args:
|
658 |
+
df_filtered: The DataFrame containing all tokens, with 'file' and 'text_clean' columns.
|
659 |
+
search_file_name: The name of the file containing the search query sequence.
|
660 |
+
reference_file_name: The name of the file to search within.
|
661 |
+
|
662 |
+
Returns:
|
663 |
+
A DataFrame with two columns ('Page1_Index', 'Page2_Index') mapping the
|
664 |
+
consecutive match, or an empty DataFrame if no match is found.
|
665 |
+
"""
|
666 |
+
print(f"Starting sequence search for '{search_file_name}' in '{reference_file_name}'...")
|
667 |
+
|
668 |
+
# Step 1: Isolate the data for each file
|
669 |
+
search_df = df_filtered[df_filtered['file'] == search_file_name]
|
670 |
+
reference_df = df_filtered[df_filtered['file'] == reference_file_name]
|
671 |
+
|
672 |
+
if search_df.empty or reference_df.empty:
|
673 |
+
print("Error: One or both files not found or are empty.")
|
674 |
+
return pd.DataFrame(columns=['Page1_Index', 'Page2_Index'])
|
675 |
+
|
676 |
+
# Step 2: Convert the token data into lists for easy comparison.
|
677 |
+
# We need both the text tokens and their original global indices.
|
678 |
+
query_tokens = search_df['text_clean'].tolist()
|
679 |
+
query_indices = search_df.index.tolist()
|
680 |
+
|
681 |
+
reference_tokens = reference_df['text_clean'].tolist()
|
682 |
+
reference_indices = reference_df.index.tolist()
|
683 |
+
|
684 |
+
query_len = len(query_tokens)
|
685 |
+
all_found_matches = []
|
686 |
+
|
687 |
+
print(f"Searching for a sequence of {query_len} tokens...")
|
688 |
+
|
689 |
+
# Step 3: Use a "sliding window" to search for the query sequence in the reference list.
|
690 |
+
for i in range(len(reference_tokens) - query_len + 1):
|
691 |
+
# The "window" is a slice of the reference list that is the same size as the query
|
692 |
+
window = reference_tokens[i : i + query_len]
|
693 |
+
|
694 |
+
# Step 4: If the window matches the query with or without punctuation on end
|
695 |
+
if _sequences_match(query_tokens, window):
|
696 |
+
print(f"Found a consecutive match starting at reference index: {reference_indices[i]}")
|
697 |
+
|
698 |
+
# Get the global indices for this entire matching block
|
699 |
+
matching_reference_indices = reference_indices[i : i + query_len]
|
700 |
+
|
701 |
+
# Create the mapping between query indices and the found reference indices
|
702 |
+
for j in range(query_len):
|
703 |
+
all_found_matches.append(
|
704 |
+
(query_indices[j], matching_reference_indices[j], 1)
|
705 |
+
)
|
706 |
+
|
707 |
+
# If you only want the *first* match, you can uncomment the next line:
|
708 |
+
# break
|
709 |
+
|
710 |
+
if not all_found_matches:
|
711 |
+
print("No matches found")
|
712 |
+
gr.Info("No matches found")
|
713 |
+
return pd.DataFrame(columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
|
714 |
+
|
715 |
+
# Step 5: Create the final DataFrame in the desired format
|
716 |
+
result_df = pd.DataFrame(all_found_matches, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
|
717 |
+
return result_df
|
718 |
+
|
719 |
+
def identify_similar_text_sequences(
|
720 |
df_combined: pd.DataFrame,
|
721 |
+
similarity_threshold: float = 1,
|
722 |
+
min_word_count: int = 1,
|
723 |
min_consecutive_pages: int = 1,
|
724 |
+
greedy_match: bool = True,
|
725 |
+
combine_pages: bool = False,
|
726 |
+
inter_file_only: bool = False,
|
727 |
+
do_text_clean:bool = True,
|
728 |
+
file1_name: str = '',
|
729 |
+
file2_name: str = '',
|
730 |
+
output_folder: str = "output/",
|
731 |
progress=Progress(track_tqdm=True)
|
732 |
) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
|
733 |
"""
|
734 |
+
Identifies similar pages. Uses a highly optimized path for inter_file_only=True.
|
|
|
|
|
|
|
735 |
"""
|
|
|
|
|
736 |
progress(0.1, desc="Processing and filtering text")
|
737 |
+
|
738 |
+
if do_text_clean:
|
739 |
+
df = clean_and_stem_text_series(df_combined, 'text') # Will produce the column 'text_clean'
|
740 |
+
else:
|
741 |
+
df = df_combined.copy()
|
742 |
+
df['text_clean'] = df['text'].str.lower()#.str.replace(r'[^\w\s]', '', regex=True)
|
743 |
+
|
744 |
+
|
745 |
df['word_count'] = df['text_clean'].str.split().str.len().fillna(0)
|
746 |
+
#df['word_count'] = pd.to_numeric(df['word_count'], errors='coerce').fillna(0).astype('int64')
|
747 |
+
|
748 |
+
# ensure min_word_count is an int (e.g., from Gradio/text input)
|
749 |
+
try:
|
750 |
+
min_word_count = int(min_word_count)
|
751 |
+
except (TypeError, ValueError):
|
752 |
+
min_word_count = 0 # or raise/log, depending on your preference
|
753 |
+
|
754 |
original_row_count = len(df)
|
755 |
df_filtered = df[df['word_count'] >= min_word_count].copy()
|
756 |
df_filtered.reset_index(drop=True, inplace=True)
|
757 |
|
758 |
print(f"Filtered out {original_row_count - len(df_filtered)} pages with fewer than {min_word_count} words.")
|
|
|
759 |
if len(df_filtered) < 2:
|
760 |
return pd.DataFrame(), [], df_combined
|
|
|
|
|
|
|
761 |
|
|
|
|
|
|
|
762 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
763 |
|
764 |
+
# Similarity calculated differently if comparing between files only (inter_file_only==True), or within the same file
|
765 |
+
if inter_file_only:
|
766 |
|
767 |
+
progress(0.2, desc="Finding direct text matches...")
|
|
|
|
|
|
|
|
|
|
|
|
|
768 |
|
769 |
+
#base_similarity_df = _debug_similarity_between_two_files(df_filtered, vectorizer, similarity_threshold, file1_name, file2_name)
|
770 |
+
base_similarity_df = find_consecutive_sequence_matches(df_filtered, file1_name, file2_name)
|
771 |
+
if base_similarity_df.empty:
|
772 |
+
return pd.DataFrame(), [], df_combined
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
773 |
|
774 |
+
else:
|
775 |
+
# Use the original, simpler path for all-to-all comparisons (including intra-file).
|
776 |
+
vectorizer = TfidfVectorizer()
|
777 |
+
print("Standard Path: Calculating all-to-all similarity.")
|
778 |
+
progress(0.2, desc="Vectorizing text...")
|
779 |
+
tfidf_matrix = vectorizer.fit_transform(df_filtered['text_clean'])
|
780 |
+
|
781 |
+
progress(0.3, desc="Calculating similarity matrix...")
|
782 |
+
similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
|
783 |
+
coo_matrix = similarity_matrix.tocoo()
|
784 |
+
|
785 |
+
similar_pages = [
|
786 |
+
(r, c, v) for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
|
787 |
+
if r < c and v >= similarity_threshold
|
788 |
+
]
|
789 |
+
|
790 |
+
if not similar_pages:
|
791 |
return pd.DataFrame(), [], df_combined
|
792 |
+
|
793 |
+
base_similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
|
794 |
|
795 |
+
progress(0.7, desc="Aggregating results based on matching strategy")
|
|
|
|
|
|
|
796 |
|
797 |
+
if greedy_match or min_consecutive_pages > 1:
|
798 |
+
print("Finding all consecutive page matches of minimum length:", min_consecutive_pages)
|
799 |
+
|
800 |
+
# Sort the dataframe to ensure consecutive pages are adjacent
|
801 |
+
similarity_df = base_similarity_df #.sort_values(['Page1_Index', 'Page2_Index']).copy()
|
802 |
+
|
803 |
+
# A new sequence starts if the difference from the previous row is not (1, 1)
|
804 |
+
# is_consecutive will be True if a row continues the sequence, False if it's a new one.
|
805 |
is_consecutive = (similarity_df['Page1_Index'].diff() == 1) & (similarity_df['Page2_Index'].diff() == 1)
|
806 |
+
|
807 |
+
# Use cumsum() on the inverted boolean series to create a unique ID for each block.
|
808 |
+
# Every time a 'False' appears (a new block starts), the sum increases.
|
809 |
block_id = is_consecutive.eq(False).cumsum()
|
810 |
+
|
811 |
+
# Group by this block ID
|
812 |
grouped = similarity_df.groupby(block_id)
|
813 |
+
|
814 |
+
# Aggregate each group to get the start, end, and length of the match
|
815 |
agg_results = grouped.agg(
|
816 |
+
Page1_Start_Index=('Page1_Index', 'first'),
|
817 |
+
Page2_Start_Index=('Page2_Index', 'first'),
|
818 |
+
Page1_End_Index=('Page1_Index', 'last'),
|
819 |
+
Page2_End_Index=('Page2_Index', 'last'),
|
820 |
+
Match_Length=('Page1_Index', 'size'),
|
821 |
+
Avg_Similarity=('Similarity_Score', 'mean')
|
822 |
).reset_index(drop=True)
|
|
|
|
|
823 |
|
824 |
+
# If greedy_match=True, we keep all matches. If min_consecutive_pages > 1, we filter.
|
825 |
+
if greedy_match and min_consecutive_pages <= 1:
|
826 |
+
subdocument_df = agg_results
|
827 |
+
else:
|
828 |
+
# This handles the case for min_consecutive_pages > 1
|
829 |
+
subdocument_df = agg_results[agg_results['Match_Length'] >= min_consecutive_pages].copy()
|
830 |
+
|
831 |
+
if subdocument_df.empty:
|
832 |
+
gr.Info("No matches found")
|
833 |
+
return pd.DataFrame(), [], df_combined
|
834 |
+
|
835 |
+
final_df = map_metadata_subdocument(subdocument_df, df_filtered)
|
836 |
else:
|
837 |
+
print(f"Finding single page matches, not greedy (min_consecutive_pages=1)")
|
838 |
+
# This part of your code would handle the non-sequential case
|
839 |
final_df = map_metadata_single_page(base_similarity_df, df_filtered)
|
840 |
+
#subdocument_df = final_df # To align variable names for saving
|
|
|
841 |
|
842 |
+
if final_df.empty:
|
843 |
+
gr.Info("No matches found")
|
844 |
+
return pd.DataFrame(), [], df_combined
|
845 |
+
|
846 |
+
progress(0.9, desc="Saving output files")
|
|
|
847 |
|
848 |
output_paths = save_results_and_redaction_lists(final_df, output_folder, combine_pages)
|
849 |
|
850 |
+
gr.Info(f"Found {final_df.shape[0]} match(es)")
|
851 |
+
print(f"Found {final_df.shape[0]} match(es)")
|
|
|
|
|
|
|
852 |
|
853 |
+
return final_df, output_paths, df_combined
|
854 |
+
|
855 |
def handle_selection_and_preview(evt: gr.SelectData, results_df:pd.DataFrame, full_duplicate_data_by_file: dict):
|
856 |
"""
|
857 |
This single function handles a user selecting a row. It:
|
|
|
902 |
Wrapper function updated to include the 'greedy_match' boolean.
|
903 |
"""
|
904 |
if not files:
|
905 |
+
raise Warning("Please upload files to analyse.")
|
|
|
906 |
|
907 |
progress(0, desc="Combining input files...")
|
908 |
+
df_combined, _, full_out_ocr_df = combine_ocr_output_text(files, combine_pages=combine_pages)
|
909 |
|
910 |
if df_combined.empty:
|
911 |
+
raise Warning("No data found in the uploaded files.")
|
|
|
912 |
|
913 |
# Call the main analysis function with the new parameter
|
914 |
+
results_df, output_paths, full_df = identify_similar_text_sequences(
|
915 |
df_combined=df_combined,
|
916 |
similarity_threshold=threshold,
|
917 |
min_word_count=min_words,
|
|
|
923 |
|
924 |
# Clip text to first 200 characters
|
925 |
full_df['text'] = full_df['text'].str[:preview_length]
|
|
|
926 |
# Preprocess full_data (without preview text) for fast access (run once)
|
927 |
full_data_by_file = {
|
928 |
file: df.sort_values('page').set_index('page')
|
|
|
932 |
if results_df.empty:
|
933 |
gr.Info(f"No duplicate pages found, no results returned.")
|
934 |
|
935 |
+
return results_df, output_paths, full_data_by_file
|
936 |
|
937 |
def show_page_previews(full_data_by_file: dict, results_df: pd.DataFrame, evt: gr.SelectData, preview_length:int=500):
|
938 |
"""
|
|
|
1017 |
|
1018 |
return all_annotations, newly_added_annotation_group
|
1019 |
|
1020 |
+
def apply_whole_page_redactions_from_list(duplicate_page_numbers_df: pd.DataFrame, doc_file_name_with_extension_textbox: str, review_file_state: pd.DataFrame, duplicate_output_paths: list[str], pymupdf_doc: object, page_sizes: list[dict], all_existing_annotations: list[dict], combine_pages:bool=True, new_annotations_with_bounding_boxes:List[dict]=list()):
|
1021 |
'''
|
1022 |
+
This function applies redactions to whole pages based on a provided list of duplicate page numbers. It supports two modes of operation: combining pages and not combining pages. When combining pages is enabled, it attempts to identify duplicate pages across different files and applies redactions accordingly. If combining pages is disabled, it relies on new annotations with bounding boxes to determine which pages to redact. The function utilises a PyMuPDF document object to manipulate the PDF file, and it also considers the sizes of pages to ensure accurate redaction application.
|
1023 |
+
|
1024 |
+
Args:
|
1025 |
+
duplicate_page_numbers_df (pd.DataFrame): A DataFrame containing page numbers identified as duplicates.
|
1026 |
+
doc_file_name_with_extension_textbox (str): The name of the document file with its extension.
|
1027 |
+
review_file_state (pd.DataFrame): The current state of the review file.
|
1028 |
+
duplicate_output_paths (list[str]): A list of paths to files containing duplicate page information.
|
1029 |
+
pymupdf_doc (object): A PyMuPDF document object representing the PDF file.
|
1030 |
+
page_sizes (list[dict]): A list of dictionaries containing page size information.
|
1031 |
+
all_existing_annotations (list[dict]): A list of all existing annotations in the document.
|
1032 |
+
combine_pages (bool, optional): A flag indicating whether to combine pages for redaction. Defaults to True.
|
1033 |
+
new_annotations_with_bounding_boxes (List[dict], optional): A list of new annotations with bounding boxes. Defaults to an empty list.
|
1034 |
'''
|
1035 |
+
if all_existing_annotations is None:
|
1036 |
+
all_existing_annotations = []
|
1037 |
+
|
1038 |
+
if new_annotations_with_bounding_boxes is None:
|
1039 |
+
new_annotations_with_bounding_boxes = []
|
1040 |
+
|
1041 |
all_annotations = all_existing_annotations.copy()
|
1042 |
|
1043 |
if not pymupdf_doc:
|
1044 |
+
message = "No document file currently under review"
|
1045 |
print(f"Warning: {message}")
|
1046 |
raise Warning(message)
|
1047 |
|
|
|
1170 |
review_file_out = review_file_out.sort_values(by=["page", "ymin", "xmin"]).reset_index(drop=True)
|
1171 |
review_file_out = review_file_out.drop_duplicates(subset=['page', 'label', 'text', 'id'], keep='first')
|
1172 |
|
1173 |
+
out_message = "Successfully created duplicate text redactions."
|
1174 |
print(out_message)
|
1175 |
gr.Info(out_message)
|
1176 |
|
1177 |
return review_file_out, all_annotations
|
1178 |
|
|
|
|
|
1179 |
def _parse_page_line_id(combined_id: int) -> Tuple[int, int]:
|
1180 |
+
"""Parses a combined ID using modular arithmetic."""
|
1181 |
+
if int(combined_id) < ID_MULTIPLIER:
|
1182 |
+
# Handle cases where page is 0 (or just an edge case)
|
1183 |
+
return 0, combined_id
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1184 |
|
1185 |
+
page = combined_id // ID_MULTIPLIER
|
1186 |
+
line = combined_id % ID_MULTIPLIER
|
1187 |
+
return page, line
|
1188 |
|
1189 |
def create_annotation_objects_from_duplicates(
|
1190 |
duplicates_df: pd.DataFrame,
|
1191 |
ocr_results_df: pd.DataFrame,
|
1192 |
page_sizes: List[Dict],
|
1193 |
+
combine_pages:bool=False) -> List[Dict]:
|
|
|
1194 |
"""
|
1195 |
Creates structured annotation objects from duplicate line ranges, mapping
|
1196 |
page numbers to image paths.
|
|
|
1206 |
"""
|
1207 |
final_output = []
|
1208 |
|
1209 |
+
if duplicates_df.empty:
|
1210 |
+
raise Warning("No duplicates found")
|
1211 |
+
if ocr_results_df.empty:
|
1212 |
+
raise Warning("No OCR results found for file under review. Please upload relevant OCR_output file and original PDF document on the review tab.")
|
1213 |
+
|
1214 |
if combine_pages == False:
|
|
|
1215 |
page_to_image_map = {item['page']: item['image_path'] for item in page_sizes}
|
1216 |
|
1217 |
# Prepare OCR Data: Add a line number column if it doesn't exist
|
|
|
1253 |
"id": "" # to be filled in after
|
1254 |
}
|
1255 |
page_number = line_row['page']
|
|
|
1256 |
|
1257 |
annotations_by_page[page_number].append(box)
|
|
|
|
|
1258 |
|
1259 |
# --- Format the final output list using the page-to-image map ---
|
1260 |
final_output = []
|
|
|
1278 |
# Handle cases where a page might not have a corresponding image path
|
1279 |
print(f"Warning: Page {page_num} found in OCR data but has no corresponding "
|
1280 |
f"entry in the 'page_sizes' object. This page's annotations will be skipped.")
|
|
|
|
|
1281 |
|
1282 |
+
return final_output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tools/helper_functions.py
CHANGED
@@ -9,7 +9,24 @@ import unicodedata
|
|
9 |
from typing import List
|
10 |
from math import ceil
|
11 |
from gradio_image_annotation import image_annotator
|
12 |
-
from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, AWS_PII_OPTION
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
def reset_state_vars():
|
15 |
return [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
|
@@ -22,7 +39,7 @@ def reset_state_vars():
|
|
22 |
show_share_button=False,
|
23 |
show_remove_button=False,
|
24 |
interactive=False
|
25 |
-
), [], [], pd.DataFrame(), pd.DataFrame(), [], [], "", False, 0
|
26 |
|
27 |
def reset_ocr_results_state():
|
28 |
return pd.DataFrame(), pd.DataFrame(), []
|
@@ -85,8 +102,7 @@ def enforce_cost_codes(enforce_cost_code_textbox:str, cost_code_choice:str, cost
|
|
85 |
return
|
86 |
|
87 |
def update_cost_code_dataframe_from_dropdown_select(cost_dropdown_selection:str, cost_code_df:pd.DataFrame):
|
88 |
-
cost_code_df = cost_code_df.loc[cost_code_df.iloc[:,0] == cost_dropdown_selection, :
|
89 |
-
]
|
90 |
return cost_code_df
|
91 |
|
92 |
def ensure_folder_exists(output_folder:str):
|
@@ -114,7 +130,7 @@ def get_file_name_without_type(file_path):
|
|
114 |
|
115 |
return filename_without_extension
|
116 |
|
117 |
-
def detect_file_type(filename):
|
118 |
"""Detect the file type based on its extension."""
|
119 |
if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
|
120 |
return 'csv'
|
@@ -132,10 +148,12 @@ def detect_file_type(filename):
|
|
132 |
return 'png'
|
133 |
elif filename.endswith('.xfdf'):
|
134 |
return 'xfdf'
|
|
|
|
|
135 |
else:
|
136 |
raise ValueError("Unsupported file type.")
|
137 |
|
138 |
-
def read_file(filename):
|
139 |
"""Read the file based on its detected type."""
|
140 |
file_type = detect_file_type(filename)
|
141 |
|
@@ -156,13 +174,7 @@ def ensure_output_folder_exists(output_folder:str):
|
|
156 |
else:
|
157 |
print(f"The {output_folder} folder already exists.")
|
158 |
|
159 |
-
|
160 |
-
"""Parses a comma-separated environment variable into a list of strings."""
|
161 |
-
value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
|
162 |
-
if not value:
|
163 |
-
return []
|
164 |
-
# Split by comma and filter out any empty strings that might result from extra commas
|
165 |
-
return [s.strip() for s in value.split(',') if s.strip()]
|
166 |
|
167 |
def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
|
168 |
'''
|
@@ -188,7 +200,7 @@ def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
|
|
188 |
print(output_text)
|
189 |
else:
|
190 |
output_text = "No file provided."
|
191 |
-
print(output_text)
|
192 |
return output_text, custom_regex_df
|
193 |
|
194 |
return output_text, custom_regex_df
|
@@ -204,7 +216,7 @@ def put_columns_in_df(in_file:List[str]):
|
|
204 |
file_type = detect_file_type(file_name)
|
205 |
print("File type is:", file_type)
|
206 |
|
207 |
-
if file_type == 'xlsx':
|
208 |
number_of_excel_files += 1
|
209 |
new_choices = []
|
210 |
print("Running through all xlsx sheets")
|
@@ -220,10 +232,13 @@ def put_columns_in_df(in_file:List[str]):
|
|
220 |
|
221 |
all_sheet_names.extend(new_sheet_names)
|
222 |
|
223 |
-
|
224 |
df = read_file(file_name)
|
225 |
new_choices = list(df.columns)
|
226 |
|
|
|
|
|
|
|
227 |
concat_choices.extend(new_choices)
|
228 |
|
229 |
# Drop duplicate columns
|
@@ -262,7 +277,6 @@ def check_for_relevant_ocr_output_with_words(doc_file_name_no_extension_textbox:
|
|
262 |
else:
|
263 |
return False
|
264 |
|
265 |
-
#
|
266 |
def add_folder_to_path(folder_path: str):
|
267 |
'''
|
268 |
Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
|
@@ -325,10 +339,8 @@ def merge_csv_files(file_list:List[str], output_folder:str=OUTPUT_FOLDER):
|
|
325 |
merged_csv_path = output_folder + file_out_name + "_merged.csv"
|
326 |
|
327 |
# Save the merged DataFrame to a CSV file
|
328 |
-
|
329 |
-
merged_df.to_csv(merged_csv_path, index=False)
|
330 |
output_files.append(merged_csv_path)
|
331 |
-
#merged_csv.seek(0) # Move to the beginning of the StringIO object
|
332 |
|
333 |
return output_files
|
334 |
|
@@ -575,5 +587,39 @@ def reset_base_dataframe(df:pd.DataFrame):
|
|
575 |
return df
|
576 |
|
577 |
def reset_ocr_base_dataframe(df:pd.DataFrame):
|
578 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
579 |
|
|
|
9 |
from typing import List
|
10 |
from math import ceil
|
11 |
from gradio_image_annotation import image_annotator
|
12 |
+
from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, AWS_PII_OPTION, MAPPED_LANGUAGE_CHOICES, LANGUAGE_CHOICES, textract_language_choices, aws_comprehend_language_choices, DEFAULT_LANGUAGE
|
13 |
+
# from tools.load_spacy_model_custom_recognisers import nlp_analyser
|
14 |
+
|
15 |
+
def _get_env_list(env_var_name: str) -> List[str]:
|
16 |
+
"""Parses a comma-separated environment variable into a list of strings."""
|
17 |
+
value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
|
18 |
+
if not value:
|
19 |
+
return []
|
20 |
+
# Split by comma and filter out any empty strings that might result from extra commas
|
21 |
+
return [s.strip() for s in value.split(',') if s.strip()]
|
22 |
+
|
23 |
+
if textract_language_choices: textract_language_choices = _get_env_list(textract_language_choices)
|
24 |
+
if aws_comprehend_language_choices: aws_comprehend_language_choices = _get_env_list(aws_comprehend_language_choices)
|
25 |
+
|
26 |
+
if MAPPED_LANGUAGE_CHOICES: MAPPED_LANGUAGE_CHOICES = _get_env_list(MAPPED_LANGUAGE_CHOICES)
|
27 |
+
if LANGUAGE_CHOICES: LANGUAGE_CHOICES = _get_env_list(LANGUAGE_CHOICES)
|
28 |
+
|
29 |
+
LANGUAGE_MAP = dict(zip(MAPPED_LANGUAGE_CHOICES, LANGUAGE_CHOICES))
|
30 |
|
31 |
def reset_state_vars():
|
32 |
return [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
|
|
|
39 |
show_share_button=False,
|
40 |
show_remove_button=False,
|
41 |
interactive=False
|
42 |
+
), [], [], pd.DataFrame(), pd.DataFrame(), [], [], "", False, 0, []
|
43 |
|
44 |
def reset_ocr_results_state():
|
45 |
return pd.DataFrame(), pd.DataFrame(), []
|
|
|
102 |
return
|
103 |
|
104 |
def update_cost_code_dataframe_from_dropdown_select(cost_dropdown_selection:str, cost_code_df:pd.DataFrame):
|
105 |
+
cost_code_df = cost_code_df.loc[cost_code_df.iloc[:,0] == cost_dropdown_selection, :]
|
|
|
106 |
return cost_code_df
|
107 |
|
108 |
def ensure_folder_exists(output_folder:str):
|
|
|
130 |
|
131 |
return filename_without_extension
|
132 |
|
133 |
+
def detect_file_type(filename:str):
|
134 |
"""Detect the file type based on its extension."""
|
135 |
if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
|
136 |
return 'csv'
|
|
|
148 |
return 'png'
|
149 |
elif filename.endswith('.xfdf'):
|
150 |
return 'xfdf'
|
151 |
+
elif filename.endswith('.docx'):
|
152 |
+
return 'docx'
|
153 |
else:
|
154 |
raise ValueError("Unsupported file type.")
|
155 |
|
156 |
+
def read_file(filename:str):
|
157 |
"""Read the file based on its detected type."""
|
158 |
file_type = detect_file_type(filename)
|
159 |
|
|
|
174 |
else:
|
175 |
print(f"The {output_folder} folder already exists.")
|
176 |
|
177 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
|
179 |
def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
|
180 |
'''
|
|
|
200 |
print(output_text)
|
201 |
else:
|
202 |
output_text = "No file provided."
|
203 |
+
#print(output_text)
|
204 |
return output_text, custom_regex_df
|
205 |
|
206 |
return output_text, custom_regex_df
|
|
|
216 |
file_type = detect_file_type(file_name)
|
217 |
print("File type is:", file_type)
|
218 |
|
219 |
+
if (file_type == 'xlsx') | (file_type == 'xls'):
|
220 |
number_of_excel_files += 1
|
221 |
new_choices = []
|
222 |
print("Running through all xlsx sheets")
|
|
|
232 |
|
233 |
all_sheet_names.extend(new_sheet_names)
|
234 |
|
235 |
+
elif (file_type == "csv") | (file_type == "parquet"):
|
236 |
df = read_file(file_name)
|
237 |
new_choices = list(df.columns)
|
238 |
|
239 |
+
else:
|
240 |
+
new_choices = []
|
241 |
+
|
242 |
concat_choices.extend(new_choices)
|
243 |
|
244 |
# Drop duplicate columns
|
|
|
277 |
else:
|
278 |
return False
|
279 |
|
|
|
280 |
def add_folder_to_path(folder_path: str):
|
281 |
'''
|
282 |
Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
|
|
|
339 |
merged_csv_path = output_folder + file_out_name + "_merged.csv"
|
340 |
|
341 |
# Save the merged DataFrame to a CSV file
|
342 |
+
merged_df.to_csv(merged_csv_path, index=False, encoding="utf-8-sig")
|
|
|
343 |
output_files.append(merged_csv_path)
|
|
|
344 |
|
345 |
return output_files
|
346 |
|
|
|
587 |
return df
|
588 |
|
589 |
def reset_ocr_base_dataframe(df:pd.DataFrame):
|
590 |
+
if df.empty:
|
591 |
+
return pd.DataFrame(columns=["page", "line", "text"])
|
592 |
+
else:
|
593 |
+
return df.loc[:, ["page", "line", "text"]]
|
594 |
+
|
595 |
+
def reset_ocr_with_words_base_dataframe(df:pd.DataFrame, page_entity_dropdown_redaction_value:str):
|
596 |
+
|
597 |
+
df["index"] = df.index
|
598 |
+
output_df = df.copy()
|
599 |
+
|
600 |
+
df["page"]=df["page"].astype(str)
|
601 |
+
|
602 |
+
output_df_filtered = df.loc[df["page"]==str(page_entity_dropdown_redaction_value), ["page", "line", "word_text", "word_x0", "word_y0", "word_x1", "word_y1", "index"]]
|
603 |
+
return output_df_filtered, output_df
|
604 |
+
|
605 |
+
def update_language_dropdown(chosen_language_full_name_drop, textract_language_choices=textract_language_choices, aws_comprehend_language_choices=aws_comprehend_language_choices, LANGUAGE_MAP=LANGUAGE_MAP):
|
606 |
+
|
607 |
+
try:
|
608 |
+
full_language_name = chosen_language_full_name_drop.lower()
|
609 |
+
matched_language = LANGUAGE_MAP[full_language_name]
|
610 |
+
|
611 |
+
chosen_language_drop = gr.Dropdown(value = matched_language, choices = LANGUAGE_CHOICES, label="Chosen language short code", multiselect=False, visible=True)
|
612 |
+
|
613 |
+
if matched_language not in aws_comprehend_language_choices and matched_language not in textract_language_choices:
|
614 |
+
gr.Info(f"Note that {full_language_name} is not supported by AWS Comprehend or AWS Textract")
|
615 |
+
elif matched_language not in aws_comprehend_language_choices:
|
616 |
+
gr.Info(f"Note that {full_language_name} is not supported by AWS Comprehend")
|
617 |
+
elif matched_language not in textract_language_choices:
|
618 |
+
gr.Info(f"Note that {full_language_name} is not supported by AWS Textract")
|
619 |
+
except Exception as e:
|
620 |
+
print(e)
|
621 |
+
gr.Info("Could not find language in list")
|
622 |
+
chosen_language_drop = gr.Dropdown(value = DEFAULT_LANGUAGE, choices = LANGUAGE_CHOICES, label="Chosen language short code", multiselect=False)
|
623 |
+
|
624 |
+
return chosen_language_drop
|
625 |
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
@@ -1,31 +1,255 @@
|
|
1 |
from typing import List
|
2 |
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
|
3 |
-
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
|
4 |
import spacy
|
5 |
-
from spacy.matcher import Matcher
|
6 |
from spaczz.matcher import FuzzyMatcher
|
7 |
spacy.prefer_gpu()
|
8 |
from spacy.cli.download import download
|
9 |
import Levenshtein
|
10 |
import re
|
|
|
|
|
11 |
import gradio as gr
|
|
|
12 |
|
13 |
-
model_name = "en_core_web_lg" #"en_core_web_sm" #"en_core_web_trf"
|
14 |
score_threshold = 0.001
|
15 |
custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
|
16 |
|
17 |
-
#
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
-
|
24 |
-
|
25 |
-
nlp = spacy.load(model_name)
|
26 |
-
print("Successfully downloaded and imported spaCy model", model_name)
|
27 |
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
def custom_word_list_recogniser(custom_list:List[str]=[]):
|
30 |
# Create regex pattern, handling quotes carefully
|
31 |
|
@@ -172,6 +396,118 @@ def custom_fuzzy_word_list_regex(text:str, custom_list:List[str]=[]):
|
|
172 |
|
173 |
return start_positions, end_positions
|
174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mistakes_max:int = 1, search_whole_phrase:bool=True, nlp=nlp, progress=gr.Progress(track_tqdm=True)):
|
176 |
''' Conduct fuzzy match on a list of text data.'''
|
177 |
|
@@ -189,9 +525,6 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
|
|
189 |
|
190 |
for string_query in custom_query_list:
|
191 |
|
192 |
-
#print("text:", text)
|
193 |
-
#print("string_query:", string_query)
|
194 |
-
|
195 |
query = nlp(string_query)
|
196 |
|
197 |
if search_whole_phrase == False:
|
@@ -200,8 +533,6 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
|
|
200 |
|
201 |
spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
|
202 |
|
203 |
-
#print("token_query:", token_query)
|
204 |
-
|
205 |
if len(token_query) > 1:
|
206 |
#pattern_lemma = [{"LEMMA": {"IN": query}}]
|
207 |
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": token_query}}}]
|
@@ -215,7 +546,6 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
|
|
215 |
|
216 |
else:
|
217 |
# If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
|
218 |
-
#tokenised_query = [string_query.lower()]
|
219 |
# If you want to match the whole phrase, use phrase matcher
|
220 |
matcher = FuzzyMatcher(nlp.vocab)
|
221 |
patterns = [nlp.make_doc(string_query)] # Convert query into a Doc object
|
@@ -236,9 +566,7 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
|
|
236 |
for match_id, start, end in matches:
|
237 |
span = str(doc[start:end]).strip()
|
238 |
query_search = str(query).strip()
|
239 |
-
|
240 |
-
#print("span:", span)
|
241 |
-
#print("query_search:", query_search)
|
242 |
|
243 |
# Convert word positions to character positions
|
244 |
start_char = doc[start].idx # Start character position
|
@@ -253,9 +581,6 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
|
|
253 |
for match_id, start, end, ratio, pattern in matches:
|
254 |
span = str(doc[start:end]).strip()
|
255 |
query_search = str(query).strip()
|
256 |
-
#print("doc:", doc)
|
257 |
-
#print("span:", span)
|
258 |
-
#print("query_search:", query_search)
|
259 |
|
260 |
# Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
|
261 |
distance = Levenshtein.distance(query_search.lower(), span.lower())
|
@@ -269,9 +594,6 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
|
|
269 |
start_char = doc[start].idx # Start character position
|
270 |
end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
|
271 |
|
272 |
-
#print("start_char:", start_char)
|
273 |
-
#print("end_char:", end_char)
|
274 |
-
|
275 |
all_matches.append(match_count)
|
276 |
all_start_positions.append(start_char)
|
277 |
all_end_positions.append(end_char)
|
@@ -281,59 +603,4 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
|
|
281 |
return all_start_positions, all_end_positions
|
282 |
|
283 |
|
284 |
-
class CustomWordFuzzyRecognizer(EntityRecognizer):
|
285 |
-
def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
|
286 |
-
super().__init__(supported_entities=supported_entities)
|
287 |
-
self.custom_list = custom_list # Store the custom_list as an instance attribute
|
288 |
-
self.spelling_mistakes_max = spelling_mistakes_max # Store the max spelling mistakes
|
289 |
-
self.search_whole_phrase = search_whole_phrase # Store the search whole phrase flag
|
290 |
-
|
291 |
-
def load(self) -> None:
|
292 |
-
"""No loading is required."""
|
293 |
-
pass
|
294 |
-
|
295 |
-
def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]:
|
296 |
-
"""
|
297 |
-
Logic for detecting a specific PII
|
298 |
-
"""
|
299 |
-
start_pos, end_pos = spacy_fuzzy_search(text, self.custom_list, self.spelling_mistakes_max, self.search_whole_phrase) # Pass new parameters
|
300 |
-
|
301 |
-
results = []
|
302 |
-
|
303 |
-
for i in range(0, len(start_pos)):
|
304 |
-
result = RecognizerResult(
|
305 |
-
entity_type="CUSTOM_FUZZY",
|
306 |
-
start=start_pos[i],
|
307 |
-
end=end_pos[i],
|
308 |
-
score=1
|
309 |
-
)
|
310 |
-
results.append(result)
|
311 |
-
|
312 |
-
return results
|
313 |
-
|
314 |
-
custom_list_default = []
|
315 |
-
custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
|
316 |
-
|
317 |
-
# Create a class inheriting from SpacyNlpEngine
|
318 |
-
class LoadedSpacyNlpEngine(SpacyNlpEngine):
|
319 |
-
def __init__(self, loaded_spacy_model):
|
320 |
-
super().__init__()
|
321 |
-
self.nlp = {"en": loaded_spacy_model}
|
322 |
-
|
323 |
-
# Pass the loaded model to the new LoadedSpacyNlpEngine
|
324 |
-
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
|
325 |
-
|
326 |
-
|
327 |
-
nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
|
328 |
-
default_score_threshold=score_threshold,
|
329 |
-
supported_languages=["en"],
|
330 |
-
log_decision_process=False,
|
331 |
-
)
|
332 |
-
|
333 |
-
# Add custom recognisers to nlp_analyser
|
334 |
-
nlp_analyser.registry.add_recognizer(street_recogniser)
|
335 |
-
nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
|
336 |
-
nlp_analyser.registry.add_recognizer(titles_recogniser)
|
337 |
-
nlp_analyser.registry.add_recognizer(custom_recogniser)
|
338 |
-
nlp_analyser.registry.add_recognizer(custom_word_fuzzy_recognizer)
|
339 |
|
|
|
1 |
from typing import List
|
2 |
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
|
3 |
+
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts, NerModelConfiguration
|
4 |
import spacy
|
5 |
+
from spacy.matcher import Matcher
|
6 |
from spaczz.matcher import FuzzyMatcher
|
7 |
spacy.prefer_gpu()
|
8 |
from spacy.cli.download import download
|
9 |
import Levenshtein
|
10 |
import re
|
11 |
+
import os
|
12 |
+
import requests
|
13 |
import gradio as gr
|
14 |
+
from tools.config import DEFAULT_LANGUAGE, TESSERACT_DATA_FOLDER
|
15 |
|
|
|
16 |
score_threshold = 0.001
|
17 |
custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
|
18 |
|
19 |
+
# Create a class inheriting from SpacyNlpEngine
|
20 |
+
class LoadedSpacyNlpEngine(SpacyNlpEngine):
|
21 |
+
def __init__(self, loaded_spacy_model, language_code: str):
|
22 |
+
super().__init__(ner_model_configuration=NerModelConfiguration(labels_to_ignore=["CARDINAL", "ORDINAL"])) # Ignore non-relevant labels
|
23 |
+
self.nlp = {language_code: loaded_spacy_model}
|
24 |
+
|
25 |
+
def _base_language_code(language: str) -> str:
|
26 |
+
lang = _normalize_language_input(language)
|
27 |
+
if "_" in lang:
|
28 |
+
return lang.split("_")[0]
|
29 |
+
return lang
|
30 |
+
|
31 |
+
def load_spacy_model(language: str = DEFAULT_LANGUAGE):
|
32 |
+
"""
|
33 |
+
Load a spaCy model for the requested language and return it as `nlp`.
|
34 |
+
|
35 |
+
Accepts common inputs like: "en", "en_lg", "en_sm", "de", "fr", "es", "it", "nl", "pt", "zh", "ja", "xx".
|
36 |
+
Falls back through sensible candidates and will download if missing.
|
37 |
+
"""
|
38 |
+
|
39 |
+
synonyms = {
|
40 |
+
"english": "en",
|
41 |
+
"catalan": "ca",
|
42 |
+
"danish": "da",
|
43 |
+
"german": "de",
|
44 |
+
"french": "fr",
|
45 |
+
"greek": "el",
|
46 |
+
"finnish": "fi",
|
47 |
+
"croatian": "hr",
|
48 |
+
"lithuanian": "lt",
|
49 |
+
"macedonian": "mk",
|
50 |
+
"norwegian_bokmaal": "nb",
|
51 |
+
"polish": "pl",
|
52 |
+
"russian": "ru",
|
53 |
+
"slovenian": "sl",
|
54 |
+
"swedish": "sv",
|
55 |
+
"dutch": "nl",
|
56 |
+
"portuguese": "pt",
|
57 |
+
"chinese": "zh",
|
58 |
+
"japanese": "ja",
|
59 |
+
"multilingual": "xx",
|
60 |
+
}
|
61 |
+
|
62 |
+
lang_norm = _normalize_language_input(language)
|
63 |
+
lang_norm = synonyms.get(lang_norm, lang_norm)
|
64 |
+
base_lang = _base_language_code(lang_norm)
|
65 |
+
|
66 |
+
candidates_by_lang = {
|
67 |
+
# English
|
68 |
+
"en": [
|
69 |
+
"en_core_web_lg",
|
70 |
+
"en_core_web_trf",
|
71 |
+
"en_core_web_md",
|
72 |
+
"en_core_web_sm",
|
73 |
+
],
|
74 |
+
"en_lg": ["en_core_web_lg"],
|
75 |
+
"en_trf": ["en_core_web_trf"],
|
76 |
+
"en_md": ["en_core_web_md"],
|
77 |
+
"en_sm": ["en_core_web_sm"],
|
78 |
+
|
79 |
+
# Major languages (news pipelines)
|
80 |
+
"ca": ["ca_core_news_lg", "ca_core_news_md", "ca_core_news_sm"], # Catalan
|
81 |
+
"da": ["da_core_news_lg", "da_core_news_md", "da_core_news_sm"], # Danish
|
82 |
+
"de": ["de_core_news_lg", "de_core_news_md", "de_core_news_sm"], # German
|
83 |
+
"el": ["el_core_news_lg", "el_core_news_md", "el_core_news_sm"], # Greek
|
84 |
+
"es": ["es_core_news_lg", "es_core_news_md", "es_core_news_sm"], # Spanish
|
85 |
+
"fi": ["fi_core_news_lg", "fi_core_news_md", "fi_core_news_sm"], # Finnish
|
86 |
+
"fr": ["fr_core_news_lg", "fr_core_news_md", "fr_core_news_sm"], # French
|
87 |
+
"hr": ["hr_core_news_lg", "hr_core_news_md", "hr_core_news_sm"], # Croatian
|
88 |
+
"it": ["it_core_news_lg", "it_core_news_md", "it_core_news_sm"], # Italian
|
89 |
+
"ja": ["ja_core_news_lg", "ja_core_news_md", "ja_core_news_sm"], # Japanese
|
90 |
+
"ko": ["ko_core_news_lg", "ko_core_news_md", "ko_core_news_sm"], # Korean
|
91 |
+
"lt": ["lt_core_news_lg", "lt_core_news_md", "lt_core_news_sm"], # Lithuanian
|
92 |
+
"mk": ["mk_core_news_lg", "mk_core_news_md", "mk_core_news_sm"], # Macedonian
|
93 |
+
"nb": ["nb_core_news_lg", "nb_core_news_md", "nb_core_news_sm"], # Norwegian Bokmål
|
94 |
+
"nl": ["nl_core_news_lg", "nl_core_news_md", "nl_core_news_sm"], # Dutch
|
95 |
+
"pl": ["pl_core_news_lg", "pl_core_news_md", "pl_core_news_sm"], # Polish
|
96 |
+
"pt": ["pt_core_news_lg", "pt_core_news_md", "pt_core_news_sm"], # Portuguese
|
97 |
+
"ro": ["ro_core_news_lg", "ro_core_news_md", "ro_core_news_sm"], # Romanian
|
98 |
+
"ru": ["ru_core_news_lg", "ru_core_news_md", "ru_core_news_sm"], # Russian
|
99 |
+
"sl": ["sl_core_news_lg", "sl_core_news_md", "sl_core_news_sm"], # Slovenian
|
100 |
+
"sv": ["sv_core_news_lg", "sv_core_news_md", "sv_core_news_sm"], # Swedish
|
101 |
+
"uk": ["uk_core_news_lg", "uk_core_news_md", "uk_core_news_sm"], # Ukrainian
|
102 |
+
"zh": ["zh_core_web_lg", "zh_core_web_mod", "zh_core_web_sm", "zh_core_web_trf"], # Chinese
|
103 |
+
|
104 |
+
# Multilingual NER
|
105 |
+
"xx": ["xx_ent_wiki_sm"],
|
106 |
+
}
|
107 |
+
|
108 |
+
if lang_norm in candidates_by_lang:
|
109 |
+
candidates = candidates_by_lang[lang_norm]
|
110 |
+
elif base_lang in candidates_by_lang:
|
111 |
+
candidates = candidates_by_lang[base_lang]
|
112 |
+
else:
|
113 |
+
# Fallback to multilingual if unknown
|
114 |
+
candidates = candidates_by_lang["xx"]
|
115 |
+
|
116 |
+
last_error = None
|
117 |
+
for candidate in candidates:
|
118 |
+
# Try importable package first (fast-path when installed as a package)
|
119 |
+
try:
|
120 |
+
module = __import__(candidate)
|
121 |
+
print(f"Successfully imported spaCy model: {candidate}")
|
122 |
+
return module.load()
|
123 |
+
except Exception as e:
|
124 |
+
last_error = e
|
125 |
+
|
126 |
+
# Try spacy.load if package is linked/installed
|
127 |
+
try:
|
128 |
+
nlp = spacy.load(candidate)
|
129 |
+
print(f"Successfully loaded spaCy model via spacy.load: {candidate}")
|
130 |
+
return nlp
|
131 |
+
except Exception as e:
|
132 |
+
last_error = e
|
133 |
+
|
134 |
+
# Check if model is already downloaded before attempting to download
|
135 |
+
try:
|
136 |
+
# Try to load the model to see if it's already available
|
137 |
+
nlp = spacy.load(candidate)
|
138 |
+
print(f"Model {candidate} is already available, skipping download")
|
139 |
+
return nlp
|
140 |
+
except OSError:
|
141 |
+
# Model not found, proceed with download
|
142 |
+
pass
|
143 |
+
except Exception as e:
|
144 |
+
last_error = e
|
145 |
+
continue
|
146 |
+
|
147 |
+
# Attempt to download then load
|
148 |
+
try:
|
149 |
+
print(f"Downloading spaCy model: {candidate}")
|
150 |
+
download(candidate)
|
151 |
+
nlp = spacy.load(candidate)
|
152 |
+
print(f"Successfully downloaded and loaded spaCy model: {candidate}")
|
153 |
+
return nlp
|
154 |
+
except Exception as e:
|
155 |
+
last_error = e
|
156 |
+
continue
|
157 |
+
|
158 |
+
raise RuntimeError(f"Failed to load spaCy model for language '{language}'. Last error: {last_error}")
|
159 |
+
|
160 |
+
# Language-aware spaCy model loader
|
161 |
+
def _normalize_language_input(language: str) -> str:
|
162 |
+
return language.strip().lower().replace("-", "_")
|
163 |
+
|
164 |
+
# Update the global variables to use the new function
|
165 |
+
ACTIVE_LANGUAGE_CODE = _base_language_code(DEFAULT_LANGUAGE)
|
166 |
+
nlp = None # Placeholder, will be loaded in the create_nlp_analyser function below #load_spacy_model(DEFAULT_LANGUAGE)
|
167 |
+
|
168 |
+
def get_tesseract_lang_code(short_code:str):
|
169 |
+
"""
|
170 |
+
Maps a two-letter language code to the corresponding Tesseract OCR code.
|
171 |
+
|
172 |
+
Args:
|
173 |
+
short_code (str): The two-letter language code (e.g., "en", "de").
|
174 |
+
|
175 |
+
Returns:
|
176 |
+
str or None: The Tesseract language code (e.g., "eng", "deu"),
|
177 |
+
or None if no mapping is found.
|
178 |
+
"""
|
179 |
+
# Mapping from 2-letter codes to Tesseract 3-letter codes
|
180 |
+
# Based on ISO 639-2/T codes.
|
181 |
+
lang_map = {
|
182 |
+
"en": "eng",
|
183 |
+
"de": "deu",
|
184 |
+
"fr": "fra",
|
185 |
+
"es": "spa",
|
186 |
+
"it": "ita",
|
187 |
+
"nl": "nld",
|
188 |
+
"pt": "por",
|
189 |
+
"zh": "chi_sim", # Mapping to Simplified Chinese by default
|
190 |
+
"ja": "jpn",
|
191 |
+
"ko": "kor",
|
192 |
+
"lt": "lit",
|
193 |
+
"mk": "mkd",
|
194 |
+
"nb": "nor",
|
195 |
+
"pl": "pol",
|
196 |
+
"ro": "ron",
|
197 |
+
"ru": "rus",
|
198 |
+
"sl": "slv",
|
199 |
+
"sv": "swe",
|
200 |
+
"uk": "ukr"
|
201 |
+
}
|
202 |
+
|
203 |
+
return lang_map.get(short_code)
|
204 |
+
|
205 |
+
def download_tesseract_lang_pack(short_lang_code:str, tessdata_dir=TESSERACT_DATA_FOLDER):
|
206 |
+
"""
|
207 |
+
Downloads a Tesseract language pack to a local directory.
|
208 |
+
|
209 |
+
Args:
|
210 |
+
lang_code (str): The short code for the language (e.g., "eng", "fra").
|
211 |
+
tessdata_dir (str, optional): The directory to save the language pack.
|
212 |
+
Defaults to "tessdata".
|
213 |
+
"""
|
214 |
+
|
215 |
+
# Create the directory if it doesn't exist
|
216 |
+
if not os.path.exists(tessdata_dir):
|
217 |
+
os.makedirs(tessdata_dir)
|
218 |
|
219 |
+
# Get the Tesseract language code
|
220 |
+
lang_code = get_tesseract_lang_code(short_lang_code)
|
|
|
|
|
221 |
|
222 |
+
if lang_code is None:
|
223 |
+
raise ValueError(f"Language code {short_lang_code} not found in Tesseract language map")
|
224 |
+
|
225 |
+
# Set the local file path
|
226 |
+
file_path = os.path.join(tessdata_dir, f"{lang_code}.traineddata")
|
227 |
+
|
228 |
+
# Check if the file already exists
|
229 |
+
if os.path.exists(file_path):
|
230 |
+
print(f"Language pack {lang_code}.traineddata already exists at {file_path}")
|
231 |
+
return file_path
|
232 |
+
|
233 |
+
# Construct the URL for the language pack
|
234 |
+
url = f"https://raw.githubusercontent.com/tesseract-ocr/tessdata/main/{lang_code}.traineddata"
|
235 |
+
|
236 |
+
# Download the file
|
237 |
+
try:
|
238 |
+
response = requests.get(url, stream=True)
|
239 |
+
response.raise_for_status() # Raise an exception for bad status codes
|
240 |
+
|
241 |
+
with open(file_path, "wb") as f:
|
242 |
+
for chunk in response.iter_content(chunk_size=8192):
|
243 |
+
f.write(chunk)
|
244 |
+
|
245 |
+
print(f"Successfully downloaded {lang_code}.traineddata to {file_path}")
|
246 |
+
return file_path
|
247 |
+
|
248 |
+
except requests.exceptions.RequestException as e:
|
249 |
+
print(f"Error downloading {lang_code}.traineddata: {e}")
|
250 |
+
return None
|
251 |
+
|
252 |
+
#### Custom recognisers
|
253 |
def custom_word_list_recogniser(custom_list:List[str]=[]):
|
254 |
# Create regex pattern, handling quotes carefully
|
255 |
|
|
|
396 |
|
397 |
return start_positions, end_positions
|
398 |
|
399 |
+
|
400 |
+
class CustomWordFuzzyRecognizer(EntityRecognizer):
|
401 |
+
def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
|
402 |
+
super().__init__(supported_entities=supported_entities)
|
403 |
+
self.custom_list = custom_list # Store the custom_list as an instance attribute
|
404 |
+
self.spelling_mistakes_max = spelling_mistakes_max # Store the max spelling mistakes
|
405 |
+
self.search_whole_phrase = search_whole_phrase # Store the search whole phrase flag
|
406 |
+
|
407 |
+
def load(self) -> None:
|
408 |
+
"""No loading is required."""
|
409 |
+
pass
|
410 |
+
|
411 |
+
def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]:
|
412 |
+
"""
|
413 |
+
Logic for detecting a specific PII
|
414 |
+
"""
|
415 |
+
start_pos, end_pos = spacy_fuzzy_search(text, self.custom_list, self.spelling_mistakes_max, self.search_whole_phrase) # Pass new parameters
|
416 |
+
|
417 |
+
results = []
|
418 |
+
|
419 |
+
for i in range(0, len(start_pos)):
|
420 |
+
result = RecognizerResult(
|
421 |
+
entity_type="CUSTOM_FUZZY",
|
422 |
+
start=start_pos[i],
|
423 |
+
end=end_pos[i],
|
424 |
+
score=1
|
425 |
+
)
|
426 |
+
results.append(result)
|
427 |
+
|
428 |
+
return results
|
429 |
+
|
430 |
+
custom_list_default = []
|
431 |
+
custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
|
432 |
+
|
433 |
+
# Pass the loaded model to the new LoadedSpacyNlpEngine
|
434 |
+
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp, language_code = ACTIVE_LANGUAGE_CODE)
|
435 |
+
|
436 |
+
def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str] = None,
|
437 |
+
spelling_mistakes_max: int = 1, search_whole_phrase: bool = True, existing_nlp_analyser: AnalyzerEngine = None, return_also_model: bool = False):
|
438 |
+
"""
|
439 |
+
Create an nlp_analyser object based on the specified language input.
|
440 |
+
|
441 |
+
Args:
|
442 |
+
language (str): Language code (e.g., "en", "de", "fr", "es", etc.)
|
443 |
+
custom_list (List[str], optional): List of custom words to recognize. Defaults to None.
|
444 |
+
spelling_mistakes_max (int, optional): Maximum number of spelling mistakes for fuzzy matching. Defaults to 1.
|
445 |
+
search_whole_phrase (bool, optional): Whether to search for whole phrases or individual words. Defaults to True.
|
446 |
+
existing_nlp_analyser (AnalyzerEngine, optional): Existing nlp_analyser object to use. Defaults to None.
|
447 |
+
return_also_model (bool, optional): Whether to return the nlp_model object as well. Defaults to False.
|
448 |
+
|
449 |
+
Returns:
|
450 |
+
AnalyzerEngine: Configured nlp_analyser object with custom recognizers
|
451 |
+
"""
|
452 |
+
|
453 |
+
if existing_nlp_analyser is None:
|
454 |
+
pass
|
455 |
+
else:
|
456 |
+
if existing_nlp_analyser.supported_languages[0] == language:
|
457 |
+
nlp_analyser = existing_nlp_analyser
|
458 |
+
print(f"Using existing nlp_analyser for {language}")
|
459 |
+
return nlp_analyser
|
460 |
+
|
461 |
+
# Load spaCy model for the specified language
|
462 |
+
nlp_model = load_spacy_model(language)
|
463 |
+
|
464 |
+
# Get base language code
|
465 |
+
base_lang_code = _base_language_code(language)
|
466 |
+
|
467 |
+
# Create custom recognizers
|
468 |
+
if custom_list is None:
|
469 |
+
custom_list = []
|
470 |
+
|
471 |
+
custom_recogniser = custom_word_list_recogniser(custom_list)
|
472 |
+
custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(
|
473 |
+
supported_entities=["CUSTOM_FUZZY"],
|
474 |
+
custom_list=custom_list,
|
475 |
+
spelling_mistakes_max=spelling_mistakes_max,
|
476 |
+
search_whole_phrase=search_whole_phrase
|
477 |
+
)
|
478 |
+
|
479 |
+
# Create NLP engine with loaded model
|
480 |
+
loaded_nlp_engine = LoadedSpacyNlpEngine(
|
481 |
+
loaded_spacy_model=nlp_model,
|
482 |
+
language_code=base_lang_code
|
483 |
+
)
|
484 |
+
|
485 |
+
# Create analyzer engine
|
486 |
+
nlp_analyser = AnalyzerEngine(
|
487 |
+
nlp_engine=loaded_nlp_engine,
|
488 |
+
default_score_threshold=score_threshold,
|
489 |
+
supported_languages=[base_lang_code],
|
490 |
+
log_decision_process=False,
|
491 |
+
)
|
492 |
+
|
493 |
+
# Add custom recognizers to nlp_analyser
|
494 |
+
nlp_analyser.registry.add_recognizer(custom_recogniser)
|
495 |
+
nlp_analyser.registry.add_recognizer(custom_word_fuzzy_recognizer)
|
496 |
+
|
497 |
+
# Add language-specific recognizers for English
|
498 |
+
if base_lang_code == "en":
|
499 |
+
nlp_analyser.registry.add_recognizer(street_recogniser)
|
500 |
+
nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
|
501 |
+
nlp_analyser.registry.add_recognizer(titles_recogniser)
|
502 |
+
|
503 |
+
if return_also_model:
|
504 |
+
return nlp_analyser, nlp_model
|
505 |
+
|
506 |
+
return nlp_analyser
|
507 |
+
|
508 |
+
# Create the default nlp_analyser using the new function
|
509 |
+
nlp_analyser, nlp = create_nlp_analyser(DEFAULT_LANGUAGE, return_also_model=True)
|
510 |
+
|
511 |
def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mistakes_max:int = 1, search_whole_phrase:bool=True, nlp=nlp, progress=gr.Progress(track_tqdm=True)):
|
512 |
''' Conduct fuzzy match on a list of text data.'''
|
513 |
|
|
|
525 |
|
526 |
for string_query in custom_query_list:
|
527 |
|
|
|
|
|
|
|
528 |
query = nlp(string_query)
|
529 |
|
530 |
if search_whole_phrase == False:
|
|
|
533 |
|
534 |
spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
|
535 |
|
|
|
|
|
536 |
if len(token_query) > 1:
|
537 |
#pattern_lemma = [{"LEMMA": {"IN": query}}]
|
538 |
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": token_query}}}]
|
|
|
546 |
|
547 |
else:
|
548 |
# If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
|
|
|
549 |
# If you want to match the whole phrase, use phrase matcher
|
550 |
matcher = FuzzyMatcher(nlp.vocab)
|
551 |
patterns = [nlp.make_doc(string_query)] # Convert query into a Doc object
|
|
|
566 |
for match_id, start, end in matches:
|
567 |
span = str(doc[start:end]).strip()
|
568 |
query_search = str(query).strip()
|
569 |
+
|
|
|
|
|
570 |
|
571 |
# Convert word positions to character positions
|
572 |
start_char = doc[start].idx # Start character position
|
|
|
581 |
for match_id, start, end, ratio, pattern in matches:
|
582 |
span = str(doc[start:end]).strip()
|
583 |
query_search = str(query).strip()
|
|
|
|
|
|
|
584 |
|
585 |
# Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
|
586 |
distance = Levenshtein.distance(query_search.lower(), span.lower())
|
|
|
594 |
start_char = doc[start].idx # Start character position
|
595 |
end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
|
596 |
|
|
|
|
|
|
|
597 |
all_matches.append(match_count)
|
598 |
all_start_positions.append(start_char)
|
599 |
all_end_positions.append(end_char)
|
|
|
603 |
return all_start_positions, all_end_positions
|
604 |
|
605 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
606 |
|
tools/redaction_review.py
CHANGED
@@ -1,49 +1,62 @@
|
|
1 |
import os
|
2 |
import re
|
3 |
-
import gradio as gr
|
4 |
import pandas as pd
|
5 |
import numpy as np
|
|
|
|
|
|
|
6 |
from xml.etree.ElementTree import Element, SubElement, tostring, parse
|
7 |
from xml.dom import minidom
|
8 |
import uuid
|
9 |
-
from typing import List, Tuple
|
10 |
from gradio_image_annotation import image_annotator
|
11 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
12 |
from pymupdf import Document, Rect
|
13 |
import pymupdf
|
14 |
from PIL import ImageDraw, Image
|
15 |
from datetime import datetime, timezone, timedelta
|
|
|
|
|
16 |
|
17 |
from tools.config import OUTPUT_FOLDER, MAX_IMAGE_PIXELS, INPUT_FOLDER, COMPRESS_REDACTED_PDF
|
18 |
-
from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, remove_duplicate_images_with_blank_boxes, fill_missing_ids, divide_coordinates_by_page_sizes, save_pdf_with_or_without_compression
|
19 |
from tools.helper_functions import get_file_name_without_type, detect_file_type
|
20 |
from tools.file_redaction import redact_page_with_pymupdf
|
21 |
|
22 |
if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
|
23 |
|
24 |
-
def decrease_page(number:int):
|
25 |
'''
|
26 |
Decrease page number for review redactions page.
|
27 |
'''
|
|
|
|
|
|
|
28 |
if number > 1:
|
29 |
return number - 1, number - 1
|
|
|
|
|
|
|
30 |
else:
|
31 |
-
|
32 |
|
33 |
-
def increase_page(number:int,
|
34 |
'''
|
35 |
Increase page number for review redactions page.
|
36 |
'''
|
37 |
|
38 |
-
if not
|
39 |
-
|
|
|
40 |
|
41 |
-
max_pages = len(
|
42 |
|
43 |
if number < max_pages:
|
44 |
return number + 1, number + 1
|
|
|
|
|
45 |
else:
|
46 |
-
|
47 |
|
48 |
def update_zoom(current_zoom_level:int, annotate_current_page:int, decrease:bool=True):
|
49 |
if decrease == False:
|
@@ -86,8 +99,8 @@ def get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object:
|
|
86 |
recogniser_dropdown_value:str,
|
87 |
text_dropdown_value:str,
|
88 |
page_dropdown_value:str,
|
89 |
-
review_df:pd.DataFrame=
|
90 |
-
page_sizes:List[str]=
|
91 |
'''
|
92 |
Create a filtered recogniser dataframe and associated dropdowns based on current information in the image annotator and review data frame.
|
93 |
'''
|
@@ -134,7 +147,7 @@ def get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object:
|
|
134 |
|
135 |
return recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop
|
136 |
|
137 |
-
def update_recogniser_dataframes(page_image_annotator_object:AnnotatedImageData, recogniser_dataframe_base:pd.DataFrame, recogniser_entities_dropdown_value:str="ALL", text_dropdown_value:str="ALL", page_dropdown_value:str="ALL", review_df:pd.DataFrame=
|
138 |
'''
|
139 |
Update recogniser dataframe information that appears alongside the pdf pages on the review screen.
|
140 |
'''
|
@@ -166,7 +179,11 @@ def update_recogniser_dataframes(page_image_annotator_object:AnnotatedImageData,
|
|
166 |
return recogniser_entities_list, recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_drop, text_entities_drop, page_entities_drop
|
167 |
|
168 |
def undo_last_removal(backup_review_state:pd.DataFrame, backup_image_annotations_state:list[dict], backup_recogniser_entity_dataframe_base:pd.DataFrame):
|
169 |
-
|
|
|
|
|
|
|
|
|
170 |
|
171 |
def update_annotator_page_from_review_df(
|
172 |
review_df: pd.DataFrame,
|
@@ -188,23 +205,24 @@ def update_annotator_page_from_review_df(
|
|
188 |
|
189 |
# Get the target page number from the selected row
|
190 |
# Safely access the page number, handling potential errors or empty DataFrame
|
191 |
-
gradio_annotator_current_page_number: int =
|
192 |
annotate_previous_page: int = 0 # Renaming for clarity if needed, matches original output
|
|
|
193 |
if not selected_recogniser_entity_df_row.empty and 'page' in selected_recogniser_entity_df_row.columns:
|
194 |
try:
|
195 |
-
|
196 |
-
gradio_annotator_current_page_number = int(
|
197 |
annotate_previous_page = gradio_annotator_current_page_number # Store original page number
|
198 |
except (IndexError, ValueError, TypeError):
|
199 |
-
print("Warning: Could not extract valid page number from selected_recogniser_entity_df_row. Defaulting to page
|
200 |
gradio_annotator_current_page_number = 1 # Or 0 depending on 1-based vs 0-based indexing elsewhere
|
201 |
|
202 |
# Ensure page number is valid and 1-based for external display/logic
|
203 |
-
if gradio_annotator_current_page_number <= 0:
|
204 |
-
gradio_annotator_current_page_number = 1
|
205 |
|
206 |
-
page_max_reported = len(out_image_annotations_state)
|
207 |
if gradio_annotator_current_page_number > page_max_reported:
|
|
|
208 |
gradio_annotator_current_page_number = page_max_reported # Cap at max pages
|
209 |
|
210 |
page_num_reported_zero_indexed = gradio_annotator_current_page_number - 1
|
@@ -247,7 +265,7 @@ def update_annotator_page_from_review_df(
|
|
247 |
if not current_page_review_df.empty:
|
248 |
# Convert the current page's review data to annotation list format for *this page*
|
249 |
|
250 |
-
current_page_annotations_list =
|
251 |
# Define expected annotation dict keys, including 'image', 'page', coords, 'label', 'text', 'color' etc.
|
252 |
# Assuming review_df has compatible columns
|
253 |
expected_annotation_keys = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id'] # Add/remove as needed
|
@@ -267,9 +285,6 @@ def update_annotator_page_from_review_df(
|
|
267 |
current_page_annotations_list = current_page_annotations_list_raw
|
268 |
|
269 |
# Update the annotations state for the current page
|
270 |
-
# Each entry in out_image_annotations_state seems to be a dict containing keys like 'image', 'page', 'annotations' (List[dict])
|
271 |
-
# Need to update the 'annotations' list for the specific page.
|
272 |
-
# Find the entry for the current page in the state
|
273 |
page_state_entry_found = False
|
274 |
for i, page_state_entry in enumerate(out_image_annotations_state):
|
275 |
# Assuming page_state_entry has a 'page' key (1-based)
|
@@ -291,16 +306,10 @@ def update_annotator_page_from_review_df(
|
|
291 |
break
|
292 |
|
293 |
if not page_state_entry_found:
|
294 |
-
# This scenario might happen if the current_image_annotations_state didn't initially contain
|
295 |
-
# an entry for this page number. Depending on the application logic, you might need to
|
296 |
-
# add a new entry here, but based on the original code's structure, it seems
|
297 |
-
# out_image_annotations_state is pre-populated for all pages.
|
298 |
print(f"Warning: Entry for page {gradio_annotator_current_page_number} not found in current_image_annotations_state. Cannot update page annotations.")
|
299 |
|
300 |
-
|
301 |
-
# --- Image Path and Page Size Handling (already seems focused on current page, keep similar logic) ---
|
302 |
# Get the image path for the current page from the updated state
|
303 |
-
# Ensure the entry exists before accessing
|
304 |
current_image_path = None
|
305 |
if len(out_image_annotations_state) > page_num_reported_zero_indexed and 'image' in out_image_annotations_state[page_num_reported_zero_indexed]:
|
306 |
current_image_path = out_image_annotations_state[page_num_reported_zero_indexed]['image']
|
@@ -331,13 +340,9 @@ def update_annotator_page_from_review_df(
|
|
331 |
if not page_sizes_df.empty:
|
332 |
page_sizes = page_sizes_df.to_dict(orient='records')
|
333 |
else:
|
334 |
-
page_sizes =
|
335 |
|
336 |
# --- Re-evaluate Coordinate Multiplication and Duplicate Removal ---
|
337 |
-
# The original code multiplied coordinates for the *entire* document and removed duplicates
|
338 |
-
# across the *entire* document *after* converting the full review_df to state.
|
339 |
-
# With the optimized approach, we updated only one page's annotations in the state.
|
340 |
-
|
341 |
# Let's assume remove_duplicate_images_with_blank_boxes expects the raw list of dicts state format:
|
342 |
try:
|
343 |
out_image_annotations_state = remove_duplicate_images_with_blank_boxes(out_image_annotations_state)
|
@@ -352,9 +357,7 @@ def update_annotator_page_from_review_df(
|
|
352 |
print(f"Warning: Cannot select current page annotator object for index {page_num_reported_zero_indexed}.")
|
353 |
out_current_page_annotator = {} # Or None, depending on expected output type
|
354 |
|
355 |
-
|
356 |
-
# The original code returns gradio_annotator_current_page_number as the 3rd value,
|
357 |
-
# which was potentially updated by bounding checks. Keep this.
|
358 |
final_page_number_returned = gradio_annotator_current_page_number
|
359 |
|
360 |
return (out_current_page_annotator,
|
@@ -364,6 +367,277 @@ def update_annotator_page_from_review_df(
|
|
364 |
review_df, # review_df might have its 'page' column type changed, keep it as is or revert if necessary
|
365 |
annotate_previous_page) # The original page number from selected_recogniser_entity_df_row
|
366 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
367 |
def exclude_selected_items_from_redaction(review_df: pd.DataFrame,
|
368 |
selected_rows_df: pd.DataFrame,
|
369 |
image_file_paths:List[str],
|
@@ -437,6 +711,7 @@ def replace_annotator_object_img_np_array_with_page_sizes_image_path(
|
|
437 |
|
438 |
def replace_placeholder_image_with_real_image(doc_full_file_name_textbox:str, current_image_path:str, page_sizes_df:pd.DataFrame, page_num_reported:int, input_folder:str):
|
439 |
''' If image path is still not valid, load in a new image an overwrite it. Then replace all items in the image annotation object for all pages based on the updated information.'''
|
|
|
440 |
page_num_reported_zero_indexed = page_num_reported - 1
|
441 |
|
442 |
if not os.path.exists(current_image_path):
|
@@ -471,11 +746,12 @@ def update_annotator_object_and_filter_df(
|
|
471 |
gradio_annotator_current_page_number:int,
|
472 |
recogniser_entities_dropdown_value:str="ALL",
|
473 |
page_dropdown_value:str="ALL",
|
|
|
474 |
text_dropdown_value:str="ALL",
|
475 |
-
recogniser_dataframe_base:
|
476 |
zoom:int=100,
|
477 |
review_df:pd.DataFrame=None, # Use None for default empty DataFrame
|
478 |
-
page_sizes:List[dict]=
|
479 |
doc_full_file_name_textbox:str='',
|
480 |
input_folder:str=INPUT_FOLDER
|
481 |
) -> Tuple[image_annotator, gr.Number, gr.Number, int, str, gr.Dataframe, pd.DataFrame, List[str], List[str], List[dict], List[AnnotatedImageData]]:
|
@@ -483,6 +759,7 @@ def update_annotator_object_and_filter_df(
|
|
483 |
Update a gradio_image_annotation object with new annotation data for the current page
|
484 |
and update filter dataframes, optimizing by processing only the current page's data for display.
|
485 |
'''
|
|
|
486 |
zoom_str = str(zoom) + '%'
|
487 |
|
488 |
# Handle default empty review_df and recogniser_dataframe_base
|
@@ -496,8 +773,9 @@ def update_annotator_object_and_filter_df(
|
|
496 |
if not all_image_annotations:
|
497 |
print("No all_image_annotation object found")
|
498 |
# Return blank/default outputs
|
499 |
-
|
500 |
-
|
|
|
501 |
show_label=False, height=zoom_str, width=zoom_str, box_min_size=1,
|
502 |
box_selected_thickness=2, handle_size=4, sources=None,
|
503 |
show_clear_button=False, show_share_button=False, show_remove_button=False,
|
@@ -508,7 +786,7 @@ def update_annotator_object_and_filter_df(
|
|
508 |
|
509 |
return (blank_annotator, gr.Number(value=1), gr.Number(value=1), 1,
|
510 |
recogniser_entities_dropdown_value, blank_df_out_gr, blank_df_modified,
|
511 |
-
[], [], [], []) # Return empty lists/defaults for other outputs
|
512 |
|
513 |
# Validate and bound the current page number (1-based logic)
|
514 |
page_num_reported = max(1, gradio_annotator_current_page_number) # Minimum page is 1
|
@@ -519,6 +797,10 @@ def update_annotator_object_and_filter_df(
|
|
519 |
page_num_reported_zero_indexed = page_num_reported - 1
|
520 |
annotate_previous_page = page_num_reported # Store the determined page number
|
521 |
|
|
|
|
|
|
|
|
|
522 |
# --- Process page sizes DataFrame ---
|
523 |
page_sizes_df = pd.DataFrame(page_sizes)
|
524 |
if not page_sizes_df.empty:
|
@@ -530,29 +812,17 @@ def update_annotator_object_and_filter_df(
|
|
530 |
print("Warning: Page sizes DataFrame became empty after processing.")
|
531 |
|
532 |
# --- Handle Image Path Replacement for the Current Page ---
|
533 |
-
|
534 |
-
# Assuming replace_annotator_object_img_np_array_with_page_sizes_image_path
|
535 |
-
# correctly updates the image path within the list element.
|
536 |
if len(all_image_annotations) > page_num_reported_zero_indexed:
|
537 |
-
|
538 |
-
# to avoid modifying the input list unexpectedly if it's used elsewhere.
|
539 |
-
# However, the original code modified the list in place, so we'll stick to that
|
540 |
-
# pattern but acknowledge it.
|
541 |
page_object_to_update = all_image_annotations[page_num_reported_zero_indexed]
|
542 |
|
543 |
# Use the helper function to replace the image path within the page object
|
544 |
-
# Note: This helper returns the potentially modified page_object and the full state.
|
545 |
-
# The full state return seems redundant if only page_object_to_update is modified.
|
546 |
-
# Let's call it and assume it correctly updates the item in the list.
|
547 |
updated_page_object, all_image_annotations_after_img_replace = replace_annotator_object_img_np_array_with_page_sizes_image_path(
|
548 |
all_image_annotations, page_object_to_update, page_sizes, page_num_reported)
|
549 |
|
550 |
-
# The original code immediately re-assigns all_image_annotations.
|
551 |
-
# We'll rely on the function modifying the list element in place or returning the updated list.
|
552 |
-
# Assuming it returns the updated list for robustness:
|
553 |
all_image_annotations = all_image_annotations_after_img_replace
|
554 |
|
555 |
-
|
556 |
# Now handle the actual image file path replacement using replace_placeholder_image_with_real_image
|
557 |
current_image_path = updated_page_object.get('image') # Get potentially updated image path
|
558 |
|
@@ -585,7 +855,7 @@ def update_annotator_object_and_filter_df(
|
|
585 |
if not page_sizes_df.empty:
|
586 |
page_sizes = page_sizes_df.to_dict(orient='records')
|
587 |
else:
|
588 |
-
page_sizes =
|
589 |
|
590 |
# --- OPTIMIZATION: Prepare data *only* for the current page for display ---
|
591 |
current_page_image_annotator_object = None
|
@@ -596,7 +866,6 @@ def update_annotator_object_and_filter_df(
|
|
596 |
# Assuming coordinate multiplication IS needed for display if state stores relative coords
|
597 |
current_page_annotations_df = convert_annotation_data_to_dataframe([page_data_for_display])
|
598 |
|
599 |
-
|
600 |
if not current_page_annotations_df.empty and not page_sizes_df.empty:
|
601 |
# Multiply coordinates *only* for this page's DataFrame
|
602 |
try:
|
@@ -642,18 +911,19 @@ def update_annotator_object_and_filter_df(
|
|
642 |
|
643 |
except Exception as e:
|
644 |
print(f"Error calling update_recogniser_dataframes: {e}. Returning empty/default filter data.")
|
645 |
-
recogniser_entities_list =
|
646 |
-
recogniser_colour_list =
|
647 |
recogniser_dataframe_out_gr = gr.Dataframe(pd.DataFrame(columns=["page", "label", "text", "id"]))
|
648 |
recogniser_dataframe_modified = pd.DataFrame(columns=["page", "label", "text", "id"])
|
649 |
-
text_entities_drop =
|
650 |
-
page_entities_drop =
|
651 |
|
652 |
|
653 |
# --- Final Output Components ---
|
654 |
-
|
655 |
-
|
656 |
-
|
|
|
657 |
|
658 |
### Present image_annotator outputs
|
659 |
# Handle the case where current_page_image_annotator_object couldn't be prepared
|
@@ -683,9 +953,12 @@ def update_annotator_object_and_filter_df(
|
|
683 |
interactive=True # Keep interactive if data is present
|
684 |
)
|
685 |
|
686 |
-
|
687 |
-
|
688 |
-
|
|
|
|
|
|
|
689 |
return (out_image_annotator,
|
690 |
page_number_reported_gradio_comp,
|
691 |
page_number_reported_gradio_comp, # Redundant, but matches original return signature
|
@@ -695,6 +968,7 @@ def update_annotator_object_and_filter_df(
|
|
695 |
recogniser_dataframe_modified,
|
696 |
text_entities_drop, # List of text entities for dropdown
|
697 |
page_entities_drop, # List of page numbers for dropdown
|
|
|
698 |
page_sizes, # Updated page_sizes list
|
699 |
all_image_annotations) # Return the updated full state
|
700 |
|
@@ -703,13 +977,19 @@ def update_all_page_annotation_object_based_on_previous_page(
|
|
703 |
current_page:int,
|
704 |
previous_page:int,
|
705 |
all_image_annotations:List[AnnotatedImageData],
|
706 |
-
page_sizes:List[dict]=
|
707 |
clear_all:bool=False
|
708 |
):
|
709 |
'''
|
710 |
Overwrite image annotations on the page we are moving from with modifications.
|
711 |
'''
|
712 |
|
|
|
|
|
|
|
|
|
|
|
|
|
713 |
previous_page_zero_index = previous_page -1
|
714 |
|
715 |
if not current_page: current_page = 1
|
@@ -718,7 +998,7 @@ def update_all_page_annotation_object_based_on_previous_page(
|
|
718 |
page_image_annotator_object, all_image_annotations = replace_annotator_object_img_np_array_with_page_sizes_image_path(all_image_annotations, page_image_annotator_object, page_sizes, previous_page)
|
719 |
|
720 |
if clear_all == False: all_image_annotations[previous_page_zero_index] = page_image_annotator_object
|
721 |
-
else: all_image_annotations[previous_page_zero_index]["boxes"] =
|
722 |
|
723 |
return all_image_annotations, current_page, current_page
|
724 |
|
@@ -730,16 +1010,16 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
|
|
730 |
review_file_state:pd.DataFrame,
|
731 |
output_folder:str = OUTPUT_FOLDER,
|
732 |
save_pdf:bool=True,
|
733 |
-
page_sizes:List[dict]=
|
734 |
COMPRESS_REDACTED_PDF:bool=COMPRESS_REDACTED_PDF,
|
735 |
progress=gr.Progress(track_tqdm=True)):
|
736 |
'''
|
737 |
Apply modified redactions to a pymupdf and export review files.
|
738 |
'''
|
739 |
|
740 |
-
output_files =
|
741 |
-
output_log_files =
|
742 |
-
pdf_doc =
|
743 |
review_df = review_file_state
|
744 |
|
745 |
page_image_annotator_object = all_image_annotations[current_page - 1]
|
@@ -805,7 +1085,7 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
|
|
805 |
doc = [image]
|
806 |
|
807 |
elif file_extension in '.csv':
|
808 |
-
pdf_doc =
|
809 |
|
810 |
# If working with pdfs
|
811 |
elif is_pdf(file_path) == True:
|
@@ -815,7 +1095,7 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
|
|
815 |
output_files.append(orig_pdf_file_path)
|
816 |
|
817 |
number_of_pages = pdf_doc.page_count
|
818 |
-
original_cropboxes =
|
819 |
|
820 |
page_sizes_df = pd.DataFrame(page_sizes)
|
821 |
page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
|
@@ -961,10 +1241,16 @@ def update_entities_df_page(choice:str, df:pd.DataFrame, label_dropdown_value:st
|
|
961 |
'''
|
962 |
if isinstance(choice, str):
|
963 |
choice = [choice]
|
|
|
|
|
964 |
if isinstance(label_dropdown_value, str):
|
965 |
label_dropdown_value = [label_dropdown_value]
|
|
|
|
|
966 |
if isinstance(text_dropdown_value, str):
|
967 |
text_dropdown_value = [text_dropdown_value]
|
|
|
|
|
968 |
|
969 |
filtered_df = df.copy()
|
970 |
|
@@ -989,6 +1275,29 @@ def update_entities_df_page(choice:str, df:pd.DataFrame, label_dropdown_value:st
|
|
989 |
|
990 |
return filtered_df, recogniser_entities_drop, text_entities_drop
|
991 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
992 |
def update_entities_df_text(choice:str, df:pd.DataFrame, label_dropdown_value:str, page_dropdown_value:str):
|
993 |
'''
|
994 |
Update the rows in a dataframe depending on the user choice from a dropdown
|
@@ -1042,6 +1351,24 @@ def reset_dropdowns(df:pd.DataFrame):
|
|
1042 |
def increase_bottom_page_count_based_on_top(page_number:int):
|
1043 |
return int(page_number)
|
1044 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1045 |
def df_select_callback_dataframe_row(df: pd.DataFrame, evt: gr.SelectData):
|
1046 |
|
1047 |
row_value_page = int(evt.row_value[0]) # This is the page number value
|
@@ -1096,9 +1423,22 @@ def get_all_rows_with_same_text(df: pd.DataFrame, text: str):
|
|
1096 |
'''
|
1097 |
if text:
|
1098 |
# Get all rows with the same text as the selected row
|
1099 |
-
return df[df["text"] == text]
|
1100 |
else:
|
1101 |
return pd.DataFrame(columns=["page", "label", "text", "id"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1102 |
|
1103 |
def update_selected_review_df_row_colour(
|
1104 |
redaction_row_selection: pd.DataFrame,
|
@@ -1286,7 +1626,7 @@ def convert_pymupdf_coords_to_adobe(x1: float, y1: float, x2: float, y2: float,
|
|
1286 |
|
1287 |
return x1, adobe_y1, x2, adobe_y2
|
1288 |
|
1289 |
-
def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:List[str]=
|
1290 |
'''
|
1291 |
Create an xfdf file from a review csv file and a pdf
|
1292 |
'''
|
@@ -1378,11 +1718,11 @@ def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, i
|
|
1378 |
reparsed = minidom.parseString(rough_string)
|
1379 |
return reparsed.toxml() #.toprettyxml(indent=" ")
|
1380 |
|
1381 |
-
def convert_df_to_xfdf(input_files:List[str], pdf_doc:Document, image_paths:List[str], output_folder:str = OUTPUT_FOLDER, document_cropboxes:List=
|
1382 |
'''
|
1383 |
Load in files to convert a review file into an Adobe comment file format
|
1384 |
'''
|
1385 |
-
output_paths =
|
1386 |
pdf_name = ""
|
1387 |
file_path_name = ""
|
1388 |
|
@@ -1481,7 +1821,7 @@ def parse_xfdf(xfdf_path:str):
|
|
1481 |
# Define the namespace
|
1482 |
namespace = {'xfdf': 'http://ns.adobe.com/xfdf/'}
|
1483 |
|
1484 |
-
redactions =
|
1485 |
|
1486 |
# Find all redact elements using the namespace
|
1487 |
for redact in root.findall('.//xfdf:redact', namespaces=namespace):
|
@@ -1513,8 +1853,8 @@ def convert_xfdf_to_dataframe(file_paths_list:List[str], pymupdf_doc, image_path
|
|
1513 |
Returns:
|
1514 |
- DataFrame containing redaction information
|
1515 |
'''
|
1516 |
-
output_paths =
|
1517 |
-
xfdf_paths =
|
1518 |
df = pd.DataFrame()
|
1519 |
|
1520 |
# Sort the file paths so that the pdfs come first
|
|
|
1 |
import os
|
2 |
import re
|
|
|
3 |
import pandas as pd
|
4 |
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
import string
|
7 |
+
import random
|
8 |
from xml.etree.ElementTree import Element, SubElement, tostring, parse
|
9 |
from xml.dom import minidom
|
10 |
import uuid
|
11 |
+
from typing import List, Tuple, Dict, Set
|
12 |
from gradio_image_annotation import image_annotator
|
13 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
14 |
from pymupdf import Document, Rect
|
15 |
import pymupdf
|
16 |
from PIL import ImageDraw, Image
|
17 |
from datetime import datetime, timezone, timedelta
|
18 |
+
from collections import defaultdict
|
19 |
+
import gradio as gr
|
20 |
|
21 |
from tools.config import OUTPUT_FOLDER, MAX_IMAGE_PIXELS, INPUT_FOLDER, COMPRESS_REDACTED_PDF
|
22 |
+
from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, remove_duplicate_images_with_blank_boxes, fill_missing_ids, divide_coordinates_by_page_sizes, save_pdf_with_or_without_compression, fill_missing_ids_in_list
|
23 |
from tools.helper_functions import get_file_name_without_type, detect_file_type
|
24 |
from tools.file_redaction import redact_page_with_pymupdf
|
25 |
|
26 |
if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
|
27 |
|
28 |
+
def decrease_page(number:int, all_annotations:dict):
|
29 |
'''
|
30 |
Decrease page number for review redactions page.
|
31 |
'''
|
32 |
+
if not all_annotations:
|
33 |
+
raise Warning("No annotator object loaded")
|
34 |
+
|
35 |
if number > 1:
|
36 |
return number - 1, number - 1
|
37 |
+
elif number <= 1:
|
38 |
+
#return 1, 1
|
39 |
+
raise Warning("At first page")
|
40 |
else:
|
41 |
+
raise Warning("At first page")
|
42 |
|
43 |
+
def increase_page(number:int, all_annotations:dict):
|
44 |
'''
|
45 |
Increase page number for review redactions page.
|
46 |
'''
|
47 |
|
48 |
+
if not all_annotations:
|
49 |
+
raise Warning("No annotator object loaded")
|
50 |
+
#return 1, 1
|
51 |
|
52 |
+
max_pages = len(all_annotations)
|
53 |
|
54 |
if number < max_pages:
|
55 |
return number + 1, number + 1
|
56 |
+
#elif number == max_pages:
|
57 |
+
# return max_pages, max_pages
|
58 |
else:
|
59 |
+
raise Warning("At last page")
|
60 |
|
61 |
def update_zoom(current_zoom_level:int, annotate_current_page:int, decrease:bool=True):
|
62 |
if decrease == False:
|
|
|
99 |
recogniser_dropdown_value:str,
|
100 |
text_dropdown_value:str,
|
101 |
page_dropdown_value:str,
|
102 |
+
review_df:pd.DataFrame=list(),
|
103 |
+
page_sizes:List[str]=list()):
|
104 |
'''
|
105 |
Create a filtered recogniser dataframe and associated dropdowns based on current information in the image annotator and review data frame.
|
106 |
'''
|
|
|
147 |
|
148 |
return recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop
|
149 |
|
150 |
+
def update_recogniser_dataframes(page_image_annotator_object:AnnotatedImageData, recogniser_dataframe_base:pd.DataFrame, recogniser_entities_dropdown_value:str="ALL", text_dropdown_value:str="ALL", page_dropdown_value:str="ALL", review_df:pd.DataFrame=list(), page_sizes:list[str]=list()):
|
151 |
'''
|
152 |
Update recogniser dataframe information that appears alongside the pdf pages on the review screen.
|
153 |
'''
|
|
|
179 |
return recogniser_entities_list, recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_drop, text_entities_drop, page_entities_drop
|
180 |
|
181 |
def undo_last_removal(backup_review_state:pd.DataFrame, backup_image_annotations_state:list[dict], backup_recogniser_entity_dataframe_base:pd.DataFrame):
|
182 |
+
|
183 |
+
if backup_image_annotations_state:
|
184 |
+
return backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
|
185 |
+
else:
|
186 |
+
raise Warning("No actions have been taken to undo")
|
187 |
|
188 |
def update_annotator_page_from_review_df(
|
189 |
review_df: pd.DataFrame,
|
|
|
205 |
|
206 |
# Get the target page number from the selected row
|
207 |
# Safely access the page number, handling potential errors or empty DataFrame
|
208 |
+
gradio_annotator_current_page_number: int = 1
|
209 |
annotate_previous_page: int = 0 # Renaming for clarity if needed, matches original output
|
210 |
+
|
211 |
if not selected_recogniser_entity_df_row.empty and 'page' in selected_recogniser_entity_df_row.columns:
|
212 |
try:
|
213 |
+
selected_page= selected_recogniser_entity_df_row['page'].iloc[0]
|
214 |
+
gradio_annotator_current_page_number = int(selected_page)
|
215 |
annotate_previous_page = gradio_annotator_current_page_number # Store original page number
|
216 |
except (IndexError, ValueError, TypeError):
|
217 |
+
print("Warning: Could not extract valid page number from selected_recogniser_entity_df_row. Defaulting to page 1.")
|
218 |
gradio_annotator_current_page_number = 1 # Or 0 depending on 1-based vs 0-based indexing elsewhere
|
219 |
|
220 |
# Ensure page number is valid and 1-based for external display/logic
|
221 |
+
if gradio_annotator_current_page_number <= 0: gradio_annotator_current_page_number = 1
|
|
|
222 |
|
223 |
+
page_max_reported = len(page_sizes) #len(out_image_annotations_state)
|
224 |
if gradio_annotator_current_page_number > page_max_reported:
|
225 |
+
print("current page is greater than highest page:", page_max_reported)
|
226 |
gradio_annotator_current_page_number = page_max_reported # Cap at max pages
|
227 |
|
228 |
page_num_reported_zero_indexed = gradio_annotator_current_page_number - 1
|
|
|
265 |
if not current_page_review_df.empty:
|
266 |
# Convert the current page's review data to annotation list format for *this page*
|
267 |
|
268 |
+
current_page_annotations_list = list()
|
269 |
# Define expected annotation dict keys, including 'image', 'page', coords, 'label', 'text', 'color' etc.
|
270 |
# Assuming review_df has compatible columns
|
271 |
expected_annotation_keys = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id'] # Add/remove as needed
|
|
|
285 |
current_page_annotations_list = current_page_annotations_list_raw
|
286 |
|
287 |
# Update the annotations state for the current page
|
|
|
|
|
|
|
288 |
page_state_entry_found = False
|
289 |
for i, page_state_entry in enumerate(out_image_annotations_state):
|
290 |
# Assuming page_state_entry has a 'page' key (1-based)
|
|
|
306 |
break
|
307 |
|
308 |
if not page_state_entry_found:
|
|
|
|
|
|
|
|
|
309 |
print(f"Warning: Entry for page {gradio_annotator_current_page_number} not found in current_image_annotations_state. Cannot update page annotations.")
|
310 |
|
311 |
+
# --- Image Path and Page Size Handling ---
|
|
|
312 |
# Get the image path for the current page from the updated state
|
|
|
313 |
current_image_path = None
|
314 |
if len(out_image_annotations_state) > page_num_reported_zero_indexed and 'image' in out_image_annotations_state[page_num_reported_zero_indexed]:
|
315 |
current_image_path = out_image_annotations_state[page_num_reported_zero_indexed]['image']
|
|
|
340 |
if not page_sizes_df.empty:
|
341 |
page_sizes = page_sizes_df.to_dict(orient='records')
|
342 |
else:
|
343 |
+
page_sizes = list() # Ensure page_sizes is a list if df is empty
|
344 |
|
345 |
# --- Re-evaluate Coordinate Multiplication and Duplicate Removal ---
|
|
|
|
|
|
|
|
|
346 |
# Let's assume remove_duplicate_images_with_blank_boxes expects the raw list of dicts state format:
|
347 |
try:
|
348 |
out_image_annotations_state = remove_duplicate_images_with_blank_boxes(out_image_annotations_state)
|
|
|
357 |
print(f"Warning: Cannot select current page annotator object for index {page_num_reported_zero_indexed}.")
|
358 |
out_current_page_annotator = {} # Or None, depending on expected output type
|
359 |
|
360 |
+
# Return final page number
|
|
|
|
|
361 |
final_page_number_returned = gradio_annotator_current_page_number
|
362 |
|
363 |
return (out_current_page_annotator,
|
|
|
367 |
review_df, # review_df might have its 'page' column type changed, keep it as is or revert if necessary
|
368 |
annotate_previous_page) # The original page number from selected_recogniser_entity_df_row
|
369 |
|
370 |
+
# --- Helper Function for ID Generation ---
|
371 |
+
# This function encapsulates your ID logic in a performant, batch-oriented way.
|
372 |
+
def _generate_unique_ids(
|
373 |
+
num_ids_to_generate: int,
|
374 |
+
existing_ids_set: Set[str]
|
375 |
+
) -> List[str]:
|
376 |
+
"""
|
377 |
+
Generates a specified number of unique, 12-character alphanumeric IDs.
|
378 |
+
|
379 |
+
This is a batch-oriented, performant version of the original
|
380 |
+
`fill_missing_ids_in_list` logic, designed to work efficiently
|
381 |
+
with DataFrames.
|
382 |
+
|
383 |
+
Args:
|
384 |
+
num_ids_to_generate (int): The number of unique IDs to create.
|
385 |
+
existing_ids_set (Set[str]): A set of IDs that are already in use and
|
386 |
+
should be avoided.
|
387 |
+
|
388 |
+
Returns:
|
389 |
+
List[str]: A list of newly generated unique IDs.
|
390 |
+
"""
|
391 |
+
id_length = 12
|
392 |
+
character_set = string.ascii_letters + string.digits
|
393 |
+
|
394 |
+
newly_generated_ids = set()
|
395 |
+
|
396 |
+
# The while loop ensures we generate exactly the number of IDs required,
|
397 |
+
# automatically handling the astronomically rare case of a collision.
|
398 |
+
while len(newly_generated_ids) < num_ids_to_generate:
|
399 |
+
candidate_id = ''.join(random.choices(character_set, k=id_length))
|
400 |
+
|
401 |
+
# Check against both pre-existing IDs and IDs generated in this batch
|
402 |
+
if candidate_id not in existing_ids_set and candidate_id not in newly_generated_ids:
|
403 |
+
newly_generated_ids.add(candidate_id)
|
404 |
+
|
405 |
+
return list(newly_generated_ids)
|
406 |
+
|
407 |
+
def _merge_horizontally_adjacent_boxes(
|
408 |
+
df: pd.DataFrame,
|
409 |
+
x_merge_threshold: int = 0.02
|
410 |
+
) -> pd.DataFrame:
|
411 |
+
"""
|
412 |
+
Merges horizontally adjacent bounding boxes within the same line.
|
413 |
+
|
414 |
+
Args:
|
415 |
+
df (pd.DataFrame): DataFrame containing annotation boxes with columns
|
416 |
+
like 'page', 'line', 'xmin', 'xmax', etc.
|
417 |
+
x_merge_threshold (int): The maximum pixel gap on the x-axis to
|
418 |
+
consider two boxes as adjacent.
|
419 |
+
|
420 |
+
Returns:
|
421 |
+
pd.DataFrame: A new DataFrame with adjacent boxes merged.
|
422 |
+
"""
|
423 |
+
if df.empty:
|
424 |
+
return df
|
425 |
+
|
426 |
+
# 1. Sort values to ensure we are comparing adjacent boxes
|
427 |
+
df_sorted = df.sort_values(by=['page', 'line', 'xmin']).copy()
|
428 |
+
|
429 |
+
# 2. Identify groups of boxes to merge using shift() and cumsum()
|
430 |
+
# Get properties of the 'previous' box in the sorted list
|
431 |
+
prev_xmax = df_sorted['xmax'].shift(1)
|
432 |
+
prev_page = df_sorted['page'].shift(1)
|
433 |
+
prev_line = df_sorted['line'].shift(1)
|
434 |
+
|
435 |
+
# A box should be merged with the previous one if it's on the same page/line
|
436 |
+
# and the horizontal gap is within the threshold.
|
437 |
+
is_adjacent = (
|
438 |
+
(df_sorted['page'] == prev_page) &
|
439 |
+
(df_sorted['line'] == prev_line) &
|
440 |
+
(df_sorted['xmin'] - prev_xmax <= x_merge_threshold)
|
441 |
+
)
|
442 |
+
|
443 |
+
# A new group starts wherever a box is NOT adjacent to the previous one.
|
444 |
+
# cumsum() on this boolean series creates a unique ID for each group.
|
445 |
+
df_sorted['merge_group'] = (~is_adjacent).cumsum()
|
446 |
+
|
447 |
+
# 3. Aggregate each group into a single bounding box
|
448 |
+
# Define how to aggregate each column
|
449 |
+
agg_funcs = {
|
450 |
+
'xmin': 'min',
|
451 |
+
'ymin': 'min', # To get the highest point of the combined box
|
452 |
+
'xmax': 'max',
|
453 |
+
'ymax': 'max', # To get the lowest point of the combined box
|
454 |
+
'text': lambda s: ' '.join(s.astype(str)), # Join the text
|
455 |
+
# Carry over the first value for columns that are constant within a group
|
456 |
+
'page': 'first',
|
457 |
+
'line': 'first',
|
458 |
+
'image': 'first',
|
459 |
+
'label': 'first',
|
460 |
+
'color': 'first',
|
461 |
+
}
|
462 |
+
|
463 |
+
merged_df = df_sorted.groupby('merge_group').agg(agg_funcs).reset_index(drop=True)
|
464 |
+
|
465 |
+
print(f"Merged {len(df)} annotations into {len(merged_df)}.")
|
466 |
+
|
467 |
+
return merged_df
|
468 |
+
|
469 |
+
def create_annotation_objects_from_filtered_ocr_results_with_words(
|
470 |
+
filtered_ocr_results_with_words_df: pd.DataFrame,
|
471 |
+
ocr_results_with_words_df_base: pd.DataFrame,
|
472 |
+
page_sizes: List[Dict],
|
473 |
+
existing_annotations_df: pd.DataFrame,
|
474 |
+
existing_annotations_list: List[Dict],
|
475 |
+
existing_recogniser_entity_df: pd.DataFrame,
|
476 |
+
redaction_label:str = "Redaction",
|
477 |
+
colour_label:str = '(0, 0, 0)',
|
478 |
+
progress:gr.Progress=gr.Progress()) -> Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
479 |
+
"""
|
480 |
+
This function processes filtered OCR results with words to create new annotation objects. It merges these new annotations with existing ones, ensuring that horizontally adjacent boxes are combined for cleaner redactions. The function also updates the existing recogniser entity DataFrame and returns the updated annotations in both DataFrame and list-of-dicts formats.
|
481 |
+
|
482 |
+
Args:
|
483 |
+
filtered_ocr_results_with_words_df (pd.DataFrame): A DataFrame containing filtered OCR results with words.
|
484 |
+
ocr_results_with_words_df_base (pd.DataFrame): The base DataFrame of OCR results with words.
|
485 |
+
page_sizes (List[Dict]): A list of dictionaries containing page sizes.
|
486 |
+
existing_annotations_df (pd.DataFrame): A DataFrame of existing annotations.
|
487 |
+
existing_annotations_list (List[Dict]): A list of dictionaries representing existing annotations.
|
488 |
+
existing_recogniser_entity_df (pd.DataFrame): A DataFrame of existing recogniser entities.
|
489 |
+
progress (gr.Progress, optional): A progress tracker. Defaults to gr.Progress(track_tqdm=True).
|
490 |
+
|
491 |
+
Returns:
|
492 |
+
Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing the updated annotations list, updated existing annotations list, updated annotations DataFrame, updated existing annotations DataFrame, updated recogniser entity DataFrame, and the original existing recogniser entity DataFrame.
|
493 |
+
"""
|
494 |
+
|
495 |
+
# Validate colour_label: must be a 3-number tuple with each value in [0, 255]
|
496 |
+
# If invalid, fallback to '(0, 0, 0,)' as requested
|
497 |
+
fallback_colour = '(0, 0, 0,)'
|
498 |
+
try:
|
499 |
+
valid = False
|
500 |
+
if isinstance(colour_label, str):
|
501 |
+
label_str = colour_label.strip()
|
502 |
+
match = re.match(r"^\(\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,?\s*\)$", label_str)
|
503 |
+
if match:
|
504 |
+
r_val, g_val, b_val = (int(match.group(1)), int(match.group(2)), int(match.group(3)))
|
505 |
+
if 0 <= r_val <= 255 and 0 <= g_val <= 255 and 0 <= b_val <= 255:
|
506 |
+
valid = True
|
507 |
+
elif isinstance(colour_label, (tuple, list)) and len(colour_label) == 3:
|
508 |
+
r_val, g_val, b_val = colour_label
|
509 |
+
if all(isinstance(v, int) for v in (r_val, g_val, b_val)) and all(0 <= v <= 255 for v in (r_val, g_val, b_val)):
|
510 |
+
colour_label = f'({r_val}, {g_val}, {b_val},)'
|
511 |
+
valid = True
|
512 |
+
if not valid:
|
513 |
+
colour_label = fallback_colour
|
514 |
+
except Exception:
|
515 |
+
colour_label = fallback_colour
|
516 |
+
|
517 |
+
progress(0.2, desc="Identifying new redactions to add")
|
518 |
+
print("Identifying new redactions to add")
|
519 |
+
if filtered_ocr_results_with_words_df.empty:
|
520 |
+
print("No new annotations to add.")
|
521 |
+
updated_annotations_df = existing_annotations_df.copy()
|
522 |
+
else:
|
523 |
+
# Assuming index relationship holds for fast lookup
|
524 |
+
filtered_ocr_results_with_words_df.index = filtered_ocr_results_with_words_df["index"]
|
525 |
+
new_annotations_df = ocr_results_with_words_df_base.loc[filtered_ocr_results_with_words_df.index].copy()
|
526 |
+
|
527 |
+
if new_annotations_df.empty:
|
528 |
+
print("No new annotations to add.")
|
529 |
+
updated_annotations_df = existing_annotations_df.copy()
|
530 |
+
else:
|
531 |
+
page_to_image_map = {item['page']: item['image_path'] for item in page_sizes}
|
532 |
+
|
533 |
+
# Prepare the initial new annotations DataFrame
|
534 |
+
new_annotations_df = new_annotations_df.assign(
|
535 |
+
image=lambda df: df['page'].map(page_to_image_map),
|
536 |
+
label= redaction_label,
|
537 |
+
color= colour_label
|
538 |
+
).rename(columns={
|
539 |
+
'word_x0': 'xmin',
|
540 |
+
'word_y0': 'ymin',
|
541 |
+
'word_x1': 'xmax',
|
542 |
+
'word_y1': 'ymax',
|
543 |
+
'word_text': 'text'
|
544 |
+
})
|
545 |
+
|
546 |
+
progress(0.3, desc="Checking for adjacent annotations to merge...")
|
547 |
+
print("Checking for adjacent annotations to merge...")
|
548 |
+
new_annotations_df = _merge_horizontally_adjacent_boxes(new_annotations_df)
|
549 |
+
|
550 |
+
progress(0.4, desc="Creating new redaction IDs...")
|
551 |
+
print("Creating new redaction IDs...")
|
552 |
+
existing_ids = set(existing_annotations_df['id'].dropna()) if 'id' in existing_annotations_df.columns else set()
|
553 |
+
num_new_ids = len(new_annotations_df)
|
554 |
+
new_id_list = _generate_unique_ids(num_new_ids, existing_ids)
|
555 |
+
new_annotations_df['id'] = new_id_list
|
556 |
+
|
557 |
+
annotation_cols = ['image', 'page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
|
558 |
+
new_annotations_df = new_annotations_df[annotation_cols]
|
559 |
+
|
560 |
+
key_cols = ['page', 'label', 'xmin', 'ymin', 'xmax', 'ymax', 'text']
|
561 |
+
|
562 |
+
progress(0.5, desc="Checking for duplicate redactions")
|
563 |
+
|
564 |
+
if existing_annotations_df.empty or not all(col in existing_annotations_df.columns for col in key_cols):
|
565 |
+
unique_new_df = new_annotations_df
|
566 |
+
else:
|
567 |
+
# Do not add duplicate redactions
|
568 |
+
merged = pd.merge(
|
569 |
+
new_annotations_df,
|
570 |
+
existing_annotations_df[key_cols].drop_duplicates(),
|
571 |
+
on=key_cols,
|
572 |
+
how='left',
|
573 |
+
indicator=True
|
574 |
+
)
|
575 |
+
unique_new_df = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge'])
|
576 |
+
#unique_new_df = new_annotations_df
|
577 |
+
|
578 |
+
print(f"Found {len(unique_new_df)} new unique annotations to add.")
|
579 |
+
gr.Info(f"Found {len(unique_new_df)} new unique annotations to add.")
|
580 |
+
updated_annotations_df = pd.concat([existing_annotations_df, unique_new_df], ignore_index=True)
|
581 |
+
|
582 |
+
# --- Part 4: Convert final DataFrame to list-of-dicts ---
|
583 |
+
updated_recogniser_entity_df = pd.DataFrame()
|
584 |
+
if not updated_annotations_df.empty:
|
585 |
+
updated_recogniser_entity_df = updated_annotations_df[["page", "label", "text", "id"]]
|
586 |
+
|
587 |
+
if not page_sizes:
|
588 |
+
print("Warning: page_sizes is empty. No pages to process.")
|
589 |
+
return [], existing_annotations_list, pd.DataFrame(), existing_annotations_df, pd.DataFrame(), existing_recogniser_entity_df
|
590 |
+
|
591 |
+
all_pages_df = pd.DataFrame(page_sizes).rename(columns={'image_path': 'image'})
|
592 |
+
|
593 |
+
if not updated_annotations_df.empty:
|
594 |
+
page_to_image_map = {item['page']: item['image_path'] for item in page_sizes}
|
595 |
+
updated_annotations_df['image'] = updated_annotations_df['page'].map(page_to_image_map)
|
596 |
+
merged_df = pd.merge(all_pages_df[['image']], updated_annotations_df, on='image', how='left')
|
597 |
+
else:
|
598 |
+
merged_df = all_pages_df[['image']]
|
599 |
+
|
600 |
+
# 1. Get the list of image paths in the exact order they appear in page_sizes.
|
601 |
+
# all_pages_df was created from page_sizes, so it preserves this order.
|
602 |
+
image_order = all_pages_df['image'].tolist()
|
603 |
+
|
604 |
+
# 2. Convert the 'image' column to a special 'Categorical' type.
|
605 |
+
# This tells pandas that this column has a custom, non-alphabetical order.
|
606 |
+
merged_df['image'] = pd.Categorical(merged_df['image'], categories=image_order, ordered=True)
|
607 |
+
|
608 |
+
# 3. Sort the DataFrame based on this new custom order.
|
609 |
+
merged_df = merged_df.sort_values('image')
|
610 |
+
|
611 |
+
|
612 |
+
final_annotations_list = list()
|
613 |
+
box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
|
614 |
+
|
615 |
+
# Now, when we group, we use `sort=False`. This tells groupby to respect the
|
616 |
+
# DataFrame's current order, which we have just manually set. This is slightly
|
617 |
+
# more efficient than letting it sort again.
|
618 |
+
for image_path, group in merged_df.groupby('image', sort=False, observed=False):
|
619 |
+
# The progress.tqdm wrapper can be added back around the groupby object as you had it.
|
620 |
+
# for image_path, group in progress.tqdm(merged_df.groupby('image', sort=False), ...):
|
621 |
+
|
622 |
+
# Check if the group has actual annotations. iloc[0] is safe because even pages
|
623 |
+
# without annotations will have one row with NaN values from the merge.
|
624 |
+
if pd.isna(group.iloc[0].get('id')):
|
625 |
+
boxes = list()
|
626 |
+
else:
|
627 |
+
valid_box_cols = [col for col in box_cols if col in group.columns]
|
628 |
+
# We should also sort the boxes within a page for consistency (e.g., left-to-right)
|
629 |
+
sorted_group = group.sort_values(by=['ymin', 'xmin'])
|
630 |
+
boxes = sorted_group[valid_box_cols].to_dict('records')
|
631 |
+
|
632 |
+
final_annotations_list.append({
|
633 |
+
"image": image_path,
|
634 |
+
"boxes": boxes
|
635 |
+
})
|
636 |
+
|
637 |
+
progress(1.0, desc="Completed annotation processing")
|
638 |
+
|
639 |
+
return final_annotations_list, existing_annotations_list, updated_annotations_df, existing_annotations_df, updated_recogniser_entity_df, existing_recogniser_entity_df
|
640 |
+
|
641 |
def exclude_selected_items_from_redaction(review_df: pd.DataFrame,
|
642 |
selected_rows_df: pd.DataFrame,
|
643 |
image_file_paths:List[str],
|
|
|
711 |
|
712 |
def replace_placeholder_image_with_real_image(doc_full_file_name_textbox:str, current_image_path:str, page_sizes_df:pd.DataFrame, page_num_reported:int, input_folder:str):
|
713 |
''' If image path is still not valid, load in a new image an overwrite it. Then replace all items in the image annotation object for all pages based on the updated information.'''
|
714 |
+
|
715 |
page_num_reported_zero_indexed = page_num_reported - 1
|
716 |
|
717 |
if not os.path.exists(current_image_path):
|
|
|
746 |
gradio_annotator_current_page_number:int,
|
747 |
recogniser_entities_dropdown_value:str="ALL",
|
748 |
page_dropdown_value:str="ALL",
|
749 |
+
page_dropdown_redaction_value:str="1",
|
750 |
text_dropdown_value:str="ALL",
|
751 |
+
recogniser_dataframe_base:pd.DataFrame=None, # Simplified default
|
752 |
zoom:int=100,
|
753 |
review_df:pd.DataFrame=None, # Use None for default empty DataFrame
|
754 |
+
page_sizes:List[dict]=list(),
|
755 |
doc_full_file_name_textbox:str='',
|
756 |
input_folder:str=INPUT_FOLDER
|
757 |
) -> Tuple[image_annotator, gr.Number, gr.Number, int, str, gr.Dataframe, pd.DataFrame, List[str], List[str], List[dict], List[AnnotatedImageData]]:
|
|
|
759 |
Update a gradio_image_annotation object with new annotation data for the current page
|
760 |
and update filter dataframes, optimizing by processing only the current page's data for display.
|
761 |
'''
|
762 |
+
|
763 |
zoom_str = str(zoom) + '%'
|
764 |
|
765 |
# Handle default empty review_df and recogniser_dataframe_base
|
|
|
773 |
if not all_image_annotations:
|
774 |
print("No all_image_annotation object found")
|
775 |
# Return blank/default outputs
|
776 |
+
|
777 |
+
blank_annotator = image_annotator(
|
778 |
+
value = None, boxes_alpha=0.1, box_thickness=1, label_list=list(), label_colors=list(),
|
779 |
show_label=False, height=zoom_str, width=zoom_str, box_min_size=1,
|
780 |
box_selected_thickness=2, handle_size=4, sources=None,
|
781 |
show_clear_button=False, show_share_button=False, show_remove_button=False,
|
|
|
786 |
|
787 |
return (blank_annotator, gr.Number(value=1), gr.Number(value=1), 1,
|
788 |
recogniser_entities_dropdown_value, blank_df_out_gr, blank_df_modified,
|
789 |
+
[], [], [], [], []) # Return empty lists/defaults for other outputs
|
790 |
|
791 |
# Validate and bound the current page number (1-based logic)
|
792 |
page_num_reported = max(1, gradio_annotator_current_page_number) # Minimum page is 1
|
|
|
797 |
page_num_reported_zero_indexed = page_num_reported - 1
|
798 |
annotate_previous_page = page_num_reported # Store the determined page number
|
799 |
|
800 |
+
if not page_sizes:
|
801 |
+
page_num_reported = 0
|
802 |
+
annotate_previous_page = 0
|
803 |
+
|
804 |
# --- Process page sizes DataFrame ---
|
805 |
page_sizes_df = pd.DataFrame(page_sizes)
|
806 |
if not page_sizes_df.empty:
|
|
|
812 |
print("Warning: Page sizes DataFrame became empty after processing.")
|
813 |
|
814 |
# --- Handle Image Path Replacement for the Current Page ---
|
815 |
+
|
|
|
|
|
816 |
if len(all_image_annotations) > page_num_reported_zero_indexed:
|
817 |
+
|
|
|
|
|
|
|
818 |
page_object_to_update = all_image_annotations[page_num_reported_zero_indexed]
|
819 |
|
820 |
# Use the helper function to replace the image path within the page object
|
|
|
|
|
|
|
821 |
updated_page_object, all_image_annotations_after_img_replace = replace_annotator_object_img_np_array_with_page_sizes_image_path(
|
822 |
all_image_annotations, page_object_to_update, page_sizes, page_num_reported)
|
823 |
|
|
|
|
|
|
|
824 |
all_image_annotations = all_image_annotations_after_img_replace
|
825 |
|
|
|
826 |
# Now handle the actual image file path replacement using replace_placeholder_image_with_real_image
|
827 |
current_image_path = updated_page_object.get('image') # Get potentially updated image path
|
828 |
|
|
|
855 |
if not page_sizes_df.empty:
|
856 |
page_sizes = page_sizes_df.to_dict(orient='records')
|
857 |
else:
|
858 |
+
page_sizes = list() # Ensure page_sizes is a list if df is empty
|
859 |
|
860 |
# --- OPTIMIZATION: Prepare data *only* for the current page for display ---
|
861 |
current_page_image_annotator_object = None
|
|
|
866 |
# Assuming coordinate multiplication IS needed for display if state stores relative coords
|
867 |
current_page_annotations_df = convert_annotation_data_to_dataframe([page_data_for_display])
|
868 |
|
|
|
869 |
if not current_page_annotations_df.empty and not page_sizes_df.empty:
|
870 |
# Multiply coordinates *only* for this page's DataFrame
|
871 |
try:
|
|
|
911 |
|
912 |
except Exception as e:
|
913 |
print(f"Error calling update_recogniser_dataframes: {e}. Returning empty/default filter data.")
|
914 |
+
recogniser_entities_list = list()
|
915 |
+
recogniser_colour_list = list()
|
916 |
recogniser_dataframe_out_gr = gr.Dataframe(pd.DataFrame(columns=["page", "label", "text", "id"]))
|
917 |
recogniser_dataframe_modified = pd.DataFrame(columns=["page", "label", "text", "id"])
|
918 |
+
text_entities_drop = list()
|
919 |
+
page_entities_drop = list()
|
920 |
|
921 |
|
922 |
# --- Final Output Components ---
|
923 |
+
if page_sizes:
|
924 |
+
page_number_reported_gradio_comp = gr.Number(label = "Current page", value=page_num_reported, precision=0, maximum=len(page_sizes), minimum=1)
|
925 |
+
else:
|
926 |
+
page_number_reported_gradio_comp = gr.Number(label = "Current page", value=0, precision=0, maximum=9999, minimum=0)
|
927 |
|
928 |
### Present image_annotator outputs
|
929 |
# Handle the case where current_page_image_annotator_object couldn't be prepared
|
|
|
953 |
interactive=True # Keep interactive if data is present
|
954 |
)
|
955 |
|
956 |
+
page_entities_drop_redaction_list = list()
|
957 |
+
all_pages_in_doc_list = [str(i) for i in range(1, len(page_sizes) + 1)]
|
958 |
+
page_entities_drop_redaction_list.extend(all_pages_in_doc_list)
|
959 |
+
|
960 |
+
page_entities_drop_redaction = gr.Dropdown(value = page_dropdown_redaction_value, choices=page_entities_drop_redaction_list, label="Page", allow_custom_value=True)
|
961 |
+
|
962 |
return (out_image_annotator,
|
963 |
page_number_reported_gradio_comp,
|
964 |
page_number_reported_gradio_comp, # Redundant, but matches original return signature
|
|
|
968 |
recogniser_dataframe_modified,
|
969 |
text_entities_drop, # List of text entities for dropdown
|
970 |
page_entities_drop, # List of page numbers for dropdown
|
971 |
+
page_entities_drop_redaction,
|
972 |
page_sizes, # Updated page_sizes list
|
973 |
all_image_annotations) # Return the updated full state
|
974 |
|
|
|
977 |
current_page:int,
|
978 |
previous_page:int,
|
979 |
all_image_annotations:List[AnnotatedImageData],
|
980 |
+
page_sizes:List[dict]=list(),
|
981 |
clear_all:bool=False
|
982 |
):
|
983 |
'''
|
984 |
Overwrite image annotations on the page we are moving from with modifications.
|
985 |
'''
|
986 |
|
987 |
+
if current_page > len(page_sizes):
|
988 |
+
raise Warning("Selected page is higher than last page number")
|
989 |
+
elif current_page <= 0:
|
990 |
+
raise Warning("Selected page is lower than first page")
|
991 |
+
|
992 |
+
|
993 |
previous_page_zero_index = previous_page -1
|
994 |
|
995 |
if not current_page: current_page = 1
|
|
|
998 |
page_image_annotator_object, all_image_annotations = replace_annotator_object_img_np_array_with_page_sizes_image_path(all_image_annotations, page_image_annotator_object, page_sizes, previous_page)
|
999 |
|
1000 |
if clear_all == False: all_image_annotations[previous_page_zero_index] = page_image_annotator_object
|
1001 |
+
else: all_image_annotations[previous_page_zero_index]["boxes"] = list()
|
1002 |
|
1003 |
return all_image_annotations, current_page, current_page
|
1004 |
|
|
|
1010 |
review_file_state:pd.DataFrame,
|
1011 |
output_folder:str = OUTPUT_FOLDER,
|
1012 |
save_pdf:bool=True,
|
1013 |
+
page_sizes:List[dict]=list(),
|
1014 |
COMPRESS_REDACTED_PDF:bool=COMPRESS_REDACTED_PDF,
|
1015 |
progress=gr.Progress(track_tqdm=True)):
|
1016 |
'''
|
1017 |
Apply modified redactions to a pymupdf and export review files.
|
1018 |
'''
|
1019 |
|
1020 |
+
output_files = list()
|
1021 |
+
output_log_files = list()
|
1022 |
+
pdf_doc = list()
|
1023 |
review_df = review_file_state
|
1024 |
|
1025 |
page_image_annotator_object = all_image_annotations[current_page - 1]
|
|
|
1085 |
doc = [image]
|
1086 |
|
1087 |
elif file_extension in '.csv':
|
1088 |
+
pdf_doc = list()
|
1089 |
|
1090 |
# If working with pdfs
|
1091 |
elif is_pdf(file_path) == True:
|
|
|
1095 |
output_files.append(orig_pdf_file_path)
|
1096 |
|
1097 |
number_of_pages = pdf_doc.page_count
|
1098 |
+
original_cropboxes = list()
|
1099 |
|
1100 |
page_sizes_df = pd.DataFrame(page_sizes)
|
1101 |
page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
|
|
|
1241 |
'''
|
1242 |
if isinstance(choice, str):
|
1243 |
choice = [choice]
|
1244 |
+
elif not isinstance(choice, list):
|
1245 |
+
choice = [str(choice)]
|
1246 |
if isinstance(label_dropdown_value, str):
|
1247 |
label_dropdown_value = [label_dropdown_value]
|
1248 |
+
elif not isinstance(label_dropdown_value, list):
|
1249 |
+
label_dropdown_value = [str(label_dropdown_value)]
|
1250 |
if isinstance(text_dropdown_value, str):
|
1251 |
text_dropdown_value = [text_dropdown_value]
|
1252 |
+
elif not isinstance(text_dropdown_value, list):
|
1253 |
+
text_dropdown_value = [str(text_dropdown_value)]
|
1254 |
|
1255 |
filtered_df = df.copy()
|
1256 |
|
|
|
1275 |
|
1276 |
return filtered_df, recogniser_entities_drop, text_entities_drop
|
1277 |
|
1278 |
+
def update_redact_choice_df_from_page_dropdown(choice:str, df:pd.DataFrame):
|
1279 |
+
'''
|
1280 |
+
Update the rows in a dataframe depending on the user choice from a dropdown
|
1281 |
+
'''
|
1282 |
+
if isinstance(choice, str):
|
1283 |
+
choice = [choice]
|
1284 |
+
elif not isinstance(choice, list):
|
1285 |
+
choice = [str(choice)]
|
1286 |
+
|
1287 |
+
if "index" not in df.columns:
|
1288 |
+
df["index"] = df.index
|
1289 |
+
|
1290 |
+
filtered_df = df[["page", "line", "word_text", "word_x0", "word_y0", "word_x1", "word_y1", "index"]].copy()
|
1291 |
+
|
1292 |
+
# Apply filtering based on dropdown selections
|
1293 |
+
if not "ALL" in choice:
|
1294 |
+
filtered_df = filtered_df.loc[filtered_df["page"].astype(str).isin(choice)]
|
1295 |
+
|
1296 |
+
page_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "page")
|
1297 |
+
page_entities_drop = gr.Dropdown(value=choice[0], choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
|
1298 |
+
|
1299 |
+
return filtered_df
|
1300 |
+
|
1301 |
def update_entities_df_text(choice:str, df:pd.DataFrame, label_dropdown_value:str, page_dropdown_value:str):
|
1302 |
'''
|
1303 |
Update the rows in a dataframe depending on the user choice from a dropdown
|
|
|
1351 |
def increase_bottom_page_count_based_on_top(page_number:int):
|
1352 |
return int(page_number)
|
1353 |
|
1354 |
+
def df_select_callback_dataframe_row_ocr_with_words(df: pd.DataFrame, evt: gr.SelectData):
|
1355 |
+
|
1356 |
+
row_value_page = int(evt.row_value[0]) # This is the page number value
|
1357 |
+
row_value_line = int(evt.row_value[1]) # This is the label number value
|
1358 |
+
row_value_text = evt.row_value[2] # This is the text number value
|
1359 |
+
|
1360 |
+
row_value_x0 = evt.row_value[3] # This is the x0 value
|
1361 |
+
row_value_y0 = evt.row_value[4] # This is the y0 value
|
1362 |
+
row_value_x1 = evt.row_value[5] # This is the x1 value
|
1363 |
+
row_value_y1 = evt.row_value[6] # This is the y1 value
|
1364 |
+
row_value_index = evt.row_value[7] # This is the y1 value
|
1365 |
+
|
1366 |
+
row_value_df = pd.DataFrame(data={"page":[row_value_page], "line":[row_value_line], "word_text":[row_value_text],
|
1367 |
+
"word_x0":[row_value_x0], "word_y0":[row_value_y0], "word_x1":[row_value_x1], "word_y1":[row_value_y1], "index":row_value_index
|
1368 |
+
})
|
1369 |
+
|
1370 |
+
return row_value_df, row_value_text
|
1371 |
+
|
1372 |
def df_select_callback_dataframe_row(df: pd.DataFrame, evt: gr.SelectData):
|
1373 |
|
1374 |
row_value_page = int(evt.row_value[0]) # This is the page number value
|
|
|
1423 |
'''
|
1424 |
if text:
|
1425 |
# Get all rows with the same text as the selected row
|
1426 |
+
return df.loc[df["text"] == text]
|
1427 |
else:
|
1428 |
return pd.DataFrame(columns=["page", "label", "text", "id"])
|
1429 |
+
|
1430 |
+
def get_all_rows_with_same_text_redact(df: pd.DataFrame, text: str):
|
1431 |
+
'''
|
1432 |
+
Get all rows with the same text as the selected row for redaction tasks
|
1433 |
+
'''
|
1434 |
+
if "index" not in df.columns:
|
1435 |
+
df["index"] = df.index
|
1436 |
+
|
1437 |
+
if text and not df.empty:
|
1438 |
+
# Get all rows with the same text as the selected row
|
1439 |
+
return df.loc[df["word_text"] == text]
|
1440 |
+
else:
|
1441 |
+
return pd.DataFrame(columns=["page", "line", "label", "word_text", "word_x0", "word_y0", "word_x1", "word_y1", "index"])
|
1442 |
|
1443 |
def update_selected_review_df_row_colour(
|
1444 |
redaction_row_selection: pd.DataFrame,
|
|
|
1626 |
|
1627 |
return x1, adobe_y1, x2, adobe_y2
|
1628 |
|
1629 |
+
def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:List[str]=list(), document_cropboxes:List=list(), page_sizes:List[dict]=list()):
|
1630 |
'''
|
1631 |
Create an xfdf file from a review csv file and a pdf
|
1632 |
'''
|
|
|
1718 |
reparsed = minidom.parseString(rough_string)
|
1719 |
return reparsed.toxml() #.toprettyxml(indent=" ")
|
1720 |
|
1721 |
+
def convert_df_to_xfdf(input_files:List[str], pdf_doc:Document, image_paths:List[str], output_folder:str = OUTPUT_FOLDER, document_cropboxes:List=list(), page_sizes:List[dict]=list()):
|
1722 |
'''
|
1723 |
Load in files to convert a review file into an Adobe comment file format
|
1724 |
'''
|
1725 |
+
output_paths = list()
|
1726 |
pdf_name = ""
|
1727 |
file_path_name = ""
|
1728 |
|
|
|
1821 |
# Define the namespace
|
1822 |
namespace = {'xfdf': 'http://ns.adobe.com/xfdf/'}
|
1823 |
|
1824 |
+
redactions = list()
|
1825 |
|
1826 |
# Find all redact elements using the namespace
|
1827 |
for redact in root.findall('.//xfdf:redact', namespaces=namespace):
|
|
|
1853 |
Returns:
|
1854 |
- DataFrame containing redaction information
|
1855 |
'''
|
1856 |
+
output_paths = list()
|
1857 |
+
xfdf_paths = list()
|
1858 |
df = pd.DataFrame()
|
1859 |
|
1860 |
# Sort the file paths so that the pdfs come first
|