Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files

xet

Community

Sean Pedrick-Case commited on Aug 22

Commit

58b064b

unverified ·

2 Parent(s): f2f92b5 72f39c9

Merge pull request #54 from seanpedrick-case/dev

Browse files

Added search text and easy redact feature, multi-language support, added support for PaddleOCR. Various minor fixes and package updates.

Files changed (26) hide show

Dockerfile +20 -6
README.md +266 -9
app.py +0 -0
cdk/cdk_config.py +3 -3
cdk/cdk_stack.py +15 -0
cdk/post_cdk_build_quickstart.py +2 -2
cdk/requirements.txt +3 -3
example_config.env +26 -0
index.qmd +1 -1
load_dynamo_logs.py +45 -8
pyproject.toml +20 -18
requirements.txt +19 -17
src/app_settings.qmd +69 -20
src/user_guide.qmd +79 -6
tools/aws_textract.py +9 -191
tools/cli_redact.py +149 -69
tools/config.py +29 -5
tools/custom_image_analyser_engine.py +0 -0
tools/data_anonymise.py +209 -65
tools/example_cli_calls.txt +24 -0
tools/file_conversion.py +195 -70
tools/file_redaction.py +481 -391
tools/find_duplicate_pages.py +696 -330
tools/helper_functions.py +67 -21
tools/load_spacy_model_custom_recognisers.py +350 -83
tools/redaction_review.py +426 -86

Dockerfile CHANGED Viewed

@@ -17,7 +17,7 @@ WORKDIR /src
 COPY requirements.txt .
-RUN pip install --no-cache-dir --target=/install -r requirements.txt && rm requirements.txt
 # Add lambda entrypoint and script
 COPY lambda_entrypoint.py .
@@ -54,7 +54,8 @@ ENV GRADIO_TEMP_DIR=/tmp/gradio_tmp/ \
     ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
     USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
     CONFIG_FOLDER=$APP_HOME/app/config/ \
-    XDG_CACHE_HOME=/tmp/xdg_cache/user_1000
 # Create the base application directory and set its ownership
 RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
@@ -81,13 +82,22 @@ RUN mkdir -p \
     ${APP_HOME}/app/logs \
     ${APP_HOME}/app/usage \
     ${APP_HOME}/app/feedback \
-    ${APP_HOME}/app/config
-# Now handle the /tmp and /var/tmp directories and their subdirectories
 RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
     && chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
     && chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
-    && chmod 700 ${XDG_CACHE_HOME}
 # Copy installed packages from builder stage
 COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
@@ -115,6 +125,9 @@ VOLUME ["/home/user/app/logs"]
 VOLUME ["/home/user/app/usage"]
 VOLUME ["/home/user/app/feedback"]
 VOLUME ["/home/user/app/config"]
 VOLUME ["/tmp"]
 VOLUME ["/var/tmp"]
@@ -127,7 +140,8 @@ ENV PATH=$APP_HOME/.local/bin:$PATH \
     GRADIO_NUM_PORTS=1 \
     GRADIO_SERVER_NAME=0.0.0.0 \
     GRADIO_SERVER_PORT=7860 \
-    GRADIO_ANALYTICS_ENABLED=False
 ENTRYPOINT ["/entrypoint.sh"]

 COPY requirements.txt .
+RUN pip install --verbose --no-cache-dir --target=/install -r requirements.txt && rm requirements.txt
 # Add lambda entrypoint and script
 COPY lambda_entrypoint.py .
     ACCESS_LOGS_FOLDER=$APP_HOME/app/logs/ \
     USAGE_LOGS_FOLDER=$APP_HOME/app/usage/ \
     CONFIG_FOLDER=$APP_HOME/app/config/ \
+    XDG_CACHE_HOME=/tmp/xdg_cache/user_1000 \
+    TESSERACT_DATA_FOLDER=/usr/share/tessdata
 # Create the base application directory and set its ownership
 RUN mkdir -p ${APP_HOME}/app && chown user:user ${APP_HOME}/app
     ${APP_HOME}/app/logs \
     ${APP_HOME}/app/usage \
     ${APP_HOME}/app/feedback \
+    ${APP_HOME}/app/config
+# Now handle the /tmp and /var/tmp directories and their subdirectories, paddle, spacy, tessdata
 RUN mkdir -p /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache /tmp /var/tmp ${XDG_CACHE_HOME} \
     && chown user:user /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache ${XDG_CACHE_HOME} \
     && chmod 1777 /tmp /var/tmp /tmp/gradio_tmp /tmp/tld /tmp/matplotlib_cache \
+    && chmod 700 ${XDG_CACHE_HOME} \
+    && mkdir -p ${APP_HOME}/.paddlex \
+    && chown user:user ${APP_HOME}/.paddlex \
+    && chmod 755 ${APP_HOME}/.paddlex \
+    && mkdir -p ${APP_HOME}/.local/share/spacy/data \
+    && chown user:user ${APP_HOME}/.local/share/spacy/data \
+    && chmod 755 ${APP_HOME}/.local/share/spacy/data \
+    && mkdir -p /usr/share/tessdata \
+    && chown user:user /usr/share/tessdata \
+    && chmod 755 /usr/share/tessdata
 # Copy installed packages from builder stage
 COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
 VOLUME ["/home/user/app/usage"]
 VOLUME ["/home/user/app/feedback"]
 VOLUME ["/home/user/app/config"]
+VOLUME ["/home/user/.paddlex"]
+VOLUME ["/home/user/.local/share/spacy/data"]
+VOLUME ["/usr/share/tessdata"]
 VOLUME ["/tmp"]
 VOLUME ["/var/tmp"]
     GRADIO_NUM_PORTS=1 \
     GRADIO_SERVER_NAME=0.0.0.0 \
     GRADIO_SERVER_PORT=7860 \
+    GRADIO_ANALYTICS_ENABLED=False
 ENTRYPOINT ["/entrypoint.sh"]

README.md CHANGED Viewed

@@ -10,9 +10,9 @@ license: agpl-3.0
 ---
 # Document redaction
-version: 0.7.1
-Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
 To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works quite well for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
@@ -20,7 +20,191 @@ After redaction, review suggested redactions on the 'Review redactions' tab. The
 NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.
-# USER GUIDE
 ## Table of contents
@@ -35,7 +219,7 @@ NOTE: The app is not 100% accurate, and it will miss some personal information.
     - [Redacting only specific pages](#redacting-only-specific-pages)
     - [Handwriting and signature redaction](#handwriting-and-signature-redaction)
 - [Reviewing and modifying suggested redactions](#reviewing-and-modifying-suggested-redactions)
-- [Redacting tabular data files (CSV/XLSX) or copy and pasted text](#redacting-tabular-data-files-xlsxcsv-or-copy-and-pasted-text)
 See the [advanced user guide here](#advanced-user-guide):
 - [Merging redaction review files](#merging-redaction-review-files)
@@ -225,9 +409,11 @@ On the 'Review redactions' tab you have a visual interface that allows you to in
 ### Uploading documents for review
-The top area has a file upload area where you can upload original, unredacted PDFs, alongside the '..._review_file.csv' that is produced by the redaction process. Once you have uploaded these two files, click the '**Review redactions based on original PDF...**' button to load in the files for review. This will allow you to visualise and modify the suggested redactions using the interface below.
-Optionally, you can also upload one of the '..._ocr_output.csv' files here that comes out of a redaction task, so that you can navigate the extracted text from the document.
 ![Search extracted text](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/search_extracted_text.PNG)
@@ -315,6 +501,77 @@ Once you have filtered the table, or selected a row from the table, you have a f
 If you made a mistake, click the 'Undo last element removal' button to restore the Search suggested redactions table to its previous state (can only undo the last action).
 ### Navigating through the document using the 'Search all extracted text'
 The 'search all extracted text' table will contain text if you have just redacted a document, or if you have uploaded a '..._ocr_output.csv' file alongside a document file and review file on the Review redactions tab as [described above](#uploading-documents-for-review).
@@ -327,11 +584,11 @@ You can search through the extracted text by using the search bar just above the
 ![Search suggested redaction area](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/search_extracted_text.PNG)
-## Redacting tabular data files (XLSX/CSV) or copy and pasted text
-### Tabular data files (XLSX/CSV)
-The app can be used to redact tabular data files such as xlsx or csv files. For this to work properly, your data file needs to be in a simple table format, with a single table starting from the first cell (A1), and no other information in the sheet. Similarly for .xlsx files, each sheet in the file that you want to redact should be in this simple format.
 To demonstrate this, we can use [the example csv file 'combined_case_notes.csv'](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/combined_case_notes.csv), which is a small dataset of dummy social care case notes. Go to the 'Open text or Excel/csv files' tab. Drop the file into the upload area. After the file is loaded, you should see the suggested columns for redaction in the box underneath. You can select and deselect columns to redact as you wish from this list.

 ---
 # Document redaction
+version: 1.0.0
+Redact personally identifiable information (PII) from documents (pdf, images), Word files (.docx), or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
 To identify text in documents, the 'local' text/OCR image analysis uses spacy/tesseract, and works quite well for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. Then, choose a method for PII identification. 'Local' is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
 NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.
+---
+## 🚀 Quick Start - Installation and first run
+Follow these instructions to get the document redaction application running on your local machine.
+### 1. Prerequisites: System Dependencies
+This application relies on two external tools for OCR (Tesseract) and PDF processing (Poppler). Please install them on your system before proceeding.
+---
+#### **On Windows**
+Installation on Windows requires downloading installers and adding the programs to your system's PATH.
+1.  **Install Tesseract OCR:**
+    *   Download the installer from the official Tesseract at [UB Mannheim page](https://github.com/UB-Mannheim/tesseract/wiki) (e.g., `tesseract-ocr-w64-setup-v5.X.X...exe`).
+    *   Run the installer.
+    *   **IMPORTANT:** During installation, ensure you select the option to "Add Tesseract to system PATH for all users" or a similar option. This is crucial for the application to find the Tesseract executable.
+2.  **Install Poppler:**
+    *   Download the latest Poppler binary for Windows. A common source is the [Poppler for Windows](https://github.com/oschwartz10612/poppler-windows) GitHub releases page. Download the `.zip` file (e.g., `poppler-24.02.0-win.zip`).
+    *   Extract the contents of the zip file to a permanent location on your computer, for example, `C:\Program Files\poppler\`.
+    *   You must add the `bin` folder from your Poppler installation to your system's PATH environment variable.
+        *   Search for "Edit the system environment variables" in the Windows Start Menu and open it.
+        *   Click the "Environment Variables..." button.
+        *   In the "System variables" section, find and select the `Path` variable, then click "Edit...".
+        *   Click "New" and add the full path to the `bin` directory inside your Poppler folder (e.g., `C:\Program Files\poppler\poppler-24.02.0\bin`).
+        *   Click OK on all windows to save the changes.
+    To verify, open a new Command Prompt and run `tesseract --version` and `pdftoppm -v`. If they both return version information, you have successfully installed the prerequisites.
+---
+#### **On Linux (Debian/Ubuntu)**
+Open your terminal and run the following command to install Tesseract and Poppler:
+```bash
+sudo apt-get update && sudo apt-get install -y tesseract-ocr poppler-utils
+```
+#### **On Linux (Fedora/CentOS/RHEL)**
+Open your terminal and use the `dnf` or `yum` package manager:
+```bash
+sudo dnf install -y tesseract poppler-utils
+```
+---
+### 2. Installation: Code and Python Packages
+Once the system prerequisites are installed, you can set up the Python environment.
+#### Step 1: Clone the Repository
+Open your terminal or Git Bash and clone this repository:
+```bash
+git clone https://github.com/seanpedrick-case/doc_redaction.git
+cd doc_redaction
+```
+#### Step 2: Create and Activate a Virtual Environment (Recommended)
+It is highly recommended to use a virtual environment to isolate project dependencies and avoid conflicts with other Python projects.
+```bash
+# Create the virtual environment
+python -m venv venv
+# Activate it
+# On Windows:
+.\venv\Scripts\activate
+# On macOS/Linux:
+source venv/bin/activate
+```
+#### Step 3: Install Python Dependencies
+This project uses `pyproject.toml` to manage dependencies. You can install everything with a single pip command. This process will also download the required Spacy models and other packages directly from their URLs.
+```bash
+pip install .
+```
+Alternatively, you can use the `requirements.txt` file:
+```bash
+pip install -r requirements.txt
+```
+### 3. Run the Application
+With all dependencies installed, you can now start the Gradio application.
+```bash
+python app.py
+```
+After running the command, the application will start, and you will see a local URL in your terminal (usually `http://127.0.0.1:7860`).
+Open this URL in your web browser to use the document redaction tool
+---
+### 4. ⚙️ Configuration (Optional)
+You can customise the application's behavior by creating a configuration file. This allows you to change settings without modifying the source code, such as enabling AWS features, changing logging behavior, or pointing to local Tesseract/Poppler installations. A full overview of all the potential settings you can modify in the app_config.env file can be seen in tools/config.py, with explanation on the documentation website for [the github repo](https://seanpedrick-case.github.io/doc_redaction/)
+To get started:
+1.  Locate the `example_config.env` file in the root of the project.
+2.  Create a new file named `app_config.env` inside the `config/` directory (i.e., `config/app_config.env`).
+3.  Copy the contents from `example_config.env` into your new `config/app_config.env` file.
+4.  Modify the values in `config/app_config.env` to suit your needs. The application will automatically load these settings on startup.
+If you do not create this file, the application will run with default settings.
+#### Configuration Breakdown
+Here is an overview of the most important settings, separated by whether they are for local use or require AWS.
+---
+#### **Local & General Settings (No AWS Required)**
+These settings are useful for all users, regardless of whether you are using AWS.
+*   `TESSERACT_FOLDER` / `POPPLER_FOLDER`
+    *   Use these if you installed Tesseract or Poppler to a custom location on **Windows** and did not add them to the system PATH.
+    *   Provide the path to the respective installation folders (for Poppler, point to the `bin` sub-directory).
+    *   **Examples:** `POPPLER_FOLDER=C:/Program Files/poppler-24.02.0/bin/` `TESSERACT_FOLDER=tesseract/`
+*   `SHOW_LANGUAGE_SELECTION=True`
+    *   Set to `True` to display a language selection dropdown in the UI for OCR processing.
+*   `CHOSEN_LOCAL_OCR_MODEL=tesseract`"
+    *   Choose the backend for local OCR. Options are `tesseract`, `paddle`, or `hybrid`. "Tesseract" is the default, and is recommended. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence. "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction.
+*   `SESSION_OUTPUT_FOLDER=False`
+    *   If `True`, redacted files will be saved in unique subfolders within the `output/` directory for each session.
+*   `DISPLAY_FILE_NAMES_IN_LOGS=False`
+    *   For privacy, file names are not recorded in usage logs by default. Set to `True` to include them.
+---
+#### **AWS-Specific Settings**
+These settings are only relevant if you intend to use AWS services like Textract for OCR and Comprehend for PII detection.
+*   `RUN_AWS_FUNCTIONS=1`
+    *   **This is the master switch.** You must set this to `1` to enable any AWS functionality. If it is `0`, all other AWS settings will be ignored.
+*   **UI Options:**
+    *   `SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True`: Adds "AWS Textract" as an option in the text extraction dropdown.
+    *   `SHOW_AWS_PII_DETECTION_OPTIONS=True`: Adds "AWS Comprehend" as an option in the PII detection dropdown.
+*   **Core AWS Configuration:**
+    *   `AWS_REGION=example-region`: Set your AWS region (e.g., `us-east-1`).
+    *   `DOCUMENT_REDACTION_BUCKET=example-bucket`: The name of the S3 bucket the application will use for temporary file storage and processing.
+*   **AWS Logging:**
+    *   `SAVE_LOGS_TO_DYNAMODB=True`: If enabled, usage and feedback logs will be saved to DynamoDB tables.
+    *   `ACCESS_LOG_DYNAMODB_TABLE_NAME`, `USAGE_LOG_DYNAMODB_TABLE_NAME`, etc.: Specify the names of your DynamoDB tables for logging.
+*   **Advanced AWS Textract Features:**
+    *   `SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS=True`: Enables UI components for large-scale, asynchronous document processing via Textract.
+    *   `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET=example-bucket-output`: A separate S3 bucket for the final output of asynchronous Textract jobs.
+    *   `LOAD_PREVIOUS_TEXTRACT_JOBS_S3=True`: If enabled, the app will try to load the status of previously submitted asynchronous jobs from S3.
+*   **Cost Tracking (for internal accounting):**
+    *   `SHOW_COSTS=True`: Displays an estimated cost for AWS operations. Can be enabled even if AWS functions are off.
+    *   `GET_COST_CODES=True`: Enables a dropdown for users to select a cost code before running a job.
+    *   `COST_CODES_PATH=config/cost_codes.csv`: The local path to a CSV file containing your cost codes.
+    *   `ENFORCE_COST_CODES=True`: Makes selecting a cost code mandatory before starting a redaction.
+Now you have the app installed, what follows is a guide on how to use it for basic and advanced redaction.
+# User Guide
 ## Table of contents
     - [Redacting only specific pages](#redacting-only-specific-pages)
     - [Handwriting and signature redaction](#handwriting-and-signature-redaction)
 - [Reviewing and modifying suggested redactions](#reviewing-and-modifying-suggested-redactions)
+- [Redacting Word, tabular data files (CSV/XLSX) or copy and pasted text](#redacting-word-tabular-data-files-xlsxcsv-or-copy-and-pasted-text)
 See the [advanced user guide here](#advanced-user-guide):
 - [Merging redaction review files](#merging-redaction-review-files)
 ### Uploading documents for review
+The top area has a file upload area where you can upload files for review . In the left box, upload the original PDF file. Click '1. Upload original PDF'. In the right box, you can upload the '..._review_file.csv' that is produced by the redaction process.
+Optionally, you can upload a '..._ocr_result_with_words' file here, that will allow you to search through the text and easily [add new redactions based on word search](#searching-and-adding-custom-redactions). You can also upload one of the '..._ocr_output.csv' file here that comes out of a redaction task, so that you can navigate the extracted text from the document. Click the button '2. Upload Review or OCR csv files' load in these files.
+Now you can review and modify the suggested redactions using the interface described below.
 ![Search extracted text](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/search_extracted_text.PNG)
 If you made a mistake, click the 'Undo last element removal' button to restore the Search suggested redactions table to its previous state (can only undo the last action).
+### Searching and Adding Custom Redactions
+After a document has been processed, you may need to redact specific terms, names, or phrases that the automatic PII (Personally Identifiable Information) detection might have missed. The **"Search text to make new redactions"** tab gives you the power to find and redact any text within your document manually.
+#### How to Use the Search and Redact Feature
+The workflow is designed to be simple: **Search → Select → Redact**.
+---
+#### **Step 1: Search for Text**
+1.  Navigate to the **"Search text to make new redactions"** tab.
+2.  The main table will initially be populated with all the text extracted from the document, broken down by word.
+3.  To narrow this down, use the **"Multi-word text search"** box to type the word or phrase you want to find.
+4.  Click the **"Search"** button or press Enter.
+5.  The table below will update to show only the rows containing text that matches your search query.
+> **Tip:** You can also filter the results by page number using the **"Page"** dropdown. To clear all filters and see the full text again, click the **"Reset table to original state"** button.
+---
+#### **Step 2: Select and Review a Match**
+When you click on any row in the search results table:
+*   The document preview on the left will automatically jump to that page, allowing you to see the word in its original context.
+*   The details of your selection will appear in the smaller **"Selected row"** table for confirmation.
+---
+#### **Step 3: Choose Your Redaction Method**
+You have several powerful options for redacting the text you've found:
+*   **Redact a Single, Specific Instance:**
+    *   Click on the exact row in the table you want to redact.
+    *   Click the **`Redact specific text row`** button.
+    *   Only that single instance will be redacted.
+*   **Redact All Instances of a Word/Phrase:**
+    *   Let's say you want to redact the project name "Project Alpha" everywhere it appears.
+    *   Find and select one instance of "Project Alpha" in the table.
+    *   Click the **`Redact all words with same text as selected row`** button.
+    *   The application will find and redact every single occurrence of "Project Alpha" throughout the entire document.
+*   **Redact All Current Search Results:**
+    *   Perform a search (e.g., for a specific person's name).
+    *   If you are confident that every result shown in the filtered table should be redacted, click the **`Redact all text in table`** button.
+    *   This will apply a redaction to all currently visible items in the table in one go.
+---
+#### **Customising Your New Redactions**
+Before you click one of the redact buttons, you can customize the appearance and label of the new redactions under the **"Search options"** accordion:
+*   **Label for new redactions:** Change the text that appears on the redaction box (default is "Redaction"). You could change this to "CONFIDENTIAL" or "CUSTOM".
+*   **Colour for labels:** Set a custom color for the redaction box by providing an RGB value. The format must be three numbers (0-255) in parentheses, for example:
+    *   ` (255, 0, 0) ` for Red
+    *   ` (0, 0, 0) ` for Black
+    *   ` (255, 255, 0) ` for Yellow
+#### **Undoing a Mistake**
+If you make a mistake, you can reverse the last redaction action you performed on this tab.
+*   Click the **`Undo latest redaction`** button. This will revert the last set of redactions you added (whether it was a single row, all of a certain text, or all search results).
+> **Important:** This undo button only works for the *most recent* action. It maintains a single backup state, so it cannot undo actions that are two or more steps in the past.
 ### Navigating through the document using the 'Search all extracted text'
 The 'search all extracted text' table will contain text if you have just redacted a document, or if you have uploaded a '..._ocr_output.csv' file alongside a document file and review file on the Review redactions tab as [described above](#uploading-documents-for-review).
 ![Search suggested redaction area](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/search_extracted_text.PNG)
+## Redacting Word, tabular data files (XLSX/CSV) or copy and pasted text
+### Word or tabular data files (XLSX/CSV)
+The app can be used to redact Word (.docx), or tabular data files such as xlsx or csv files. For this to work properly, your data file needs to be in a simple table format, with a single table starting from the first cell (A1), and no other information in the sheet. Similarly for .xlsx files, each sheet in the file that you want to redact should be in this simple format.
 To demonstrate this, we can use [the example csv file 'combined_case_notes.csv'](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/combined_case_notes.csv), which is a small dataset of dummy social care case notes. Go to the 'Open text or Excel/csv files' tab. Drop the file into the upload area. After the file is loaded, you should see the suggested columns for redaction in the box underneath. You can select and deselect columns to redact as you wish from this list.

app.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

cdk/cdk_config.py CHANGED Viewed

@@ -213,9 +213,9 @@ SAVE_LOGS_TO_CSV = get_or_create_env_var('SAVE_LOGS_TO_CSV', 'True')
 ### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
 SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var('SAVE_LOGS_TO_DYNAMODB', 'True')
-ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('ACCESS_LOG_DYNAMODB_TABLE_NAME', f"{CDK_PREFIX}dynamodb-access-log".lower())
-FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('FEEDBACK_LOG_DYNAMODB_TABLE_NAME', f"{CDK_PREFIX}dynamodb-feedback".lower())
-USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('USAGE_LOG_DYNAMODB_TABLE_NAME', f"{CDK_PREFIX}dynamodb-usage".lower())
 ###
 # REDACTION OPTIONS

 ### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
 SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var('SAVE_LOGS_TO_DYNAMODB', 'True')
+ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('ACCESS_LOG_DYNAMODB_TABLE_NAME', f"{CDK_PREFIX}dynamodb-access-logs".lower())
+FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('FEEDBACK_LOG_DYNAMODB_TABLE_NAME', f"{CDK_PREFIX}dynamodb-feedback-logs".lower())
+USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('USAGE_LOG_DYNAMODB_TABLE_NAME', f"{CDK_PREFIX}dynamodb-usage-logs".lower())
 ###
 # REDACTION OPTIONS

cdk/cdk_stack.py CHANGED Viewed

@@ -990,6 +990,21 @@ class CdkStack(Stack):
                         "sourceVolume": epheremal_storage_volume_name,
                         "containerPath": "/tmp/gradio_tmp",
                         "readOnly": False
                     }
                 ],
                 "readonlyRootFilesystem": read_only_file_system,

                         "sourceVolume": epheremal_storage_volume_name,
                         "containerPath": "/tmp/gradio_tmp",
                         "readOnly": False
+                    },
+                    {
+                        "sourceVolume": epheremal_storage_volume_name,
+                        "containerPath": "/home/user/.paddlex",
+                        "readOnly": False
+                    },
+                    {
+                        "sourceVolume": epheremal_storage_volume_name,
+                        "containerPath": "/home/user/.local/share/spacy/data",
+                        "readOnly": False
+                    },
+                    {
+                        "sourceVolume": epheremal_storage_volume_name,
+                        "containerPath": "/usr/share/tessdata",
+                        "readOnly": False
                     }
                 ],
                 "readonlyRootFilesystem": read_only_file_system,

cdk/post_cdk_build_quickstart.py CHANGED Viewed

@@ -13,10 +13,10 @@ start_codebuild_build(PROJECT_NAME=CODEBUILD_PROJECT_NAME)
 # Upload config.env file to S3 bucket
 upload_file_to_s3(local_file_paths="config/config.env", s3_key="", s3_bucket=S3_LOG_CONFIG_BUCKET_NAME)
-total_seconds = 450 # 7.5 minutes
 update_interval = 1 # Update every second
-print("Waiting 7.5 minutes for the CodeBuild container to build.")
 # tqdm iterates over a range, and you perform a small sleep in each iteration
 for i in tqdm(range(total_seconds), desc="Building container"):

 # Upload config.env file to S3 bucket
 upload_file_to_s3(local_file_paths="config/config.env", s3_key="", s3_bucket=S3_LOG_CONFIG_BUCKET_NAME)
+total_seconds = 660 # 11 minutes
 update_interval = 1 # Update every second
+print("Waiting 11 minutes for the CodeBuild container to build.")
 # tqdm iterates over a range, and you perform a small sleep in each iteration
 for i in tqdm(range(total_seconds), desc="Building container"):

cdk/requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
-aws-cdk-lib==2.202.0
-boto3==1.38.41
-pandas==2.3.0
 nodejs==0.1.1
 python-dotenv==1.0.1

+aws-cdk-lib==2.212.0
+boto3==1.40.10
+pandas==2.3.1
 nodejs==0.1.1
 python-dotenv==1.0.1

example_config.env ADDED Viewed

	@@ -0,0 +1,26 @@

+TESSERACT_FOLDER=tesseract/
+POPPLER_FOLDER=poppler/poppler-24.02.0/Library/bin/
+SHOW_LANGUAGE_SELECTION=True
+CHOSEN_LOCAL_OCR_MODEL=tesseract
+SESSION_OUTPUT_FOLDER=False
+DISPLAY_FILE_NAMES_IN_LOGS=False
+RUN_AWS_FUNCTIONS=1 # Set to 0 if you don't want to run AWS functions
+SAVE_LOGS_TO_DYNAMODB=True
+S3_COST_CODES_PATH=cost_codes.csv
+SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True
+SHOW_AWS_PII_DETECTION_OPTIONS=True
+AWS_REGION=example-region
+DOCUMENT_REDACTION_BUCKET=example-bucket
+SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS=True
+TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET=example-bucket-output
+LOAD_PREVIOUS_TEXTRACT_JOBS_S3=True
+ACCESS_LOG_DYNAMODB_TABLE_NAME=example-dynamodb-access-log
+USAGE_LOG_DYNAMODB_TABLE_NAME=example-dynamodb-usage
+FEEDBACK_LOG_DYNAMODB_TABLE_NAME=example-dynamodb-feedback
+SHOW_COSTS=True
+GET_COST_CODES=True
+COST_CODES_PATH=config/cost_codes.csv
+ENFORCE_COST_CODES=True
+DEFAULT_COST_CODE=example_cost_code

index.qmd CHANGED Viewed

@@ -2,7 +2,7 @@
 title: "Home"
 ---
-version: 0.7.1
 Welcome to the Document Redaction App documentation. This site provides comprehensive documentation for the Document Redaction App.

 title: "Home"
 ---
+version: 1.0.0
 Welcome to the Document Redaction App documentation. This site provides comprehensive documentation for the Document Redaction App.

load_dynamo_logs.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import boto3
 import csv
 from decimal import Decimal
 from boto3.dynamodb.conditions import Key
 from tools.config import AWS_REGION, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, OUTPUT_FOLDER
@@ -16,11 +17,26 @@ table = dynamodb.Table(TABLE_NAME)
 # Helper function to convert Decimal to float or int
 def convert_types(item):
     for key, value in item.items():
         if isinstance(value, Decimal):
-            # Convert to int if no decimal places, else float
-            item[key] = int(value) if value % 1 == 0 else float(value)
-    return item
 # Paginated scan
 def scan_table():
@@ -35,22 +51,43 @@ def scan_table():
     return items
 # Export to CSV
-def export_to_csv(items, output_path):
     if not items:
         print("No items found.")
         return
-    fieldnames = sorted(items[0].keys())
-    with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
-        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
         writer.writeheader()
         for item in items:
             writer.writerow(convert_types(item))
     print(f"Exported {len(items)} items to {output_path}")
 # Run export
 items = scan_table()
-export_to_csv(items, CSV_OUTPUT)

 import boto3
 import csv
 from decimal import Decimal
+import datetime
 from boto3.dynamodb.conditions import Key
 from tools.config import AWS_REGION, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, OUTPUT_FOLDER
 # Helper function to convert Decimal to float or int
 def convert_types(item):
+    new_item = {}
     for key, value in item.items():
+        # Handle Decimals first
         if isinstance(value, Decimal):
+            new_item[key] = int(value) if value % 1 == 0 else float(value)
+        # Handle Strings that might be dates
+        elif isinstance(value, str):
+            try:
+                # Attempt to parse a common ISO 8601 format.
+                # The .replace() handles the 'Z' for Zulu/UTC time.
+                dt_obj = datetime.datetime.fromisoformat(value.replace('Z', '+00:00'))
+                # Now that we have a datetime object, format it as desired
+                new_item[key] = dt_obj.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
+            except (ValueError, TypeError):
+                # If it fails to parse, it's just a regular string
+                new_item[key] = value
+        # Handle all other types
+        else:
+            new_item[key] = value
+    return new_item
 # Paginated scan
 def scan_table():
     return items
 # Export to CSV
+# Export to CSV
+def export_to_csv(items, output_path, fields_to_drop: list = None):
     if not items:
         print("No items found.")
         return
+    # Use a set for efficient lookup
+    drop_set = set(fields_to_drop or [])
+    # Get a comprehensive list of all possible headers from all items
+    all_keys = set()
+    for item in items:
+        all_keys.update(item.keys())
+    # Determine the final fieldnames by subtracting the ones to drop
+    fieldnames = sorted(list(all_keys - drop_set))
+    print("Final CSV columns will be:", fieldnames)
+    with open(output_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
+        # The key fix is here: extrasaction='ignore'
+        # restval='' is also good practice to handle rows that are missing a key
+        writer = csv.DictWriter(
+            csvfile,
+            fieldnames=fieldnames,
+            extrasaction='ignore',
+            restval=''
+        )
         writer.writeheader()
         for item in items:
+            # The convert_types function can now return the full dict,
+            # and the writer will simply ignore the extra fields.
             writer.writerow(convert_types(item))
     print(f"Exported {len(items)} items to {output_path}")
 # Run export
 items = scan_table()
+export_to_csv(items, CSV_OUTPUT, fields_to_drop=[])

pyproject.toml CHANGED Viewed

@@ -4,38 +4,40 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "doc_redaction"
-version = "0.7.2"
 description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
-    "pdfminer.six==20240706",
     "pdf2image==1.17.0",
-    "pymupdf==1.26.1",
-    "opencv-python==4.10.0.84",
-    "presidio_analyzer==2.2.358",
-    "presidio_anonymizer==2.2.358",
-    "presidio-image-redactor==0.0.56",
-    "pikepdf==9.5.2",
-    "pandas==2.3.0",
-    "scikit-learn==1.6.1",
     "spacy==3.8.7",
     # Direct URL dependency for spacy model
     "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
-    "gradio==5.34.2",
-    "boto3==1.39.1",
-    "pyarrow==19.0.1",
     "openpyxl==3.1.5",
-    "Faker==36.1.1",
-    "python-levenshtein==0.26.1",
     "spaczz==0.6.1",
     # Direct URL dependency for gradio_image_annotator wheel
     "gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl",
-    "rapidfuzz==3.12.1",
     "python-dotenv==1.0.1",
-    "numpy==1.26.4",
-    "awslambdaric==3.0.1"
 ]
 [project.urls]

 [project]
 name = "doc_redaction"
+version = "1.0.0"
 description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
+    "pdfminer.six==20250506",
     "pdf2image==1.17.0",
+    "pymupdf==1.26.3",
+    "opencv-python==4.12.0.88",
+    "presidio_analyzer==2.2.359",
+    "presidio_anonymizer==2.2.359",
+    "presidio-image-redactor==0.0.57",
+    "pikepdf==9.10.2",
+    "pandas==2.3.1",
+    "scikit-learn==1.7.1",
     "spacy==3.8.7",
     # Direct URL dependency for spacy model
     "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
+    "gradio==5.43.1",
+    "boto3==1.40.10",
+    "pyarrow==21.0.0",
     "openpyxl==3.1.5",
+    "Faker==37.5.3",
+    "python-levenshtein==0.27.1",
     "spaczz==0.6.1",
     # Direct URL dependency for gradio_image_annotator wheel
     "gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl",
+    "rapidfuzz==3.13.0",
     "python-dotenv==1.0.1",
+    "awslambdaric==3.1.1",
+    "python-docx==1.2.0",
+    "paddlepaddle==3.1.0",
+    "paddleocr==3.1.1"
 ]
 [project.urls]

requirements.txt CHANGED Viewed

@@ -1,28 +1,30 @@
-pdfminer.six==20240706
 pdf2image==1.17.0
-pymupdf==1.26.1
-opencv-python==4.10.0.84
-presidio_analyzer==2.2.358
-presidio_anonymizer==2.2.358
-presidio-image-redactor==0.0.56
-pikepdf==9.5.2
-pandas==2.3.0
-scikit-learn==1.6.1
 spacy==3.8.7
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
-gradio==5.34.2
-boto3==1.39.1
-pyarrow==19.0.1
 openpyxl==3.1.5
-Faker==36.1.1
-python-levenshtein==0.26.1
 spaczz==0.6.1
 # The following version
 https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
-rapidfuzz==3.12.1
 python-dotenv==1.0.1
-numpy==1.26.4
-awslambdaric==3.0.1

+pdfminer.six==20250506
 pdf2image==1.17.0
+pymupdf==1.26.3
+opencv-python==4.12.0.88
+presidio_analyzer==2.2.359
+presidio_anonymizer==2.2.359
+presidio-image-redactor==0.0.57
+pikepdf==9.10.2
+pandas==2.3.1
+scikit-learn==1.7.1
 spacy==3.8.7
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
+gradio==5.43.1
+boto3==1.40.10
+pyarrow==21.0.0
 openpyxl==3.1.5
+Faker==37.5.3
+python-levenshtein==0.27.1
 spaczz==0.6.1
 # The following version
 https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
+rapidfuzz==3.13.0
 python-dotenv==1.0.1
+awslambdaric==3.1.1
+python-docx==1.2.0
+paddlepaddle==3.1.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
+paddleocr==3.1.1

src/app_settings.qmd CHANGED Viewed

@@ -115,6 +115,16 @@ Configuration for input and output file handling.
     *   **Default Value:** `'input/'`
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
 ## Logging Options
 Settings for configuring application logging, including log formats and storage locations.
@@ -161,7 +171,7 @@ Settings for configuring application logging, including log formats and storage
 *   **`CSV_USAGE_LOG_HEADERS`**
     *   **Description:** Defines custom headers for CSV usage logs.
-    *   **Default Value:** A predefined list of header names. Refer to `tools/config.py` for the complete list.
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
 *   **`SAVE_LOGS_TO_DYNAMODB`**
@@ -214,12 +224,17 @@ Settings for configuring application logging, including log formats and storage
 Configurations related to the text redaction process, including PII detection models and external tool paths.
 *   **`TESSERACT_FOLDER`**
-    *   **Description:** Path to the local Tesseract OCR installation folder. Only required if Tesseract is not in path, or you are running a version of the app as an .exe installed with Pyinstaller. Gives the path to the local Tesseract OCR model for text extraction.
     *   **Default Value:** `""` (empty string)
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
 *   **`POPPLER_FOLDER`**
-    *   **Description:** Path to the local Poppler installation's `bin` folder. Only required if Tesseract is not in path, or you are running a version of the app as an .exe installed with Pyinstaller. Poppler is used for PDF processing.
     *   **Default Value:** `""` (empty string)
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
@@ -283,24 +298,34 @@ Configurations related to the text redaction process, including PII detection mo
     *   **Default Value:** Value of `AWS_PII_OPTION` if `SHOW_AWS_PII_DETECTION_OPTIONS` is True, else value of `LOCAL_PII_OPTION`.
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`. Provide one of the PII detection option display names.
 *   **`CHOSEN_COMPREHEND_ENTITIES`**
     *   **Description:** A list of AWS Comprehend PII entity types to be redacted when using AWS Comprehend.
-    *   **Default Value:** A predefined list of entity types. Refer to `tools/config.py` for the complete list.
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`. This should be a string representation of a Python list.
 *   **`FULL_COMPREHEND_ENTITY_LIST`**
     *   **Description:** The complete list of PII entity types supported by AWS Comprehend that can be selected for redaction.
-    *   **Default Value:** A predefined list of entity types. Refer to `tools/config.py` for the complete list.
     *   **Configuration:** This is typically an informational variable reflecting the capabilities of AWS Comprehend and is not meant to be changed by users directly affecting redaction behavior (use `CHOSEN_COMPREHEND_ENTITIES` for that). Set as an environment variable directly, or include in `config/app_config.env`.
 *   **`CHOSEN_REDACT_ENTITIES`**
     *   **Description:** A list of local PII entity types to be redacted when using the local PII detection model.
-    *   **Default Value:** A predefined list of entity types. Refer to `tools/config.py` for the complete list.
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`. This should be a string representation of a Python list.
 *   **`FULL_ENTITY_LIST`**
     *   **Description:** The complete list of PII entity types supported by the local PII detection model that can be selected for redaction.
-    *   **Default Value:** A predefined list of entity types. Refer to `tools/config.py` for the complete list.
     *   **Configuration:** This is typically an informational variable reflecting the capabilities of the local model and is not meant to be changed by users directly affecting redaction behavior (use `CHOSEN_REDACT_ENTITIES` for that). Set as an environment variable directly, or include in `config/app_config.env`.
 *   **`PAGE_BREAK_VALUE`**
@@ -309,20 +334,15 @@ Configurations related to the text redaction process, including PII detection mo
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
 *   **`MAX_TIME_VALUE`**
-    *   **Description:** Specifies the maximum time (in arbitrary units, likely seconds or milliseconds depending on implementation) for a process before it might be timed out.
     *   **Default Value:** `'999999'`
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
 *   **`CUSTOM_BOX_COLOUR`**
-    *   **Description:** Allows specifying a custom color for the redaction boxes drawn on documents (e.g., "grey", "red", "#FF0000"). If empty, a default color is used.
     *   **Default Value:** `""` (empty string)
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
-*   **`REDACTION_LANGUAGE`**
-    *   **Description:** Specifies the language for redaction processing. Currently, only "en" (English) is supported.
-    *   **Default Value:** `"en"`
-    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
 *   **`RETURN_PDF_END_OF_REDACTION`**
     *   **Description:** If set to `'True'`, the application will return a PDF document at the end of the redaction task.
     *   **Default Value:** `"True"`
@@ -333,13 +353,42 @@ Configurations related to the text redaction process, including PII detection mo
     *   **Default Value:** `"False"`
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
 ## App Run Options
 General runtime configurations for the application.
 *   **`TLDEXTRACT_CACHE`**
-    *   **Description:** Path to the cache file used by the `tldextract` library, which helps in accurately extracting top-level domains (TLDs) from URLs.
-    *   **Default Value:** `'tld/.tld_set_snapshot'`
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
 *   **`COGNITO_AUTH`**
@@ -436,7 +485,7 @@ Settings related to tracking and applying cost codes for application usage.
 Configurations for features related to processing whole documents via APIs, particularly AWS Textract for large documents.
 *   **`SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS`**
-    *   **Description:** Controls whether UI options for whole document Textract calls are displayed. (Note: Mentioned as not currently implemented in the source).
     *   **Default Value:** `'False'`
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
@@ -461,12 +510,12 @@ Configurations for features related to processing whole documents via APIs, part
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
 *   **`TEXTRACT_JOBS_S3_LOC`**
-    *   **Description:** The S3 subfolder (within `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET`) where Textract job data (output) is stored.
     *   **Default Value:** `'output'`
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
 *   **`TEXTRACT_JOBS_S3_INPUT_LOC`**
-    *   **Description:** The S3 subfolder (within `TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET`) where Textract job input is stored.
     *   **Default Value:** `'input'`
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
@@ -478,4 +527,4 @@ Configurations for features related to processing whole documents via APIs, part
 *   **`DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS`**
     *   **Description:** Specifies the number of past days for which to display whole document Textract jobs in the UI.
     *   **Default Value:** `'7'`
-    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.

     *   **Default Value:** `'input/'`
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`GRADIO_TEMP_DIR`**
+    *   **Description:** Defines the path for Gradio's temporary file storage.
+    *   **Default Value:** `'tmp/gradio_tmp/'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`MPLCONFIGDIR`**
+    *   **Description:** Specifies the cache directory for the Matplotlib library, which is used for plotting and image handling.
+    *   **Default Value:** `'tmp/matplotlib_cache/'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
 ## Logging Options
 Settings for configuring application logging, including log formats and storage locations.
 *   **`CSV_USAGE_LOG_HEADERS`**
     *   **Description:** Defines custom headers for CSV usage logs.
+    *   **Default Value:** A predefined list of header names. Refer to `config.py` for the complete list.
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
 *   **`SAVE_LOGS_TO_DYNAMODB`**
 Configurations related to the text redaction process, including PII detection models and external tool paths.
 *   **`TESSERACT_FOLDER`**
+    *   **Description:** Path to the local Tesseract OCR installation folder. Only required if Tesseract is not in the system's PATH, or when running a packaged executable (e.g., via PyInstaller).
     *   **Default Value:** `""` (empty string)
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`TESSERACT_DATA_FOLDER`**
+    *   **Description:** Path to the Tesseract trained data files (e.g., `tessdata`).
+    *   **Default Value:** `"/usr/share/tessdata"`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
 *   **`POPPLER_FOLDER`**
+    *   **Description:** Path to the local Poppler installation's `bin` folder. Poppler is used for PDF processing. Only required if Poppler is not in the system's PATH.
     *   **Default Value:** `""` (empty string)
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
     *   **Default Value:** Value of `AWS_PII_OPTION` if `SHOW_AWS_PII_DETECTION_OPTIONS` is True, else value of `LOCAL_PII_OPTION`.
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`. Provide one of the PII detection option display names.
+*   **`CHOSEN_LOCAL_OCR_MODEL`**
+    *   **Description:** Choose the engine for local OCR: `"tesseract"`, `"paddle"`, or `"hybrid"`. "paddle" is effective for line extraction but not word-level redaction. "hybrid" uses Tesseract first, then PaddleOCR for low-confidence words.
+    *   **Default Value:** `"tesseract"`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`PREPROCESS_LOCAL_OCR_IMAGES`**
+    *   **Description:** If set to `"True"`, images will be preprocessed (e.g., deskewed, contrast adjusted) before being sent to the local OCR engine. This can sometimes yield worse results on clean scans.
+    *   **Default Value:** `"False"`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
 *   **`CHOSEN_COMPREHEND_ENTITIES`**
     *   **Description:** A list of AWS Comprehend PII entity types to be redacted when using AWS Comprehend.
+    *   **Default Value:** A predefined list of entity types. Refer to `config.py` for the complete list.
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`. This should be a string representation of a Python list.
 *   **`FULL_COMPREHEND_ENTITY_LIST`**
     *   **Description:** The complete list of PII entity types supported by AWS Comprehend that can be selected for redaction.
+    *   **Default Value:** A predefined list of entity types. Refer to `config.py` for the complete list.
     *   **Configuration:** This is typically an informational variable reflecting the capabilities of AWS Comprehend and is not meant to be changed by users directly affecting redaction behavior (use `CHOSEN_COMPREHEND_ENTITIES` for that). Set as an environment variable directly, or include in `config/app_config.env`.
 *   **`CHOSEN_REDACT_ENTITIES`**
     *   **Description:** A list of local PII entity types to be redacted when using the local PII detection model.
+    *   **Default Value:** A predefined list of entity types. Refer to `config.py` for the complete list.
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`. This should be a string representation of a Python list.
 *   **`FULL_ENTITY_LIST`**
     *   **Description:** The complete list of PII entity types supported by the local PII detection model that can be selected for redaction.
+    *   **Default Value:** A predefined list of entity types. Refer to `config.py` for the complete list.
     *   **Configuration:** This is typically an informational variable reflecting the capabilities of the local model and is not meant to be changed by users directly affecting redaction behavior (use `CHOSEN_REDACT_ENTITIES` for that). Set as an environment variable directly, or include in `config/app_config.env`.
 *   **`PAGE_BREAK_VALUE`**
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
 *   **`MAX_TIME_VALUE`**
+    *   **Description:** Specifies a maximum time value for long-running processes.
     *   **Default Value:** `'999999'`
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
 *   **`CUSTOM_BOX_COLOUR`**
+    *   **Description:** Allows specifying a custom color for the redaction boxes drawn on documents. Only `"grey"` is currently supported as a custom value. If empty, a default color is used.
     *   **Default Value:** `""` (empty string)
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
 *   **`RETURN_PDF_END_OF_REDACTION`**
     *   **Description:** If set to `'True'`, the application will return a PDF document at the end of the redaction task.
     *   **Default Value:** `"True"`
     *   **Default Value:** `"False"`
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+## Language Options
+Settings for multi-language support in OCR and PII detection.
+*   **`SHOW_LANGUAGE_SELECTION`**
+    *   **Description:** If set to `"True"`, a dropdown menu for language selection will be visible in the user interface.
+    *   **Default Value:** `"False"`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`DEFAULT_LANGUAGE_FULL_NAME`**
+    *   **Description:** The default language's full name (e.g., "english") to be displayed in the UI.
+    *   **Default Value:** `"english"`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`DEFAULT_LANGUAGE`**
+    *   **Description:** The default language's short code (e.g., "en") used by the backend engines. Ensure the corresponding Tesseract/PaddleOCR language packs are installed.
+    *   **Default Value:** `"en"`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`MAPPED_LANGUAGE_CHOICES`**
+    *   **Description:** A string list of full language names (e.g., 'english', 'french') presented to the user in the language dropdown.
+    *   **Default Value:** A predefined list. See `config.py`.
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
+*   **`LANGUAGE_CHOICES`**
+    *   **Description:** A string list of short language codes (e.g., 'en', 'fr') that correspond to `MAPPED_LANGUAGE_CHOICES`. This is what the backend uses.
+    *   **Default Value:** A predefined list. See `config.py`.
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
 ## App Run Options
 General runtime configurations for the application.
 *   **`TLDEXTRACT_CACHE`**
+    *   **Description:** Path to the cache directory used by the `tldextract` library, which helps in accurately extracting top-level domains (TLDs) from URLs.
+    *   **Default Value:** `'tmp/tld/'`
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
 *   **`COGNITO_AUTH`**
 Configurations for features related to processing whole documents via APIs, particularly AWS Textract for large documents.
 *   **`SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS`**
+    *   **Description:** Controls whether UI options for whole document Textract calls are displayed.
     *   **Default Value:** `'False'`
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
 *   **`TEXTRACT_JOBS_S3_LOC`**
+    *   **Description:** The S3 subfolder (within the main redaction bucket) where Textract job data (output) is stored.
     *   **Default Value:** `'output'`
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
 *   **`TEXTRACT_JOBS_S3_INPUT_LOC`**
+    *   **Description:** The S3 subfolder (within the main redaction bucket) where Textract job input is stored.
     *   **Default Value:** `'input'`
     *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
 *   **`DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS`**
     *   **Description:** Specifies the number of past days for which to display whole document Textract jobs in the UI.
     *   **Default Value:** `'7'`
+    *   **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.

src/user_guide.qmd CHANGED Viewed

@@ -20,7 +20,7 @@ format:
     - [Redacting only specific pages](#redacting-only-specific-pages)
     - [Handwriting and signature redaction](#handwriting-and-signature-redaction)
 - [Reviewing and modifying suggested redactions](#reviewing-and-modifying-suggested-redactions)
-- [Redacting tabular data files (CSV/XLSX) or copy and pasted text](#redacting-tabular-data-files-xlsxcsv-or-copy-and-pasted-text)
 See the [advanced user guide here](#advanced-user-guide):
 - [Merging redaction review files](#merging-redaction-review-files)
@@ -210,9 +210,11 @@ On the 'Review redactions' tab you have a visual interface that allows you to in
 ### Uploading documents for review
-The top area has a file upload area where you can upload original, unredacted PDFs, alongside the '..._review_file.csv' that is produced by the redaction process. Once you have uploaded these two files, click the '**Review redactions based on original PDF...**' button to load in the files for review. This will allow you to visualise and modify the suggested redactions using the interface below.
-Optionally, you can also upload one of the '..._ocr_output.csv' files here that comes out of a redaction task, so that you can navigate the extracted text from the document.
 ![Search extracted text](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/search_extracted_text.PNG)
@@ -300,6 +302,77 @@ Once you have filtered the table, or selected a row from the table, you have a f
 If you made a mistake, click the 'Undo last element removal' button to restore the Search suggested redactions table to its previous state (can only undo the last action).
 ### Navigating through the document using the 'Search all extracted text'
 The 'search all extracted text' table will contain text if you have just redacted a document, or if you have uploaded a '..._ocr_output.csv' file alongside a document file and review file on the Review redactions tab as [described above](#uploading-documents-for-review).
@@ -312,11 +385,11 @@ You can search through the extracted text by using the search bar just above the
 ![Search suggested redaction area](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/search_extracted_text.PNG)
-## Redacting tabular data files (XLSX/CSV) or copy and pasted text
-### Tabular data files (XLSX/CSV)
-The app can be used to redact tabular data files such as xlsx or csv files. For this to work properly, your data file needs to be in a simple table format, with a single table starting from the first cell (A1), and no other information in the sheet. Similarly for .xlsx files, each sheet in the file that you want to redact should be in this simple format.
 To demonstrate this, we can use [the example csv file 'combined_case_notes.csv'](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/combined_case_notes.csv), which is a small dataset of dummy social care case notes. Go to the 'Open text or Excel/csv files' tab. Drop the file into the upload area. After the file is loaded, you should see the suggested columns for redaction in the box underneath. You can select and deselect columns to redact as you wish from this list.

     - [Redacting only specific pages](#redacting-only-specific-pages)
     - [Handwriting and signature redaction](#handwriting-and-signature-redaction)
 - [Reviewing and modifying suggested redactions](#reviewing-and-modifying-suggested-redactions)
+- [Redacting Word, tabular data files (CSV/XLSX) or copy and pasted text](#redacting-word-tabular-data-files-xlsxcsv-or-copy-and-pasted-text)
 See the [advanced user guide here](#advanced-user-guide):
 - [Merging redaction review files](#merging-redaction-review-files)
 ### Uploading documents for review
+The top area has a file upload area where you can upload files for review . In the left box, upload the original PDF file. Click '1. Upload original PDF'. In the right box, you can upload the '..._review_file.csv' that is produced by the redaction process.
+Optionally, you can upload a '..._ocr_result_with_words' file here, that will allow you to search through the text and easily [add new redactions based on word search](#searching-and-adding-custom-redactions). You can also upload one of the '..._ocr_output.csv' file here that comes out of a redaction task, so that you can navigate the extracted text from the document. Click the button '2. Upload Review or OCR csv files' load in these files.
+Now you can review and modify the suggested redactions using the interface described below.
 ![Search extracted text](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/search_extracted_text.PNG)
 If you made a mistake, click the 'Undo last element removal' button to restore the Search suggested redactions table to its previous state (can only undo the last action).
+### Searching and Adding Custom Redactions
+After a document has been processed, you may need to redact specific terms, names, or phrases that the automatic PII (Personally Identifiable Information) detection might have missed. The **"Search text to make new redactions"** tab gives you the power to find and redact any text within your document manually.
+#### How to Use the Search and Redact Feature
+The workflow is designed to be simple: **Search → Select → Redact**.
+---
+#### **Step 1: Search for Text**
+1.  Navigate to the **"Search text to make new redactions"** tab.
+2.  The main table will initially be populated with all the text extracted from the document, broken down by word.
+3.  To narrow this down, use the **"Multi-word text search"** box to type the word or phrase you want to find.
+4.  Click the **"Search"** button or press Enter.
+5.  The table below will update to show only the rows containing text that matches your search query.
+> **Tip:** You can also filter the results by page number using the **"Page"** dropdown. To clear all filters and see the full text again, click the **"Reset table to original state"** button.
+---
+#### **Step 2: Select and Review a Match**
+When you click on any row in the search results table:
+*   The document preview on the left will automatically jump to that page, allowing you to see the word in its original context.
+*   The details of your selection will appear in the smaller **"Selected row"** table for confirmation.
+---
+#### **Step 3: Choose Your Redaction Method**
+You have several powerful options for redacting the text you've found:
+*   **Redact a Single, Specific Instance:**
+    *   Click on the exact row in the table you want to redact.
+    *   Click the **`Redact specific text row`** button.
+    *   Only that single instance will be redacted.
+*   **Redact All Instances of a Word/Phrase:**
+    *   Let's say you want to redact the project name "Project Alpha" everywhere it appears.
+    *   Find and select one instance of "Project Alpha" in the table.
+    *   Click the **`Redact all words with same text as selected row`** button.
+    *   The application will find and redact every single occurrence of "Project Alpha" throughout the entire document.
+*   **Redact All Current Search Results:**
+    *   Perform a search (e.g., for a specific person's name).
+    *   If you are confident that every result shown in the filtered table should be redacted, click the **`Redact all text in table`** button.
+    *   This will apply a redaction to all currently visible items in the table in one go.
+---
+#### **Customising Your New Redactions**
+Before you click one of the redact buttons, you can customize the appearance and label of the new redactions under the **"Search options"** accordion:
+*   **Label for new redactions:** Change the text that appears on the redaction box (default is "Redaction"). You could change this to "CONFIDENTIAL" or "CUSTOM".
+*   **Colour for labels:** Set a custom color for the redaction box by providing an RGB value. The format must be three numbers (0-255) in parentheses, for example:
+    *   ` (255, 0, 0) ` for Red
+    *   ` (0, 0, 0) ` for Black
+    *   ` (255, 255, 0) ` for Yellow
+#### **Undoing a Mistake**
+If you make a mistake, you can reverse the last redaction action you performed on this tab.
+*   Click the **`Undo latest redaction`** button. This will revert the last set of redactions you added (whether it was a single row, all of a certain text, or all search results).
+> **Important:** This undo button only works for the *most recent* action. It maintains a single backup state, so it cannot undo actions that are two or more steps in the past.
 ### Navigating through the document using the 'Search all extracted text'
 The 'search all extracted text' table will contain text if you have just redacted a document, or if you have uploaded a '..._ocr_output.csv' file alongside a document file and review file on the Review redactions tab as [described above](#uploading-documents-for-review).
 ![Search suggested redaction area](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/main/review_redactions/search_extracted_text.PNG)
+## Redacting Word, tabular data files (XLSX/CSV) or copy and pasted text
+### Word or tabular data files (XLSX/CSV)
+The app can be used to redact Word (.docx), or tabular data files such as xlsx or csv files. For this to work properly, your data file needs to be in a simple table format, with a single table starting from the first cell (A1), and no other information in the sheet. Similarly for .xlsx files, each sheet in the file that you want to redact should be in this simple format.
 To demonstrate this, we can use [the example csv file 'combined_case_notes.csv'](https://github.com/seanpedrick-case/document_redaction_examples/blob/main/combined_case_notes.csv), which is a small dataset of dummy social care case notes. Go to the 'Open text or Excel/csv files' tab. Drop the file into the upload area. After the file is loaded, you should see the suggested columns for redaction in the box underneath. You can select and deselect columns to redact as you wish from this list.

tools/aws_textract.py CHANGED Viewed

@@ -3,7 +3,6 @@ from typing import List
 import io
 import os
 import json
-from collections import defaultdict
 import pikepdf
 import time
 import pandas as pd
@@ -13,17 +12,12 @@ from tools.config import AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION
 def extract_textract_metadata(response:object):
     """Extracts metadata from an AWS Textract response."""
-    #print("Document metadata:", response['DocumentMetadata'])
     request_id = response['ResponseMetadata']['RequestId']
     pages = response['DocumentMetadata']['Pages']
-    #number_of_pages = response['DocumentMetadata']['NumberOfPages']
     return str({
         'RequestId': request_id,
         'Pages': pages
-        #,
-        #'NumberOfPages': number_of_pages
     })
 def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str="", handwrite_signature_checkbox:List[str]=["Extract handwriting", "Redact all identified signatures"]):
@@ -54,7 +48,6 @@ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str=""
             time.sleep(3)
             response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
     else:
-        #print("Analysing document without signature detection")
         # Call detect_document_text to extract plain text
         try:
             response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
@@ -74,12 +67,8 @@ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str=""
         'data': response
     }
-    #print("response:", response)
     request_metadata = extract_textract_metadata(response)  # Metadata comes out as a string
-    #print("request_metadata:", request_metadata)
     # Return a list containing the wrapped response and the metadata
     return wrapped_response, request_metadata  # Return as a list to match the desired structure
@@ -103,179 +92,8 @@ def convert_pike_pdf_page_to_bytes(pdf:object, page_num:int):
     # Now you can use the `pdf_bytes` to convert it to an image or further process
     buffer.close()
-    #images = convert_from_bytes(pdf_bytes)
-    #image = images[0]
     return pdf_bytes
-# def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_no:int):
-#     '''
-#     Convert the json response from textract to the OCRResult format used elsewhere in the code. Looks for lines, words, and signatures. Handwriting and signatures are set aside especially for later in case the user wants to override the default behaviour and redact all handwriting/signatures.
-#     '''
-#     all_ocr_results = []
-#     signature_or_handwriting_recogniser_results = []
-#     signature_recogniser_results = []
-#     handwriting_recogniser_results = []
-#     signatures = []
-#     handwriting = []
-#     ocr_results_with_words = {}
-#     text_block={}
-#     i = 1
-#     # Assuming json_data is structured as a dictionary with a "pages" key
-#     #if "pages" in json_data:
-#     # Find the specific page data
-#     page_json_data = json_data #next((page for page in json_data["pages"] if page["page_no"] == page_no), None)
-#     #print("page_json_data:", page_json_data)
-#     if "Blocks" in page_json_data:
-#         # Access the data for the specific page
-#         text_blocks = page_json_data["Blocks"]  # Access the Blocks within the page data
-#     # This is a new page
-#     elif "page_no" in page_json_data:
-#         text_blocks = page_json_data["data"]["Blocks"]
-#     else: text_blocks = []
-#     is_signature = False
-#     is_handwriting = False
-#     for text_block in text_blocks:
-#         if (text_block['BlockType'] == 'LINE') | (text_block['BlockType'] == 'SIGNATURE'): # (text_block['BlockType'] == 'WORD') |
-#             # Extract text and bounding box for the line
-#             line_bbox = text_block["Geometry"]["BoundingBox"]
-#             line_left = int(line_bbox["Left"] * page_width)
-#             line_top = int(line_bbox["Top"] * page_height)
-#             line_right = int((line_bbox["Left"] + line_bbox["Width"]) * page_width)
-#             line_bottom = int((line_bbox["Top"] + line_bbox["Height"]) * page_height)
-#             width_abs = int(line_bbox["Width"] * page_width)
-#             height_abs = int(line_bbox["Height"] * page_height)
-#             if text_block['BlockType'] == 'LINE':
-#                 # Extract text and bounding box for the line
-#                 line_text = text_block.get('Text', '')
-#                 words = []
-#                 current_line_handwriting_results = []  # Track handwriting results for this line
-#                 if 'Relationships' in text_block:
-#                     for relationship in text_block['Relationships']:
-#                         if relationship['Type'] == 'CHILD':
-#                             for child_id in relationship['Ids']:
-#                                 child_block = next((block for block in text_blocks if block['Id'] == child_id), None)
-#                                 if child_block and child_block['BlockType'] == 'WORD':
-#                                     word_text = child_block.get('Text', '')
-#                                     word_bbox = child_block["Geometry"]["BoundingBox"]
-#                                     confidence = child_block.get('Confidence','')
-#                                     word_left = int(word_bbox["Left"] * page_width)
-#                                     word_top = int(word_bbox["Top"] * page_height)
-#                                     word_right = int((word_bbox["Left"] + word_bbox["Width"]) * page_width)
-#                                     word_bottom = int((word_bbox["Top"] + word_bbox["Height"]) * page_height)
-#                                     # Extract BoundingBox details
-#                                     word_width = word_bbox["Width"]
-#                                     word_height = word_bbox["Height"]
-#                                     # Convert proportional coordinates to absolute coordinates
-#                                     word_width_abs = int(word_width * page_width)
-#                                     word_height_abs = int(word_height * page_height)
-#                                     words.append({
-#                                         'text': word_text,
-#                                         'bounding_box': (word_left, word_top, word_right, word_bottom)
-#                                     })
-#                                     # Check for handwriting
-#                                     text_type = child_block.get("TextType", '')
-#                                     if text_type == "HANDWRITING":
-#                                         is_handwriting = True
-#                                         entity_name = "HANDWRITING"
-#                                         word_end = len(word_text)
-#                                         recogniser_result = CustomImageRecognizerResult(
-#                                             entity_type=entity_name,
-#                                             text=word_text,
-#                                             score=confidence,
-#                                             start=0,
-#                                             end=word_end,
-#                                             left=word_left,
-#                                             top=word_top,
-#                                             width=word_width_abs,
-#                                             height=word_height_abs
-#                                         )
-#                                         # Add to handwriting collections immediately
-#                                         handwriting.append(recogniser_result)
-#                                         handwriting_recogniser_results.append(recogniser_result)
-#                                         signature_or_handwriting_recogniser_results.append(recogniser_result)
-#                                         current_line_handwriting_results.append(recogniser_result)
-#             # If handwriting or signature, add to bounding box
-#             elif (text_block['BlockType'] == 'SIGNATURE'):
-#                 line_text = "SIGNATURE"
-#                 is_signature = True
-#                 entity_name = "SIGNATURE"
-#                 confidence = text_block.get('Confidence', 0)
-#                 word_end = len(line_text)
-#                 recogniser_result = CustomImageRecognizerResult(
-#                     entity_type=entity_name,
-#                     text=line_text,
-#                     score=confidence,
-#                     start=0,
-#                     end=word_end,
-#                     left=line_left,
-#                     top=line_top,
-#                     width=width_abs,
-#                     height=height_abs
-#                 )
-#                 # Add to signature collections immediately
-#                 signatures.append(recogniser_result)
-#                 signature_recogniser_results.append(recogniser_result)
-#                 signature_or_handwriting_recogniser_results.append(recogniser_result)
-#                 words = [{
-#                     'text': line_text,
-#                     'bounding_box': (line_left, line_top, line_right, line_bottom)
-#                 }]
-#             ocr_results_with_words["text_line_" + str(i)] = {
-#                 "line": i,
-#                 'text': line_text,
-#                 'bounding_box': (line_left, line_top, line_right, line_bottom),
-#                 'words': words
-#             }
-#             # Create OCRResult with absolute coordinates
-#             ocr_result = OCRResult(line_text, line_left, line_top, width_abs, height_abs)
-#             all_ocr_results.append(ocr_result)
-#             is_signature_or_handwriting = is_signature | is_handwriting
-#             # If it is signature or handwriting, will overwrite the default behaviour of the PII analyser
-#             if is_signature_or_handwriting:
-#                 if recogniser_result not in signature_or_handwriting_recogniser_results:
-#                     signature_or_handwriting_recogniser_results.append(recogniser_result)
-#                 if is_signature:
-#                     if recogniser_result not in signature_recogniser_results:
-#                         signature_recogniser_results.append(recogniser_result)
-#                 if is_handwriting:
-#                     if recogniser_result not in handwriting_recogniser_results:
-#                         handwriting_recogniser_results.append(recogniser_result)
-#             i += 1
-#     return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_words
 def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_no:int):
     '''
     Convert the json response from textract to the OCRResult format used elsewhere in the code. Looks for lines, words, and signatures. Handwriting and signatures are set aside especially for later in case the user wants to override the default behaviour and redact all handwriting/signatures.
@@ -289,15 +107,13 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
     ocr_results_with_words = {}
     text_block={}
-    i = 1
     # Assuming json_data is structured as a dictionary with a "pages" key
-    #if "pages" in json_data:
     # Find the specific page data
     page_json_data = json_data #next((page for page in json_data["pages"] if page["page_no"] == page_no), None)
-    #print("page_json_data:", page_json_data)
     if "Blocks" in page_json_data:
         # Access the data for the specific page
         text_blocks = page_json_data["Blocks"]  # Access the Blocks within the page data
@@ -424,8 +240,8 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
         if line_text:
-            ocr_results_with_words["text_line_" + str(i)] = {
-                "line": i,
                 'text': line_text,
                 'bounding_box': (line_left, line_top, line_right, line_bottom),
                 'words': words,
@@ -433,9 +249,12 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
             }
             # Create OCRResult with absolute coordinates
-            ocr_result = OCRResult(line_text, line_left, line_top, width_abs, height_abs)
             all_ocr_results.append(ocr_result)
         is_signature_or_handwriting = is_signature | is_handwriting
         # If it is signature or handwriting, will overwrite the default behaviour of the PII analyser
@@ -451,7 +270,7 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
                 if recogniser_result not in handwriting_recogniser_results:
                     handwriting_recogniser_results.append(recogniser_result)
-        i += 1
     # Add page key to the line level results
     all_ocr_results_with_page = {"page": page_no, "results": all_ocr_results}
@@ -459,7 +278,6 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
     return all_ocr_results_with_page, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_words_with_page
 def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
     """
     Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.

 import io
 import os
 import json
 import pikepdf
 import time
 import pandas as pd
 def extract_textract_metadata(response:object):
     """Extracts metadata from an AWS Textract response."""
     request_id = response['ResponseMetadata']['RequestId']
     pages = response['DocumentMetadata']['Pages']
     return str({
         'RequestId': request_id,
         'Pages': pages
     })
 def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str="", handwrite_signature_checkbox:List[str]=["Extract handwriting", "Redact all identified signatures"]):
             time.sleep(3)
             response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
     else:
         # Call detect_document_text to extract plain text
         try:
             response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
         'data': response
     }
     request_metadata = extract_textract_metadata(response)  # Metadata comes out as a string
     # Return a list containing the wrapped response and the metadata
     return wrapped_response, request_metadata  # Return as a list to match the desired structure
     # Now you can use the `pdf_bytes` to convert it to an image or further process
     buffer.close()
     return pdf_bytes
 def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_no:int):
     '''
     Convert the json response from textract to the OCRResult format used elsewhere in the code. Looks for lines, words, and signatures. Handwriting and signatures are set aside especially for later in case the user wants to override the default behaviour and redact all handwriting/signatures.
     ocr_results_with_words = {}
     text_block={}
+    text_line_number = 1
     # Assuming json_data is structured as a dictionary with a "pages" key
     # Find the specific page data
     page_json_data = json_data #next((page for page in json_data["pages"] if page["page_no"] == page_no), None)
     if "Blocks" in page_json_data:
         # Access the data for the specific page
         text_blocks = page_json_data["Blocks"]  # Access the Blocks within the page data
         if line_text:
+            ocr_results_with_words["text_line_" + str(text_line_number)] = {
+                "line": text_line_number,
                 'text': line_text,
                 'bounding_box': (line_left, line_top, line_right, line_bottom),
                 'words': words,
             }
             # Create OCRResult with absolute coordinates
+            ocr_result = OCRResult(line_text, line_left, line_top, width_abs, height_abs, conf=confidence, line=text_line_number)
             all_ocr_results.append(ocr_result)
+            # Increase line number
+            text_line_number += 1
         is_signature_or_handwriting = is_signature | is_handwriting
         # If it is signature or handwriting, will overwrite the default behaviour of the PII analyser
                 if recogniser_result not in handwriting_recogniser_results:
                     handwriting_recogniser_results.append(recogniser_result)
     # Add page key to the line level results
     all_ocr_results_with_page = {"page": page_no, "results": all_ocr_results}
     return all_ocr_results_with_page, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_words_with_page
 def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
     """
     Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.

tools/cli_redact.py CHANGED Viewed

@@ -1,84 +1,164 @@
 import argparse
 import os
-from tools.config import get_or_create_env_var
-from tools.helper_functions import ensure_output_folder_exists,tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
 from tools.file_redaction import choose_and_run_redactor
-import pandas as pd
-from datetime import datetime
-chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER', 'CREDIT_DEBIT_CVV', 'CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS',
-                                'NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD',
-                                'IP_ADDRESS','MAC_ADDRESS','LICENSE_PLATE',
-                                'VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER',
-                                'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE',
-                                'UK_NATIONAL_HEALTH_SERVICE_NUMBER']
-chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS",
-                            "STREETNAME", "UKPOSTCODE"]
-def main(first_loop_state=True, latest_file_completed=0, output_summary="", output_file_list=None,
-         log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
-         current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Redact all identified handwriting", "Redact all identified signatures"]):
-    if output_file_list is None:
-        output_file_list = []
-    if log_files_list is None:
-        log_files_list = []
-    parser = argparse.ArgumentParser(description='Redact PII from documents via command line')
-    # Required arguments
-    parser.add_argument('--input_file', help='Path to input file (PDF, JPG, or PNG)')
-    # Optional arguments with defaults matching the GUI app
-    parser.add_argument('--ocr_method', choices=[text_ocr_option, tesseract_ocr_option, textract_option],
-                       default='Quick image analysis', help='OCR method to use')
-    parser.add_argument('--pii_detector', choices=[local_pii_detector, aws_pii_detector],
-                       default='Local', help='PII detection method')
-    parser.add_argument('--page_min', type=int, default=0, help='First page to redact')
-    parser.add_argument('--page_max', type=int, default=0, help='Last page to redact')
-    parser.add_argument('--allow_list', help='Path to allow list CSV file')
-    parser.add_argument('--output_dir', default='output/', help='Output directory')
-    args = parser.parse_args()
-    # Ensure output directory exists
-    ensure_output_folder_exists()
-    # Create file object similar to what Gradio provides
-    file_obj = {"name": args.input_file}
-    # Load allow list if provided
-    allow_list_df = pd.DataFrame()
-    if args.allow_list:
-        allow_list_df = pd.read_csv(args.allow_list)
-    # Get file names
-    file_name_no_ext, file_name_with_ext, full_file_name = get_input_file_names(file_obj)
-    # Initialize empty states for PDF processing
-    # Prepare PDF/image
-    output_summary, prepared_pdf, images_pdf, max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations = prepare_image_or_pdf(
-        file_obj, args.ocr_method, allow_list_df, latest_file_completed,
-        output_summary, first_loop_state, args.page_max, current_loop_page, all_image_annotations
-    )
-    output_summary, output_files, output_file_list, latest_file_completed, log_files, \
-    log_files_list, estimated_time, textract_metadata, pdf_doc_state, all_image_annotations, \
-    current_loop_page, page_break, all_line_level_ocr_results, all_decision_process_table, \
-    comprehend_query_num = choose_and_run_redactor(
-        file_obj, prepared_pdf, images_pdf, "en", chosen_redact_entities,
-        chosen_comprehend_entities, args.ocr_method, allow_list_df,
-        latest_file_completed, output_summary, output_file_list, log_files_list,
-        first_loop_state, args.page_min, args.page_max, estimated_time,
-        handwrite_signature_checkbox, textract_metadata, all_image_annotations,
-        all_line_level_ocr_results, all_decision_process_table, pdf_doc_state,
-        current_loop_page, page_break, args.pii_detector, comprehend_query_num, args.output_dir
-    )
-    print(f"\nRedaction complete. Output file_list:\n{output_file_list}")
-    print(f"\nOutput files saved to: {args.output_dir}")
 if __name__ == "__main__":
-    main()

 import argparse
 import os
+import pandas as pd
+from tools.config import get_or_create_env_var, LOCAL_PII_OPTION, AWS_PII_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION
+from tools.helper_functions import ensure_output_folder_exists
 from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
 from tools.file_redaction import choose_and_run_redactor
+from tools.anonymisation import anonymise_files_with_open_text
+# --- Constants and Configuration ---
+INPUT_FOLDER = 'input/'
+OUTPUT_FOLDER = 'output/'
+DEFAULT_LANGUAGE = 'en'
+# Define entities for redaction
+chosen_comprehend_entities = [
+    'BANK_ACCOUNT_NUMBER', 'BANK_ROUTING', 'CREDIT_DEBIT_NUMBER',
+    'CREDIT_DEBIT_CVV', 'CREDIT_DEBIT_EXPIRY', 'PIN', 'EMAIL', 'ADDRESS',
+    'NAME', 'PHONE', 'PASSPORT_NUMBER', 'DRIVER_ID', 'USERNAME', 'PASSWORD',
+    'IP_ADDRESS', 'MAC_ADDRESS', 'LICENSE_PLATE', 'VEHICLE_IDENTIFICATION_NUMBER',
+    'UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER',
+    'SWIFT_CODE', 'UK_NATIONAL_HEALTH_SERVICE_NUMBER'
+]
+chosen_redact_entities = [
+    "TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"
+]
+# --- Main CLI Function ---
+def main():
+    """
+    A unified command-line interface to prepare, redact, and anonymise various document types.
+    """
+    parser = argparse.ArgumentParser(
+        description='A versatile CLI for redacting PII from PDF/image files and anonymising Word/tabular data.',
+        formatter_class=argparse.RawTextHelpFormatter
+    )
+    # --- General Arguments (apply to all file types) ---
+    general_group = parser.add_argument_group('General Options')
+    general_group.add_argument('--input_file', required=True, help='Path to the input file to process.')
+    general_group.add_argument('--output_dir', default=OUTPUT_FOLDER, help='Directory for all output files.')
+    general_group.add_argument('--language', default=DEFAULT_LANGUAGE, help='Language of the document content.')
+    general_group.add_argument('--allow_list', help='Path to a CSV file with words to exclude from redaction.')
+    general_group.add_argument('--pii_detector',
+    choices=[LOCAL_PII_OPTION, AWS_PII_OPTION],
+    default=LOCAL_PII_OPTION,
+    help='Core PII detection method (Local or AWS).')
+    general_group.add_argument('--aws_access_key', default='', help='Your AWS Access Key ID.')
+    general_group.add_argument('--aws_secret_key', default='', help='Your AWS Secret Access Key.')
+    # --- PDF/Image Redaction Arguments ---
+    pdf_group = parser.add_argument_group('PDF/Image Redaction Options (.pdf, .png, .jpg)')
+    pdf_group.add_argument('--ocr_method',
+    choices=[SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION],
+    default=TESSERACT_TEXT_EXTRACT_OPTION,
+    help='OCR method for text extraction from images.')
+    pdf_group.add_argument('--page_min', type=int, default=0, help='First page to redact.')
+    pdf_group.add_argument('--page_max', type=int, default=999, help='Last page to redact.')
+    pdf_group.add_argument('--prepare_for_review', action='store_true', help='Prepare files for reviewing redactions.')
+    pdf_group.add_argument('--no_images', action='store_false', dest='prepare_images', help='Disable image creation for PDF pages.')
+    # --- Word/Tabular Anonymisation Arguments ---
+    tabular_group = parser.add_argument_group('Word/Tabular Anonymisation Options (.docx, .csv, .xlsx)')
+    tabular_group.add_argument('--anon_strat', choices=['redact', 'encrypt', 'hash'], default='redact', help='The anonymisation strategy to apply.')
+    tabular_group.add_argument('--columns', nargs='+', default=[], help='A list of column names to anonymise in tabular data.')
+    tabular_group.add_argument('--excel_sheets', nargs='+', default=[], help='Specific Excel sheet names to process.')
+    tabular_group.add_argument('--deny_list', help='Path to a CSV file with specific terms/phrases to redact.')
+    tabular_group.add_argument('--fuzzy_mistakes', type=int, default=1, help='Number of allowed spelling mistakes for fuzzy matching.')
+    args = parser.parse_args()
+    # --- Initial Setup ---
+    ensure_output_folder_exists(args.output_dir)
+    _, file_extension = os.path.splitext(args.input_file)
+    file_extension = file_extension.lower()
+    # Load allow/deny lists
+    allow_list = pd.read_csv(args.allow_list) if args.allow_list else pd.DataFrame()
+    deny_list = pd.read_csv(args.deny_list).iloc[:, 0].tolist() if args.deny_list else []
+    # --- Route to the Correct Workflow Based on File Type ---
+    # Workflow 1: PDF/Image Redaction
+    if file_extension in ['.pdf', '.png', '.jpg', '.jpeg']:
+        print("--- Detected PDF/Image file. Starting Redaction Workflow... ---")
+        try:
+            # Step 1: Prepare the document
+            print("\nStep 1: Preparing document...")
+            (
+                prep_summary, prepared_pdf_paths, image_file_paths, _, _, pdf_doc,
+                image_annotations, _, original_cropboxes, page_sizes, textract_output_found, _, _, _, _
+            ) = prepare_image_or_pdf(
+                file_paths=[args.input_file], text_extract_method=args.ocr_method,
+                all_line_level_ocr_results_df=pd.DataFrame(), all_page_line_level_ocr_results_with_words_df=pd.DataFrame(),
+                first_loop_state=True, prepare_for_review=args.prepare_for_review,
+                output_folder=args.output_dir, prepare_images=args.prepare_images
+            )
+            print(f"Preparation complete. {prep_summary}")
+            # Step 2: Redact the prepared document
+            print("\nStep 2: Running redaction...")
+            (
+                output_summary, output_files, _, _, log_files, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _
+            ) = choose_and_run_redactor(
+                file_paths=[args.input_file], prepared_pdf_file_paths=prepared_pdf_paths,
+                pdf_image_file_paths=image_file_paths, chosen_redact_entities=chosen_redact_entities,
+                chosen_redact_comprehend_entities=chosen_comprehend_entities, text_extraction_method=args.ocr_method,
+                in_allow_list=allow_list, first_loop_state=True, page_min=args.page_min, page_max=args.page_max,
+                pymupdf_doc=pdf_doc, annotations_all_pages=image_annotations, page_sizes=page_sizes,
+                document_cropboxes=original_cropboxes, pii_identification_method=args.pii_detector,
+                aws_access_key_textbox=args.aws_access_key, aws_secret_key_textbox=args.aws_secret_key,
+                language=args.language, output_folder=args.output_dir
+            )
+            print("\n--- Redaction Process Complete ---")
+            print(f"Summary: {output_summary}")
+            print(f"\nOutput files saved to: {args.output_dir}")
+            print("Generated Files:", sorted(output_files))
+            if log_files: print("Log Files:", sorted(log_files))
+        except Exception as e:
+            print(f"\nAn error occurred during the PDF/Image redaction workflow: {e}")
+    # Workflow 2: Word/Tabular Data Anonymisation
+    elif file_extension in ['.docx', '.xlsx', '.xls', '.csv', '.parquet']:
+        print("--- Detected Word/Tabular file. Starting Anonymisation Workflow... ---")
+        try:
+            # Run the anonymisation function directly
+            output_summary, output_files, _, _, log_files, _, _ = anonymise_files_with_open_text(
+                file_paths=[args.input_file],
+                in_text="", # Not used for file-based operations
+                anon_strat=args.anon_strat,
+                chosen_cols=args.columns,
+                chosen_redact_entities=chosen_redact_entities,
+                in_allow_list=allow_list,
+                in_excel_sheets=args.excel_sheets,
+                first_loop_state=True,
+                output_folder=args.output_dir,
+                in_deny_list=deny_list,
+                max_fuzzy_spelling_mistakes_num=args.fuzzy_mistakes,
+                pii_identification_method=args.pii_detector,
+                chosen_redact_comprehend_entities=chosen_comprehend_entities,
+                aws_access_key_textbox=args.aws_access_key,
+                aws_secret_key_textbox=args.aws_secret_key,
+                language=args.language
+            )
+            print("\n--- Anonymisation Process Complete ---")
+            print(f"Summary: {output_summary}")
+            print(f"\nOutput files saved to: {args.output_dir}")
+            print("Generated Files:", sorted(output_files))
+            if log_files: print("Log Files:", sorted(log_files))
+        except Exception as e:
+            print(f"\nAn error occurred during the Word/Tabular anonymisation workflow: {e}")
+    else:
+        print(f"Error: Unsupported file type '{file_extension}'.")
+        print("Supported types for redaction: .pdf, .png, .jpg, .jpeg")
+        print("Supported types for anonymisation: .docx, .xlsx, .xls, .csv, .parquet")
 if __name__ == "__main__":
+    main()

tools/config.py CHANGED Viewed

@@ -154,10 +154,9 @@ if USE_LOG_SUBFOLDERS == "True":
     ACCESS_LOGS_FOLDER = ACCESS_LOGS_FOLDER + full_log_subfolder
     USAGE_LOGS_FOLDER = USAGE_LOGS_FOLDER + full_log_subfolder
-S3_FEEDBACK_LOGS_FOLDER = get_or_create_env_var('S3_FEEDBACK_LOGS_FOLDER', FEEDBACK_LOGS_FOLDER)
-S3_ACCESS_LOGS_FOLDER = get_or_create_env_var('S3_ACCESS_LOGS_FOLDER', ACCESS_LOGS_FOLDER)
-S3_USAGE_LOGS_FOLDER = get_or_create_env_var('S3_USAGE_LOGS_FOLDER', USAGE_LOGS_FOLDER)
 # Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
 DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
@@ -197,6 +196,7 @@ FEEDBACK_LOG_FILE_NAME = get_or_create_env_var('FEEDBACK_LOG_FILE_NAME', LOG_FIL
 # Create Tesseract and Poppler folders if you have installed them locally
 TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") #  # If installing for Windows, install Tesseract 5.5.0 from here: https://github.com/UB-Mannheim/tesseract/wiki. Then this environment variable should point to the Tesseract folder e.g. tesseract/
 POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # If installing on Windows,install Poppler from here https://github.com/oschwartz10612/poppler-windows. This variable needs to point to the poppler bin folder e.g. poppler/poppler-24.02.0/Library/bin/
 if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
@@ -266,6 +266,11 @@ TABULAR_PII_DETECTION_MODELS = PII_DETECTION_MODELS.copy()
 if NO_REDACTION_PII_OPTION in TABULAR_PII_DETECTION_MODELS:
     TABULAR_PII_DETECTION_MODELS.remove(NO_REDACTION_PII_OPTION)
 # Entities for redaction
 CHOSEN_COMPREHEND_ENTITIES = get_or_create_env_var('CHOSEN_COMPREHEND_ENTITIES', "['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']")
@@ -284,7 +289,26 @@ MAX_TIME_VALUE = get_or_create_env_var('MAX_TIME_VALUE', '999999')
 CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") # only "grey" is currently supported as a custom box colour
-REDACTION_LANGUAGE = get_or_create_env_var("REDACTION_LANGUAGE", "en") # Currently only English is supported by the app
 RETURN_PDF_END_OF_REDACTION = get_or_create_env_var("RETURN_PDF_END_OF_REDACTION", "True") # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.

     ACCESS_LOGS_FOLDER = ACCESS_LOGS_FOLDER + full_log_subfolder
     USAGE_LOGS_FOLDER = USAGE_LOGS_FOLDER + full_log_subfolder
+S3_FEEDBACK_LOGS_FOLDER = get_or_create_env_var('S3_FEEDBACK_LOGS_FOLDER', 'feedback/' + full_log_subfolder)
+S3_ACCESS_LOGS_FOLDER = get_or_create_env_var('S3_ACCESS_LOGS_FOLDER', 'logs/' + full_log_subfolder)
+S3_USAGE_LOGS_FOLDER = get_or_create_env_var('S3_USAGE_LOGS_FOLDER', 'usage/' + full_log_subfolder)
 # Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
 DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
 # Create Tesseract and Poppler folders if you have installed them locally
 TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") #  # If installing for Windows, install Tesseract 5.5.0 from here: https://github.com/UB-Mannheim/tesseract/wiki. Then this environment variable should point to the Tesseract folder e.g. tesseract/
+TESSERACT_DATA_FOLDER = get_or_create_env_var('TESSERACT_DATA_FOLDER', "/usr/share/tessdata")
 POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # If installing on Windows,install Poppler from here https://github.com/oschwartz10612/poppler-windows. This variable needs to point to the poppler bin folder e.g. poppler/poppler-24.02.0/Library/bin/
 if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
 if NO_REDACTION_PII_OPTION in TABULAR_PII_DETECTION_MODELS:
     TABULAR_PII_DETECTION_MODELS.remove(NO_REDACTION_PII_OPTION)
+### Local OCR model - Tesseract vs PaddleOCR
+CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var('CHOSEN_LOCAL_OCR_MODEL', "tesseract") # Choose between "tesseract", "hybrid", and "paddle". "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence.
+PREPROCESS_LOCAL_OCR_IMAGES = get_or_create_env_var('PREPROCESS_LOCAL_OCR_IMAGES', "False") # Whether to try and preprocess images before extracting text. NOTE: I have found in testing that this often results in WORSE results for scanned pages, so it is default False
 # Entities for redaction
 CHOSEN_COMPREHEND_ENTITIES = get_or_create_env_var('CHOSEN_COMPREHEND_ENTITIES', "['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']")
 CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") # only "grey" is currently supported as a custom box colour
+### Language selection options
+SHOW_LANGUAGE_SELECTION = get_or_create_env_var("SHOW_LANGUAGE_SELECTION", "False")
+DEFAULT_LANGUAGE_FULL_NAME = get_or_create_env_var("DEFAULT_LANGUAGE_FULL_NAME", "english")
+DEFAULT_LANGUAGE = get_or_create_env_var("DEFAULT_LANGUAGE", "en") # For tesseract, ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system. You can find the relevant language packs here: https://github.com/tesseract-ocr/tessdata.
+# For paddle, ensure the paddle language data (e.g., fra.traineddata) is installed on your system. You can find information on supported languages here: https://www.paddleocr.ai/main/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html
+# For AWS Comprehend, only English and Spanish are supported https://docs.aws.amazon.com/comprehend/latest/dg/how-pii.html ['en', 'es']
+# AWS Textract automatically detects the language of the document and supports the following languages: https://aws.amazon.com/textract/faqs/#topic-0. 'English, Spanish, Italian, Portuguese, French, German. Handwriting, Invoices and Receipts, Identity documents and Queries processing are in English only'
+textract_language_choices = get_or_create_env_var("textract_language_choices", "['en', 'es', 'fr', 'de', 'it', 'pt']")
+aws_comprehend_language_choices = get_or_create_env_var("aws_comprehend_language_choices", "['en', 'es']")
+# The choices that the user sees
+MAPPED_LANGUAGE_CHOICES = get_or_create_env_var("MAPPED_LANGUAGE_CHOICES", "['english', 'french', 'german', 'spanish', 'italian', 'dutch', 'portuguese', 'chinese', 'japanese', 'korean', 'lithuanian', 'macedonian', 'norwegian_bokmaal', 'polish', 'romanian', 'russian', 'slovenian', 'swedish', 'catalan', 'ukrainian']")
+LANGUAGE_CHOICES = get_or_create_env_var("LANGUAGE_CHOICES", "['en', 'fr', 'de', 'es', 'it', 'nl', 'pt', 'zh', 'ja', 'ko', 'lt', 'mk', 'nb', 'pl', 'ro', 'ru', 'sl', 'sv', 'ca', 'uk']")
+### File output options
 RETURN_PDF_END_OF_REDACTION = get_or_create_env_var("RETURN_PDF_END_OF_REDACTION", "True") # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.

tools/custom_image_analyser_engine.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

tools/data_anonymise.py CHANGED Viewed

@@ -6,16 +6,19 @@ import time
 import boto3
 import botocore
 import pandas as pd
 from openpyxl import Workbook
 from faker import Faker
 from gradio import Progress
-from typing import List, Dict, Any
 from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
 from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
-from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER
 from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
-from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
 # Use custom version of analyze_dict to be able to track progress
 from tools.presidio_analyzer_custom import analyze_dict
@@ -46,7 +49,7 @@ def initial_clean(text:str) -> str:
     return text
 def process_recognizer_result(result:RecognizerResult, recognizer_result:RecognizerResult, data_row:int, dictionary_key:int, df_dict:Dict[str, List[Any]], keys_to_keep:List[str]) -> List[str]:
-        output = []
         if hasattr(result, 'value'):
             text = result.value[data_row]
@@ -86,7 +89,7 @@ def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult],
     Returns:
         str: A string containing the detailed decision process output.
     """
-    decision_process_output = []
     keys_to_keep = ['entity_type', 'start', 'end']
     # Run through each column to analyse for PII
@@ -115,22 +118,16 @@ def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
     # ## Pick out common names and replace them with the same person value
     df_dict = df.to_dict(orient="list")
-    analyzer = AnalyzerEngine()
-    batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
-    analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
     analyzer_results = list(analyzer_results)
-    # + tags=[]
     text = analyzer_results[3].value
-    # + tags=[]
     recognizer_result = str(analyzer_results[3].recognizer_results)
-    # + tags=[]
-    recognizer_result
-    # + tags=[]
     data_str = recognizer_result  # abbreviated for brevity
     # Adjusting the parse_dict function to handle trailing ']'
@@ -153,7 +150,7 @@ def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
     # Re-running the improved processing code
-    result = []
     for lst_str in list_strs:
         # Splitting each list string into individual dictionary strings
@@ -164,73 +161,156 @@ def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
         dicts = [parse_dict(d) for d in dict_strs]
         result.append(dicts)
-    #result
-    # + tags=[]
-    names = []
     for idx, paragraph in enumerate(text):
-        paragraph_texts = []
         for dictionary in result[idx]:
             if dictionary['type'] == 'PERSON':
                 paragraph_texts.append(paragraph[dictionary['start']:dictionary['end']])
         names.append(paragraph_texts)
-    # + tags=[]
     # Flatten the list of lists and extract unique names
     unique_names = list(set(name for sublist in names for name in sublist))
-    # + tags=[]
     fake_names = pd.Series(unique_names).apply(fake_first_name)
-    # + tags=[]
     mapping_df = pd.DataFrame(data={"Unique names":unique_names,
                     "Fake names": fake_names})
-    # + tags=[]
-    # Convert mapping dataframe to dictionary
     # Convert mapping dataframe to dictionary, adding word boundaries for full-word match
     name_map = {r'\b' + k + r'\b': v for k, v in zip(mapping_df['Unique names'], mapping_df['Fake names'])}
-    # + tags=[]
     name_map
-    # + tags=[]
     scrubbed_df_consistent_names = df.replace(name_map, regex = True)
-    # + tags=[]
     scrubbed_df_consistent_names
     return scrubbed_df_consistent_names
-def anonymise_data_files(file_paths: List[str],
                          in_text: str,
                          anon_strat: str,
                          chosen_cols: List[str],
-                         language: str,
                          chosen_redact_entities: List[str],
                          in_allow_list: List[str] = None,
                          latest_file_completed: int = 0,
-                         out_message: list = [],
-                         out_file_paths: list = [],
-                         log_files_output_paths: list = [],
-                         in_excel_sheets: list = [],
                          first_loop_state: bool = False,
                          output_folder: str = OUTPUT_FOLDER,
-                         in_deny_list:list[str]=[],
                          max_fuzzy_spelling_mistakes_num:int=0,
                          pii_identification_method:str="Local",
-                         chosen_redact_comprehend_entities:List[str]=[],
                          comprehend_query_number:int=0,
                          aws_access_key_textbox:str='',
                          aws_secret_key_textbox:str='',
                          actual_time_taken_number:float=0,
                          progress: Progress = Progress(track_tqdm=True)):
     """
     This function anonymises data files based on the provided parameters.
     Parameters:
-    - file_paths (List[str]): A list of file paths to anonymise.
     - in_text (str): The text to anonymise if file_paths is 'open_text'.
     - anon_strat (str): The anonymisation strategy to use.
     - chosen_cols (List[str]): A list of column names to anonymise.
@@ -252,17 +332,26 @@ def anonymise_data_files(file_paths: List[str],
     - aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
     - aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
     - actual_time_taken_number (float, optional): Time taken to do the redaction.
     - progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
     """
     tic = time.perf_counter()
     comprehend_client = ""
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
         latest_file_completed = 0
-        out_message = []
-        out_file_paths = []
     # Load file
     # If out message or out_file_paths are blank, change to a list so it can be appended to
@@ -272,23 +361,23 @@ def anonymise_data_files(file_paths: List[str],
     #print("log_files_output_paths:",log_files_output_paths)
     if isinstance(log_files_output_paths, str):
-        log_files_output_paths = []
     if not out_file_paths:
-        out_file_paths = []
     if isinstance(in_allow_list, list):
         if in_allow_list:
             in_allow_list_flat = in_allow_list
         else:
-            in_allow_list_flat = []
     elif isinstance(in_allow_list, pd.DataFrame):
         if not in_allow_list.empty:
             in_allow_list_flat = list(in_allow_list.iloc[:, 0].unique())
         else:
-            in_allow_list_flat = []
     else:
-        in_allow_list_flat = []
     anon_df = pd.DataFrame()
@@ -342,15 +431,37 @@ def anonymise_data_files(file_paths: List[str],
             sheet_name = ""
             file_type = ""
-            out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=OUTPUT_FOLDER)
         else:
             # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
             file_type = detect_file_type(anon_file)
             print("File type is:", file_type)
             out_file_part = get_file_name_without_type(anon_file.name)
-            if file_type == 'xlsx':
                 print("Running through all xlsx sheets")
                 #anon_xlsx = pd.ExcelFile(anon_file)
                 if not in_excel_sheets:
@@ -371,14 +482,14 @@ def anonymise_data_files(file_paths: List[str],
                     anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
-                    out_file_paths, out_message, key_string, log_files_output_paths  = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
             else:
                 sheet_name = ""
                 anon_df = read_file(anon_file)
                 out_file_part = get_file_name_without_type(anon_file.name)
-                out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
         # Increase latest file completed count unless we are at the last file
         if latest_file_completed != len(file_paths):
@@ -392,6 +503,9 @@ def anonymise_data_files(file_paths: List[str],
         actual_time_taken_number += out_time_float
         out_message.append("Anonymisation of file '" + out_file_part + "' successfully completed in")
         out_message_out = '\n'.join(out_message)
@@ -406,7 +520,7 @@ def anonymise_data_files(file_paths: List[str],
     return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, actual_time_taken_number
-def anon_wrapper_func(
     anon_file: str,
     anon_df: pd.DataFrame,
     chosen_cols: List[str],
@@ -415,18 +529,20 @@ def anon_wrapper_func(
     out_message: str,
     excel_sheet_name: str,
     anon_strat: str,
-    language: str,
     chosen_redact_entities: List[str],
     in_allow_list: List[str],
     file_type: str,
     anon_xlsx_export_file_name: str,
     log_files_output_paths: List[str],
-    in_deny_list: List[str]=[],
     max_fuzzy_spelling_mistakes_num:int=0,
     pii_identification_method:str="Local",
-    chosen_redact_comprehend_entities:List[str]=[],
     comprehend_query_number:int=0,
     comprehend_client:botocore.client.BaseClient="",
     output_folder: str = OUTPUT_FOLDER
 ):
     """
@@ -469,7 +585,7 @@ def anon_wrapper_func(
         Returns:
             A list containing the common strings.
         """
-        common_strings = []
         for string in list1:
             if string in list2:
                 common_strings.append(string)
@@ -485,7 +601,9 @@ def anon_wrapper_func(
     if any_cols_found == False:
         out_message = "No chosen columns found in dataframe: " + out_file_part
         print(out_message)
     else:
         chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
@@ -495,9 +613,12 @@ def anon_wrapper_func(
     anon_df_part = anon_df[chosen_cols_in_anon_df]
     anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
-    # Anonymise the selected columns
-    anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client)
     # Rejoin the dataframe together
     anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
     anon_df_out = anon_df_out[all_cols_original_order]
@@ -531,7 +652,7 @@ def anon_wrapper_func(
     else:
         anon_export_file_name = output_folder + out_file_part + "_anon_" + anon_strat_txt + ".csv"
-        anon_df_out.to_csv(anon_export_file_name, index = None)
         decision_process_log_output_file = anon_export_file_name + "_decision_process_output.txt"
         with open(decision_process_log_output_file, "w") as f:
@@ -553,14 +674,15 @@ def anonymise_script(df:pd.DataFrame,
                      anon_strat:str,
                      language:str,
                      chosen_redact_entities:List[str],
-                     in_allow_list:List[str]=[],
-                     in_deny_list:List[str]=[],
                      max_fuzzy_spelling_mistakes_num:int=0,
                      pii_identification_method:str="Local",
-                     chosen_redact_comprehend_entities:List[str]=[],
                      comprehend_query_number:int=0,
-                     comprehend_client:botocore.client.BaseClient="",
                      custom_entities:List[str]=custom_entities,
                      progress:Progress=Progress(track_tqdm=False)):
     '''
     Conduct anonymisation of a dataframe using Presidio and/or AWS Comprehend if chosen.
@@ -580,21 +702,43 @@ def anonymise_script(df:pd.DataFrame,
         if in_allow_list:
             in_allow_list_flat = in_allow_list
         else:
-            in_allow_list_flat = []
     elif isinstance(in_allow_list, pd.DataFrame):
         if not in_allow_list.empty:
             in_allow_list_flat = list(in_allow_list.iloc[:, 0].unique())
         else:
-            in_allow_list_flat = []
     else:
-        in_allow_list_flat = []
     if isinstance(in_deny_list, pd.DataFrame):
         if not in_deny_list.empty:
             in_deny_list = in_deny_list.iloc[:, 0].tolist()
         else:
             # Handle the case where the DataFrame is empty
-            in_deny_list = []  # or some default value
         # Sort the strings in order from the longest string to the shortest
         in_deny_list = sorted(in_deny_list, key=len, reverse=True)
@@ -612,7 +756,7 @@ def anonymise_script(df:pd.DataFrame,
     batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
     anonymizer = AnonymizerEngine()#conflict_resolution=ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED)
     batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
-    analyzer_results = []
     if pii_identification_method == "Local":

 import boto3
 import botocore
 import pandas as pd
+import docx
+import gradio as gr
 from openpyxl import Workbook
 from faker import Faker
 from gradio import Progress
+from typing import List, Dict, Any, Optional
+from botocore.client import BaseClient
 from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
 from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
+from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER, DEFAULT_LANGUAGE, aws_comprehend_language_choices
 from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
+from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities, create_nlp_analyser, load_spacy_model
 # Use custom version of analyze_dict to be able to track progress
 from tools.presidio_analyzer_custom import analyze_dict
     return text
 def process_recognizer_result(result:RecognizerResult, recognizer_result:RecognizerResult, data_row:int, dictionary_key:int, df_dict:Dict[str, List[Any]], keys_to_keep:List[str]) -> List[str]:
+        output = list()
         if hasattr(result, 'value'):
             text = result.value[data_row]
     Returns:
         str: A string containing the detailed decision process output.
     """
+    decision_process_output = list()
     keys_to_keep = ['entity_type', 'start', 'end']
     # Run through each column to analyse for PII
     # ## Pick out common names and replace them with the same person value
     df_dict = df.to_dict(orient="list")
+    #analyzer = AnalyzerEngine()
+    batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
+    analyzer_results = batch_analyzer.analyze_dict(df_dict, language=DEFAULT_LANGUAGE)
     analyzer_results = list(analyzer_results)
     text = analyzer_results[3].value
     recognizer_result = str(analyzer_results[3].recognizer_results)
     data_str = recognizer_result  # abbreviated for brevity
     # Adjusting the parse_dict function to handle trailing ']'
     # Re-running the improved processing code
+    result = list()
     for lst_str in list_strs:
         # Splitting each list string into individual dictionary strings
         dicts = [parse_dict(d) for d in dict_strs]
         result.append(dicts)
+    names = list()
     for idx, paragraph in enumerate(text):
+        paragraph_texts = list()
         for dictionary in result[idx]:
             if dictionary['type'] == 'PERSON':
                 paragraph_texts.append(paragraph[dictionary['start']:dictionary['end']])
         names.append(paragraph_texts)
     # Flatten the list of lists and extract unique names
     unique_names = list(set(name for sublist in names for name in sublist))
     fake_names = pd.Series(unique_names).apply(fake_first_name)
     mapping_df = pd.DataFrame(data={"Unique names":unique_names,
                     "Fake names": fake_names})
     # Convert mapping dataframe to dictionary, adding word boundaries for full-word match
     name_map = {r'\b' + k + r'\b': v for k, v in zip(mapping_df['Unique names'], mapping_df['Fake names'])}
     name_map
     scrubbed_df_consistent_names = df.replace(name_map, regex = True)
     scrubbed_df_consistent_names
     return scrubbed_df_consistent_names
+def handle_docx_anonymisation(
+    file_path: str,
+    output_folder: str,
+    anon_strat: str,
+    chosen_redact_entities: List[str],
+    in_allow_list: List[str],
+    in_deny_list: List[str],
+    max_fuzzy_spelling_mistakes_num: int,
+    pii_identification_method: str,
+    chosen_redact_comprehend_entities: List[str],
+    comprehend_query_number: int,
+    comprehend_client: BaseClient,
+    language: Optional[str] = DEFAULT_LANGUAGE,
+    nlp_analyser: AnalyzerEngine = nlp_analyser
+):
+    """
+    Anonymises a .docx file by extracting text, processing it, and re-inserting it.
+    Returns:
+        A tuple containing the output file path and the log file path.
+    """
+    # 1. Load the document and extract text elements
+    doc = docx.Document(file_path)
+    text_elements = list()  # This will store the actual docx objects (paragraphs, cells)
+    original_texts = list() # This will store the text from those objects
+    # Extract from paragraphs
+    for para in doc.paragraphs:
+        if para.text.strip():  # Only process non-empty paragraphs
+            text_elements.append(para)
+            original_texts.append(para.text)
+    # Extract from tables
+    for table in doc.tables:
+        for row in table.rows:
+            for cell in row.cells:
+                if cell.text.strip(): # Only process non-empty cells
+                    text_elements.append(cell)
+                    original_texts.append(cell.text)
+    # If there's no text to process, return early
+    if not original_texts:
+        print(f"No text found in {file_path}. Skipping.")
+        return None, None
+    # 2. Convert to a DataFrame for the existing anonymisation script
+    df_to_anonymise = pd.DataFrame({'text_to_redact': original_texts})
+    # 3. Call the core anonymisation script
+    anonymised_df, _, decision_log = anonymise_script(
+        df=df_to_anonymise,
+        anon_strat=anon_strat,
+        language=language,
+        chosen_redact_entities=chosen_redact_entities,
+        in_allow_list=in_allow_list,
+        in_deny_list=in_deny_list,
+        max_fuzzy_spelling_mistakes_num=max_fuzzy_spelling_mistakes_num,
+        pii_identification_method=pii_identification_method,
+        chosen_redact_comprehend_entities=chosen_redact_comprehend_entities,
+        comprehend_query_number=comprehend_query_number,
+        comprehend_client=comprehend_client,
+        nlp_analyser=nlp_analyser
+    )
+    anonymised_texts = anonymised_df['text_to_redact'].tolist()
+    # 4. Re-insert the anonymised text back into the document objects
+    for element, new_text in zip(text_elements, anonymised_texts):
+        if isinstance(element, docx.text.paragraph.Paragraph):
+            # Clear existing content (runs) and add the new text in a single new run
+            element.clear()
+            element.add_run(new_text)
+        elif isinstance(element, docx.table._Cell):
+            # For cells, setting .text works similarly
+            element.text = new_text
+    # 5. Save the redacted document and the log file
+    base_name = os.path.basename(file_path)
+    file_name_without_ext = os.path.splitext(base_name)[0]
+    output_docx_path = os.path.join(output_folder, f"{file_name_without_ext}_redacted.docx")
+    log_file_path = os.path.join(output_folder, f"{file_name_without_ext}_redacted_log.txt")
+    output_xlsx_path = os.path.join(output_folder, f"{file_name_without_ext}_redacted.csv")
+    anonymised_df.to_csv(output_xlsx_path, encoding="utf-8-sig")
+    doc.save(output_docx_path)
+    with open(log_file_path, "w", encoding="utf-8-sig") as f:
+        f.write(decision_log)
+    return output_docx_path, log_file_path, output_xlsx_path
+def anonymise_files_with_open_text(file_paths: List[str],
                          in_text: str,
                          anon_strat: str,
                          chosen_cols: List[str],
                          chosen_redact_entities: List[str],
                          in_allow_list: List[str] = None,
                          latest_file_completed: int = 0,
+                         out_message: list = list(),
+                         out_file_paths: list = list(),
+                         log_files_output_paths: list = list(),
+                         in_excel_sheets: list = list(),
                          first_loop_state: bool = False,
                          output_folder: str = OUTPUT_FOLDER,
+                         in_deny_list:list[str]=list(),
                          max_fuzzy_spelling_mistakes_num:int=0,
                          pii_identification_method:str="Local",
+                         chosen_redact_comprehend_entities:List[str]=list(),
                          comprehend_query_number:int=0,
                          aws_access_key_textbox:str='',
                          aws_secret_key_textbox:str='',
                          actual_time_taken_number:float=0,
+                         language: Optional[str] = None,
                          progress: Progress = Progress(track_tqdm=True)):
     """
     This function anonymises data files based on the provided parameters.
     Parameters:
+    - file_paths (List[str]): A list of file paths to anonymise: '.xlsx', '.xls', '.csv', '.parquet', or '.docx'.
     - in_text (str): The text to anonymise if file_paths is 'open_text'.
     - anon_strat (str): The anonymisation strategy to use.
     - chosen_cols (List[str]): A list of column names to anonymise.
     - aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
     - aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
     - actual_time_taken_number (float, optional): Time taken to do the redaction.
+    - language (str, optional): The language of the text to anonymise.
     - progress (Progress, optional): A Progress object to track progress. Defaults to a Progress object with track_tqdm=True.
     """
     tic = time.perf_counter()
     comprehend_client = ""
+    # Use provided language or default
+    language = language or DEFAULT_LANGUAGE
+    if pii_identification_method == "AWS Comprehend":
+        if language not in aws_comprehend_language_choices:
+            out_message = f"Please note that this language is not supported by AWS Comprehend: {language}"
+            raise Warning(out_message)
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
         latest_file_completed = 0
+        out_message = list()
+        out_file_paths = list()
     # Load file
     # If out message or out_file_paths are blank, change to a list so it can be appended to
     #print("log_files_output_paths:",log_files_output_paths)
     if isinstance(log_files_output_paths, str):
+        log_files_output_paths = list()
     if not out_file_paths:
+        out_file_paths = list()
     if isinstance(in_allow_list, list):
         if in_allow_list:
             in_allow_list_flat = in_allow_list
         else:
+            in_allow_list_flat = list()
     elif isinstance(in_allow_list, pd.DataFrame):
         if not in_allow_list.empty:
             in_allow_list_flat = list(in_allow_list.iloc[:, 0].unique())
         else:
+            in_allow_list_flat = list()
     else:
+        in_allow_list_flat = list()
     anon_df = pd.DataFrame()
             sheet_name = ""
             file_type = ""
+            out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=OUTPUT_FOLDER)
         else:
             # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
             file_type = detect_file_type(anon_file)
             print("File type is:", file_type)
             out_file_part = get_file_name_without_type(anon_file.name)
+            if file_type == 'docx':
+                output_path, log_path, output_xlsx_path = handle_docx_anonymisation(
+                    file_path=anon_file.name, # .name if it's a temp file object
+                    output_folder=output_folder,
+                    anon_strat=anon_strat,
+                    chosen_redact_entities=chosen_redact_entities,
+                    in_allow_list=in_allow_list_flat,
+                    in_deny_list=in_deny_list,
+                    max_fuzzy_spelling_mistakes_num=max_fuzzy_spelling_mistakes_num,
+                    pii_identification_method=pii_identification_method,
+                    chosen_redact_comprehend_entities=chosen_redact_comprehend_entities,
+                    comprehend_query_number=comprehend_query_number,
+                    comprehend_client=comprehend_client,
+                    language=language
+                )
+                if output_path:
+                    out_file_paths.append(output_path)
+                if output_xlsx_path:
+                    out_file_paths.append(output_xlsx_path)
+                if log_path:
+                    log_files_output_paths.append(log_path)
+            elif file_type == 'xlsx':
                 print("Running through all xlsx sheets")
                 #anon_xlsx = pd.ExcelFile(anon_file)
                 if not in_excel_sheets:
                     anon_df = pd.read_excel(anon_file, sheet_name=sheet_name)
+                    out_file_paths, out_message, key_string, log_files_output_paths  = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
             else:
                 sheet_name = ""
                 anon_df = read_file(anon_file)
                 out_file_part = get_file_name_without_type(anon_file.name)
+                out_file_paths, out_message, key_string, log_files_output_paths = tabular_anonymise_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, language, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, output_folder=output_folder)
         # Increase latest file completed count unless we are at the last file
         if latest_file_completed != len(file_paths):
         actual_time_taken_number += out_time_float
+        if isinstance(out_message, str):
+            out_message = [out_message]
         out_message.append("Anonymisation of file '" + out_file_part + "' successfully completed in")
         out_message_out = '\n'.join(out_message)
     return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, actual_time_taken_number
+def tabular_anonymise_wrapper_func(
     anon_file: str,
     anon_df: pd.DataFrame,
     chosen_cols: List[str],
     out_message: str,
     excel_sheet_name: str,
     anon_strat: str,
+    language: str,
     chosen_redact_entities: List[str],
     in_allow_list: List[str],
     file_type: str,
     anon_xlsx_export_file_name: str,
     log_files_output_paths: List[str],
+    in_deny_list: List[str]=list(),
     max_fuzzy_spelling_mistakes_num:int=0,
     pii_identification_method:str="Local",
+    comprehend_language: Optional[str] = None,
+    chosen_redact_comprehend_entities:List[str]=list(),
     comprehend_query_number:int=0,
     comprehend_client:botocore.client.BaseClient="",
+    nlp_analyser: AnalyzerEngine = nlp_analyser,
     output_folder: str = OUTPUT_FOLDER
 ):
     """
         Returns:
             A list containing the common strings.
         """
+        common_strings = list()
         for string in list1:
             if string in list2:
                 common_strings.append(string)
     if any_cols_found == False:
         out_message = "No chosen columns found in dataframe: " + out_file_part
+        key_string = ""
         print(out_message)
+        return out_file_paths, out_message, key_string, log_files_output_paths
     else:
         chosen_cols_in_anon_df = get_common_strings(chosen_cols, all_cols_original_order)
     anon_df_part = anon_df[chosen_cols_in_anon_df]
     anon_df_remain = anon_df.drop(chosen_cols_in_anon_df, axis = 1)
+    # Anonymise the selected columns
+    anon_df_part_out, key_string, decision_process_output_str = anonymise_script(anon_df_part, anon_strat, language, chosen_redact_entities, in_allow_list, in_deny_list, max_fuzzy_spelling_mistakes_num, pii_identification_method, chosen_redact_comprehend_entities, comprehend_query_number, comprehend_client, nlp_analyser=nlp_analyser)
+    anon_df_part_out.replace("^nan$", "", regex=True, inplace=True)
     # Rejoin the dataframe together
     anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
     anon_df_out = anon_df_out[all_cols_original_order]
     else:
         anon_export_file_name = output_folder + out_file_part + "_anon_" + anon_strat_txt + ".csv"
+        anon_df_out.to_csv(anon_export_file_name, index = None, encoding="utf-8-sig")
         decision_process_log_output_file = anon_export_file_name + "_decision_process_output.txt"
         with open(decision_process_log_output_file, "w") as f:
                      anon_strat:str,
                      language:str,
                      chosen_redact_entities:List[str],
+                     in_allow_list:List[str]=list(),
+                     in_deny_list:List[str]=list(),
                      max_fuzzy_spelling_mistakes_num:int=0,
                      pii_identification_method:str="Local",
+                     chosen_redact_comprehend_entities:List[str]=list(),
                      comprehend_query_number:int=0,
+                     comprehend_client:botocore.client.BaseClient="",
                      custom_entities:List[str]=custom_entities,
+                     nlp_analyser: AnalyzerEngine = nlp_analyser,
                      progress:Progress=Progress(track_tqdm=False)):
     '''
     Conduct anonymisation of a dataframe using Presidio and/or AWS Comprehend if chosen.
         if in_allow_list:
             in_allow_list_flat = in_allow_list
         else:
+            in_allow_list_flat = list()
     elif isinstance(in_allow_list, pd.DataFrame):
         if not in_allow_list.empty:
             in_allow_list_flat = list(in_allow_list.iloc[:, 0].unique())
         else:
+            in_allow_list_flat = list()
     else:
+        in_allow_list_flat = list()
+    ### Language check - check if selected language packs exist
+    try:
+        if language != "en":
+            progress(0.1, desc=f"Loading SpaCy model for {language}")
+        load_spacy_model(language)
+    except Exception as e:
+        print(f"Error downloading language packs for {language}: {e}")
+        raise Exception(f"Error downloading language packs for {language}: {e}")
+    # Try updating the supported languages for the spacy analyser
+    try:
+        nlp_analyser = create_nlp_analyser(language, existing_nlp_analyser=nlp_analyser)
+        # Check list of nlp_analyser recognisers and languages
+        if language != "en":
+            gr.Info(f"Language: {language} only supports the following entity detection: {str(nlp_analyser.registry.get_supported_entities(languages=[language]))}")
+    except Exception as e:
+        print(f"Error creating nlp_analyser for {language}: {e}")
+        raise Exception(f"Error creating nlp_analyser for {language}: {e}")
     if isinstance(in_deny_list, pd.DataFrame):
         if not in_deny_list.empty:
             in_deny_list = in_deny_list.iloc[:, 0].tolist()
         else:
             # Handle the case where the DataFrame is empty
+            in_deny_list = list()  # or some default value
         # Sort the strings in order from the longest string to the shortest
         in_deny_list = sorted(in_deny_list, key=len, reverse=True)
     batch_analyzer = BatchAnalyzerEngine(analyzer_engine=nlp_analyser)
     anonymizer = AnonymizerEngine()#conflict_resolution=ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED)
     batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)
+    analyzer_results = list()
     if pii_identification_method == "Local":

tools/example_cli_calls.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+python cli_redact.py --help
+python cli_redact.py \
+    --input_file "documents/confidential-report.pdf" \
+    --output_dir "output/redacted_reports/" \
+    --ocr_method "Local OCR model - PDFs without selectable text" \
+    --pii_detector "Local" \
+    --page_min 2 \
+    --page_max 10 \
+    --allow_list "config/project_allowlist.csv"
+python your_cli_script.py \
+    --input_file "data/customer_data.xlsx" \
+    --output_dir "output/anonymised_data/" \
+    --anon_strat "redact" \
+    --columns "Customer Name" "Email" \
+    --excel_sheets "Q3-Data"
+python your_cli_script.py \
+    --input_file "legal_docs/legal_agreement.docx" \
+    --output_dir "output/anonymised_docs/" \
+    --anon_strat "encrypt" \
+    --deny_list "config/codenames.csv" \
+    --language "en"

tools/file_conversion.py CHANGED Viewed

@@ -4,6 +4,7 @@ import os
 import re
 import time
 import json
 import numpy as np
 import pymupdf
 from pymupdf import Document, Page, Rect
@@ -71,7 +72,7 @@ def check_image_size_and_reduce(out_path:str, image:Image):
     Check if a given image size is above around 4.5mb, and reduce size if necessary. 5mb is the maximum possible to submit to AWS Textract.
     '''
-    all_img_details = []
     page_num = 0
     # Check file size and resize if necessary
@@ -133,6 +134,8 @@ def process_single_page_for_image_conversion(pdf_path:str, page_num:int, image_d
             elif pdf_path.lower().endswith(".jpg") or pdf_path.lower().endswith(".png") or pdf_path.lower().endswith(".jpeg"):
                 image = Image.open(pdf_path)
                 image.save(out_path, format="PNG")
             width, height = image.size
@@ -143,6 +146,7 @@ def process_single_page_for_image_conversion(pdf_path:str, page_num:int, image_d
             return page_num, out_path, width, height
         except Exception as e:
             print(f"Error processing page {page_num + 1}: {e}")
             return page_num,  out_path_placeholder, pd.NA, pd.NA
     else:
@@ -159,14 +163,14 @@ def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min
     else:
         page_count = pdfinfo_from_path(pdf_path)['Pages']
-    print(f"Number of pages in PDF: {page_count}")
     # Set page max to length of pdf if not specified
     if page_max == 0: page_max = page_count
-    results = []
     with ThreadPoolExecutor(max_workers=num_threads) as executor:
-        futures = []
         for page_num in range(page_min, page_max):
             futures.append(executor.submit(process_single_page_for_image_conversion, pdf_path, page_num, image_dpi, create_images=create_images, input_folder=input_folder))
@@ -218,10 +222,10 @@ def process_file_for_image_creation(file_path:str, prepare_for_review:bool=False
     else:
         print(f"{file_path} is not an image or PDF file.")
-        img_path = []
-        image_sizes_width = []
-        image_sizes_height = []
-        all_img_details = []
     return img_path, image_sizes_width, image_sizes_height, all_img_details
@@ -230,7 +234,7 @@ def get_input_file_names(file_input:List[str]):
     Get list of input files to report to logs.
     '''
-    all_relevant_files = []
     file_name_with_extension = ""
     full_file_name = ""
     total_pdf_page_count = 0
@@ -254,7 +258,7 @@ def get_input_file_names(file_input:List[str]):
         file_extension = os.path.splitext(file_path)[1].lower()
         # Check if the file is in acceptable types
-        if (file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet']) & ("review_file" not in file_path_without_ext) & ("ocr_output" not in file_path_without_ext):
             all_relevant_files.append(file_path_without_ext)
             file_name_with_extension = file_path_without_ext + file_extension
             full_file_name = file_path
@@ -415,8 +419,8 @@ def redact_whole_pymupdf_page(rect_height:float, rect_width:float, page:Page, cu
     return whole_page_img_annotation_box
 def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float], image_sizes_height:List[float], image_file_paths:List[str]):
-    page_sizes = []
-    original_cropboxes = []
     for page_no, page in enumerate(pymupdf_doc):
         reported_page_no = page_no + 1
@@ -439,9 +443,6 @@ def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float]
         out_page_image_sizes['cropbox_x_offset'] = pymupdf_page.cropbox.x0 - pymupdf_page.mediabox.x0
         # cropbox_y_offset_from_top: Distance from MediaBox top edge to CropBox top edge
-        # MediaBox top y = mediabox.y1
-        # CropBox top y = cropbox.y1
-        # The difference is mediabox.y1 - cropbox.y1
         out_page_image_sizes['cropbox_y_offset_from_top'] = pymupdf_page.mediabox.y1 - pymupdf_page.cropbox.y1
         if image_sizes_width and image_sizes_height:
@@ -452,23 +453,57 @@ def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float]
     return page_sizes, original_cropboxes
 def prepare_image_or_pdf(
     file_paths: List[str],
     text_extract_method: str,
     all_line_level_ocr_results_df:pd.DataFrame,
     latest_file_completed: int = 0,
-    out_message: List[str] = [],
     first_loop_state: bool = False,
     number_of_pages:int = 0,
-    all_annotations_object:List = [],
     prepare_for_review:bool = False,
-    in_fully_redacted_list:List[int]=[],
     output_folder:str=OUTPUT_FOLDER,
     input_folder:str=INPUT_FOLDER,
     prepare_images:bool=True,
-    page_sizes:list[dict]=[],
     textract_output_found:bool = False,
-    relevant_ocr_output_with_words_found:bool = False,
     progress: Progress = Progress(track_tqdm=True)
 ) -> tuple[List[str], List[str]]:
     """
@@ -490,6 +525,7 @@ def prepare_image_or_pdf(
         output_folder (optional, str): The output folder for file save
         prepare_images (optional, bool): A boolean indicating whether to create images for each PDF page. Defaults to True.
         page_sizes(optional, List[dict]): A list of dicts containing information about page sizes in various formats.
         textract_output_found (optional, bool): A boolean indicating whether Textract analysis output has already been found. Defaults to False.
         relevant_ocr_output_with_words_found (optional, bool): A boolean indicating whether local OCR analysis output has already been found. Defaults to False.
         progress (optional, Progress): Progress tracker for the operation
@@ -501,11 +537,11 @@ def prepare_image_or_pdf(
     tic = time.perf_counter()
     json_from_csv = False
-    original_cropboxes = []  # Store original CropBox values
-    converted_file_paths = []
-    image_file_paths = []
-    pymupdf_doc = []
-    all_img_details = []
     review_file_csv = pd.DataFrame()
     out_textract_path = ""
     combined_out_message = ""
@@ -518,15 +554,15 @@ def prepare_image_or_pdf(
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
         latest_file_completed = 0
-        out_message = []
-        all_annotations_object = []
     else:
         print("Now redacting file", str(latest_file_completed))
     # If combined out message or converted_file_paths are blank, change to a list so it can be appended to
     if isinstance(out_message, str): out_message = [out_message]
-    if not file_paths: file_paths = []
     if isinstance(file_paths, dict): file_paths = os.path.abspath(file_paths["name"])
@@ -542,7 +578,8 @@ def prepare_image_or_pdf(
             final_out_message = '\n'.join(out_message)
         else:
             final_out_message = out_message
-        return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df, relevant_ocr_output_with_words_found
     progress(0.1, desc='Preparing file')
@@ -555,8 +592,8 @@ def prepare_image_or_pdf(
     # Loop through files to load in
     for file in file_paths_loop:
-        converted_file_path = []
-        image_file_path = []
         if isinstance(file, str):
             file_path = file
@@ -565,15 +602,18 @@ def prepare_image_or_pdf(
         file_path_without_ext = get_file_name_without_type(file_path)
         file_name_with_ext = os.path.basename(file_path)
         if not file_path:
-            out_message = "Please select a file."
             print(out_message)
-            raise Exception(out_message)
         file_extension = os.path.splitext(file_path)[1].lower()
         # If a pdf, load as a pymupdf document
         if is_pdf(file_path):
             pymupdf_doc = pymupdf.open(file_path)
             pymupdf_pages = pymupdf_doc.page_count
@@ -588,16 +628,17 @@ def prepare_image_or_pdf(
             #Create base version of the annotation object that doesn't have any annotations in it
             if (not all_annotations_object) & (prepare_for_review == True):
-                all_annotations_object = []
                 for image_path in image_file_paths:
                     annotation = {}
                     annotation["image"] = image_path
-                    annotation["boxes"] = []
                     all_annotations_object.append(annotation)
         elif is_pdf_or_image(file_path):  # Alternatively, if it's an image
             # Check if the file is an image type and the user selected text ocr option
             if file_extension in ['.jpg', '.jpeg', '.png'] and text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION:
                 text_extract_method = TESSERACT_TEXT_EXTRACT_OPTION
@@ -622,18 +663,25 @@ def prepare_image_or_pdf(
             pymupdf_doc.save(converted_file_path, garbage=4, deflate=True, clean=True)
         elif file_extension in ['.csv']:
             if '_review_file' in file_path_without_ext:
                 review_file_csv = read_file(file_path)
                 all_annotations_object = convert_review_df_to_annotation_json(review_file_csv, image_file_paths, page_sizes)
-                json_from_csv = True
-                #print("Converted CSV review file to image annotation object")
             elif '_ocr_output' in file_path_without_ext:
-                all_line_level_ocr_results_df = read_file(file_path)
                 json_from_csv = False
         # NEW IF STATEMENT
-        # If the file name ends with .json, check if we are loading for review. If yes, assume it is an annoations object, overwrite the current annotations object. If false, assume this is a Textract object, load in to Textract
         if (file_extension in ['.json']) | (json_from_csv == True):
@@ -661,7 +709,7 @@ def prepare_image_or_pdf(
                 continue
             elif (file_extension in ['.json']) and '_ocr_results_with_words' in file_path_without_ext: #(prepare_for_review != True):
-                print("Saving local OCR output")
                 # Copy it to the output folder so it can be used later.
                 output_ocr_results_with_words_json_file_name = file_path_without_ext + ".json"
                 # if not file_path.endswith("_ocr_results_with_words.json"): output_ocr_results_with_words_json_file_name = file_path_without_ext + "_ocr_results_with_words.json"
@@ -672,6 +720,15 @@ def prepare_image_or_pdf(
                 # Use shutil to copy the file directly
                 shutil.copy2(file_path, out_ocr_results_with_words_path)  # Preserves metadata
                 if text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_local_text.json"): relevant_ocr_output_with_words_found = True
                 if text_extract_method == TESSERACT_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_local_ocr.json"): relevant_ocr_output_with_words_found = True
                 if text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_textract.json"): relevant_ocr_output_with_words_found = True
@@ -742,7 +799,7 @@ def prepare_image_or_pdf(
                 continue
         # If it's a zip, it could be extract from a Textract bulk API call. Check it's this, and load in json if found
-        elif file_extension in ['.zip']:
             # Assume it's a Textract response object. Copy it to the output folder so it can be used later.
             out_folder = os.path.join(output_folder, file_path_without_ext + "_textract.json")
@@ -766,37 +823,25 @@ def prepare_image_or_pdf(
                 else:
                     print(f"Skipping {file_path}: Expected 1 JSON file, found {len(json_files)}")
-        elif file_extension in ['.csv'] and "ocr_output" in file_path:
-            continue
-        # Must be something else, return with error message
-        else:
-            if text_extract_method == TESSERACT_TEXT_EXTRACT_OPTION or text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION:
-                if is_pdf_or_image(file_path) == False:
-                    out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
-                    print(out_message)
-                    raise Exception(out_message)
-            elif text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION:
-                if is_pdf(file_path) == False:
-                    out_message = "Please upload a PDF file for text analysis."
-                    print(out_message)
-                    raise Exception(out_message)
         converted_file_paths.append(converted_file_path)
         image_file_paths.extend(image_file_path)
         toc = time.perf_counter()
-        out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
         print(out_time)
         out_message.append(out_time)
         combined_out_message = '\n'.join(out_message)
-    number_of_pages = len(page_sizes)#len(image_file_paths)
-    return combined_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df, relevant_ocr_output_with_words_found
 def load_and_convert_ocr_results_with_words_json(ocr_results_with_words_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
     """
@@ -834,6 +879,8 @@ def load_and_convert_ocr_results_with_words_json(ocr_results_with_words_json_fil
 def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi, output_folder:str=OUTPUT_FOLDER, input_folder:str=INPUT_FOLDER):
     file_path_without_ext = get_file_name_without_type(in_file_path)
     out_file_paths = out_text_file_path
     # Convert annotated text pdf back to image to give genuine redactions
@@ -896,7 +943,7 @@ def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
         image_groups[item['image']].append(item)
     # Process each group to prioritize items with non-empty boxes
-    result = []
     for image, items in image_groups.items():
         # Filter items with non-empty boxes
         non_empty_boxes = [item for item in items if item.get('boxes')]
@@ -1035,7 +1082,6 @@ def divide_coordinates_by_page_sizes(
         else:
              print("Skipping coordinate division due to missing or non-numeric dimension columns.")
     # --- Combine Relative and Processed Absolute DataFrames ---
     dfs_to_concat = [df for df in [df_rel, df_abs] if not df.empty]
@@ -1046,7 +1092,6 @@ def divide_coordinates_by_page_sizes(
         print("Warning: Both relative and absolute splits resulted in empty DataFrames.")
         final_df = pd.DataFrame(columns=review_file_df.columns)
     # --- Final Sort ---
     required_sort_columns = {"page", xmin, ymin}
     if not final_df.empty and required_sort_columns.issubset(final_df.columns):
@@ -1428,7 +1473,7 @@ def create_annotation_dicts_from_annotation_df(
 def convert_annotation_json_to_review_df(
     all_annotations: List[dict],
     redaction_decision_output: pd.DataFrame = pd.DataFrame(),
-    page_sizes: List[dict] = [],
     do_proximity_match: bool = True
 ) -> pd.DataFrame:
     '''
@@ -1615,7 +1660,7 @@ def convert_annotation_json_to_review_df(
     if 'color' in review_file_df.columns:
          # Check if the column actually contains lists before applying lambda
          if review_file_df['color'].apply(lambda x: isinstance(x, list)).any():
-            review_file_df["color"] = review_file_df["color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
     # Sort the results
     # Ensure sort columns exist before sorting
@@ -1642,6 +1687,86 @@ def convert_annotation_json_to_review_df(
     return review_file_df
 def fill_missing_box_ids(data_input: dict) -> dict:
     """
     Generates unique alphanumeric IDs for bounding boxes in an input dictionary
@@ -1873,7 +1998,7 @@ def fill_missing_ids(df: pd.DataFrame, column_name: str = 'id', length: int = 12
     # --- Generate Unique IDs ---
     character_set = string.ascii_letters + string.digits # a-z, A-Z, 0-9
     generated_ids_set = set() # Keep track of IDs generated *in this run*
-    new_ids_list = []      # Store the generated IDs in order
     max_possible_ids = len(character_set) ** length
     if num_needed > max_possible_ids:
@@ -2080,14 +2205,14 @@ def convert_review_df_to_annotation_json(
     # --- Build JSON Structure ---
-    json_data = []
     output_cols_for_boxes = [col for col in ["label", "color", xmin, ymin, xmax, ymax, "id", "text"] if col in review_file_df.columns]
     # Iterate through page_sizes_df to define the structure (one entry per image path)
     for _, row in page_sizes_df.iterrows():
         page_num = row['page'] # Already Int64
         pdf_image_path = row['image_path']
-        annotation_boxes = [] # Default to empty list
         # Check if the page exists in the grouped annotations (using the faster set lookup)
         # Check pd.notna because page_num could be <NA> if conversion failed
@@ -2106,7 +2231,7 @@ def convert_review_df_to_annotation_json(
             except KeyError:
                  print(f"Warning: Group key {page_num} not found despite being in group_keys (should not happen).")
-                 annotation_boxes = [] # Keep empty
         # Append the structured data for this image/page
         json_data.append({

 import re
 import time
 import json
+import gradio as gr
 import numpy as np
 import pymupdf
 from pymupdf import Document, Page, Rect
     Check if a given image size is above around 4.5mb, and reduce size if necessary. 5mb is the maximum possible to submit to AWS Textract.
     '''
+    all_img_details = list()
     page_num = 0
     # Check file size and resize if necessary
             elif pdf_path.lower().endswith(".jpg") or pdf_path.lower().endswith(".png") or pdf_path.lower().endswith(".jpeg"):
                 image = Image.open(pdf_path)
                 image.save(out_path, format="PNG")
+            else:
+                raise Warning("Could not create image.")
             width, height = image.size
             return page_num, out_path, width, height
         except Exception as e:
             print(f"Error processing page {page_num + 1}: {e}")
             return page_num,  out_path_placeholder, pd.NA, pd.NA
     else:
     else:
         page_count = pdfinfo_from_path(pdf_path)['Pages']
+    print(f"Creating images. Number of pages in PDF: {page_count}")
     # Set page max to length of pdf if not specified
     if page_max == 0: page_max = page_count
+    results = list()
     with ThreadPoolExecutor(max_workers=num_threads) as executor:
+        futures = list()
         for page_num in range(page_min, page_max):
             futures.append(executor.submit(process_single_page_for_image_conversion, pdf_path, page_num, image_dpi, create_images=create_images, input_folder=input_folder))
     else:
         print(f"{file_path} is not an image or PDF file.")
+        img_path = list()
+        image_sizes_width = list()
+        image_sizes_height = list()
+        all_img_details = list()
     return img_path, image_sizes_width, image_sizes_height, all_img_details
     Get list of input files to report to logs.
     '''
+    all_relevant_files = list()
     file_name_with_extension = ""
     full_file_name = ""
     total_pdf_page_count = 0
         file_extension = os.path.splitext(file_path)[1].lower()
         # Check if the file is in acceptable types
+        if (file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet', '.docx']) & ("review_file" not in file_path_without_ext) & ("ocr_output" not in file_path_without_ext) & ("ocr_results_with_words" not in file_path_without_ext):
             all_relevant_files.append(file_path_without_ext)
             file_name_with_extension = file_path_without_ext + file_extension
             full_file_name = file_path
     return whole_page_img_annotation_box
 def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float], image_sizes_height:List[float], image_file_paths:List[str]):
+    page_sizes = list()
+    original_cropboxes = list()
     for page_no, page in enumerate(pymupdf_doc):
         reported_page_no = page_no + 1
         out_page_image_sizes['cropbox_x_offset'] = pymupdf_page.cropbox.x0 - pymupdf_page.mediabox.x0
         # cropbox_y_offset_from_top: Distance from MediaBox top edge to CropBox top edge
         out_page_image_sizes['cropbox_y_offset_from_top'] = pymupdf_page.mediabox.y1 - pymupdf_page.cropbox.y1
         if image_sizes_width and image_sizes_height:
     return page_sizes, original_cropboxes
+def word_level_ocr_output_to_dataframe(ocr_results: dict) -> pd.DataFrame:
+    '''
+    Convert a json of ocr results to a dataframe
+    '''
+    rows = list()
+    ocr_result_page = ocr_results[0]
+    for ocr_result in ocr_results:
+        page_number = int(ocr_result['page'])
+        for line_key, line_data in ocr_result['results'].items():
+            line_number = int(line_data['line'])
+            for word in line_data['words']:
+                rows.append({
+                    'page': page_number,
+                    'line': line_number,
+                    'word_text': word['text'],
+                    'word_x0': word['bounding_box'][0],
+                    'word_y0': word['bounding_box'][1],
+                    'word_x1': word['bounding_box'][2],
+                    'word_y1': word['bounding_box'][3],
+                    'line_text': "", #line_data['text'], # This data is too large to include
+                    'line_x0': line_data['bounding_box'][0],
+                    'line_y0': line_data['bounding_box'][1],
+                    'line_x1': line_data['bounding_box'][2],
+                    'line_y1': line_data['bounding_box'][3],
+                })
+    return pd.DataFrame(rows)
 def prepare_image_or_pdf(
     file_paths: List[str],
     text_extract_method: str,
     all_line_level_ocr_results_df:pd.DataFrame,
+    all_page_line_level_ocr_results_with_words_df:pd.DataFrame,
     latest_file_completed: int = 0,
+    out_message: List[str] = list(),
     first_loop_state: bool = False,
     number_of_pages:int = 0,
+    all_annotations_object:List = list(),
     prepare_for_review:bool = False,
+    in_fully_redacted_list:List[int]=list(),
     output_folder:str=OUTPUT_FOLDER,
     input_folder:str=INPUT_FOLDER,
     prepare_images:bool=True,
+    page_sizes:list[dict]=list(),
+    pymupdf_doc:Document = list(),
     textract_output_found:bool = False,
+    relevant_ocr_output_with_words_found:bool = False,
     progress: Progress = Progress(track_tqdm=True)
 ) -> tuple[List[str], List[str]]:
     """
         output_folder (optional, str): The output folder for file save
         prepare_images (optional, bool): A boolean indicating whether to create images for each PDF page. Defaults to True.
         page_sizes(optional, List[dict]): A list of dicts containing information about page sizes in various formats.
+        pymupdf_doc(optional, Document): A pymupdf document object that indicates the existing PDF document object.
         textract_output_found (optional, bool): A boolean indicating whether Textract analysis output has already been found. Defaults to False.
         relevant_ocr_output_with_words_found (optional, bool): A boolean indicating whether local OCR analysis output has already been found. Defaults to False.
         progress (optional, Progress): Progress tracker for the operation
     tic = time.perf_counter()
     json_from_csv = False
+    original_cropboxes = list()  # Store original CropBox values
+    converted_file_paths = list()
+    image_file_paths = list()
+    # pymupdf_doc = list()
+    all_img_details = list()
     review_file_csv = pd.DataFrame()
     out_textract_path = ""
     combined_out_message = ""
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
         latest_file_completed = 0
+        out_message = list()
+        all_annotations_object = list()
     else:
         print("Now redacting file", str(latest_file_completed))
     # If combined out message or converted_file_paths are blank, change to a list so it can be appended to
     if isinstance(out_message, str): out_message = [out_message]
+    if not file_paths: file_paths = list()
     if isinstance(file_paths, dict): file_paths = os.path.abspath(file_paths["name"])
             final_out_message = '\n'.join(out_message)
         else:
             final_out_message = out_message
+        return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df, relevant_ocr_output_with_words_found, all_page_line_level_ocr_results_with_words_df
     progress(0.1, desc='Preparing file')
     # Loop through files to load in
     for file in file_paths_loop:
+        converted_file_path = list()
+        image_file_path = list()
         if isinstance(file, str):
             file_path = file
         file_path_without_ext = get_file_name_without_type(file_path)
         file_name_with_ext = os.path.basename(file_path)
+        print("Loading file:", file_name_with_ext)
         if not file_path:
+            out_message = "Please select at least one file."
             print(out_message)
+            raise Warning(out_message)
         file_extension = os.path.splitext(file_path)[1].lower()
         # If a pdf, load as a pymupdf document
         if is_pdf(file_path):
+            print(f"File {file_name_with_ext} is a PDF")
             pymupdf_doc = pymupdf.open(file_path)
             pymupdf_pages = pymupdf_doc.page_count
             #Create base version of the annotation object that doesn't have any annotations in it
             if (not all_annotations_object) & (prepare_for_review == True):
+                all_annotations_object = list()
                 for image_path in image_file_paths:
                     annotation = {}
                     annotation["image"] = image_path
+                    annotation["boxes"] = list()
                     all_annotations_object.append(annotation)
         elif is_pdf_or_image(file_path):  # Alternatively, if it's an image
+            print(f"File {file_name_with_ext} is an image")
             # Check if the file is an image type and the user selected text ocr option
             if file_extension in ['.jpg', '.jpeg', '.png'] and text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION:
                 text_extract_method = TESSERACT_TEXT_EXTRACT_OPTION
             pymupdf_doc.save(converted_file_path, garbage=4, deflate=True, clean=True)
+        # Loading in review files, ocr_outputs, or ocr_outputs_with_words
         elif file_extension in ['.csv']:
             if '_review_file' in file_path_without_ext:
                 review_file_csv = read_file(file_path)
                 all_annotations_object = convert_review_df_to_annotation_json(review_file_csv, image_file_paths, page_sizes)
+                json_from_csv = True
             elif '_ocr_output' in file_path_without_ext:
+                all_line_level_ocr_results_df = read_file(file_path)
+                if "line" not in all_line_level_ocr_results_df.columns:
+                    all_line_level_ocr_results_df["line"] = ""
+                json_from_csv = False
+            elif '_ocr_results_with_words' in file_path_without_ext:
+                all_page_line_level_ocr_results_with_words_df = read_file(file_path)
                 json_from_csv = False
         # NEW IF STATEMENT
+        # If the file name ends with .json, check if we are loading for review. If yes, assume it is an annotations object, overwrite the current annotations object. If false, assume this is a Textract object, load in to Textract
         if (file_extension in ['.json']) | (json_from_csv == True):
                 continue
             elif (file_extension in ['.json']) and '_ocr_results_with_words' in file_path_without_ext: #(prepare_for_review != True):
+                print("Saving local OCR output with words")
                 # Copy it to the output folder so it can be used later.
                 output_ocr_results_with_words_json_file_name = file_path_without_ext + ".json"
                 # if not file_path.endswith("_ocr_results_with_words.json"): output_ocr_results_with_words_json_file_name = file_path_without_ext + "_ocr_results_with_words.json"
                 # Use shutil to copy the file directly
                 shutil.copy2(file_path, out_ocr_results_with_words_path)  # Preserves metadata
+                if prepare_for_review == True:
+                    print("Converting local OCR output with words to csv")
+                    page_sizes_df = pd.DataFrame(page_sizes)
+                    all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(out_ocr_results_with_words_path, log_files_output_paths, page_sizes_df)
+                    all_page_line_level_ocr_results_with_words_df = word_level_ocr_output_to_dataframe(all_page_line_level_ocr_results_with_words)
+                    all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="word_x0", xmax="word_x1", ymin="word_y0", ymax="word_y1")
+                    all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="line_x0", xmax="line_x1", ymin="line_y0", ymax="line_y1")
                 if text_extract_method == SELECTABLE_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_local_text.json"): relevant_ocr_output_with_words_found = True
                 if text_extract_method == TESSERACT_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_local_ocr.json"): relevant_ocr_output_with_words_found = True
                 if text_extract_method == TEXTRACT_TEXT_EXTRACT_OPTION and file_path.endswith("_ocr_results_with_words_textract.json"): relevant_ocr_output_with_words_found = True
                 continue
         # If it's a zip, it could be extract from a Textract bulk API call. Check it's this, and load in json if found
+        if file_extension in ['.zip']:
             # Assume it's a Textract response object. Copy it to the output folder so it can be used later.
             out_folder = os.path.join(output_folder, file_path_without_ext + "_textract.json")
                 else:
                     print(f"Skipping {file_path}: Expected 1 JSON file, found {len(json_files)}")
         converted_file_paths.append(converted_file_path)
         image_file_paths.extend(image_file_path)
         toc = time.perf_counter()
+        out_time = f"File '{file_name_with_ext}' prepared in {toc - tic:0.1f} seconds."
         print(out_time)
         out_message.append(out_time)
         combined_out_message = '\n'.join(out_message)
+    if not page_sizes:
+        number_of_pages = 1
+    else:
+        number_of_pages = len(page_sizes)
+    print("Finished loading in files")
+    return combined_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df, relevant_ocr_output_with_words_found, all_page_line_level_ocr_results_with_words_df
 def load_and_convert_ocr_results_with_words_json(ocr_results_with_words_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
     """
 def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi, output_folder:str=OUTPUT_FOLDER, input_folder:str=INPUT_FOLDER):
     file_path_without_ext = get_file_name_without_type(in_file_path)
+    print("In convert_text_pdf_to_img_pdf function, file_path_without_ext:", file_path_without_ext)
     out_file_paths = out_text_file_path
     # Convert annotated text pdf back to image to give genuine redactions
         image_groups[item['image']].append(item)
     # Process each group to prioritize items with non-empty boxes
+    result = list()
     for image, items in image_groups.items():
         # Filter items with non-empty boxes
         non_empty_boxes = [item for item in items if item.get('boxes')]
         else:
              print("Skipping coordinate division due to missing or non-numeric dimension columns.")
     # --- Combine Relative and Processed Absolute DataFrames ---
     dfs_to_concat = [df for df in [df_rel, df_abs] if not df.empty]
         print("Warning: Both relative and absolute splits resulted in empty DataFrames.")
         final_df = pd.DataFrame(columns=review_file_df.columns)
     # --- Final Sort ---
     required_sort_columns = {"page", xmin, ymin}
     if not final_df.empty and required_sort_columns.issubset(final_df.columns):
 def convert_annotation_json_to_review_df(
     all_annotations: List[dict],
     redaction_decision_output: pd.DataFrame = pd.DataFrame(),
+    page_sizes: List[dict] = list(),
     do_proximity_match: bool = True
 ) -> pd.DataFrame:
     '''
     if 'color' in review_file_df.columns:
          # Check if the column actually contains lists before applying lambda
          if review_file_df['color'].apply(lambda x: isinstance(x, list)).any():
+            review_file_df.loc[:, "color"] = review_file_df.loc[:, "color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
     # Sort the results
     # Ensure sort columns exist before sorting
     return review_file_df
+def fill_missing_ids_in_list(data_list: list) -> list:
+    """
+    Generates unique alphanumeric IDs for dictionaries in a list where the 'id' is
+    missing, blank, or not a 12-character string.
+    Args:
+        data_list (list): A list of dictionaries, each potentially with an 'id' key.
+    Returns:
+        list: The input list with missing/invalid IDs filled.
+              Note: The function modifies the input list in place.
+    """
+    # --- Input Validation ---
+    if not isinstance(data_list, list):
+        raise TypeError("Input 'data_list' must be a list.")
+    if not data_list:
+        return data_list  # Return empty list as-is
+    id_length = 12
+    character_set = string.ascii_letters + string.digits  # a-z, A-Z, 0-9
+    # --- Get Existing IDs to Ensure Uniqueness ---
+    # Collect all valid existing IDs first
+    existing_ids = set()
+    for item in data_list:
+        if not isinstance(item, dict):
+            continue  # Skip non-dictionary items
+        item_id = item.get('id')
+        if isinstance(item_id, str) and len(item_id) == id_length:
+            existing_ids.add(item_id)
+    # --- Identify and Fill Items Needing IDs ---
+    generated_ids_set = set()  # Keep track of IDs generated *in this run*
+    num_filled = 0
+    for item in data_list:
+        if not isinstance(item, dict):
+            continue  # Skip non-dictionary items
+        item_id = item.get('id')
+        # Check if ID needs to be generated
+        # Needs ID if: key is missing, value is None, value is not a string,
+        # value is an empty string after stripping whitespace, or value is a string
+        # but not of the correct length.
+        needs_new_id = (
+            item_id is None or
+            not isinstance(item_id, str) or
+            item_id.strip() == "" or
+            len(item_id) != id_length
+        )
+        if needs_new_id:
+            # Generate a unique ID
+            attempts = 0
+            while True:
+                candidate_id = ''.join(random.choices(character_set, k=id_length))
+                # Check against *all* existing valid IDs and *newly* generated ones in this run
+                if candidate_id not in existing_ids and candidate_id not in generated_ids_set:
+                    generated_ids_set.add(candidate_id)
+                    item['id'] = candidate_id  # Assign the new ID directly to the item dict
+                    num_filled += 1
+                    break  # Found a unique ID
+                attempts += 1
+                # Safety break for unlikely infinite loop (though highly improbable with 12 chars)
+                if attempts > len(data_list) * 100 + 1000:
+                    raise RuntimeError(f"Failed to generate a unique ID after {attempts} attempts. Check ID length or existing IDs.")
+    if num_filled > 0:
+        pass
+        #print(f"Successfully filled {num_filled} missing or invalid IDs.")
+    else:
+        pass
+        #print("No missing or invalid IDs found.")
+    # The input list 'data_list' has been modified in place
+    return data_list
 def fill_missing_box_ids(data_input: dict) -> dict:
     """
     Generates unique alphanumeric IDs for bounding boxes in an input dictionary
     # --- Generate Unique IDs ---
     character_set = string.ascii_letters + string.digits # a-z, A-Z, 0-9
     generated_ids_set = set() # Keep track of IDs generated *in this run*
+    new_ids_list = list()      # Store the generated IDs in order
     max_possible_ids = len(character_set) ** length
     if num_needed > max_possible_ids:
     # --- Build JSON Structure ---
+    json_data = list()
     output_cols_for_boxes = [col for col in ["label", "color", xmin, ymin, xmax, ymax, "id", "text"] if col in review_file_df.columns]
     # Iterate through page_sizes_df to define the structure (one entry per image path)
     for _, row in page_sizes_df.iterrows():
         page_num = row['page'] # Already Int64
         pdf_image_path = row['image_path']
+        annotation_boxes = list() # Default to empty list
         # Check if the page exists in the grouped annotations (using the faster set lookup)
         # Check pd.notna because page_num could be <NA> if conversion failed
             except KeyError:
                  print(f"Warning: Group key {page_num} not found despite being in group_keys (should not happen).")
+                 annotation_boxes = list() # Keep empty
         # Append the structured data for this image/page
         json_data.append({

tools/file_redaction.py CHANGED Viewed

@@ -15,14 +15,15 @@ from pdfminer.high_level import extract_pages
 from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal, LTAnno
 from pikepdf import Pdf, Dictionary, Name
 from pymupdf import Rect, Page, Document
 import gradio as gr
 from gradio import Progress
 from collections import defaultdict  # For efficient grouping
-from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, LOCAL_PII_OPTION, AWS_PII_OPTION, NO_REDACTION_PII_OPTION
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction,  recreate_page_line_level_ocr_results_with_page
-from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression
-from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
 from tools.helper_functions import get_file_name_without_type, clean_unicode_text
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
@@ -59,7 +60,15 @@ def sum_numbers_before_seconds(string:str):
     return sum_of_numbers
-def merge_page_results(data):
     merged = {}
     for item in data:
@@ -76,79 +85,55 @@ def merge_page_results(data):
     return list(merged.values())
-def word_level_ocr_output_to_dataframe(ocr_result: dict) -> pd.DataFrame:
-    rows = []
-    ocr_result = ocr_result[0]
-    page_number = int(ocr_result['page'])
-    for line_key, line_data in ocr_result['results'].items():
-        line_number = int(line_data['line'])
-        for word in line_data['words']:
-            rows.append({
-                'page': page_number,
-                'line': line_number,
-                'word_text': word['text'],
-                'word_x0': word['bounding_box'][0],
-                'word_y0': word['bounding_box'][1],
-                'word_x1': word['bounding_box'][2],
-                'word_y1': word['bounding_box'][3],
-                'line_text': line_data['text'],
-                'line_x0': line_data['bounding_box'][0],
-                'line_y0': line_data['bounding_box'][1],
-                'line_x1': line_data['bounding_box'][2],
-                'line_y1': line_data['bounding_box'][3],
-            })
-    return pd.DataFrame(rows)
 def choose_and_run_redactor(file_paths:List[str],
  prepared_pdf_file_paths:List[str],
- pdf_image_file_paths:List[str],
- language:str,
  chosen_redact_entities:List[str],
  chosen_redact_comprehend_entities:List[str],
  text_extraction_method:str,
- in_allow_list:List[List[str]]=None,
- custom_recogniser_word_list:List[str]=None,
- redact_whole_page_list:List[str]=None,
  latest_file_completed:int=0,
- combined_out_message:List=[],
- out_file_paths:List=[],
- log_files_output_paths:List=[],
  first_loop_state:bool=False,
  page_min:int=0,
  page_max:int=999,
  estimated_time_taken_state:float=0.0,
- handwrite_signature_checkbox:List[str]=["Extract handwriting", "Extract signatures"],
  all_request_metadata_str:str = "",
- annotations_all_pages:List[dict]=[],
- all_line_level_ocr_results_df:pd.DataFrame=[],#pd.DataFrame(),
- all_pages_decision_process_table:pd.DataFrame=[],#pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score"]),
- pymupdf_doc=[],
  current_loop_page:int=0,
  page_break_return:bool=False,
- pii_identification_method:str="Local",
  comprehend_query_number:int=0,
  max_fuzzy_spelling_mistakes_num:int=1,
  match_fuzzy_whole_phrase_bool:bool=True,
  aws_access_key_textbox:str='',
  aws_secret_key_textbox:str='',
  annotate_max_pages:int=1,
- review_file_state:pd.DataFrame=[],
  output_folder:str=OUTPUT_FOLDER,
- document_cropboxes:List=[],
- page_sizes:List[dict]=[],
  textract_output_found:bool=False,
  text_extraction_only:bool=False,
- duplication_file_path_outputs:list=[],
  review_file_path:str="",
  input_folder:str=INPUT_FOLDER,
  total_textract_query_number:int=0,
  ocr_file_path:str="",
- all_page_line_level_ocr_results = [],
- all_page_line_level_ocr_results_with_words = [],
- prepare_images:bool=True,
  RETURN_PDF_END_OF_REDACTION:bool=RETURN_PDF_END_OF_REDACTION,
  progress=gr.Progress(track_tqdm=True)):
     '''
@@ -157,7 +142,7 @@ def choose_and_run_redactor(file_paths:List[str],
     - file_paths (List[str]): A list of paths to the files to be redacted.
     - prepared_pdf_file_paths (List[str]): A list of paths to the PDF files prepared for redaction.
     - pdf_image_file_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
-    - language (str): The language of the text in the files.
     - chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
     - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
     - text_extraction_method (str): The method to use to extract text from documents.
@@ -175,7 +160,7 @@ def choose_and_run_redactor(file_paths:List[str],
     - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
     - all_request_metadata_str (str, optional): A string containing all request metadata. Defaults to an empty string.
     - annotations_all_pages (List[dict], optional): A list of dictionaries containing all image annotations. Defaults to an empty list.
-    - all_line_level_ocr_results_df (pd.DataFrame, optional): A DataFrame containing all line-level OCR results. Defaults to an empty DataFrame.
     - all_pages_decision_process_table (pd.DataFrame, optional): A DataFrame containing all decision process tables. Defaults to an empty DataFrame.
     - pymupdf_doc (optional): A list containing the PDF document object. Defaults to an empty list.
     - current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
@@ -200,7 +185,11 @@ def choose_and_run_redactor(file_paths:List[str],
     - ocr_file_path (str, optional): The latest ocr file path created by the app.
     - all_page_line_level_ocr_results (list, optional): All line level text on the page with bounding boxes.
     - all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
-    - prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
     - RETURN_PDF_END_OF_REDACTION (bool, optional): Boolean to determine whether to return a redacted PDF at the end of the redaction process.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
@@ -210,11 +199,31 @@ def choose_and_run_redactor(file_paths:List[str],
     out_message = ""
     pdf_file_name_with_ext = ""
-    pdf_file_name_without_ext = ""
-    blank_request_metadata = []
     all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
-    review_out_file_paths = [prepared_pdf_file_paths[0]]
     # Ensure all_pages_decision_process_table is in correct format for downstream processes
     if isinstance(all_pages_decision_process_table,list):
         if not all_pages_decision_process_table: all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"])
@@ -227,7 +236,8 @@ def choose_and_run_redactor(file_paths:List[str],
         #print("First_loop_state is True")
         latest_file_completed = 0
         current_loop_page = 0
-        out_file_paths = []
         estimate_total_processing_time = 0
         estimated_time_taken_state = 0
         comprehend_query_number = 0
@@ -239,7 +249,7 @@ def choose_and_run_redactor(file_paths:List[str],
     elif (first_loop_state == False) & (current_loop_page == 999):
         current_loop_page = 0
         total_textract_query_number = 0
-        comprehend_query_number = 0
     # Choose the correct file to prepare
     if isinstance(file_paths, str): file_paths_list = [os.path.abspath(file_paths)]
@@ -256,6 +266,8 @@ def choose_and_run_redactor(file_paths:List[str],
     # Check if any files were found and assign to file_paths_list
     file_paths_list = filtered_files if filtered_files else []
     # If latest_file_completed is used, get the specific file
     if not isinstance(file_paths, (str, dict)): file_paths_loop = [file_paths_list[int(latest_file_completed)]] if len(file_paths_list) > latest_file_completed else []
     else: file_paths_loop = file_paths_list
@@ -287,8 +299,7 @@ def choose_and_run_redactor(file_paths:List[str],
         # Only send across review file if redaction has been done
         if pii_identification_method != NO_REDACTION_PII_OPTION:
-            if len(review_out_file_paths) == 1:
-                #review_file_path = [x for x in out_file_paths if "review_file" in x]
                 if review_file_path: review_out_file_paths.append(review_file_path)
         if not isinstance(pymupdf_doc, list):
@@ -299,7 +310,9 @@ def choose_and_run_redactor(file_paths:List[str],
         estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
         print("Estimated total processing time:", str(estimate_total_processing_time))
-        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
     #if first_loop_state == False:
     # Prepare documents and images as required if they don't already exist
@@ -329,10 +342,10 @@ def choose_and_run_redactor(file_paths:List[str],
     # Call prepare_image_or_pdf only if needed
     if prepare_images_flag is not None:
-        out_message, prepared_pdf_file_paths, pdf_image_file_paths, annotate_max_pages, annotate_max_pages_bottom, pymupdf_doc, annotations_all_pages, review_file_state, document_cropboxes, page_sizes, textract_output_found, all_img_details_state, placeholder_ocr_results_df, local_ocr_output_found_checkbox = prepare_image_or_pdf(
-            file_paths_loop, text_extraction_method, all_line_level_ocr_results_df, 0, out_message, True,
             annotate_max_pages, annotations_all_pages, document_cropboxes, redact_whole_page_list,
-            output_folder, prepare_images=prepare_images_flag, page_sizes=page_sizes, input_folder=input_folder
         )
     page_sizes_df = pd.DataFrame(page_sizes)
@@ -343,8 +356,7 @@ def choose_and_run_redactor(file_paths:List[str],
     page_sizes = page_sizes_df.to_dict(orient="records")
-    number_of_pages = pymupdf_doc.page_count
     # If we have reached the last page, return message and outputs
     if current_loop_page >= number_of_pages:
@@ -361,11 +373,12 @@ def choose_and_run_redactor(file_paths:List[str],
         # Only send across review file if redaction has been done
         if pii_identification_method != NO_REDACTION_PII_OPTION:
             # If only pdf currently in review outputs, add on the latest review file
-            if len(review_out_file_paths) == 1:
-                #review_file_path = [x for x in out_file_paths if "review_file" in x]
                 if review_file_path: review_out_file_paths.append(review_file_path)
-        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
     # Load/create allow list
     # If string, assume file path
@@ -374,7 +387,7 @@ def choose_and_run_redactor(file_paths:List[str],
     if not in_allow_list.empty:
         in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
     else:
-        in_allow_list_flat = []
     # If string, assume file path
     if isinstance(custom_recogniser_word_list, str):
@@ -383,7 +396,7 @@ def choose_and_run_redactor(file_paths:List[str],
         if not custom_recogniser_word_list.empty:
             custom_recogniser_word_list_flat = custom_recogniser_word_list.iloc[:, 0].tolist()
         else:
-            custom_recogniser_word_list_flat = []
         # Sort the strings in order from the longest string to the shortest
         custom_recogniser_word_list_flat = sorted(custom_recogniser_word_list_flat, key=len, reverse=True)
@@ -399,7 +412,7 @@ def choose_and_run_redactor(file_paths:List[str],
                 print("Could not convert whole page redaction data to number list due to:", e)
                 redact_whole_page_list_flat = redact_whole_page_list.iloc[:,0].tolist()
         else:
-            redact_whole_page_list_flat = []
@@ -452,13 +465,28 @@ def choose_and_run_redactor(file_paths:List[str],
     else:
         textract_client = ""
     # Check if output_folder exists, create it if it doesn't
     if not os.path.exists(output_folder): os.makedirs(output_folder)
     progress(0.5, desc="Extracting text and redacting document")
     all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"])
-    all_line_level_ocr_results_df = pd.DataFrame(columns=["page", "text",	"left", "top", "width", "height"])
     # Run through file loop, redact each file at a time
     for file in file_paths_loop:
@@ -482,16 +510,16 @@ def choose_and_run_redactor(file_paths:List[str],
             raise Exception(out_message)
         # Output file paths names
-        orig_pdf_file_path = output_folder + pdf_file_name_with_ext
         review_file_path = orig_pdf_file_path + '_review_file.csv'
         # Load in all_ocr_results_with_words if it exists as a file path and doesn't exist already
-        file_name = get_file_name_without_type(file_path)
-        if text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_local_text.json"
-        elif text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_local_ocr.json"
-        elif text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_textract.json"
-        all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + file_ending
         if not all_page_line_level_ocr_results_with_words:
             if local_ocr_output_found_checkbox == True and os.path.exists(all_page_line_level_ocr_results_with_words_json_file_path):
@@ -509,7 +537,7 @@ def choose_and_run_redactor(file_paths:List[str],
             print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
-            pymupdf_doc, all_pages_decision_process_table, out_file_paths, new_textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words = redact_image_pdf(file_path,
              pdf_image_file_paths,
              language,
              chosen_redact_entities,
@@ -523,7 +551,7 @@ def choose_and_run_redactor(file_paths:List[str],
              current_loop_page,
              page_break_return,
              annotations_all_pages,
-             all_line_level_ocr_results_df,
              all_pages_decision_process_table,
              pymupdf_doc,
              pii_identification_method,
@@ -538,14 +566,17 @@ def choose_and_run_redactor(file_paths:List[str],
              text_extraction_only,
              all_page_line_level_ocr_results,
              all_page_line_level_ocr_results_with_words,
              log_files_output_paths=log_files_output_paths,
              output_folder=output_folder)
-            # Save Textract request metadata (if exists)
             if new_textract_request_metadata and isinstance(new_textract_request_metadata, list):
-                all_textract_request_metadata.extend(new_textract_request_metadata)
         elif text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION:
@@ -556,7 +587,7 @@ def choose_and_run_redactor(file_paths:List[str],
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
-            pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words = redact_text_pdf(
             file_path,
             language,
             chosen_redact_entities,
@@ -567,7 +598,7 @@ def choose_and_run_redactor(file_paths:List[str],
             current_loop_page,
             page_break_return,
             annotations_all_pages,
-            all_line_level_ocr_results_df,
             all_pages_decision_process_table,
             pymupdf_doc,
             all_page_line_level_ocr_results_with_words,
@@ -615,43 +646,50 @@ def choose_and_run_redactor(file_paths:List[str],
                         out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.pdf"
                         print("Saving redacted PDF file:", out_redacted_pdf_file_path)
                         save_pdf_with_or_without_compression(pymupdf_doc, out_redacted_pdf_file_path)
                     out_file_paths.append(out_redacted_pdf_file_path)
-            if not all_line_level_ocr_results_df.empty:
-                all_line_level_ocr_results_df = all_line_level_ocr_results_df[["page", "text", "left", "top", "width", "height"]]
-            else: all_line_level_ocr_results_df = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
-            ocr_file_path = orig_pdf_file_path + "_ocr_output.csv"
-            all_line_level_ocr_results_df.sort_values(["page", "top", "left"], inplace=True)
-            all_line_level_ocr_results_df.to_csv(ocr_file_path, index = None, encoding="utf-8")
             out_file_paths.append(ocr_file_path)
             duplication_file_path_outputs.append(ocr_file_path)
             if all_page_line_level_ocr_results_with_words:
-                #print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
-                #
-                #if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
                 all_page_line_level_ocr_results_with_words = merge_page_results(all_page_line_level_ocr_results_with_words)
-                # print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
                 with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
                     json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
                 all_page_line_level_ocr_results_with_words_df = word_level_ocr_output_to_dataframe(all_page_line_level_ocr_results_with_words)
                 all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="word_x0", xmax="word_x1", ymin="word_y0", ymax="word_y1")
-                all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="line_x0", xmax="line_x1", ymin="line_y0", ymax="line_y1")
-                all_page_line_level_ocr_results_with_words_df.sort_values(["page", "line", "word_x0"], inplace=True)
-                all_page_line_level_ocr_results_with_words_df_file_path = (output_folder + file_name + file_ending).replace(".json", ".csv")
-                all_page_line_level_ocr_results_with_words_df.to_csv(all_page_line_level_ocr_results_with_words_df_file_path, index = None)
                 if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
                     log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
@@ -659,8 +697,10 @@ def choose_and_run_redactor(file_paths:List[str],
                 if all_page_line_level_ocr_results_with_words_df_file_path not in log_files_output_paths:
                     log_files_output_paths.append(all_page_line_level_ocr_results_with_words_df_file_path)
             # Convert the gradio annotation boxes to relative coordinates
-            # Convert annotations_all_pages to a consistent relative coordinate format output
             progress(0.93, "Creating review file output")
             page_sizes = page_sizes_df.to_dict(orient="records")
             all_image_annotations_df = convert_annotation_data_to_dataframe(annotations_all_pages)
@@ -674,10 +714,9 @@ def choose_and_run_redactor(file_paths:List[str],
             # Don't need page sizes in outputs
             review_file_state.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
-            review_file_state.to_csv(review_file_path, index=None)
-            if pii_identification_method != NO_REDACTION_PII_OPTION:
-                out_file_paths.append(review_file_path)
             # Make a combined message for the file
             if isinstance(out_message, list) and out_message:
@@ -699,18 +738,16 @@ def choose_and_run_redactor(file_paths:List[str],
             time_taken = toc - tic
             estimated_time_taken_state += time_taken
-   # If textract requests made, write to logging file. Alos record number of Textract requests
     if all_textract_request_metadata and isinstance(all_textract_request_metadata, list):
         all_request_metadata_str = '\n'.join(all_textract_request_metadata).strip()
         all_textract_request_metadata_file_path = output_folder + pdf_file_name_without_ext + "_textract_metadata.txt"
-        with open(all_textract_request_metadata_file_path, "w") as f:
-            f.write(all_request_metadata_str)
         # Add the request metadata to the log outputs if not there already
-        if all_textract_request_metadata_file_path not in log_files_output_paths:
-            log_files_output_paths.append(all_textract_request_metadata_file_path)
         new_textract_query_numbers = len(all_textract_request_metadata)
         total_textract_query_number += new_textract_query_numbers
@@ -725,7 +762,9 @@ def choose_and_run_redactor(file_paths:List[str],
     if total_textract_query_number > number_of_pages: total_textract_query_number = number_of_pages
-    return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages_divide, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
 def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
     '''
@@ -1063,7 +1102,7 @@ def set_cropbox_safely(page: Page, original_cropbox: Optional[Rect]):
     else:
         page.set_cropbox(original_cropbox)
-def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]=[], page_sizes_df:pd.DataFrame=pd.DataFrame()):
     rect_height = page.rect.height
     rect_width = page.rect.width
@@ -1090,7 +1129,7 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
             image_dimensions = {}
     out_annotation_boxes = {}
-    all_image_annotation_boxes = []
     if isinstance(image, Image.Image):
         image_path = move_page_info(str(page))
@@ -1201,10 +1240,25 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
 # IMAGE-BASED OCR PDF TEXT DETECTION/REDACTION WITH TESSERACT OR AWS TEXTRACT
 ###
-def merge_img_bboxes(bboxes, combined_results: Dict, page_signature_recogniser_results=[], page_handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Extract handwriting", "Extract signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
-    all_bboxes = []
-    merged_bboxes = []
     grouped_bboxes = defaultdict(list)
     # Deep copy original bounding boxes to retain them
@@ -1219,7 +1273,7 @@ def merge_img_bboxes(bboxes, combined_results: Dict, page_signature_recogniser_r
             merged_bboxes.extend(copy.deepcopy(page_signature_recogniser_results))
     # Reconstruct bounding boxes for substrings of interest
-    reconstructed_bboxes = []
     for bbox in bboxes:
         bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
         for line_text, line_info in combined_results.items():
@@ -1229,7 +1283,7 @@ def merge_img_bboxes(bboxes, combined_results: Dict, page_signature_recogniser_r
                     start_char = line_text.index(bbox.text)
                     end_char = start_char + len(bbox.text)
-                    relevant_words = []
                     current_char = 0
                     for word in line_info['words']:
                         word_end = current_char + len(word['text'])
@@ -1318,33 +1372,35 @@ def redact_image_pdf(file_path:str,
                      page_max:int=999,
                      text_extraction_method:str=TESSERACT_TEXT_EXTRACT_OPTION,
                      handwrite_signature_checkbox:List[str]=["Extract handwriting", "Extract signatures"],
-                     textract_request_metadata:list=[],
                      current_loop_page:int=0,
                      page_break_return:bool=False,
-                     annotations_all_pages:List=[],
-                     all_line_level_ocr_results_df:pd.DataFrame = pd.DataFrame(columns=["page", "text",	"left", "top", "width", "height"]),
                      all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"]),
-                     pymupdf_doc:Document = [],
                      pii_identification_method:str="Local",
                      comprehend_query_number:int=0,
                      comprehend_client:str="",
                      textract_client:str="",
-                     custom_recogniser_word_list:List[str]=[],
-                     redact_whole_page_list:List[str]=[],
                      max_fuzzy_spelling_mistakes_num:int=1,
                      match_fuzzy_whole_phrase_bool:bool=True,
                      page_sizes_df:pd.DataFrame=pd.DataFrame(),
                      text_extraction_only:bool=False,
-                     all_page_line_level_ocr_results = [],
-                     all_page_line_level_ocr_results_with_words = [],
                      page_break_val:int=int(PAGE_BREAK_VALUE),
-                     log_files_output_paths:List=[],
                      max_time:int=int(MAX_TIME_VALUE),
-                     output_folder:str=OUTPUT_FOLDER,
                      progress=Progress(track_tqdm=True)):
     '''
-    This function redacts sensitive information from a PDF document. It takes the following parameters:
     - file_path (str): The path to the PDF file to be redacted.
     - pdf_image_file_paths (List[str]): A list of paths to the PDF file pages converted to images.
@@ -1357,9 +1413,10 @@ def redact_image_pdf(file_path:str,
     - text_extraction_method (str, optional): The type of analysis to perform on the PDF. Defaults to TESSERACT_TEXT_EXTRACT_OPTION.
     - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
     - textract_request_metadata (list, optional): Metadata related to the redaction request. Defaults to an empty string.
     - page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
     - annotations_all_pages (List, optional): List of annotations on all pages that is used by the gradio_image_annotation object.
-    - all_line_level_ocr_results_df (pd.DataFrame, optional): All line level OCR results for the document as a Pandas dataframe,
     - all_pages_decision_process_table (pd.DataFrame, optional): All redaction decisions for document as a Pandas dataframe.
     - pymupdf_doc (Document, optional): The document as a PyMupdf object.
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
@@ -1372,10 +1429,14 @@ def redact_image_pdf(file_path:str,
     - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
     - page_sizes_df (pd.DataFrame, optional): A pandas dataframe of PDF page sizes in PDF or image format.
     - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
-    - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
     - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
-    - output_folder (str, optional): The folder for file outputs.
     - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     The function returns a redacted PDF document along with processing output objects.
@@ -1386,6 +1447,17 @@ def redact_image_pdf(file_path:str,
     file_name = get_file_name_without_type(file_path)
     comprehend_query_number_new = 0
     # Update custom word list analyser object with any new words that have been added to the custom deny list
     if custom_recogniser_word_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
@@ -1396,7 +1468,11 @@ def redact_image_pdf(file_path:str,
         new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
         nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
-    image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
     if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
         out_message = "Connection to AWS Comprehend service unsuccessful."
@@ -1406,7 +1482,7 @@ def redact_image_pdf(file_path:str,
     if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION and textract_client == "":
         out_message_warning = "Connection to AWS Textract service unsuccessful. Redaction will only continue if local AWS Textract results can be found."
         print(out_message_warning)
-        #raise Exception(out_message)
     number_of_pages = pymupdf_doc.page_count
     print("Number of pages:", str(number_of_pages))
@@ -1425,7 +1501,7 @@ def redact_image_pdf(file_path:str,
         textract_data, is_missing, log_files_output_paths = load_and_convert_textract_json(textract_json_file_path, log_files_output_paths, page_sizes_df)
         original_textract_data = textract_data.copy()
-        print("Successfully loaded in Textract analysis results from file")
     # If running local OCR option, check if file already exists. If it does, load in existing data
     if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
@@ -1433,7 +1509,7 @@ def redact_image_pdf(file_path:str,
         all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(all_page_line_level_ocr_results_with_words_json_file_path, log_files_output_paths, page_sizes_df)
         original_all_page_line_level_ocr_results_with_words = all_page_line_level_ocr_results_with_words.copy()
-        print("Loaded in local OCR analysis results from file")
     ###
     if current_loop_page == 0: page_loop_start = 0
@@ -1442,11 +1518,11 @@ def redact_image_pdf(file_path:str,
     progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
     # If there's data from a previous run (passed in via the DataFrame parameters), add it
-    all_line_level_ocr_results_list = []
-    all_pages_decision_process_list = []
-    if not all_line_level_ocr_results_df.empty:
-        all_line_level_ocr_results_list.extend(all_line_level_ocr_results_df.to_dict('records'))
     if not all_pages_decision_process_table.empty:
         all_pages_decision_process_list.extend(all_pages_decision_process_table.to_dict('records'))
@@ -1454,10 +1530,10 @@ def redact_image_pdf(file_path:str,
     # Go through each page
     for page_no in progress_bar:
-        handwriting_or_signature_boxes = []
-        page_signature_recogniser_results = []
-        page_handwriting_recogniser_results = []
-        page_line_level_ocr_results_with_words = []
         page_break_return = False
         reported_page_number = str(page_no + 1)
@@ -1495,12 +1571,7 @@ def redact_image_pdf(file_path:str,
                 print("Can't find original cropbox details for page, using current PyMuPDF page cropbox")
                 original_cropbox =  pymupdf_page.cropbox.irect
-            # Possibility to use different languages
-            if language == 'en': ocr_lang = 'eng'
-            else: ocr_lang = language
             # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
             # If using Tesseract
             if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
@@ -1513,7 +1584,7 @@ def redact_image_pdf(file_path:str,
                     )
                     page_line_level_ocr_results_with_words = matching_page if matching_page else []
-                else: page_line_level_ocr_results_with_words = []
                 if page_line_level_ocr_results_with_words:
                     print("Found OCR results for page in existing OCR with words object")
@@ -1523,11 +1594,14 @@ def redact_image_pdf(file_path:str,
                     page_line_level_ocr_results, page_line_level_ocr_results_with_words = combine_ocr_results(page_word_level_ocr_results, page=reported_page_number)
                     all_page_line_level_ocr_results_with_words.append(page_line_level_ocr_results_with_words)
             # Check if page exists in existing textract data. If not, send to service to analyse
             if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
-                text_blocks = []
                 if not textract_data:
                     try:
@@ -1565,7 +1639,7 @@ def redact_image_pdf(file_path:str,
                             text_blocks, new_textract_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
                             # Check if "pages" key exists, if not, initialise it as an empty list
-                            if "pages" not in textract_data: textract_data["pages"] = []
                             # Append the new page data
                             textract_data["pages"].append(text_blocks)
@@ -1573,11 +1647,11 @@ def redact_image_pdf(file_path:str,
                         except Exception as e:
                             out_message = "Textract extraction for page " + reported_page_number + " failed due to:" + str(e)
                             print(out_message)
-                            text_blocks = []
                             new_textract_request_metadata = "Failed Textract API call"
                             # Check if "pages" key exists, if not, initialise it as an empty list
-                            if "pages" not in textract_data: textract_data["pages"] = []
                             raise Exception(out_message)
@@ -1589,6 +1663,9 @@ def redact_image_pdf(file_path:str,
                 page_line_level_ocr_results, handwriting_or_signature_boxes, page_signature_recogniser_results, page_handwriting_recogniser_results, page_line_level_ocr_results_with_words = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
                 all_page_line_level_ocr_results_with_words.append(page_line_level_ocr_results_with_words)
             # Convert to DataFrame and add to ongoing logging table
@@ -1598,7 +1675,8 @@ def redact_image_pdf(file_path:str,
                 'left': result.left,
                 'top': result.top,
                 'width': result.width,
-                'height': result.height
             } for result in page_line_level_ocr_results['results']])
             if not line_level_ocr_results_df.empty: # Ensure there are records to add
@@ -1613,21 +1691,22 @@ def redact_image_pdf(file_path:str,
                         page_line_level_ocr_results_with_words['results'],
                         chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
                         pii_identification_method = pii_identification_method,
-                        comprehend_client=comprehend_client,
                         language=language,
-                        entities=chosen_redact_entities,
                         allow_list=allow_list,
-                        score_threshold=score_threshold
                     )
                     comprehend_query_number = comprehend_query_number + comprehend_query_number_new
-                else: page_redaction_bounding_boxes = []
                 # Merge redaction bounding boxes that are close together
                 page_merged_redaction_bboxes = merge_img_bboxes(page_redaction_bounding_boxes, page_line_level_ocr_results_with_words['results'], page_signature_recogniser_results, page_handwriting_recogniser_results, handwrite_signature_checkbox)
-            else: page_merged_redaction_bboxes = []
             # 3. Draw the merged boxes
             ## Apply annotations to pdf with pymupdf
@@ -1654,7 +1733,7 @@ def redact_image_pdf(file_path:str,
                 fill = (0, 0, 0)   # Fill colour for redactions
                 draw = ImageDraw.Draw(image)
-                all_image_annotations_boxes = []
                 for box in page_merged_redaction_bboxes:
@@ -1696,9 +1775,7 @@ def redact_image_pdf(file_path:str,
                 page_image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
-                redacted_image = image.copy()
-                #redacted_image.save("test_out_image.png")
             # Convert decision process to table
             decision_process_table = pd.DataFrame([{
@@ -1720,7 +1797,6 @@ def redact_image_pdf(file_path:str,
                 all_pages_decision_process_list.extend(decision_process_table.to_dict('records'))
             decision_process_table = fill_missing_ids(decision_process_table)
-            decision_process_table.to_csv(output_folder + "decision_process_table_with_ids.csv")
             toc = time.perf_counter()
@@ -1855,224 +1931,225 @@ def get_text_container_characters(text_container:LTTextContainer):
         return characters
     return []
-def create_line_level_ocr_results_from_characters(char_objects:List[LTChar]) -> Tuple[List[OCRResult], List[LTChar]]:
-    '''
-    Create an OCRResult object based on a list of pdfminer LTChar objects.
-    '''
-    line_level_results_out = []
-    line_level_characters_out = []
-    line_level_words_out = {}
-    character_objects_out = []
-    # Initialize variables
     full_text = ""
-    added_text = ""
-    overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]  # [x0, y0, x1, y1]
-    line_bboxes = []
-    # Iterate through the character objects
-    current_word = ""
-    current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]  # [x0, y0, x1, y1]
     for char in char_objects:
-        character_objects_out.append(char)  # Collect character objects
-        if not isinstance(char, LTAnno):
-            character_text = char.get_text()
-            # character_text_objects_out.append(character_text)
         if isinstance(char, LTAnno):
             added_text = char.get_text()
-            # Handle double quotes
-            #added_text = added_text.replace('"', '\\"')  # Escape double quotes
-            # Handle space separately by finalizing the word
-            full_text += added_text  # Adds space or newline
-            if current_word:  # Only finalise if there is a current word
-                line_bboxes.append((current_word, current_word_bbox))
-                current_word = ""
-                current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]  # Reset for next word
-            # Check for line break (assuming a new line is indicated by a specific character)
             if '\n' in added_text:
-                # finalise the current line
-                if current_word:
-                    line_bboxes.append((current_word, current_word_bbox))
-                # Create an OCRResult for the current line
-                line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0], 2), round(overall_bbox[1], 2), round(overall_bbox[2] - overall_bbox[0], 2), round(overall_bbox[3] - overall_bbox[1], 2)))
-                line_level_characters_out.append(character_objects_out)
                 # Reset for the next line
-                character_objects_out = []
                 full_text = ""
                 overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]
-                current_word = ""
-                current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]
             continue
-        # Concatenate text for LTChar
-        #full_text += char.get_text()
-        #added_text = re.sub(r'[^\x00-\x7F]+', ' ', char.get_text())
-        added_text = char.get_text()
-        if re.search(r'[^\x00-\x7F]', added_text):  # Matches any non-ASCII character
-            #added_text.encode('latin1', errors='replace').decode('utf-8')
-            added_text = clean_unicode_text(added_text)
-        full_text += added_text  # Adds space or newline, removing
-        # Update overall bounding box
         x0, y0, x1, y1 = char.bbox
-        overall_bbox[0] = min(overall_bbox[0], x0)  # x0
-        overall_bbox[1] = min(overall_bbox[1], y0)  # y0
-        overall_bbox[2] = max(overall_bbox[2], x1)  # x1
-        overall_bbox[3] = max(overall_bbox[3], y1)  # y1
-        # Update current word
-        #current_word += char.get_text()
-        current_word += added_text
-        # Update current word bounding box
-        current_word_bbox[0] = min(current_word_bbox[0], x0)  # x0
-        current_word_bbox[1] = min(current_word_bbox[1], y0)  # y0
-        current_word_bbox[2] = max(current_word_bbox[2], x1)  # x1
-        current_word_bbox[3] = max(current_word_bbox[3], y1)  # y1
-    # Finalise the last word if any
-    if current_word:
-        line_bboxes.append((current_word, current_word_bbox))
-    if full_text:
-        print("full_text found")
-        if re.search(r'[^\x00-\x7F]', full_text):  # Matches any non-ASCII character
-            # Convert special characters to a human-readable format
-            full_text = clean_unicode_text(full_text)
-            full_text = full_text.strip()
-        line_ocr_result_bbox = round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)
-        line_ocr_result = OCRResult(full_text.strip(), line_ocr_result_bbox)
-        line_level_results_out.append(line_ocr_result)
-    else:
-        line_ocr_result_bbox = []
-    # if line_ocr_result_bbox:
-    #     line_level_words_out["page"] = 1
-    #     line_level_words_out['results'] = {'text_line_1':{"line":1, "text":full_text, "bounding_box": line_ocr_result_bbox, "words": line_bboxes}}
-    # else:
-    #     line_level_words_out = {}
-    return line_level_results_out, line_level_characters_out # Return both results and character objects
-def generate_word_level_ocr(char_objects: List, page_number: int, text_line_number:int) -> Dict[str, Any]:
     """
-    Generates a dictionary with line and word-level OCR results from a list of pdfminer.six objects.
-    This robust version handles real-world pdfminer.six output by:
-    1. Filtering out non-character (LTAnno) objects that lack coordinate data.
-    2. Sorting all text characters (LTChar) into a proper reading order.
-    3. Using an adaptive threshold for detecting spaces based on character font size.
     Args:
-        char_objects: A mixed list of pdfminer.six LTChar and LTAnno objects from a single page.
-        page_number: The page number where the characters are from.
     Returns:
-        A dictionary formatted with page, line, and word-level results.
     """
-    # **CRITICAL FIX: Filter out LTAnno objects, as they lack '.bbox' and are not needed for layout analysis.**
-    text_chars = [c for c in char_objects if isinstance(c, LTChar)]
     if not text_chars:
-        return {"page": str(page_number), "results": {}}
-    # Sort the remaining text characters into reading order.
-    text_chars.sort(key=lambda c: (-c.bbox[3], c.bbox[0]))
-    page_data = {"page": str(page_number), "results": {}}
-    line_number = text_line_number
-    # State variables
-    line_text, line_bbox, line_words = "", [float('inf'), float('inf'), -1, -1], []
-    current_word_text, current_word_bbox = "", [float('inf'), float('inf'), -1, -1]
     prev_char = None
     def finalize_word():
         nonlocal current_word_text, current_word_bbox
-        word_text = current_word_text.strip()
-        if word_text:
             line_words.append({
-                "text": word_text,
-                "bounding_box": [round(b, 2) for b in current_word_bbox]
             })
         current_word_text = ""
         current_word_bbox = [float('inf'), float('inf'), -1, -1]
-    def finalize_line():
-        nonlocal line_text, line_bbox, line_words, line_number, prev_char
-        finalize_word()
-        if line_text.strip():
-            page_data["results"][f"text_line_{line_number}"] = {
-                "line": line_number,
-                "text": line_text.strip(),
-                "bounding_box": [round(b, 2) for b in line_bbox],
-                "words": line_words
-            }
-            line_number += 1
-            line_text, line_bbox, line_words = "", [float('inf'), float('inf'), -1, -1], []
-            prev_char = None
     for char in text_chars:
         char_text = clean_unicode_text(char.get_text())
-        if prev_char:
-            char_height = char.bbox[3] - char.bbox[1]
-            vertical_gap = abs(char.bbox[1] - prev_char.bbox[1])
-            # Line break detection
-            if vertical_gap > char_height * 0.7:
-                finalize_line()
-            else:
-                # Check for spacing between characters
-                space_threshold = char.size * 0.5
-                gap = char.bbox[0] - prev_char.bbox[2]
-                if gap > max(space_threshold, 1.0):
-                    finalize_word()
-                    line_text += " "
-        # ✅ Explicitly finalize if space character
-        if char_text == " ":
             finalize_word()
-            line_text += " "
             prev_char = char
-            continue
-        current_word_text += char_text
-        line_text += char_text
-        # Update bounding boxes
-        current_word_bbox[0] = min(current_word_bbox[0], char.bbox[0])
-        current_word_bbox[1] = min(current_word_bbox[1], char.bbox[1])
-        current_word_bbox[2] = max(current_word_bbox[2], char.bbox[2])
-        current_word_bbox[3] = max(current_word_bbox[3], char.bbox[3])
-        line_bbox[0] = min(line_bbox[0], char.bbox[0])
-        line_bbox[1] = min(line_bbox[1], char.bbox[1])
-        line_bbox[2] = max(line_bbox[2], char.bbox[2])
-        line_bbox[3] = max(line_bbox[3], char.bbox[3])
         prev_char = char
-    finalize_line()
-    return page_data
 def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
     decision_process_table = pd.DataFrame()
@@ -2098,7 +2175,7 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
     return decision_process_table
 def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
-    pikepdf_redaction_annotations_on_page = []
     for analysed_bounding_box in analysed_bounding_boxes:
         bounding_box = analysed_bounding_box["boundingBox"]
@@ -2131,27 +2208,27 @@ def redact_text_pdf(
     page_max: int = 999,  # Maximum page number to end redaction
     current_loop_page: int = 0,  # Current page being processed in the loop
     page_break_return: bool = False,  # Flag to indicate if a page break should be returned
-    annotations_all_pages: List[dict] = [],  # List of annotations across all pages
-    all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(columns=["page", "text",	"left", "top", "width", "height"]),  # DataFrame for OCR results
     all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"]),  # DataFrame for decision process table
-    pymupdf_doc: List = [],  # List of PyMuPDF documents
-    all_page_line_level_ocr_results_with_words: List = [],
-    pii_identification_method: str = "Local",
     comprehend_query_number:int = 0,
     comprehend_client="",
-    custom_recogniser_word_list:List[str]=[],
-    redact_whole_page_list:List[str]=[],
     max_fuzzy_spelling_mistakes_num:int=1,
     match_fuzzy_whole_phrase_bool:bool=True,
     page_sizes_df:pd.DataFrame=pd.DataFrame(),
-    original_cropboxes:List[dict]=[],
     text_extraction_only:bool=False,
     output_folder:str=OUTPUT_FOLDER,
     page_break_val: int = int(PAGE_BREAK_VALUE),  # Value for page break
-    max_time: int = int(MAX_TIME_VALUE),
     progress: Progress = Progress(track_tqdm=True)  # Progress tracking object
-):
     '''
     Redact chosen entities from a PDF that is made up of multiple pages that are not images.
@@ -2177,16 +2254,18 @@ def redact_text_pdf(
     - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
     -  max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
     -  match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
-    - page_sizes_df (pd.DataFrame, optional): A pandas dataframe containing page size information.
     - original_cropboxes (List[dict], optional): A list of dictionaries containing pymupdf cropbox information.
     - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
     - output_folder (str, optional): The output folder for the function
     - page_break_val: Value for page break
-    - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress: Progress tracking object
     '''
-    tic = time.perf_counter()
     if isinstance(all_line_level_ocr_results_df, pd.DataFrame):
         all_line_level_ocr_results_list = [all_line_level_ocr_results_df]
@@ -2198,6 +2277,17 @@ def redact_text_pdf(
     if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
         out_message = "Connection to AWS Comprehend service not found."
         raise Exception(out_message)
     # Update custom word list analyser object with any new words that have been added to the custom deny list
     if custom_recogniser_word_list:
@@ -2207,20 +2297,18 @@ def redact_text_pdf(
         nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
         new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
-        nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
     # Open with Pikepdf to get text lines
     pikepdf_pdf = Pdf.open(file_path)
     number_of_pages = len(pikepdf_pdf.pages)
-    file_name = get_file_name_without_type(file_path)
-    if not all_page_line_level_ocr_results_with_words:
-        all_page_line_level_ocr_results_with_words = []
     # Check that page_min and page_max are within expected ranges
-    if page_max > number_of_pages or page_max == 0:
-        page_max = number_of_pages
     if page_min <= 0: page_min = 0
     else: page_min = page_min - 1
@@ -2250,28 +2338,33 @@ def redact_text_pdf(
             # Go page by page
             for page_layout in extract_pages(file_path, page_numbers = [page_no], maxpages=1):
-                all_page_line_text_extraction_characters = []
-                all_page_line_level_text_extraction_results_list = []
-                page_analyser_results = []
-                page_redaction_bounding_boxes = []
-                characters = []
-                pikepdf_redaction_annotations_on_page = []
                 page_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"])
-                page_text_ocr_outputs = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
-                text_line_no = 0
                 for n, text_container in enumerate(page_layout):
-                    characters = []
                     if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
                         characters = get_text_container_characters(text_container)
-                        text_line_no += 1
                     # Create dataframe for all the text on the page
-                    line_level_text_results_list, line_characters, = create_line_level_ocr_results_from_characters(characters)
-                    line_level_ocr_results_with_words = generate_word_level_ocr(characters, page_number=int(reported_page_number), text_line_number=text_line_no)
                     ### Create page_text_ocr_outputs (OCR format outputs)
                     if line_level_text_results_list:
@@ -2282,14 +2375,19 @@ def redact_text_pdf(
                             'left': result.left,
                             'top': result.top,
                             'width': result.width,
-                            'height': result.height
                         } for result in line_level_text_results_list])
-                        page_text_ocr_outputs = pd.concat([page_text_ocr_outputs, line_level_text_results_df])
                     all_page_line_level_text_extraction_results_list.extend(line_level_text_results_list)
                     all_page_line_text_extraction_characters.extend(line_characters)
-                    all_page_line_level_ocr_results_with_words.append(line_level_ocr_results_with_words)
                 ### REDACTION
                 if pii_identification_method != NO_REDACTION_PII_OPTION:
@@ -2315,7 +2413,7 @@ def redact_text_pdf(
                         # Annotate redactions on page
                         pikepdf_redaction_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_redaction_bounding_boxes)
-                    else: pikepdf_redaction_annotations_on_page = []
                     # Make pymupdf page redactions
                     if redact_whole_page_list:
@@ -2339,8 +2437,8 @@ def redact_text_pdf(
                 # Join extracted text outputs for all lines together
                 if not page_text_ocr_outputs.empty:
-                    #page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
-                    page_text_ocr_outputs = page_text_ocr_outputs.loc[:, ["page", "text", "left", "top", "width", "height"]]
                     all_line_level_ocr_results_list.append(page_text_ocr_outputs)
                 toc = time.perf_counter()
@@ -2366,7 +2464,8 @@ def redact_text_pdf(
                     # Write logs
                     all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
                     all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
                     current_loop_page += 1
@@ -2395,19 +2494,14 @@ def redact_text_pdf(
     # Write all page outputs
     all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
-    #print("all_line_level_ocr_results_list:", all_line_level_ocr_results_list)
     all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
-    #print("all_line_level_ocr_results_df after concat:", all_line_level_ocr_results_df)
     # Convert decision table to relative coordinates
     all_pages_decision_process_table = divide_coordinates_by_page_sizes(all_pages_decision_process_table, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
     # Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
-    all_pages_decision_process_table['ymin'] = 1 - all_pages_decision_process_table['ymin']
-    all_pages_decision_process_table['ymax'] = 1 - all_pages_decision_process_table['ymax']
     # Convert decision table to relative coordinates
     all_line_level_ocr_results_df = divide_coordinates_by_page_sizes(all_line_level_ocr_results_df, page_sizes_df, xmin="left", xmax="width", ymin="top", ymax="height")
@@ -2416,13 +2510,9 @@ def redact_text_pdf(
     # Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
     if not all_line_level_ocr_results_df.empty:
-        all_line_level_ocr_results_df['top'] = all_line_level_ocr_results_df['top'].astype(float)
-        all_line_level_ocr_results_df['top'] = 1 - all_line_level_ocr_results_df['top']
-    #all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words_local_text.json"
-    #print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
-    #with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
-    #    json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
     return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words

 from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal, LTAnno
 from pikepdf import Pdf, Dictionary, Name
 from pymupdf import Rect, Page, Document
+from presidio_analyzer import AnalyzerEngine
 import gradio as gr
 from gradio import Progress
 from collections import defaultdict  # For efficient grouping
+from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, LOCAL_PII_OPTION, AWS_PII_OPTION, NO_REDACTION_PII_OPTION, DEFAULT_LANGUAGE, textract_language_choices, aws_comprehend_language_choices
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction,  recreate_page_line_level_ocr_results_with_page
+from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression, word_level_ocr_output_to_dataframe
+from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer, load_spacy_model, download_tesseract_lang_pack, create_nlp_analyser
 from tools.helper_functions import get_file_name_without_type, clean_unicode_text
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
     return sum_of_numbers
+def reverse_y_coords(df:pd.DataFrame, column:str):
+    df[column] = df[column]
+    df[column] = 1 - df[column].astype(float)
+    df[column] = df[column].round(6)
+    return df[column]
+def merge_page_results(data:list):
     merged = {}
     for item in data:
     return list(merged.values())
 def choose_and_run_redactor(file_paths:List[str],
  prepared_pdf_file_paths:List[str],
+ pdf_image_file_paths:List[str],
  chosen_redact_entities:List[str],
  chosen_redact_comprehend_entities:List[str],
  text_extraction_method:str,
+ in_allow_list:List[List[str]]=list(),
+ custom_recogniser_word_list:List[str]=list(),
+ redact_whole_page_list:List[str]=list(),
  latest_file_completed:int=0,
+ combined_out_message:List=list(),
+ out_file_paths:List=list(),
+ log_files_output_paths:List=list(),
  first_loop_state:bool=False,
  page_min:int=0,
  page_max:int=999,
  estimated_time_taken_state:float=0.0,
+ handwrite_signature_checkbox:List[str]=list(["Extract handwriting", "Extract signatures"]),
  all_request_metadata_str:str = "",
+ annotations_all_pages:List[dict]=list(),
+ all_page_line_level_ocr_results_df:pd.DataFrame=None,
+ all_pages_decision_process_table:pd.DataFrame=None,
+ pymupdf_doc=list(),
  current_loop_page:int=0,
  page_break_return:bool=False,
+ pii_identification_method:str="Local",
  comprehend_query_number:int=0,
  max_fuzzy_spelling_mistakes_num:int=1,
  match_fuzzy_whole_phrase_bool:bool=True,
  aws_access_key_textbox:str='',
  aws_secret_key_textbox:str='',
  annotate_max_pages:int=1,
+ review_file_state:pd.DataFrame=list(),
  output_folder:str=OUTPUT_FOLDER,
+ document_cropboxes:List=list(),
+ page_sizes:List[dict]=list(),
  textract_output_found:bool=False,
  text_extraction_only:bool=False,
+ duplication_file_path_outputs:list=list(),
  review_file_path:str="",
  input_folder:str=INPUT_FOLDER,
  total_textract_query_number:int=0,
  ocr_file_path:str="",
+ all_page_line_level_ocr_results:list[dict] = list(),
+ all_page_line_level_ocr_results_with_words:list[dict] = list(),
+ all_page_line_level_ocr_results_with_words_df:pd.DataFrame=None,
+ chosen_local_model:str="tesseract",
+ language:str=DEFAULT_LANGUAGE,
+ prepare_images:bool=True,
  RETURN_PDF_END_OF_REDACTION:bool=RETURN_PDF_END_OF_REDACTION,
  progress=gr.Progress(track_tqdm=True)):
     '''
     - file_paths (List[str]): A list of paths to the files to be redacted.
     - prepared_pdf_file_paths (List[str]): A list of paths to the PDF files prepared for redaction.
     - pdf_image_file_paths (List[str]): A list of paths to the PDF files converted to images for redaction.
     - chosen_redact_entities (List[str]): A list of entity types to redact from the files using the local model (spacy) with Microsoft Presidio.
     - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service.
     - text_extraction_method (str): The method to use to extract text from documents.
     - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
     - all_request_metadata_str (str, optional): A string containing all request metadata. Defaults to an empty string.
     - annotations_all_pages (List[dict], optional): A list of dictionaries containing all image annotations. Defaults to an empty list.
+    - all_page_line_level_ocr_results_df (pd.DataFrame, optional): A DataFrame containing all line-level OCR results. Defaults to an empty DataFrame.
     - all_pages_decision_process_table (pd.DataFrame, optional): A DataFrame containing all decision process tables. Defaults to an empty DataFrame.
     - pymupdf_doc (optional): A list containing the PDF document object. Defaults to an empty list.
     - current_loop_page (int, optional): The current page being processed in the loop. Defaults to 0.
     - ocr_file_path (str, optional): The latest ocr file path created by the app.
     - all_page_line_level_ocr_results (list, optional): All line level text on the page with bounding boxes.
     - all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
+    - all_page_line_level_ocr_results_with_words_df (pd.Dataframe, optional): All word level text on the page with bounding boxes as a dataframe.
+    - chosen_local_model (str): Which local model is being used for OCR on images - "tesseract", "paddle" for PaddleOCR, or "hybrid" to combine both.
+    - language (str, optional): The language of the text in the files. Defaults to English.
+    - language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
+    - prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
     - RETURN_PDF_END_OF_REDACTION (bool, optional): Boolean to determine whether to return a redacted PDF at the end of the redaction process.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     out_message = ""
     pdf_file_name_with_ext = ""
+    pdf_file_name_without_ext = ""
+    page_break_return = False
+    blank_request_metadata = list()
     all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
+    review_out_file_paths = [prepared_pdf_file_paths[0]]
+    # Use provided language or default
+    language = language or DEFAULT_LANGUAGE
+    if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
+        if language not in textract_language_choices:
+            out_message = f"Language '{language}' is not supported by AWS Textract. Please select a different language."
+            raise Warning(out_message)
+    elif pii_identification_method == AWS_PII_OPTION:
+        if language not in aws_comprehend_language_choices:
+            out_message = f"Language '{language}' is not supported by AWS Comprehend. Please select a different language."
+            raise Warning(out_message)
+    if all_page_line_level_ocr_results_with_words_df is None:
+         all_page_line_level_ocr_results_with_words_df = pd.DataFrame()
+    # Create copies of out_file_path objects to avoid overwriting each other on append actions
+    out_file_paths = out_file_paths.copy()
+    log_files_output_paths = log_files_output_paths.copy()
     # Ensure all_pages_decision_process_table is in correct format for downstream processes
     if isinstance(all_pages_decision_process_table,list):
         if not all_pages_decision_process_table: all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"])
         #print("First_loop_state is True")
         latest_file_completed = 0
         current_loop_page = 0
+        out_file_paths = list()
+        log_files_output_paths = list()
         estimate_total_processing_time = 0
         estimated_time_taken_state = 0
         comprehend_query_number = 0
     elif (first_loop_state == False) & (current_loop_page == 999):
         current_loop_page = 0
         total_textract_query_number = 0
+        comprehend_query_number = 0
     # Choose the correct file to prepare
     if isinstance(file_paths, str): file_paths_list = [os.path.abspath(file_paths)]
     # Check if any files were found and assign to file_paths_list
     file_paths_list = filtered_files if filtered_files else []
+    print("Latest file completed:", latest_file_completed)
     # If latest_file_completed is used, get the specific file
     if not isinstance(file_paths, (str, dict)): file_paths_loop = [file_paths_list[int(latest_file_completed)]] if len(file_paths_list) > latest_file_completed else []
     else: file_paths_loop = file_paths_list
         # Only send across review file if redaction has been done
         if pii_identification_method != NO_REDACTION_PII_OPTION:
+            if len(review_out_file_paths) == 1:
                 if review_file_path: review_out_file_paths.append(review_file_path)
         if not isinstance(pymupdf_doc, list):
         estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
         print("Estimated total processing time:", str(estimate_total_processing_time))
+        page_break_return = True
+        return combined_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words,  all_page_line_level_ocr_results_with_words_df, review_file_state
     #if first_loop_state == False:
     # Prepare documents and images as required if they don't already exist
     # Call prepare_image_or_pdf only if needed
     if prepare_images_flag is not None:
+        out_message, prepared_pdf_file_paths, pdf_image_file_paths, annotate_max_pages, annotate_max_pages_bottom, pymupdf_doc, annotations_all_pages, review_file_state, document_cropboxes, page_sizes, textract_output_found, all_img_details_state, placeholder_ocr_results_df, local_ocr_output_found_checkbox, all_page_line_level_ocr_results_with_words_df = prepare_image_or_pdf(
+            file_paths_loop, text_extraction_method, all_page_line_level_ocr_results_df, all_page_line_level_ocr_results_with_words_df, 0, out_message, True,
             annotate_max_pages, annotations_all_pages, document_cropboxes, redact_whole_page_list,
+            output_folder=output_folder, prepare_images=prepare_images_flag, page_sizes=page_sizes, pymupdf_doc=pymupdf_doc, input_folder=input_folder
         )
     page_sizes_df = pd.DataFrame(page_sizes)
     page_sizes = page_sizes_df.to_dict(orient="records")
+    number_of_pages = pymupdf_doc.page_count
     # If we have reached the last page, return message and outputs
     if current_loop_page >= number_of_pages:
         # Only send across review file if redaction has been done
         if pii_identification_method != NO_REDACTION_PII_OPTION:
             # If only pdf currently in review outputs, add on the latest review file
+            if len(review_out_file_paths) == 1:
                 if review_file_path: review_out_file_paths.append(review_file_path)
+        page_break_return = False
+        return combined_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df, review_file_state
     # Load/create allow list
     # If string, assume file path
     if not in_allow_list.empty:
         in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
     else:
+        in_allow_list_flat = list()
     # If string, assume file path
     if isinstance(custom_recogniser_word_list, str):
         if not custom_recogniser_word_list.empty:
             custom_recogniser_word_list_flat = custom_recogniser_word_list.iloc[:, 0].tolist()
         else:
+            custom_recogniser_word_list_flat = list()
         # Sort the strings in order from the longest string to the shortest
         custom_recogniser_word_list_flat = sorted(custom_recogniser_word_list_flat, key=len, reverse=True)
                 print("Could not convert whole page redaction data to number list due to:", e)
                 redact_whole_page_list_flat = redact_whole_page_list.iloc[:,0].tolist()
         else:
+            redact_whole_page_list_flat = list()
     else:
         textract_client = ""
+    ### Language check - check if selected language packs exist
+    try:
+        if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION and chosen_local_model == "tesseract":
+            if language != "en":
+                progress(0.1, desc=f"Downloading Tesseract language pack for {language}")
+                download_tesseract_lang_pack(language)
+        if language != "en":
+            progress(0.1, desc=f"Loading SpaCy model for {language}")
+        load_spacy_model(language)
+    except Exception as e:
+        print(f"Error downloading language packs for {language}: {e}")
+        raise Exception(f"Error downloading language packs for {language}: {e}")
     # Check if output_folder exists, create it if it doesn't
     if not os.path.exists(output_folder): os.makedirs(output_folder)
     progress(0.5, desc="Extracting text and redacting document")
     all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"])
+    all_page_line_level_ocr_results_df = pd.DataFrame(columns=["page", "text",	"left", "top", "width", "height", "line"])
     # Run through file loop, redact each file at a time
     for file in file_paths_loop:
             raise Exception(out_message)
         # Output file paths names
+        orig_pdf_file_path = output_folder + pdf_file_name_without_ext
         review_file_path = orig_pdf_file_path + '_review_file.csv'
         # Load in all_ocr_results_with_words if it exists as a file path and doesn't exist already
+        #file_name = get_file_name_without_type(file_path)
+        if text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION: file_ending = "local_text"
+        elif text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION: file_ending = "local_ocr"
+        elif text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION: file_ending = "textract"
+        all_page_line_level_ocr_results_with_words_json_file_path = output_folder + pdf_file_name_without_ext + "_ocr_results_with_words_" + file_ending + ".json"
         if not all_page_line_level_ocr_results_with_words:
             if local_ocr_output_found_checkbox == True and os.path.exists(all_page_line_level_ocr_results_with_words_json_file_path):
             print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
+            pymupdf_doc, all_pages_decision_process_table, out_file_paths, new_textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words = redact_image_pdf(file_path,
              pdf_image_file_paths,
              language,
              chosen_redact_entities,
              current_loop_page,
              page_break_return,
              annotations_all_pages,
+             all_page_line_level_ocr_results_df,
              all_pages_decision_process_table,
              pymupdf_doc,
              pii_identification_method,
              text_extraction_only,
              all_page_line_level_ocr_results,
              all_page_line_level_ocr_results_with_words,
+             chosen_local_model,
              log_files_output_paths=log_files_output_paths,
+             nlp_analyser=nlp_analyser,
              output_folder=output_folder)
+            # This line creates a copy of out_file_paths to break potential links with log_files_output_paths
+            out_file_paths = out_file_paths.copy()
+            # Save Textract request metadata (if exists)
             if new_textract_request_metadata and isinstance(new_textract_request_metadata, list):
+                all_textract_request_metadata.extend(new_textract_request_metadata)
         elif text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION:
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
+            pymupdf_doc, all_pages_decision_process_table, all_page_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words = redact_text_pdf(
             file_path,
             language,
             chosen_redact_entities,
             current_loop_page,
             page_break_return,
             annotations_all_pages,
+            all_page_line_level_ocr_results_df,
             all_pages_decision_process_table,
             pymupdf_doc,
             all_page_line_level_ocr_results_with_words,
                         out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.pdf"
                         print("Saving redacted PDF file:", out_redacted_pdf_file_path)
                         save_pdf_with_or_without_compression(pymupdf_doc, out_redacted_pdf_file_path)
                     out_file_paths.append(out_redacted_pdf_file_path)
+            if not all_page_line_level_ocr_results_df.empty:
+                all_page_line_level_ocr_results_df = all_page_line_level_ocr_results_df[["page", "text", "left", "top", "width", "height", "line"]]
+            else: all_page_line_level_ocr_results_df = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height", "line"])
+            #ocr_file_path = orig_pdf_file_path + "_ocr_output.csv"
+            ocr_file_path = (output_folder + pdf_file_name_without_ext + "_ocr_output_" + file_ending + ".csv")
+            all_page_line_level_ocr_results_df.sort_values(["page", "line"], inplace=True)
+            all_page_line_level_ocr_results_df.to_csv(ocr_file_path, index = None, encoding="utf-8-sig")
             out_file_paths.append(ocr_file_path)
             duplication_file_path_outputs.append(ocr_file_path)
             if all_page_line_level_ocr_results_with_words:
                 all_page_line_level_ocr_results_with_words = merge_page_results(all_page_line_level_ocr_results_with_words)
                 with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
                     json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
                 all_page_line_level_ocr_results_with_words_df = word_level_ocr_output_to_dataframe(all_page_line_level_ocr_results_with_words)
                 all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="word_x0", xmax="word_x1", ymin="word_y0", ymax="word_y1")
+                # all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="line_x0", xmax="line_x1", ymin="line_y0", ymax="line_y1")
+                if text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION:
+                    # Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
+                    if not all_page_line_level_ocr_results_with_words_df.empty:
+                        # all_page_line_level_ocr_results_with_words_df['line_y0'] = reverse_y_coords(all_page_line_level_ocr_results_with_words_df, 'line_y0')
+                        # all_page_line_level_ocr_results_with_words_df['line_y1'] = reverse_y_coords(all_page_line_level_ocr_results_with_words_df, 'line_y1')
+                        all_page_line_level_ocr_results_with_words_df['word_y0'] = reverse_y_coords(all_page_line_level_ocr_results_with_words_df, 'word_y0')
+                        all_page_line_level_ocr_results_with_words_df['word_y1'] = reverse_y_coords(all_page_line_level_ocr_results_with_words_df, 'word_y1')
+                all_page_line_level_ocr_results_with_words_df['line_text'] = ""
+                all_page_line_level_ocr_results_with_words_df['line_x0'] = ""
+                all_page_line_level_ocr_results_with_words_df['line_x1'] = ""
+                all_page_line_level_ocr_results_with_words_df['line_y0'] = ""
+                all_page_line_level_ocr_results_with_words_df['line_y1'] = ""
+                all_page_line_level_ocr_results_with_words_df.sort_values(["page", "line", "word_x0"], inplace=True)
+                all_page_line_level_ocr_results_with_words_df_file_path = all_page_line_level_ocr_results_with_words_json_file_path.replace(".json", ".csv")
+                all_page_line_level_ocr_results_with_words_df.to_csv(all_page_line_level_ocr_results_with_words_df_file_path, index = None, encoding="utf-8-sig")
                 if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
                     log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
                 if all_page_line_level_ocr_results_with_words_df_file_path not in log_files_output_paths:
                     log_files_output_paths.append(all_page_line_level_ocr_results_with_words_df_file_path)
+                if all_page_line_level_ocr_results_with_words_df_file_path not in out_file_paths:
+                    out_file_paths.append(all_page_line_level_ocr_results_with_words_df_file_path)
             # Convert the gradio annotation boxes to relative coordinates
             progress(0.93, "Creating review file output")
             page_sizes = page_sizes_df.to_dict(orient="records")
             all_image_annotations_df = convert_annotation_data_to_dataframe(annotations_all_pages)
             # Don't need page sizes in outputs
             review_file_state.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
+            review_file_state.to_csv(review_file_path, index=None, encoding="utf-8-sig")
+            if pii_identification_method != NO_REDACTION_PII_OPTION: out_file_paths.append(review_file_path)
             # Make a combined message for the file
             if isinstance(out_message, list) and out_message:
             time_taken = toc - tic
             estimated_time_taken_state += time_taken
+   # If textract requests made, write to logging file. Also record number of Textract requests
     if all_textract_request_metadata and isinstance(all_textract_request_metadata, list):
         all_request_metadata_str = '\n'.join(all_textract_request_metadata).strip()
         all_textract_request_metadata_file_path = output_folder + pdf_file_name_without_ext + "_textract_metadata.txt"
+        with open(all_textract_request_metadata_file_path, "w") as f: f.write(all_request_metadata_str)
         # Add the request metadata to the log outputs if not there already
+        if all_textract_request_metadata_file_path not in log_files_output_paths: log_files_output_paths.append(all_textract_request_metadata_file_path)
         new_textract_query_numbers = len(all_textract_request_metadata)
         total_textract_query_number += new_textract_query_numbers
     if total_textract_query_number > number_of_pages: total_textract_query_number = number_of_pages
+    page_break_return = True
+    return combined_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages_divide, current_loop_page, page_break_return, all_page_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df, review_file_state
 def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
     '''
     else:
         page.set_cropbox(original_cropbox)
+def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]= list(), page_sizes_df:pd.DataFrame=pd.DataFrame()):
     rect_height = page.rect.height
     rect_width = page.rect.width
             image_dimensions = {}
     out_annotation_boxes = {}
+    all_image_annotation_boxes = list()
     if isinstance(image, Image.Image):
         image_path = move_page_info(str(page))
 # IMAGE-BASED OCR PDF TEXT DETECTION/REDACTION WITH TESSERACT OR AWS TEXTRACT
 ###
+def merge_img_bboxes(bboxes: list, combined_results: Dict, page_signature_recogniser_results: list = list(), page_handwriting_recogniser_results: list = list(), handwrite_signature_checkbox: List[str] = ["Extract handwriting", "Extract signatures"], horizontal_threshold: int = 50, vertical_threshold: int = 12):
+    """
+    Merges bounding boxes for image annotations based on the provided results from signature and handwriting recognizers.
+    Args:
+        bboxes (list): A list of bounding boxes to be merged.
+        combined_results (Dict): A dictionary containing combined results with line text and their corresponding bounding boxes.
+        page_signature_recogniser_results (list, optional): A list of results from the signature recognizer. Defaults to an empty list.
+        page_handwriting_recogniser_results (list, optional): A list of results from the handwriting recognizer. Defaults to an empty list.
+        handwrite_signature_checkbox (List[str], optional): A list of options indicating whether to extract handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
+        horizontal_threshold (int, optional): The threshold for merging bounding boxes horizontally. Defaults to 50.
+        vertical_threshold (int, optional): The threshold for merging bounding boxes vertically. Defaults to 12.
+    Returns:
+        None: This function modifies the bounding boxes in place and does not return a value.
+    """
+    all_bboxes = list()
+    merged_bboxes = list()
     grouped_bboxes = defaultdict(list)
     # Deep copy original bounding boxes to retain them
             merged_bboxes.extend(copy.deepcopy(page_signature_recogniser_results))
     # Reconstruct bounding boxes for substrings of interest
+    reconstructed_bboxes = list()
     for bbox in bboxes:
         bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
         for line_text, line_info in combined_results.items():
                     start_char = line_text.index(bbox.text)
                     end_char = start_char + len(bbox.text)
+                    relevant_words = list()
                     current_char = 0
                     for word in line_info['words']:
                         word_end = current_char + len(word['text'])
                      page_max:int=999,
                      text_extraction_method:str=TESSERACT_TEXT_EXTRACT_OPTION,
                      handwrite_signature_checkbox:List[str]=["Extract handwriting", "Extract signatures"],
+                     textract_request_metadata:list=list(),
                      current_loop_page:int=0,
                      page_break_return:bool=False,
+                     annotations_all_pages:List=list(),
+                     all_page_line_level_ocr_results_df:pd.DataFrame = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height", "line"]),
                      all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"]),
+                     pymupdf_doc:Document = list(),
                      pii_identification_method:str="Local",
                      comprehend_query_number:int=0,
                      comprehend_client:str="",
                      textract_client:str="",
+                     custom_recogniser_word_list:List[str]=list(),
+                     redact_whole_page_list:List[str]=list(),
                      max_fuzzy_spelling_mistakes_num:int=1,
                      match_fuzzy_whole_phrase_bool:bool=True,
                      page_sizes_df:pd.DataFrame=pd.DataFrame(),
                      text_extraction_only:bool=False,
+                     all_page_line_level_ocr_results = list(),
+                     all_page_line_level_ocr_results_with_words = list(),
+                     chosen_local_model:str="tesseract",
                      page_break_val:int=int(PAGE_BREAK_VALUE),
+                     log_files_output_paths:List=list(),
                      max_time:int=int(MAX_TIME_VALUE),
+                     nlp_analyser: AnalyzerEngine = nlp_analyser,
+                     output_folder:str=OUTPUT_FOLDER,
                      progress=Progress(track_tqdm=True)):
     '''
+    This function redacts sensitive information from a PDF document. It takes the following parameters in order:
     - file_path (str): The path to the PDF file to be redacted.
     - pdf_image_file_paths (List[str]): A list of paths to the PDF file pages converted to images.
     - text_extraction_method (str, optional): The type of analysis to perform on the PDF. Defaults to TESSERACT_TEXT_EXTRACT_OPTION.
     - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
     - textract_request_metadata (list, optional): Metadata related to the redaction request. Defaults to an empty string.
+    - current_loop_page (int, optional): The current page being processed. Defaults to 0.
     - page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
     - annotations_all_pages (List, optional): List of annotations on all pages that is used by the gradio_image_annotation object.
+    - all_page_line_level_ocr_results_df (pd.DataFrame, optional): All line level OCR results for the document as a Pandas dataframe,
     - all_pages_decision_process_table (pd.DataFrame, optional): All redaction decisions for document as a Pandas dataframe.
     - pymupdf_doc (Document, optional): The document as a PyMupdf object.
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
     - page_sizes_df (pd.DataFrame, optional): A pandas dataframe of PDF page sizes in PDF or image format.
     - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
+    - all_page_line_level_ocr_results (optional): List of all page line level OCR results.
+    - all_page_line_level_ocr_results_with_words (optional): List of all page line level OCR results with words.
+    - chosen_local_model (str, optional): The local model chosen for OCR. Defaults to "tesseract", other choices are "paddle" for PaddleOCR, or "hybrid" for a combination of both.
+    - page_break_val (int, optional): The value at which to trigger a page break. Defaults to PAGE_BREAK_VALUE.
     - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
+    - nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
+    - output_folder (str, optional): The folder for file outputs.
     - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     The function returns a redacted PDF document along with processing output objects.
     file_name = get_file_name_without_type(file_path)
     comprehend_query_number_new = 0
+    # Try updating the supported languages for the spacy analyser
+    try:
+        nlp_analyser = create_nlp_analyser(language, existing_nlp_analyser=nlp_analyser)
+        # Check list of nlp_analyser recognisers and languages
+        if language != "en":
+            gr.Info(f"Language: {language} only supports the following entity detection: {str(nlp_analyser.registry.get_supported_entities(languages=[language]))}")
+    except Exception as e:
+        print(f"Error creating nlp_analyser for {language}: {e}")
+        raise Exception(f"Error creating nlp_analyser for {language}: {e}")
     # Update custom word list analyser object with any new words that have been added to the custom deny list
     if custom_recogniser_word_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
         nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
+    # Only load in PaddleOCR models if not running Textract
+    if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
+        image_analyser = CustomImageAnalyzerEngine(analyzer_engine=nlp_analyser, ocr_engine="tesseract", language=language)
+    else:
+        image_analyser = CustomImageAnalyzerEngine(analyzer_engine=nlp_analyser, ocr_engine=chosen_local_model, language=language)
     if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
         out_message = "Connection to AWS Comprehend service unsuccessful."
     if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION and textract_client == "":
         out_message_warning = "Connection to AWS Textract service unsuccessful. Redaction will only continue if local AWS Textract results can be found."
         print(out_message_warning)
+        #raise Exception(out_message)
     number_of_pages = pymupdf_doc.page_count
     print("Number of pages:", str(number_of_pages))
         textract_data, is_missing, log_files_output_paths = load_and_convert_textract_json(textract_json_file_path, log_files_output_paths, page_sizes_df)
         original_textract_data = textract_data.copy()
+        #print("Successfully loaded in Textract analysis results from file")
     # If running local OCR option, check if file already exists. If it does, load in existing data
     if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
         all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(all_page_line_level_ocr_results_with_words_json_file_path, log_files_output_paths, page_sizes_df)
         original_all_page_line_level_ocr_results_with_words = all_page_line_level_ocr_results_with_words.copy()
+        #print("Loaded in local OCR analysis results from file")
     ###
     if current_loop_page == 0: page_loop_start = 0
     progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
     # If there's data from a previous run (passed in via the DataFrame parameters), add it
+    all_line_level_ocr_results_list = list()
+    all_pages_decision_process_list = list()
+    if not all_page_line_level_ocr_results_df.empty:
+        all_line_level_ocr_results_list.extend(all_page_line_level_ocr_results_df.to_dict('records'))
     if not all_pages_decision_process_table.empty:
         all_pages_decision_process_list.extend(all_pages_decision_process_table.to_dict('records'))
     # Go through each page
     for page_no in progress_bar:
+        handwriting_or_signature_boxes = list()
+        page_signature_recogniser_results = list()
+        page_handwriting_recogniser_results = list()
+        page_line_level_ocr_results_with_words = list()
         page_break_return = False
         reported_page_number = str(page_no + 1)
                 print("Can't find original cropbox details for page, using current PyMuPDF page cropbox")
                 original_cropbox =  pymupdf_page.cropbox.irect
             # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
             # If using Tesseract
             if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
                     )
                     page_line_level_ocr_results_with_words = matching_page if matching_page else []
+                else: page_line_level_ocr_results_with_words = list()
                 if page_line_level_ocr_results_with_words:
                     print("Found OCR results for page in existing OCR with words object")
                     page_line_level_ocr_results, page_line_level_ocr_results_with_words = combine_ocr_results(page_word_level_ocr_results, page=reported_page_number)
+                    if all_page_line_level_ocr_results_with_words is None:
+                        all_page_line_level_ocr_results_with_words = list()
                     all_page_line_level_ocr_results_with_words.append(page_line_level_ocr_results_with_words)
             # Check if page exists in existing textract data. If not, send to service to analyse
             if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
+                text_blocks = list()
                 if not textract_data:
                     try:
                             text_blocks, new_textract_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
                             # Check if "pages" key exists, if not, initialise it as an empty list
+                            if "pages" not in textract_data: textract_data["pages"] = list()
                             # Append the new page data
                             textract_data["pages"].append(text_blocks)
                         except Exception as e:
                             out_message = "Textract extraction for page " + reported_page_number + " failed due to:" + str(e)
                             print(out_message)
+                            text_blocks = list()
                             new_textract_request_metadata = "Failed Textract API call"
                             # Check if "pages" key exists, if not, initialise it as an empty list
+                            if "pages" not in textract_data: textract_data["pages"] = list()
                             raise Exception(out_message)
                 page_line_level_ocr_results, handwriting_or_signature_boxes, page_signature_recogniser_results, page_handwriting_recogniser_results, page_line_level_ocr_results_with_words = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
+                if all_page_line_level_ocr_results_with_words is None:
+                    all_page_line_level_ocr_results_with_words = list()
                 all_page_line_level_ocr_results_with_words.append(page_line_level_ocr_results_with_words)
             # Convert to DataFrame and add to ongoing logging table
                 'left': result.left,
                 'top': result.top,
                 'width': result.width,
+                'height': result.height,
+                'line': result.line
             } for result in page_line_level_ocr_results['results']])
             if not line_level_ocr_results_df.empty: # Ensure there are records to add
                         page_line_level_ocr_results_with_words['results'],
                         chosen_redact_comprehend_entities = chosen_redact_comprehend_entities,
                         pii_identification_method = pii_identification_method,
+                        comprehend_client=comprehend_client,
+                        custom_entities=chosen_redact_entities,
                         language=language,
                         allow_list=allow_list,
+                        score_threshold=score_threshold,
+                        nlp_analyser=nlp_analyser
                     )
                     comprehend_query_number = comprehend_query_number + comprehend_query_number_new
+                else: page_redaction_bounding_boxes = list()
                 # Merge redaction bounding boxes that are close together
                 page_merged_redaction_bboxes = merge_img_bboxes(page_redaction_bounding_boxes, page_line_level_ocr_results_with_words['results'], page_signature_recogniser_results, page_handwriting_recogniser_results, handwrite_signature_checkbox)
+            else: page_merged_redaction_bboxes = list()
             # 3. Draw the merged boxes
             ## Apply annotations to pdf with pymupdf
                 fill = (0, 0, 0)   # Fill colour for redactions
                 draw = ImageDraw.Draw(image)
+                all_image_annotations_boxes = list()
                 for box in page_merged_redaction_bboxes:
                 page_image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
+                redacted_image = image.copy()
             # Convert decision process to table
             decision_process_table = pd.DataFrame([{
                 all_pages_decision_process_list.extend(decision_process_table.to_dict('records'))
             decision_process_table = fill_missing_ids(decision_process_table)
             toc = time.perf_counter()
         return characters
     return []
+def create_line_level_ocr_results_from_characters(char_objects:List, line_number:int) -> Tuple[List[OCRResult], List[List]]:
+    """
+    Create OCRResult objects based on a list of pdfminer LTChar objects.
+    This version is corrected to use the specified OCRResult class definition.
+    """
+    line_level_results_out = list()
+    line_level_characters_out = list()
+    character_objects_out = list()
     full_text = ""
+    # [x0, y0, x1, y1]
+    overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]
     for char in char_objects:
+        character_objects_out.append(char)
         if isinstance(char, LTAnno):
             added_text = char.get_text()
+            full_text += added_text
             if '\n' in added_text:
+                if full_text.strip():
+                    # Create OCRResult for line
+                    line_level_results_out.append(OCRResult(
+                        text=full_text.strip(),
+                        left=round(overall_bbox[0], 2),
+                        top=round(overall_bbox[1], 2),
+                        width=round(overall_bbox[2] - overall_bbox[0], 2),
+                        height=round(overall_bbox[3] - overall_bbox[1], 2),
+                        line=line_number
+                    ))
+                    line_level_characters_out.append(character_objects_out)
                 # Reset for the next line
+                character_objects_out = list()
                 full_text = ""
                 overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]
+                line_number += 1
             continue
+        # This part handles LTChar objects
+        added_text = clean_unicode_text(char.get_text())
+        full_text += added_text
         x0, y0, x1, y1 = char.bbox
+        overall_bbox[0] = min(overall_bbox[0], x0)
+        overall_bbox[1] = min(overall_bbox[1], y0)
+        overall_bbox[2] = max(overall_bbox[2], x1)
+        overall_bbox[3] = max(overall_bbox[3], y1)
+    # Process the last line
+    if full_text.strip():
+        line_number += 1
+        line_ocr_result = OCRResult(
+            text=full_text.strip(),
+            left=round(overall_bbox[0], 2),
+            top=round(overall_bbox[1], 2),
+            width=round(overall_bbox[2] - overall_bbox[0], 2),
+            height=round(overall_bbox[3] - overall_bbox[1], 2),
+            line=line_number
+        )
+        line_level_results_out.append(line_ocr_result)
+        line_level_characters_out.append(character_objects_out)
+    return line_level_results_out, line_level_characters_out
+def generate_words_for_line(line_chars: List) -> List[Dict[str, Any]]:
     """
+    Generates word-level results for a single, pre-defined line of characters.
+    This robust version correctly identifies word breaks by:
+    1. Treating specific punctuation characters as standalone words.
+    2. Explicitly using space characters (' ') as a primary word separator.
+    3. Using a geometric gap between characters as a secondary, heuristic separator.
     Args:
+        line_chars: A list of pdfminer.six LTChar/LTAnno objects for one line.
     Returns:
+        A list of dictionaries, where each dictionary represents an individual word.
     """
+    # We only care about characters with coordinates and text for word building.
+    text_chars = [c for c in line_chars if hasattr(c, 'bbox') and c.get_text()]
     if not text_chars:
+        return []
+    # Sort characters by horizontal position for correct processing.
+    text_chars.sort(key=lambda c: c.bbox[0])
+    # NEW: Define punctuation that should be split into separate words.
+    # The hyphen '-' is intentionally excluded to keep words like 'high-tech' together.
+    PUNCTUATION_TO_SPLIT = {'.', ',', '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'}
+    line_words = list()
+    current_word_text = ""
+    current_word_bbox = [float('inf'), float('inf'), -1, -1]  # [x0, y0, x1, y1]
     prev_char = None
     def finalize_word():
         nonlocal current_word_text, current_word_bbox
+        # Only add the word if it contains non-space text
+        if current_word_text.strip():
+            # bbox from [x0, y0, x1, y1] to your required format
+            final_bbox = [
+                round(current_word_bbox[0], 2),
+                round(current_word_bbox[3], 2), # Note: using y1 from pdfminer bbox
+                round(current_word_bbox[2], 2),
+                round(current_word_bbox[1], 2), # Note: using y0 from pdfminer bbox
+            ]
             line_words.append({
+                "text": current_word_text.strip(),
+                "bounding_box": final_bbox
             })
+        # Reset for the next word
         current_word_text = ""
         current_word_bbox = [float('inf'), float('inf'), -1, -1]
     for char in text_chars:
         char_text = clean_unicode_text(char.get_text())
+        # 1. NEW: Check for splitting punctuation first.
+        if char_text in PUNCTUATION_TO_SPLIT:
+            # Finalize any word that came immediately before the punctuation.
             finalize_word()
+            # Treat the punctuation itself as a separate word.
+            px0, py0, px1, py1 = char.bbox
+            punc_bbox = [round(px0, 2), round(py1, 2), round(px1, 2), round(py0, 2)]
+            line_words.append({
+                "text": char_text,
+                "bounding_box": punc_bbox
+            })
             prev_char = char
+            continue # Skip to the next character
+        # 2. Primary Signal: Is the character a space?
+        if char_text.isspace():
+            finalize_word()  # End the preceding word
+            prev_char = char
+            continue         # Skip to the next character, do not add the space to any word
+        # 3. Secondary Signal: Is there a large geometric gap?
+        if prev_char:
+            # A gap is considered a word break if it's larger than a fraction of the font size.
+            space_threshold = prev_char.size * 0.25  # 25% of the char size
+            min_gap = 1.0  # Or at least 1.0 unit
+            gap = char.bbox[0] - prev_char.bbox[2] # gap = current_char.x0 - prev_char.x1
+            if gap > max(space_threshold, min_gap):
+                finalize_word() # Found a gap, so end the previous word.
+        # Append the character's text and update the bounding box for the current word
+        current_word_text += char_text
+        x0, y0, x1, y1 = char.bbox
+        current_word_bbox[0] = min(current_word_bbox[0], x0)
+        current_word_bbox[1] = min(current_word_bbox[3], y0) # pdfminer y0 is bottom
+        current_word_bbox[2] = max(current_word_bbox[2], x1)
+        current_word_bbox[3] = max(current_word_bbox[1], y1) # pdfminer y1 is top
         prev_char = char
+    # After the loop, finalize the last word that was being built.
+    finalize_word()
+    return line_words
+def process_page_to_structured_ocr(
+    all_char_objects: List,
+    page_number: int,
+    text_line_number: int, # This will now be treated as the STARTING line number
+) -> Tuple[Dict[str, Any], List[OCRResult], List[List]]:
+    """
+    Orchestrates the OCR process, correctly handling multiple lines.
+    Returns:
+        A tuple containing:
+        1. A dictionary with detailed line/word results for the page.
+        2. A list of the complete OCRResult objects for each line.
+        3. A list of lists, containing the character objects for each line.
+    """
+    page_data = {"page": str(page_number), "results": {}}
+    # Step 1: Get definitive lines and their character groups.
+    # This function correctly returns all lines found in the input characters.
+    line_results, lines_char_groups = create_line_level_ocr_results_from_characters(all_char_objects, text_line_number)
+    if not line_results:
+        return {}, [], []
+    # Step 2: Iterate through each found line and generate its words.
+    for i, (line_info, char_group) in enumerate(zip(line_results, lines_char_groups)):
+        current_line_number = line_info.line #text_line_number + i
+        word_level_results = generate_words_for_line(char_group)
+        # Create a unique, incrementing line number for each iteration.
+        line_key = f"text_line_{current_line_number}"
+        line_bbox = [line_info.left, line_info.top, line_info.left + line_info.width, line_info.top + line_info.height]
+        # Now, each line is added to the dictionary with its own unique key.
+        page_data["results"][line_key] = {
+            "line": current_line_number, # Use the unique line number
+            "text": line_info.text,
+            "bounding_box": line_bbox,
+            "words": word_level_results
+        }
+    # The list of OCRResult objects is already correct.
+    line_level_ocr_results_list = line_results
+    # Return the structured dictionary, the list of OCRResult objects, and the character groups
+    return page_data, line_level_ocr_results_list, lines_char_groups
 def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
     decision_process_table = pd.DataFrame()
     return decision_process_table
 def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
+    pikepdf_redaction_annotations_on_page = list()
     for analysed_bounding_box in analysed_bounding_boxes:
         bounding_box = analysed_bounding_box["boundingBox"]
     page_max: int = 999,  # Maximum page number to end redaction
     current_loop_page: int = 0,  # Current page being processed in the loop
     page_break_return: bool = False,  # Flag to indicate if a page break should be returned
+    annotations_all_pages: List[dict] = list(),  # List of annotations across all pages
+    all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(columns=["page", "text",	"left", "top", "width", "height", "line"]),  # DataFrame for OCR results
     all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"]),  # DataFrame for decision process table
+    pymupdf_doc: List = list(),  # List of PyMuPDF documents
+    all_page_line_level_ocr_results_with_words: List = list(),
+    pii_identification_method: str = "Local",
     comprehend_query_number:int = 0,
     comprehend_client="",
+    custom_recogniser_word_list:List[str]=list(),
+    redact_whole_page_list:List[str]=list(),
     max_fuzzy_spelling_mistakes_num:int=1,
     match_fuzzy_whole_phrase_bool:bool=True,
     page_sizes_df:pd.DataFrame=pd.DataFrame(),
+    original_cropboxes:List[dict]=list(),
     text_extraction_only:bool=False,
     output_folder:str=OUTPUT_FOLDER,
     page_break_val: int = int(PAGE_BREAK_VALUE),  # Value for page break
+    max_time: int = int(MAX_TIME_VALUE),
+    nlp_analyser: AnalyzerEngine = nlp_analyser,
     progress: Progress = Progress(track_tqdm=True)  # Progress tracking object
+):
     '''
     Redact chosen entities from a PDF that is made up of multiple pages that are not images.
     - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
     -  max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
     -  match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
+    - page_sizes_df (pd.DataFrame, optional): A pandas dataframe of PDF page sizes in PDF or image format.
     - original_cropboxes (List[dict], optional): A list of dictionaries containing pymupdf cropbox information.
     - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
+    - language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
     - output_folder (str, optional): The output folder for the function
     - page_break_val: Value for page break
+    - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
+    - nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
     - progress: Progress tracking object
     '''
+    tic = time.perf_counter()
     if isinstance(all_line_level_ocr_results_df, pd.DataFrame):
         all_line_level_ocr_results_list = [all_line_level_ocr_results_df]
     if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
         out_message = "Connection to AWS Comprehend service not found."
         raise Exception(out_message)
+    # Try updating the supported languages for the spacy analyser
+    try:
+        nlp_analyser = create_nlp_analyser(language, existing_nlp_analyser=nlp_analyser)
+        # Check list of nlp_analyser recognisers and languages
+        if language != "en":
+            gr.Info(f"Language: {language} only supports the following entity detection: {str(nlp_analyser.registry.get_supported_entities(languages=[language]))}")
+    except Exception as e:
+        print(f"Error creating nlp_analyser for {language}: {e}")
+        raise Exception(f"Error creating nlp_analyser for {language}: {e}")
     # Update custom word list analyser object with any new words that have been added to the custom deny list
     if custom_recogniser_word_list:
         nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
         new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
+        nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
     # Open with Pikepdf to get text lines
     pikepdf_pdf = Pdf.open(file_path)
     number_of_pages = len(pikepdf_pdf.pages)
+    #file_name = get_file_name_without_type(file_path)
+    if not all_page_line_level_ocr_results_with_words: all_page_line_level_ocr_results_with_words = list()
     # Check that page_min and page_max are within expected ranges
+    if page_max > number_of_pages or page_max == 0: page_max = number_of_pages
     if page_min <= 0: page_min = 0
     else: page_min = page_min - 1
             # Go page by page
             for page_layout in extract_pages(file_path, page_numbers = [page_no], maxpages=1):
+                all_page_line_text_extraction_characters = list()
+                all_page_line_level_text_extraction_results_list = list()
+                page_analyser_results = list()
+                page_redaction_bounding_boxes = list()
+                characters = list()
+                pikepdf_redaction_annotations_on_page = list()
                 page_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"])
+                page_text_ocr_outputs = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height", "line"])
+                page_text_ocr_outputs_list = list()
+                text_line_no = 1
                 for n, text_container in enumerate(page_layout):
+                    characters = list()
                     if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
                         characters = get_text_container_characters(text_container)
+                        #text_line_no += 1
                     # Create dataframe for all the text on the page
+                    # line_level_text_results_list, line_characters = create_line_level_ocr_results_from_characters(characters)
+                    # line_level_ocr_results_with_words = generate_word_level_ocr(characters, page_number=int(reported_page_number), text_line_number=text_line_no)
+                    line_level_ocr_results_with_words, line_level_text_results_list, line_characters = process_page_to_structured_ocr(characters, page_number=int(reported_page_number), text_line_number=text_line_no)
+                    text_line_no += len(line_level_text_results_list)
                     ### Create page_text_ocr_outputs (OCR format outputs)
                     if line_level_text_results_list:
                             'left': result.left,
                             'top': result.top,
                             'width': result.width,
+                            'height': result.height,
+                            'line': result.line
                         } for result in line_level_text_results_list])
+                        page_text_ocr_outputs_list.append(line_level_text_results_df)
                     all_page_line_level_text_extraction_results_list.extend(line_level_text_results_list)
                     all_page_line_text_extraction_characters.extend(line_characters)
+                    all_page_line_level_ocr_results_with_words.append(line_level_ocr_results_with_words)
+                #print("page_text_ocr_outputs_list:", page_text_ocr_outputs_list)
+                page_text_ocr_outputs = pd.concat(page_text_ocr_outputs_list)
+                #page_text_ocr_outputs.to_csv("output/page_text_ocr_outputs.csv")
                 ### REDACTION
                 if pii_identification_method != NO_REDACTION_PII_OPTION:
                         # Annotate redactions on page
                         pikepdf_redaction_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_redaction_bounding_boxes)
+                    else: pikepdf_redaction_annotations_on_page = list()
                     # Make pymupdf page redactions
                     if redact_whole_page_list:
                 # Join extracted text outputs for all lines together
                 if not page_text_ocr_outputs.empty:
+                    page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["line"]).reset_index(drop=True)
+                    page_text_ocr_outputs = page_text_ocr_outputs.loc[:, ["page", "text", "left", "top", "width", "height", "line"]]
                     all_line_level_ocr_results_list.append(page_text_ocr_outputs)
                 toc = time.perf_counter()
                     # Write logs
                     all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
                     all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
+                    print("all_line_level_ocr_results_df:", all_line_level_ocr_results_df)
                     current_loop_page += 1
     # Write all page outputs
     all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
     all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
     # Convert decision table to relative coordinates
     all_pages_decision_process_table = divide_coordinates_by_page_sizes(all_pages_decision_process_table, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
     # Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
+    all_pages_decision_process_table['ymin'] = reverse_y_coords(all_pages_decision_process_table,'ymin')
+    all_pages_decision_process_table['ymax'] = reverse_y_coords(all_pages_decision_process_table,'ymax')
     # Convert decision table to relative coordinates
     all_line_level_ocr_results_df = divide_coordinates_by_page_sizes(all_line_level_ocr_results_df, page_sizes_df, xmin="left", xmax="width", ymin="top", ymax="height")
     # Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
     if not all_line_level_ocr_results_df.empty:
+        all_line_level_ocr_results_df['top'] = reverse_y_coords(all_line_level_ocr_results_df,'top')
+    # Remove empty dictionary items from ocr results with words
+    all_page_line_level_ocr_results_with_words = [d for d in all_page_line_level_ocr_results_with_words if d]
     return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words

tools/find_duplicate_pages.py CHANGED Viewed

@@ -1,91 +1,482 @@
 import pandas as pd
 import os
 import re
-from tools.helper_functions import OUTPUT_FOLDER
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-from typing import List, Tuple, Optional, Dict
 from collections import defaultdict
 import gradio as gr
 from gradio import Progress
 from pathlib import Path
-from pymupdf import Document
 from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
-import en_core_web_lg
-nlp = en_core_web_lg.load()
 similarity_threshold = 0.95
-def combine_ocr_output_text(input_files:List[str], combine_pages:bool=True, output_folder:str=OUTPUT_FOLDER):
     """
-    Combines text from multiple CSV files containing page and text columns.
-    Groups text by file and page number, concatenating text within these groups.
-    Args:
-        input_files (list): List of paths to CSV files
     Returns:
-        pd.DataFrame: Combined dataframe with columns [file, page, text]
     """
-    all_data = []
-    output_files = []
-    if isinstance(input_files, str):
-        file_paths_list = [input_files]
-    else:
-        file_paths_list = input_files
-    for file in file_paths_list:
-        if isinstance(file, str):
-            file_path = file
-        else:
-            file_path = file.name
-        # Read CSV file
-        df = pd.read_csv(file_path)
-        # Ensure required columns exist
         if 'page' not in df.columns or 'text' not in df.columns:
-            print(f"Warning: Skipping {file_path} - missing required columns 'page' and 'text'")
             continue
         df['text'] = df['text'].fillna('').astype(str)
-        # Group by page and concatenate text
-        if combine_pages == True:
-            grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
         else:
-            df['line_number_by_page'] = df.groupby('page').cumcount() + 1
-            df['original_page'] = df['page']
-            df['page'] = df['page'].astype(str).str.zfill(5) + df['line_number_by_page'].astype(str).str.zfill(5)
-            df['page'] = df['page'].astype(int)
-            grouped = df #.drop('line_number_by_page', axis=1)
-        # Add filename column
-        grouped['file'] = os.path.basename(file_path)
-        all_data.append(grouped)
     if not all_data:
-        raise ValueError("No valid CSV files were processed")
-    # Combine all dataframes
     combined_df = pd.concat(all_data, ignore_index=True)
-    # Reorder columns
-    combined_df = combined_df[['file', 'page', 'text']]
-    output_combined_file_path = output_folder + "combined_ocr_output_files.csv"
-    combined_df.to_csv(output_combined_file_path, index=None)
-    output_files.append(output_combined_file_path)
-    return combined_df, output_files
-def process_data(df:pd.DataFrame, column:str):
     '''
     Clean and stem text columns in a data frame
     '''
@@ -176,7 +567,7 @@ def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str,
     # 1. Save the main results DataFrame
     similarity_file_output_path = output_folder_path / 'page_similarity_results.csv'
-    final_df.to_csv(similarity_file_output_path, index=False)
     output_paths.append(str(similarity_file_output_path))
     print(f"Main results saved to {similarity_file_output_path}")
@@ -213,156 +604,254 @@ def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str,
     return output_paths
-def identify_similar_pages(
     df_combined: pd.DataFrame,
-    similarity_threshold: float = 0.9,
-    min_word_count: int = 10,
     min_consecutive_pages: int = 1,
-    greedy_match: bool = False,
-    combine_pages:bool=True,
-    output_folder: str = OUTPUT_FOLDER,
     progress=Progress(track_tqdm=True)
 ) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
     """
-    Identifies similar pages with three possible strategies:
-    1. Single Page: If greedy_match=False and min_consecutive_pages=1.
-    2. Fixed-Length Subdocument: If greedy_match=False and min_consecutive_pages > 1.
-    3. Greedy Consecutive Match: If greedy_match=True.
     """
-    output_paths = []
     progress(0.1, desc="Processing and filtering text")
-    df = process_data(df_combined, 'text')
     df['word_count'] = df['text_clean'].str.split().str.len().fillna(0)
     original_row_count = len(df)
     df_filtered = df[df['word_count'] >= min_word_count].copy()
     df_filtered.reset_index(drop=True, inplace=True)
     print(f"Filtered out {original_row_count - len(df_filtered)} pages with fewer than {min_word_count} words.")
     if len(df_filtered) < 2:
         return pd.DataFrame(), [], df_combined
-    vectorizer = TfidfVectorizer()
-    tfidf_matrix = vectorizer.fit_transform(df_filtered['text_clean'])
-    progress(0.3, desc="Calculating text similarity")
-    similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
-    coo_matrix = similarity_matrix.tocoo()
-    # Create a DataFrame of all individual page pairs above the threshold.
-    # This is the base for all three matching strategies.
-    similar_pages = [
-        (r, c, v) for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
-        if r < c and v >= similarity_threshold
-    ]
-    if not similar_pages:
-        return pd.DataFrame(), [], df_combined
-    base_similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
-    progress(0.6, desc="Aggregating results based on matching strategy")
-    if greedy_match:
-        print("Finding matches using greedy consecutive strategy.")
-        # A set of pairs for fast lookups of (page1_idx, page2_idx)
-        valid_pairs_set = set(zip(base_similarity_df['Page1_Index'], base_similarity_df['Page2_Index']))
-        # Keep track of indices that have been used in a sequence
-        consumed_indices_1 = set()
-        consumed_indices_2 = set()
-        all_sequences = []
-        # Iterate through all potential starting pairs, sorted for consistent results
-        sorted_pairs = base_similarity_df.sort_values(['Page1_Index', 'Page2_Index'])
-        for _, row in sorted_pairs.iterrows():
-            start_idx1, start_idx2 = int(row['Page1_Index']), int(row['Page2_Index'])
-            # If this pair has already been consumed by a previous sequence, skip it
-            if start_idx1 in consumed_indices_1 or start_idx2 in consumed_indices_2:
-                continue
-            # This is a new sequence, start expanding it
-            current_sequence = [(start_idx1, start_idx2)]
-            k = 1
-            while True:
-                next_idx1 = start_idx1 + k
-                next_idx2 = start_idx2 + k
-                # Check if the next pair in the sequence is a valid match
-                if (next_idx1, next_idx2) in valid_pairs_set and \
-                   next_idx1 not in consumed_indices_1 and \
-                   next_idx2 not in consumed_indices_2:
-                    current_sequence.append((next_idx1, next_idx2))
-                    k += 1
-                else:
-                    # The sequence has ended
-                    break
-            # Record the found sequence and mark all its pages as consumed
-            sequence_indices_1 = [p[0] for p in current_sequence]
-            sequence_indices_2 = [p[1] for p in current_sequence]
-            all_sequences.append({
-                'Page1_Start_Index': sequence_indices_1[0], 'Page1_End_Index': sequence_indices_1[-1],
-                'Page2_Start_Index': sequence_indices_2[0], 'Page2_End_Index': sequence_indices_2[-1],
-                'Match_Length': len(current_sequence)
-            })
-            consumed_indices_1.update(sequence_indices_1)
-            consumed_indices_2.update(sequence_indices_2)
-        if not all_sequences:
             return pd.DataFrame(), [], df_combined
-        subdocument_df = pd.DataFrame(all_sequences)
-        # We can add back the average similarity if needed, but it requires more lookups.
-        # For now, we'll omit it for simplicity in the greedy approach.
-        # ... (The rest is metadata mapping, same as the subdocument case)
-    elif min_consecutive_pages > 1:
-        # --- STRATEGY 2: Fixed-Length Subdocument Matching ---
-        print(f"Finding consecutive page matches (min_consecutive_pages > 1)")
-        similarity_df = base_similarity_df.copy()
-        similarity_df.sort_values(['Page1_Index', 'Page2_Index'], inplace=True)
         is_consecutive = (similarity_df['Page1_Index'].diff() == 1) & (similarity_df['Page2_Index'].diff() == 1)
         block_id = is_consecutive.eq(False).cumsum()
         grouped = similarity_df.groupby(block_id)
         agg_results = grouped.agg(
-            Page1_Start_Index=('Page1_Index', 'first'), Page2_Start_Index=('Page2_Index', 'first'),
-            Page1_End_Index=('Page1_Index', 'last'), Page2_End_Index=('Page2_Index', 'last'),
-            Match_Length=('Page1_Index', 'size'), Avg_Similarity=('Similarity_Score', 'mean')
         ).reset_index(drop=True)
-        subdocument_df = agg_results[agg_results['Match_Length'] >= min_consecutive_pages].copy()
-        if subdocument_df.empty: return pd.DataFrame(), [], df_combined
     else:
-        # --- STRATEGY 1: Single Page Matching ---
-        print(f"Finding single page matches (min_consecutive_pages=1)")
         final_df = map_metadata_single_page(base_similarity_df, df_filtered)
-        # The rest of the logic (saving files) is handled after this if/else block
-        pass # The final_df is already prepared
-    # --- Map metadata and format output ---
-    # This block now handles the output for both subdocument strategies (2 and 3)
-    if greedy_match or min_consecutive_pages > 1:
-        final_df = map_metadata_subdocument(subdocument_df, df_filtered)
-    progress(0.8, desc="Saving output files")
     output_paths = save_results_and_redaction_lists(final_df, output_folder, combine_pages)
-    return final_df, output_paths, df_combined
-# ==============================================================================
-# GRADIO HELPER FUNCTIONS
-# ==============================================================================
-# full_data:pd.DataFrame,
 def handle_selection_and_preview(evt: gr.SelectData, results_df:pd.DataFrame, full_duplicate_data_by_file: dict):
     """
     This single function handles a user selecting a row. It:
@@ -413,18 +902,16 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
     Wrapper function updated to include the 'greedy_match' boolean.
     """
     if not files:
-        gr.Warning("Please upload files to analyze.")
-        return None, None, None
     progress(0, desc="Combining input files...")
-    df_combined, _ = combine_ocr_output_text(files, combine_pages=combine_pages)
     if df_combined.empty:
-        gr.Warning("No data found in the uploaded files.")
-        return None, None, None
     # Call the main analysis function with the new parameter
-    results_df, output_paths, full_df = identify_similar_pages(
         df_combined=df_combined,
         similarity_threshold=threshold,
         min_word_count=min_words,
@@ -436,7 +923,6 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
     # Clip text to first 200 characters
     full_df['text'] = full_df['text'].str[:preview_length]
     # Preprocess full_data (without preview text) for fast access (run once)
     full_data_by_file = {
     file: df.sort_values('page').set_index('page')
@@ -446,7 +932,7 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
     if results_df.empty:
         gr.Info(f"No duplicate pages found, no results returned.")
-    return results_df, output_paths, full_data_by_file # full_df,
 def show_page_previews(full_data_by_file: dict, results_df: pd.DataFrame, evt: gr.SelectData, preview_length:int=500):
     """
@@ -531,14 +1017,31 @@ def add_new_annotations_to_existing_page_annotations(
     return all_annotations, newly_added_annotation_group
-def apply_whole_page_redactions_from_list(duplicate_page_numbers_df: pd.DataFrame, doc_file_name_with_extension_textbox: str, review_file_state: pd.DataFrame, duplicate_output_paths: list[str], pymupdf_doc: object, page_sizes: list[dict], all_existing_annotations: list[dict], combine_pages:bool=True, new_annotations_with_bounding_boxes:List[dict]=[]):
     '''
-    Take a list of suggested whole pages to redact and apply it to review file data.
     '''
     all_annotations = all_existing_annotations.copy()
     if not pymupdf_doc:
-        message = "No document file currently under review."
         print(f"Warning: {message}")
         raise Warning(message)
@@ -667,131 +1170,27 @@ def apply_whole_page_redactions_from_list(duplicate_page_numbers_df: pd.DataFram
     review_file_out = review_file_out.sort_values(by=["page", "ymin", "xmin"]).reset_index(drop=True)
     review_file_out = review_file_out.drop_duplicates(subset=['page', 'label', 'text', 'id'], keep='first')
-    out_message = "Successfully created whole page redactions."
     print(out_message)
     gr.Info(out_message)
     return review_file_out, all_annotations
-# --- 1. Helper Function to Parse the Combined Page/Line ID ---
 def _parse_page_line_id(combined_id: int) -> Tuple[int, int]:
-    """
-    Parses a combined page and line number ID into a (page, line) tuple.
-    Assumes the ID is a 10-digit number where the first 5 are the page
-    and the last 5 are the line number.
-    Example: 100027 -> (1, 27)
-             200005 -> (2, 5)
-    """
-    # zfill ensures the string is padded with leading zeros to 10 characters
-    s_id = str(combined_id).zfill(10)
-    page = int(s_id[:5])
-    line = int(s_id[5:])
-    return page, line
-# def create_annotations_from_ocr_outputs(ocr_results_df_lines_to_annotate:pd.DataFrame):
-#     '''
-#     Create a set of annotation boxes based on selected ocr_results_df lines.
-#     '''
-#     annotations_by_page = []
-#     # --- Build Annotation Boxes for each selected line ---
-#     for _, line_row in ocr_results_df_lines_to_annotate.iterrows():
-#         # The coordinates are relative, so xmax = left + width and ymax = top + height
-#         box = {
-#             "label": "Similar Text", # Or any other label you prefer
-#             "xmin": line_row['left'],
-#             "ymin": line_row['top'] + line_row['height'],
-#             "xmax": line_row['left'] + line_row['width'],
-#             "ymax": line_row['top'] ,
-#             "text": line_row['text']
-#         }
-#         # --- 6. Group the box by its page number ---
-#         page_number = line_row['page']
-#         annotations_by_page[page_number].append(box)
-#     return annotations_by_page
-# def create_annotation_objects_from_duplicates(
-#     duplicates_df: pd.DataFrame,
-#     ocr_results_df: pd.DataFrame,
-#     combine_pages:bool=False
-# ) -> List[Dict]:
-#     """
-#     Creates structured annotation objects from selected ocr outputs.
-#     Args:
-#         duplicates_df (pd.DataFrame): DataFrame containing duplicate ranges with
-#                                       columns like 'Page2_Start_Page' and 'Page2_End_Page'.
-#         ocr_results_df (pd.DataFrame): DataFrame with OCR results, including columns
-#                                        'page', 'text', 'left', 'top', 'width', 'height'.
-#     Returns:
-#         List[Dict]: A list of dictionaries, where each dict represents a page and its
-#                     list of annotation boxes, in the format:
-#                     [{"page": 1, "boxes": [...]}, {"page": 2, "boxes": [...]}]
-#     """
-#     annotations_by_page = []
-#     if combine_pages == False:
-#         # --- 2. Prepare OCR Data: Add a line number column if it doesn't exist ---
-#         if 'line_number_by_page' not in ocr_results_df.columns:
-#             print("Generating 'line_number_by_page' for ocr_results_df...")
-#             # Sort by page and original position to ensure correct line numbering
-#             ocr_results_df = ocr_results_df.sort_values(by=['page', 'top', 'left']).reset_index(drop=True)
-#             ocr_results_df['line_number_by_page'] = ocr_results_df.groupby('page').cumcount() + 1
-#         # Use defaultdict to easily append to lists for each page
-#         annotations_by_page = defaultdict(list)
-#         # --- 3. Iterate through each duplicate range ---
-#         for _, row in duplicates_df.iterrows():
-#             # Parse the start and end page/line numbers from the duplicate row
-#             start_page, start_line = _parse_page_line_id(row['Page2_Start_Page'])
-#             end_page, end_line = _parse_page_line_id(row['Page2_End_Page'])
-#             # --- 4. Select OCR Lines based on the range ---
-#             # This logic correctly handles ranges within a single page and across multiple pages
-#             if start_page == end_page:
-#                 # Simple case: the range is on a single page
-#                 condition = (
-#                     (ocr_results_df['page'] == start_page) &
-#                     (ocr_results_df['line_number_by_page'].between(start_line, end_line))
-#                 )
-#             else:
-#                 # Complex case: the range spans multiple pages
-#                 # Condition for the first page in the range
-#                 cond_start = (ocr_results_df['page'] == start_page) & (ocr_results_df['line_number_by_page'] >= start_line)
-#                 # Condition for all pages between the start and end
-#                 cond_middle = ocr_results_df['page'].between(start_page + 1, end_page - 1)
-#                 # Condition for the last page in the range
-#                 cond_end = (ocr_results_df['page'] == end_page) & (ocr_results_df['line_number_by_page'] <= end_line)
-#                 condition = cond_start | cond_middle | cond_end
-#             lines_to_annotate = ocr_results_df[condition]
-#             annotations_by_page = create_annotations_from_ocr_outputs(lines_to_annotate)
-#         # --- Format the final output list ---
-#         final_output = []
-#         # Sort by page number for a predictable order
-#         for page, boxes in sorted(annotations_by_page.items()):
-#             final_output.append({
-#                 "page": page,
-#                 "boxes": boxes
-#             })
-#     return final_output
 def create_annotation_objects_from_duplicates(
     duplicates_df: pd.DataFrame,
     ocr_results_df: pd.DataFrame,
     page_sizes: List[Dict],
-    combine_pages:bool=False
-) -> List[Dict]:
     """
     Creates structured annotation objects from duplicate line ranges, mapping
     page numbers to image paths.
@@ -807,8 +1206,12 @@ def create_annotation_objects_from_duplicates(
     """
     final_output = []
     if combine_pages == False:
-        # --- NEW: Create an efficient lookup map from page number to image path ---
         page_to_image_map = {item['page']: item['image_path'] for item in page_sizes}
         # Prepare OCR Data: Add a line number column if it doesn't exist
@@ -850,11 +1253,8 @@ def create_annotation_objects_from_duplicates(
                     "id": "" # to be filled in after
                 }
                 page_number = line_row['page']
                 annotations_by_page[page_number].append(box)
-        print("annotations_by_page:", annotations_by_page)
         # --- Format the final output list using the page-to-image map ---
         final_output = []
@@ -878,39 +1278,5 @@ def create_annotation_objects_from_duplicates(
                 # Handle cases where a page might not have a corresponding image path
                 print(f"Warning: Page {page_num} found in OCR data but has no corresponding "
                     f"entry in the 'page_sizes' object. This page's annotations will be skipped.")
-    print("final_output:", final_output)
-    return final_output
-# --- Example Usage ---
-# 1. Create your example DataFrames
-# duplicates_data = {
-#     'Page1_File': ['doc_a.csv'],
-#     'Page1_Start_Page': [100009],
-#     'Page1_End_Page': [100021],
-#     'Page2_File': ['doc_a.csv'],
-#     'Page2_Start_Page': [100027], # Page 1, Line 27
-#     'Page2_End_Page': [200005],   # Page 2, Line 5
-# }
-# duplicates_df = pd.DataFrame(duplicates_data)
-# ocr_data = {
-#     'page': [1]*30 + [2]*10, # 30 lines on page 1, 10 on page 2
-#     'text': [f"Text on page {p}, line {l}" for p in [1, 2] for l in range(1, (31 if p==1 else 11))],
-#     # Example coordinates (using small, consistent values for demonstration)
-#     'left': [0.1] * 40,
-#     'top': [i*0.02 for i in range(30)] + [i*0.02 for i in range(10)],
-#     'width': [0.8] * 40,
-#     'height': [0.015] * 40,
-# }
-# ocr_results_df = pd.DataFrame(ocr_data)
-# # 2. Run the function
-# generated_annotations = create_annotation_objects_from_duplicates(duplicates_df, ocr_results_df)
-# # 3. Print the result
-# import json
-# print(json.dumps(generated_annotations, indent=2))

 import pandas as pd
 import os
 import re
+import itertools
+import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+from typing import List, Tuple, Optional, Dict, Union
 from collections import defaultdict
 import gradio as gr
 from gradio import Progress
 from pathlib import Path
+from typing import List
+from tools.helper_functions import OUTPUT_FOLDER
 from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
+from tools.load_spacy_model_custom_recognisers import nlp
 similarity_threshold = 0.95
+number_of_zeros_to_add_to_index = 7 # Number of zeroes to add between page number and line numbers to get a unique page/line index value
+ID_MULTIPLIER = 100000
+def split_text_with_punctuation(text: str) -> List[str]:
     """
+    A more concise version of the tokenization function using a single
+    powerful regex with re.findall.
+    """
+    # This single regex pattern finds either:
+    # 1. A sequence of one or more punctuation marks `[.,?!:;]+`
+    # 2. OR a sequence of one or more characters that are NOT punctuation or whitespace `[^.,?!:;\s]+`
+    pattern = re.compile(r"([.,?!:;]+|[^.,?!:;\s]+)")
+    final_list = []
+    # We first split by whitespace to handle sentences correctly
+    for word in text.split():
+        # Then, for each whitespace-separated word, we tokenize it further
+        final_list.extend(pattern.findall(word))
+    return final_list
+def extract_indices_from_page_ranges(
+    results_df: pd.DataFrame,
+    start_col: str = 'Page2_Start_Page',
+    end_col: str = 'Page2_End_Page',
+    modulo_divisor_number_of_zeros: int = number_of_zeros_to_add_to_index, # Search for number of added
+    converted_index: bool = False # Has the index been converted to the page_no + 0000 + line number format that needs the modulo divisor to convert back?
+) -> List[int]:
+    all_indices = set()
+    modulo_divisor = int("1" + modulo_divisor_number_of_zeros*"0")
+    for _, row in results_df.iterrows():
+        start_page = row[start_col]
+        end_page = row[end_col]
+        for encoded_page_id in range(start_page, end_page + 1):
+            if converted_index == True:
+                original_page, original_index = _parse_page_line_id(encoded_page_id)#(encoded_page_id % modulo_divisor) - 1
+            else:
+                original_index = encoded_page_id
+            all_indices.add(original_index)
+    return sorted(list(all_indices))
+def punctuation_at_word_text_end(word_level_df_orig: pd.DataFrame) -> bool:
+    """
+    Check the first 1000 rows of word_level_df_orig to see if any of the strings
+    in 'word_text' end with a full stop '.', exclamation mark '!', or question mark '?',
+    for strings that do not contain these characters alone.
+    Args:
+        word_level_df_orig (pd.DataFrame): DataFrame containing word-level OCR data with 'word_text' column
     Returns:
+        bool: True if any strings end with punctuation marks, False otherwise
+    """
+    # Get the first 1000 rows or all rows if less than 1000
+    sample_df = word_level_df_orig.head(1000)
+    # Check if 'word_text' column exists
+    if 'word_text' not in sample_df.columns:
+        return False
+    # Define punctuation marks to check for
+    punctuation_marks = ['.', '!', '?']
+    # Check each word_text string
+    for word_text in sample_df['word_text']:
+        if pd.isna(word_text) or not isinstance(word_text, str):
+            continue
+        # Skip strings that contain only punctuation marks
+        if word_text.strip() in punctuation_marks:
+            continue
+        # Check if the string ends with any of the punctuation marks
+        if any(word_text.rstrip().endswith(punct) for punct in punctuation_marks):
+            return True
+    return False
+def run_full_search_and_analysis(
+    search_query_text: str,
+    word_level_df_orig: pd.DataFrame,
+    similarity_threshold: float = 1,
+    combine_pages: bool = False,
+    min_word_count: int = 1,
+    min_consecutive_pages: int = 1,
+    greedy_match: bool = True,
+    remake_index: bool = False,
+    progress=gr.Progress(track_tqdm=True)
+):
+    """
+    This function orchestrates the entire pipeline for finding duplicate pages based on a user's search query. It takes in the search query text, the original word-level OCR data, and various parameters to control the analysis. The function then:
+    1. Converts the user's search query into a DataFrame format suitable for analysis.
+    2. Prepares the main word-level OCR data for processing by converting it into the required format.
+    3. Combines the search query DataFrame with the prepared OCR data DataFrame.
+    4. Executes the similarity analysis on the combined data using the specified parameters such as similarity threshold, minimum word count, minimum consecutive pages, and greedy match strategy.
+    Parameters:
+    - search_query_text (str): The text entered by the user to search for in the OCR data.
+    - word_level_df_orig (pd.DataFrame): The original DataFrame containing word-level OCR data.
+    - similarity_threshold (float, optional): The minimum similarity score required for two pages to be considered duplicates. Defaults to 1.
+    - combine_pages (bool, optional): A flag indicating whether to combine text from the same page number within a file. Defaults to False.
+    - min_word_count (int, optional): The minimum number of words required for a page to be considered in the analysis. Defaults to 1.
+    - min_consecutive_pages (int, optional): The minimum number of consecutive pages required to be considered a match. Defaults to 1.
+    - greedy_match (bool, optional): A flag indicating whether to use a greedy strategy for matching consecutive pages. Defaults to True.
+    - remake_index (bool, optional): A flag indicating whether to remake the index of the DataFrame during processing. Defaults to False.
+    - progress (gr.Progress, optional): A Progress object to track the progress of the operation. Defaults to a Progress object with track_tqdm set to True.
     """
+    if len(search_query_text) < 3:
+        raise Warning("Please use a search query with at least three letters.")
+    if len(search_query_text) > 100:
+        raise Warning("Please use a search query with at less than 100 characters.")
+    if punctuation_at_word_text_end(word_level_df_orig) == True: do_punctuation_split = False
+    else: do_punctuation_split = True
+    # Step 1: Process the user's search query string
+    search_query_data, query_word_length = create_dataframe_from_string(search_query_text, file_name="user_search_query", split_words=True, split_punctuation=do_punctuation_split)
+    if not search_query_data:
+        # Handle case where user submits an empty search string
+        raise Warning("Could not convert search string to required format")
+    if query_word_length > 25:
+        # Handle case where user submits an empty search string
+        raise Warning("Please use a query with less than 25 words")
+    # Overwrite min_consecutive_pages with the search string length
+    min_consecutive_pages = query_word_length
+    # Create word index from reference table
+    word_level_df_orig["index"] = word_level_df_orig.index
+    word_level_df = word_level_df_orig.copy()
+    # Step 2: Process the main word-level OCR DataFrame
+    word_level_data = convert_word_level_df(word_level_df, file_name="source_document")
+    # Step 3: Combine both data sources into one list
+    all_data_to_process = search_query_data + word_level_data
+    if not all_data_to_process:
+        raise gr.Error("No data to process. Please check your inputs.")
+    # Step 4: Run the combination logic
+    combined_df, _, full_out_ocr_df = combine_ocr_dataframes(
+        input_data=all_data_to_process,
+        combine_pages=combine_pages,
+        output_folder=None, # No need to save this intermediate file
+        remake_index=remake_index
+    )
+    # Step 5: Run the final similarity analysis on the combined data
+    results_df, duplicate_files, full_data = identify_similar_text_sequences(
+        df_combined=combined_df,
+        similarity_threshold=similarity_threshold,
+        min_word_count=min_word_count,
+        min_consecutive_pages=min_consecutive_pages,
+        greedy_match=greedy_match,
+        combine_pages=combine_pages,
+        inter_file_only=True,
+        do_text_clean=False,
+        file1_name="user_search_query",
+        file2_name="source_document",
+        progress=progress
+    )
+    print("Finished text search")
+    # Map the results back to the reference data file
+    if remake_index == True:
+        results_df_index_list = extract_indices_from_page_ranges(results_df, converted_index=True)
+    else:
+        results_df_index_list = extract_indices_from_page_ranges(results_df, converted_index=False)
+    word_level_df_out = word_level_df_orig.loc[word_level_df_orig["index"].isin(results_df_index_list)]
+    return word_level_df_out, duplicate_files, full_data
+def create_all_data_to_process(converted_data:pd.DataFrame, other_data_list:List[Tuple]):
+    all_data_to_process = converted_data + other_data_list
+    return all_data_to_process
+def convert_word_level_df(
+    word_level_df: pd.DataFrame,
+    file_name: str = "converted_dataframe"
+) -> List[Tuple[str, pd.DataFrame]]:
+    """
+    Converts a word-level OCR DataFrame to the format for
+    combine_ocr_dataframes.
+    A simple renaming and selection of relevant columns
+    Args:
+        word_level_df (pd.DataFrame):
+            A DataFrame containing detailed OCR output. Must include at least
+            the columns: 'page', 'line', and 'word_text'.
+        file_name (str, optional):
+            A unique identifier or "dummy" filename to assign to the resulting
+            data. Defaults to "converted_dataframe".
+    Returns:
+        List[Tuple[str, pd.DataFrame]]:
+            A list containing a single tuple of (file_name, DataFrame), ready
+            to be used as input for the combine_ocr_dataframes function. The
+            DataFrame will have 'page' and 'text' columns.
+    """
+    # --- 1. Validate Input ---
+    required_columns = ['page', 'line', 'word_text']
+    if not all(col in word_level_df.columns for col in required_columns):
+        raise ValueError(f"Input DataFrame must contain all of the following columns: {required_columns}")
+    df = word_level_df.copy()
+    # --- 2. Process the DataFrame ---
+    # Ensure word_text is a string to allow for joining
+    df['word_text'] = df['word_text'].astype(str)
+    # Group by page and line number, then join the words with a space (not needed for word level search)
+    # The result is a Series with a MultiIndex (page, line)
+    #line_text_series = df.groupby(['page', 'line'])['word_text'].apply(' '.join)
+    # Convert the Series back to a DataFrame and reset the index
+    #line_level_df = line_text_series.reset_index()
+    # Rename the aggregated column from 'word_text' to the required 'text'
+    df = df.rename(columns={'word_text': 'text'})
+    # --- 3. Finalise the structure ---
+    # We now have a DataFrame with columns [page, line, text].
+    final_df = df[['page', 'text']]
+    # --- 4. Package for output ---
+    # Return in the required List[Tuple[str, DataFrame]] format
+    return [(file_name, final_df)]
+def create_dataframe_from_string(
+    text_string: str,
+    file_name: str = "user_search_query",
+    page_number: int = 1,
+    split_words: bool = False,
+    split_punctuation: bool = True,
+) -> Tuple[List[Tuple[str, pd.DataFrame]], int]:
+    """
+    Converts a string into a DataFrame compatible with combine_ocr_dataframes.
+    Can operate in two modes:
+    1. As a single-line document (default).
+    2. As a multi-line document where each word from the string is a separate line.
+    Args:
+        text_string (str): The input text to be placed in the DataFrame.
+        file_name (str, optional): A dummy filename to assign to this text.
+                                   Defaults to "user_search_query".
+        page_number (int, optional): A dummy page number to assign. Defaults to 1.
+        split_words (bool, optional): If True, splits the input string by
+                                      whitespace and creates a row for each word.
+                                      If False (default), the entire string is
+                                      treated as a single text entry.
+        split_punctuation (bool, optional): If True, splits the 'end of sentence' punctuation off the end
+                                      of the search query to match the reference data.
+    Returns:
+        Tuple[List[Tuple[str, pd.DataFrame]], int]:
+            A list containing a single tuple: (file_name, DataFrame).
+            The DataFrame has 'page' and 'text' columns. Also, an integer value indicating the number of words in the search string.
+            Returns an empty list if the input string is empty or whitespace.
+    """
+    # Handle empty input gracefully, this works for both modes.
+    if not text_string or not text_string.strip():
+        print("Warning: Input string is empty. Returning an empty list.")
+        return [], 0
+    if split_words:
+        # --- Split string into words, one per row, based on similar punctuation split technique used to create ocr_results_with_words objects ---
+        if split_punctuation == True:
+            words = split_text_with_punctuation(text_string)
+        else:
+            words = text_string.split()
+        #words = text_string.split()
+        len_words = len(words)
+        data = {
+            'page': [page_number] * len_words, # Assign the same page number to every word
+            'text': words # The list of words becomes the text column
+        }
+    else:
+        # --- Entire string in one row ---
+        len_words = 1
+        data = {
+            'page': [page_number],
+            'text': [text_string]
+        }
+    # Create the DataFrame from the prepared data
+    df = pd.DataFrame(data)
+    df["line"] = df.index + 1
+    # Return it in the required format: a list containing one (name, df) tuple
+    return [(file_name, df)], len_words
+def combine_ocr_dataframes(
+    input_data: List[Tuple[str, pd.DataFrame]],
+    combine_pages: bool = True,
+    output_folder: str = OUTPUT_FOLDER,
+    output_filename: str = "combined_ocr_output.csv",
+    number_of_added_zeros: int = number_of_zeros_to_add_to_index,
+    remake_index:bool = True
+) -> Tuple[pd.DataFrame, List[str]]:
+    """
+    Combines text from multiple pandas DataFrames containing page and text columns.
+    This function takes a list of (name, DataFrame) tuples, processes each DataFrame
+    by grouping and concatenating text, and then combines them into a single DataFrame.
+    Args:
+        input_data (List[Tuple[str, pd.DataFrame]]):
+            A list of tuples, where each tuple contains a unique identifier (like a filename)
+            and a pandas DataFrame. Each DataFrame must have 'page' and 'text' columns.
+        combine_pages (bool, optional):
+            If True, text from the same page number within a file is joined into a
+            single row. If False, each line of text gets its own row with a unique
+            page identifier. Defaults to True.
+        output_folder (str, optional):
+            The folder where the combined CSV file will be saved. Defaults to OUTPUT_FOLDER.
+        output_filename (str, optional):
+            The name of the output CSV file. Defaults to "combined_ocr_output.csv".
+    Returns:
+        Tuple[pd.DataFrame, List[str]]:
+            A tuple containing:
+            - The final combined and processed DataFrame.
+            - A list containing the path to the saved output CSV file.
+    """
+    all_data = []
+    for file_identifier, df_initial in input_data:
+        df = df_initial.copy()  # Work on a copy to avoid side effects
+        # --- Validation ---
         if 'page' not in df.columns or 'text' not in df.columns:
+            print(f"Warning: Skipping data for '{file_identifier}' - missing required columns 'page' and 'text'.")
             continue
+        # --- Processing ---
         df['text'] = df['text'].fillna('').astype(str)
+        if combine_pages:
+            # Group by page and concatenate text into a single string
+            processed_df = df.groupby('page')['text'].apply(' '.join).reset_index()
         else:
+            if remake_index == True:
+                # # Create a unique, sortable page ID for each line without combining
+                # df['line_number_by_page'] = df.groupby('page').cumcount() + 1
+                # df['original_page'] = df['page']
+                # # Create a new page ID that combines page and line number for uniqueness
+                # df['page'] = (
+                #     df['page'].astype(str).str.zfill(number_of_added_zeros) +
+                #     df['line_number_by_page'].astype(str).str.zfill(number_of_added_zeros)
+                # ).astype(int)
+                # Define the multiplier based on the max expected lines per page.
+                # If you expect up to 99,999 lines, use 100,000.
+                df['line_number_by_page'] = df.groupby('page').cumcount() + 1
+                df['original_page'] = df['page']
+                # Create the new combined ID using arithmetic
+                df['page'] = (df['original_page'] * ID_MULTIPLIER) + df['line_number_by_page']
+            else:
+                if not 'index' in df.columns:
+                    df['index'] = df.index
+                df['page'] = df['index']
+            processed_df = df
+        # Add the file identifier column
+        processed_df['file'] = file_identifier
+        all_data.append(processed_df)
     if not all_data:
+        raise ValueError("No valid DataFrames were processed. Ensure input data is not empty and DataFrames have 'page' and 'text' columns.")
+    # --- Final Combination ---
     combined_df = pd.concat(all_data, ignore_index=True)
+    # Reorder columns to a standard format, dropping intermediate columns
+    final_columns = ['file', 'page', 'text']
+    if 'original_page' in combined_df.columns:
+         final_columns.append('original_page') # Keep for context if created
+    # Ensure all final columns exist before trying to select them
+    existing_final_columns = [col for col in final_columns if col in combined_df.columns]
+    full_out_ocr_df = combined_df
+    combined_df = combined_df.copy()[existing_final_columns]
+    # --- Save Output ---
+    output_files = []
+    if output_folder and output_filename:
+        os.makedirs(output_folder, exist_ok=True)
+        output_path = os.path.join(output_folder, output_filename)
+        combined_df.to_csv(output_path, index=False)
+        output_files.append(output_path)
+        print(f"Successfully combined data and saved to: {output_path}")
+    return combined_df, output_files, full_out_ocr_df
+def combine_ocr_output_text(
+    input_files: Union[str, List[str]],
+    combine_pages: bool = True,
+    remake_index: bool = True,
+    output_folder: str = OUTPUT_FOLDER
+) -> Tuple[pd.DataFrame, List[str]]:
+    """
+    Reads multiple OCR CSV files, combines them, and saves the result.
+    This function serves as a wrapper that reads CSV files from paths and then
+    uses the `combine_ocr_dataframes` function to perform the combination logic.
+    Args:
+        input_files (Union[str, List[str]]): A single file path or a list of file paths.
+        combine_pages (bool, optional): See `combine_ocr_dataframes`. Defaults to True.
+        output_folder (str, optional): See `combine_ocr_dataframes`. Defaults to OUTPUT_FOLDER.
+    Returns:
+        Tuple[pd.DataFrame, List[str]]: The combined DataFrame and the path to the output file.
+    """
+    if isinstance(input_files, str):
+        file_paths_list = [input_files]
+    else:
+        file_paths_list = input_files
+    data_to_process = []
+    for file_path in file_paths_list:
+        try:
+            df = pd.read_csv(file_path)
+            # Use the base filename as the identifier
+            file_identifier = os.path.basename(file_path)
+            data_to_process.append((file_identifier, df))
+        except FileNotFoundError:
+            print(f"Warning: File not found, skipping: {file_path}")
+        except Exception as e:
+            print(f"Warning: Failed to read or process {file_path}. Error: {e}")
+    if not data_to_process:
+        raise ValueError("No valid CSV files could be read or processed.")
+    # Call the core function with the loaded data
+    return combine_ocr_dataframes(
+        input_data=data_to_process,
+        combine_pages=combine_pages,
+        output_folder=output_folder,
+        output_filename="combined_ocr_from_files.csv", # Specific name for this path
+        remake_index=remake_index
+    )
+def clean_and_stem_text_series(df:pd.DataFrame, column:str):
     '''
     Clean and stem text columns in a data frame
     '''
     # 1. Save the main results DataFrame
     similarity_file_output_path = output_folder_path / 'page_similarity_results.csv'
+    final_df.to_csv(similarity_file_output_path, index=False, encoding="utf-8-sig")
     output_paths.append(str(similarity_file_output_path))
     print(f"Main results saved to {similarity_file_output_path}")
     return output_paths
+# Define the set of punctuation characters for efficient lookup
+PUNCTUATION_TO_STRIP = {'.', ',', '?', '!', ':', ';'}
+def _sequences_match(query_seq: List[str], ref_seq: List[str]) -> bool:
+    """
+    Helper function to compare two sequences of tokens with punctuation flexibility.
+    Returns True if the sequences match according to the rules:
+    1. An exact match is a match.
+    2. A reference token also matches a query token if it is the query token
+       followed by a single character from PUNCTUATION_TO_STRIP. This rule does not
+       apply if the reference token consists only of punctuation.
+    """
+    if len(query_seq) != len(ref_seq):
+        return False
+    for query_token, ref_token in zip(query_seq, ref_seq):
+        # Rule 1: Check for a direct, exact match first (most common case)
+        if query_token == ref_token:
+            continue
+        # Rule 2: Check for the flexible punctuation match
+        # - The reference token must be longer than 1 character
+        # - Its last character must be in our punctuation set
+        # - The token without its last character must match the query token
+        if (
+            len(ref_token) > 1 and
+            ref_token[-1] in PUNCTUATION_TO_STRIP and
+            ref_token[:-1] == query_token
+        ):
+            continue
+        # If neither rule applies, the tokens don't match, so the sequence doesn't match.
+        return False
+    # If the loop completes, every token has matched.
+    return True
+def find_consecutive_sequence_matches(
+    df_filtered: pd.DataFrame,
+    search_file_name: str,
+    reference_file_name: str
+) -> pd.DataFrame:
+    """
+    Finds all occurrences of a consecutive sequence of tokens from a search file
+    within a larger reference file.
+    This function is designed for order-dependent matching, not "bag-of-words" similarity.
+    Args:
+        df_filtered: The DataFrame containing all tokens, with 'file' and 'text_clean' columns.
+        search_file_name: The name of the file containing the search query sequence.
+        reference_file_name: The name of the file to search within.
+    Returns:
+        A DataFrame with two columns ('Page1_Index', 'Page2_Index') mapping the
+        consecutive match, or an empty DataFrame if no match is found.
+    """
+    print(f"Starting sequence search for '{search_file_name}' in '{reference_file_name}'...")
+    # Step 1: Isolate the data for each file
+    search_df = df_filtered[df_filtered['file'] == search_file_name]
+    reference_df = df_filtered[df_filtered['file'] == reference_file_name]
+    if search_df.empty or reference_df.empty:
+        print("Error: One or both files not found or are empty.")
+        return pd.DataFrame(columns=['Page1_Index', 'Page2_Index'])
+    # Step 2: Convert the token data into lists for easy comparison.
+    # We need both the text tokens and their original global indices.
+    query_tokens = search_df['text_clean'].tolist()
+    query_indices = search_df.index.tolist()
+    reference_tokens = reference_df['text_clean'].tolist()
+    reference_indices = reference_df.index.tolist()
+    query_len = len(query_tokens)
+    all_found_matches = []
+    print(f"Searching for a sequence of {query_len} tokens...")
+    # Step 3: Use a "sliding window" to search for the query sequence in the reference list.
+    for i in range(len(reference_tokens) - query_len + 1):
+        # The "window" is a slice of the reference list that is the same size as the query
+        window = reference_tokens[i : i + query_len]
+        # Step 4: If the window matches the query with or without punctuation on end
+        if _sequences_match(query_tokens, window):
+            print(f"Found a consecutive match starting at reference index: {reference_indices[i]}")
+            # Get the global indices for this entire matching block
+            matching_reference_indices = reference_indices[i : i + query_len]
+            # Create the mapping between query indices and the found reference indices
+            for j in range(query_len):
+                all_found_matches.append(
+                    (query_indices[j], matching_reference_indices[j], 1)
+                )
+            # If you only want the *first* match, you can uncomment the next line:
+            # break
+    if not all_found_matches:
+        print("No matches found")
+        gr.Info("No matches found")
+        return pd.DataFrame(columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
+    # Step 5: Create the final DataFrame in the desired format
+    result_df = pd.DataFrame(all_found_matches, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
+    return result_df
+def identify_similar_text_sequences(
     df_combined: pd.DataFrame,
+    similarity_threshold: float = 1,
+    min_word_count: int = 1,
     min_consecutive_pages: int = 1,
+    greedy_match: bool = True,
+    combine_pages: bool = False,
+    inter_file_only: bool = False,
+    do_text_clean:bool = True,
+    file1_name: str = '',
+    file2_name: str = '',
+    output_folder: str = "output/",
     progress=Progress(track_tqdm=True)
 ) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
     """
+    Identifies similar pages. Uses a highly optimized path for inter_file_only=True.
     """
     progress(0.1, desc="Processing and filtering text")
+    if do_text_clean:
+        df = clean_and_stem_text_series(df_combined, 'text') # Will produce the column 'text_clean'
+    else:
+        df = df_combined.copy()
+        df['text_clean'] = df['text'].str.lower()#.str.replace(r'[^\w\s]', '', regex=True)
     df['word_count'] = df['text_clean'].str.split().str.len().fillna(0)
+    #df['word_count'] = pd.to_numeric(df['word_count'], errors='coerce').fillna(0).astype('int64')
+    # ensure min_word_count is an int (e.g., from Gradio/text input)
+    try:
+        min_word_count = int(min_word_count)
+    except (TypeError, ValueError):
+        min_word_count = 0  # or raise/log, depending on your preference
     original_row_count = len(df)
     df_filtered = df[df['word_count'] >= min_word_count].copy()
     df_filtered.reset_index(drop=True, inplace=True)
     print(f"Filtered out {original_row_count - len(df_filtered)} pages with fewer than {min_word_count} words.")
     if len(df_filtered) < 2:
         return pd.DataFrame(), [], df_combined
+    # Similarity calculated differently if comparing between files only (inter_file_only==True), or within the same file
+    if inter_file_only:
+        progress(0.2, desc="Finding direct text matches...")
+        #base_similarity_df = _debug_similarity_between_two_files(df_filtered, vectorizer, similarity_threshold, file1_name, file2_name)
+        base_similarity_df = find_consecutive_sequence_matches(df_filtered, file1_name, file2_name)
+        if base_similarity_df.empty:
+            return pd.DataFrame(), [], df_combined
+    else:
+        # Use the original, simpler path for all-to-all comparisons (including intra-file).
+        vectorizer = TfidfVectorizer()
+        print("Standard Path: Calculating all-to-all similarity.")
+        progress(0.2, desc="Vectorizing text...")
+        tfidf_matrix = vectorizer.fit_transform(df_filtered['text_clean'])
+        progress(0.3, desc="Calculating similarity matrix...")
+        similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
+        coo_matrix = similarity_matrix.tocoo()
+        similar_pages = [
+            (r, c, v) for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
+            if r < c and v >= similarity_threshold
+        ]
+        if not similar_pages:
             return pd.DataFrame(), [], df_combined
+        base_similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
+    progress(0.7, desc="Aggregating results based on matching strategy")
+    if greedy_match or min_consecutive_pages > 1:
+        print("Finding all consecutive page matches of minimum length:", min_consecutive_pages)
+        # Sort the dataframe to ensure consecutive pages are adjacent
+        similarity_df = base_similarity_df #.sort_values(['Page1_Index', 'Page2_Index']).copy()
+        # A new sequence starts if the difference from the previous row is not (1, 1)
+        # is_consecutive will be True if a row continues the sequence, False if it's a new one.
         is_consecutive = (similarity_df['Page1_Index'].diff() == 1) & (similarity_df['Page2_Index'].diff() == 1)
+        # Use cumsum() on the inverted boolean series to create a unique ID for each block.
+        # Every time a 'False' appears (a new block starts), the sum increases.
         block_id = is_consecutive.eq(False).cumsum()
+        # Group by this block ID
         grouped = similarity_df.groupby(block_id)
+        # Aggregate each group to get the start, end, and length of the match
         agg_results = grouped.agg(
+            Page1_Start_Index=('Page1_Index', 'first'),
+            Page2_Start_Index=('Page2_Index', 'first'),
+            Page1_End_Index=('Page1_Index', 'last'),
+            Page2_End_Index=('Page2_Index', 'last'),
+            Match_Length=('Page1_Index', 'size'),
+            Avg_Similarity=('Similarity_Score', 'mean')
         ).reset_index(drop=True)
+        # If greedy_match=True, we keep all matches. If min_consecutive_pages > 1, we filter.
+        if greedy_match and min_consecutive_pages <= 1:
+            subdocument_df = agg_results
+        else:
+             # This handles the case for min_consecutive_pages > 1
+            subdocument_df = agg_results[agg_results['Match_Length'] >= min_consecutive_pages].copy()
+        if subdocument_df.empty:
+            gr.Info("No matches found")
+            return pd.DataFrame(), [], df_combined
+        final_df = map_metadata_subdocument(subdocument_df, df_filtered)
     else:
+        print(f"Finding single page matches, not greedy (min_consecutive_pages=1)")
+        # This part of your code would handle the non-sequential case
         final_df = map_metadata_single_page(base_similarity_df, df_filtered)
+        #subdocument_df = final_df # To align variable names for saving
+        if final_df.empty:
+            gr.Info("No matches found")
+            return pd.DataFrame(), [], df_combined
+    progress(0.9, desc="Saving output files")
     output_paths = save_results_and_redaction_lists(final_df, output_folder, combine_pages)
+    gr.Info(f"Found {final_df.shape[0]} match(es)")
+    print(f"Found {final_df.shape[0]} match(es)")
+    return final_df, output_paths, df_combined
 def handle_selection_and_preview(evt: gr.SelectData, results_df:pd.DataFrame, full_duplicate_data_by_file: dict):
     """
     This single function handles a user selecting a row. It:
     Wrapper function updated to include the 'greedy_match' boolean.
     """
     if not files:
+        raise Warning("Please upload files to analyse.")
     progress(0, desc="Combining input files...")
+    df_combined, _, full_out_ocr_df = combine_ocr_output_text(files, combine_pages=combine_pages)
     if df_combined.empty:
+        raise Warning("No data found in the uploaded files.")
     # Call the main analysis function with the new parameter
+    results_df, output_paths, full_df = identify_similar_text_sequences(
         df_combined=df_combined,
         similarity_threshold=threshold,
         min_word_count=min_words,
     # Clip text to first 200 characters
     full_df['text'] = full_df['text'].str[:preview_length]
     # Preprocess full_data (without preview text) for fast access (run once)
     full_data_by_file = {
     file: df.sort_values('page').set_index('page')
     if results_df.empty:
         gr.Info(f"No duplicate pages found, no results returned.")
+    return results_df, output_paths, full_data_by_file
 def show_page_previews(full_data_by_file: dict, results_df: pd.DataFrame, evt: gr.SelectData, preview_length:int=500):
     """
     return all_annotations, newly_added_annotation_group
+def apply_whole_page_redactions_from_list(duplicate_page_numbers_df: pd.DataFrame, doc_file_name_with_extension_textbox: str, review_file_state: pd.DataFrame, duplicate_output_paths: list[str], pymupdf_doc: object, page_sizes: list[dict], all_existing_annotations: list[dict], combine_pages:bool=True, new_annotations_with_bounding_boxes:List[dict]=list()):
     '''
+    This function applies redactions to whole pages based on a provided list of duplicate page numbers. It supports two modes of operation: combining pages and not combining pages. When combining pages is enabled, it attempts to identify duplicate pages across different files and applies redactions accordingly. If combining pages is disabled, it relies on new annotations with bounding boxes to determine which pages to redact. The function utilises a PyMuPDF document object to manipulate the PDF file, and it also considers the sizes of pages to ensure accurate redaction application.
+    Args:
+        duplicate_page_numbers_df (pd.DataFrame): A DataFrame containing page numbers identified as duplicates.
+        doc_file_name_with_extension_textbox (str): The name of the document file with its extension.
+        review_file_state (pd.DataFrame): The current state of the review file.
+        duplicate_output_paths (list[str]): A list of paths to files containing duplicate page information.
+        pymupdf_doc (object): A PyMuPDF document object representing the PDF file.
+        page_sizes (list[dict]): A list of dictionaries containing page size information.
+        all_existing_annotations (list[dict]): A list of all existing annotations in the document.
+        combine_pages (bool, optional): A flag indicating whether to combine pages for redaction. Defaults to True.
+        new_annotations_with_bounding_boxes (List[dict], optional): A list of new annotations with bounding boxes. Defaults to an empty list.
     '''
+    if all_existing_annotations is None:
+        all_existing_annotations = []
+    if new_annotations_with_bounding_boxes is None:
+        new_annotations_with_bounding_boxes = []
     all_annotations = all_existing_annotations.copy()
     if not pymupdf_doc:
+        message = "No document file currently under review"
         print(f"Warning: {message}")
         raise Warning(message)
     review_file_out = review_file_out.sort_values(by=["page", "ymin", "xmin"]).reset_index(drop=True)
     review_file_out = review_file_out.drop_duplicates(subset=['page', 'label', 'text', 'id'], keep='first')
+    out_message = "Successfully created duplicate text redactions."
     print(out_message)
     gr.Info(out_message)
     return review_file_out, all_annotations
 def _parse_page_line_id(combined_id: int) -> Tuple[int, int]:
+    """Parses a combined ID using modular arithmetic."""
+    if int(combined_id) < ID_MULTIPLIER:
+        # Handle cases where page is 0 (or just an edge case)
+        return 0, combined_id
+    page = combined_id // ID_MULTIPLIER
+    line = combined_id % ID_MULTIPLIER
+    return page, line
 def create_annotation_objects_from_duplicates(
     duplicates_df: pd.DataFrame,
     ocr_results_df: pd.DataFrame,
     page_sizes: List[Dict],
+    combine_pages:bool=False) -> List[Dict]:
     """
     Creates structured annotation objects from duplicate line ranges, mapping
     page numbers to image paths.
     """
     final_output = []
+    if duplicates_df.empty:
+        raise Warning("No duplicates found")
+    if ocr_results_df.empty:
+        raise Warning("No OCR results found for file under review. Please upload relevant OCR_output file and original PDF document on the review tab.")
     if combine_pages == False:
         page_to_image_map = {item['page']: item['image_path'] for item in page_sizes}
         # Prepare OCR Data: Add a line number column if it doesn't exist
                     "id": "" # to be filled in after
                 }
                 page_number = line_row['page']
                 annotations_by_page[page_number].append(box)
         # --- Format the final output list using the page-to-image map ---
         final_output = []
                 # Handle cases where a page might not have a corresponding image path
                 print(f"Warning: Page {page_num} found in OCR data but has no corresponding "
                     f"entry in the 'page_sizes' object. This page's annotations will be skipped.")
+    return final_output

tools/helper_functions.py CHANGED Viewed

@@ -9,7 +9,24 @@ import unicodedata
 from typing import List
 from math import ceil
 from gradio_image_annotation import image_annotator
-from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, AWS_PII_OPTION
 def reset_state_vars():
     return [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
@@ -22,7 +39,7 @@ def reset_state_vars():
             show_share_button=False,
             show_remove_button=False,
             interactive=False
-        ), [], [], pd.DataFrame(), pd.DataFrame(), [], [], "", False, 0
 def reset_ocr_results_state():
     return pd.DataFrame(), pd.DataFrame(), []
@@ -85,8 +102,7 @@ def enforce_cost_codes(enforce_cost_code_textbox:str, cost_code_choice:str, cost
     return
 def update_cost_code_dataframe_from_dropdown_select(cost_dropdown_selection:str, cost_code_df:pd.DataFrame):
-    cost_code_df = cost_code_df.loc[cost_code_df.iloc[:,0] == cost_dropdown_selection, :
-                                    ]
     return cost_code_df
 def ensure_folder_exists(output_folder:str):
@@ -114,7 +130,7 @@ def get_file_name_without_type(file_path):
     return filename_without_extension
-def detect_file_type(filename):
     """Detect the file type based on its extension."""
     if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
         return 'csv'
@@ -132,10 +148,12 @@ def detect_file_type(filename):
         return 'png'
     elif filename.endswith('.xfdf'):
         return 'xfdf'
     else:
         raise ValueError("Unsupported file type.")
-def read_file(filename):
     """Read the file based on its detected type."""
     file_type = detect_file_type(filename)
@@ -156,13 +174,7 @@ def ensure_output_folder_exists(output_folder:str):
     else:
         print(f"The {output_folder} folder already exists.")
-def _get_env_list(env_var_name: str) -> List[str]:
-    """Parses a comma-separated environment variable into a list of strings."""
-    value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
-    if not value:
-        return []
-    # Split by comma and filter out any empty strings that might result from extra commas
-    return [s.strip() for s in value.split(',') if s.strip()]
 def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
     '''
@@ -188,7 +200,7 @@ def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
             print(output_text)
     else:
         output_text = "No file provided."
-        print(output_text)
         return output_text, custom_regex_df
     return output_text, custom_regex_df
@@ -204,7 +216,7 @@ def put_columns_in_df(in_file:List[str]):
         file_type = detect_file_type(file_name)
         print("File type is:", file_type)
-        if file_type == 'xlsx':
             number_of_excel_files += 1
             new_choices = []
             print("Running through all xlsx sheets")
@@ -220,10 +232,13 @@ def put_columns_in_df(in_file:List[str]):
             all_sheet_names.extend(new_sheet_names)
-        else:
             df = read_file(file_name)
             new_choices = list(df.columns)
         concat_choices.extend(new_choices)
     # Drop duplicate columns
@@ -262,7 +277,6 @@ def check_for_relevant_ocr_output_with_words(doc_file_name_no_extension_textbox:
     else:
         return False
-#
 def add_folder_to_path(folder_path: str):
     '''
     Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
@@ -325,10 +339,8 @@ def merge_csv_files(file_list:List[str], output_folder:str=OUTPUT_FOLDER):
     merged_csv_path = output_folder + file_out_name + "_merged.csv"
     # Save the merged DataFrame to a CSV file
-    #merged_csv = StringIO()
-    merged_df.to_csv(merged_csv_path, index=False)
     output_files.append(merged_csv_path)
-    #merged_csv.seek(0)  # Move to the beginning of the StringIO object
     return output_files
@@ -575,5 +587,39 @@ def reset_base_dataframe(df:pd.DataFrame):
     return df
 def reset_ocr_base_dataframe(df:pd.DataFrame):
-    return df.iloc[:, [0,1]]

 from typing import List
 from math import ceil
 from gradio_image_annotation import image_annotator
+from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, AWS_PII_OPTION, MAPPED_LANGUAGE_CHOICES, LANGUAGE_CHOICES, textract_language_choices, aws_comprehend_language_choices, DEFAULT_LANGUAGE
+# from tools.load_spacy_model_custom_recognisers import nlp_analyser
+def _get_env_list(env_var_name: str) -> List[str]:
+    """Parses a comma-separated environment variable into a list of strings."""
+    value = env_var_name[1:-1].strip().replace('\"', '').replace("\'","")
+    if not value:
+        return []
+    # Split by comma and filter out any empty strings that might result from extra commas
+    return [s.strip() for s in value.split(',') if s.strip()]
+if textract_language_choices: textract_language_choices = _get_env_list(textract_language_choices)
+if aws_comprehend_language_choices: aws_comprehend_language_choices = _get_env_list(aws_comprehend_language_choices)
+if MAPPED_LANGUAGE_CHOICES: MAPPED_LANGUAGE_CHOICES = _get_env_list(MAPPED_LANGUAGE_CHOICES)
+if LANGUAGE_CHOICES: LANGUAGE_CHOICES = _get_env_list(LANGUAGE_CHOICES)
+LANGUAGE_MAP = dict(zip(MAPPED_LANGUAGE_CHOICES, LANGUAGE_CHOICES))
 def reset_state_vars():
     return [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
             show_share_button=False,
             show_remove_button=False,
             interactive=False
+        ), [], [], pd.DataFrame(), pd.DataFrame(), [], [], "", False, 0, []
 def reset_ocr_results_state():
     return pd.DataFrame(), pd.DataFrame(), []
     return
 def update_cost_code_dataframe_from_dropdown_select(cost_dropdown_selection:str, cost_code_df:pd.DataFrame):
+    cost_code_df = cost_code_df.loc[cost_code_df.iloc[:,0] == cost_dropdown_selection, :]
     return cost_code_df
 def ensure_folder_exists(output_folder:str):
     return filename_without_extension
+def detect_file_type(filename:str):
     """Detect the file type based on its extension."""
     if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')):
         return 'csv'
         return 'png'
     elif filename.endswith('.xfdf'):
         return 'xfdf'
+    elif filename.endswith('.docx'):
+        return 'docx'
     else:
         raise ValueError("Unsupported file type.")
+def read_file(filename:str):
     """Read the file based on its detected type."""
     file_type = detect_file_type(filename)
     else:
         print(f"The {output_folder} folder already exists.")
 def custom_regex_load(in_file:List[str], file_type:str = "allow_list"):
     '''
             print(output_text)
     else:
         output_text = "No file provided."
+        #print(output_text)
         return output_text, custom_regex_df
     return output_text, custom_regex_df
         file_type = detect_file_type(file_name)
         print("File type is:", file_type)
+        if (file_type == 'xlsx') | (file_type == 'xls'):
             number_of_excel_files += 1
             new_choices = []
             print("Running through all xlsx sheets")
             all_sheet_names.extend(new_sheet_names)
+        elif (file_type == "csv") | (file_type == "parquet"):
             df = read_file(file_name)
             new_choices = list(df.columns)
+        else:
+            new_choices = []
         concat_choices.extend(new_choices)
     # Drop duplicate columns
     else:
         return False
 def add_folder_to_path(folder_path: str):
     '''
     Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
     merged_csv_path = output_folder + file_out_name + "_merged.csv"
     # Save the merged DataFrame to a CSV file
+    merged_df.to_csv(merged_csv_path, index=False, encoding="utf-8-sig")
     output_files.append(merged_csv_path)
     return output_files
     return df
 def reset_ocr_base_dataframe(df:pd.DataFrame):
+    if df.empty:
+        return pd.DataFrame(columns=["page", "line", "text"])
+    else:
+        return df.loc[:, ["page", "line", "text"]]
+def reset_ocr_with_words_base_dataframe(df:pd.DataFrame, page_entity_dropdown_redaction_value:str):
+    df["index"] = df.index
+    output_df = df.copy()
+    df["page"]=df["page"].astype(str)
+    output_df_filtered = df.loc[df["page"]==str(page_entity_dropdown_redaction_value), ["page", "line", "word_text", "word_x0", "word_y0", "word_x1", "word_y1", "index"]]
+    return output_df_filtered, output_df
+def update_language_dropdown(chosen_language_full_name_drop, textract_language_choices=textract_language_choices, aws_comprehend_language_choices=aws_comprehend_language_choices, LANGUAGE_MAP=LANGUAGE_MAP):
+    try:
+        full_language_name = chosen_language_full_name_drop.lower()
+        matched_language = LANGUAGE_MAP[full_language_name]
+        chosen_language_drop = gr.Dropdown(value = matched_language, choices = LANGUAGE_CHOICES, label="Chosen language short code", multiselect=False, visible=True)
+        if matched_language not in aws_comprehend_language_choices and matched_language not in textract_language_choices:
+            gr.Info(f"Note that {full_language_name} is not supported by AWS Comprehend or AWS Textract")
+        elif matched_language not in aws_comprehend_language_choices:
+            gr.Info(f"Note that {full_language_name} is not supported by AWS Comprehend")
+        elif matched_language not in textract_language_choices:
+            gr.Info(f"Note that {full_language_name} is not supported by AWS Textract")
+    except Exception as e:
+        print(e)
+        gr.Info("Could not find language in list")
+        chosen_language_drop = gr.Dropdown(value = DEFAULT_LANGUAGE, choices = LANGUAGE_CHOICES, label="Chosen language short code", multiselect=False)
+    return chosen_language_drop

tools/load_spacy_model_custom_recognisers.py CHANGED Viewed

@@ -1,31 +1,255 @@
 from typing import List
 from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
-from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
 import spacy
-from spacy.matcher import Matcher, PhraseMatcher
 from spaczz.matcher import FuzzyMatcher
 spacy.prefer_gpu()
 from spacy.cli.download import download
 import Levenshtein
 import re
 import gradio as gr
-model_name = "en_core_web_lg" #"en_core_web_sm" #"en_core_web_trf"
 score_threshold = 0.001
 custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
-#Load spacy model
-try:
-	import en_core_web_lg #en_core_web_sm
-	nlp = en_core_web_lg.load() #en_core_web_sm.load()
-	print("Successfully imported spaCy model")
-except:
-	download(model_name)
-	nlp = spacy.load(model_name)
-	print("Successfully downloaded and imported spaCy model", model_name)
-# #### Custom recognisers
 def custom_word_list_recogniser(custom_list:List[str]=[]):
     # Create regex pattern, handling quotes carefully
@@ -172,6 +396,118 @@ def custom_fuzzy_word_list_regex(text:str, custom_list:List[str]=[]):
     return start_positions, end_positions
 def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mistakes_max:int = 1, search_whole_phrase:bool=True, nlp=nlp, progress=gr.Progress(track_tqdm=True)):
     ''' Conduct fuzzy match on a list of text data.'''
@@ -189,9 +525,6 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
     for string_query in custom_query_list:
-        #print("text:", text)
-        #print("string_query:", string_query)
         query = nlp(string_query)
         if search_whole_phrase == False:
@@ -200,8 +533,6 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
             spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
-            #print("token_query:", token_query)
             if len(token_query) > 1:
                 #pattern_lemma = [{"LEMMA": {"IN": query}}]
                 pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": token_query}}}]
@@ -215,7 +546,6 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
         else:
             # If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
-            #tokenised_query = [string_query.lower()]
             # If you want to match the whole phrase, use phrase matcher
             matcher = FuzzyMatcher(nlp.vocab)
             patterns = [nlp.make_doc(string_query)]  # Convert query into a Doc object
@@ -236,9 +566,7 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
                 for match_id, start, end in matches:
                     span = str(doc[start:end]).strip()
                     query_search = str(query).strip()
-                    #print("doc:", doc)
-                    #print("span:", span)
-                    #print("query_search:", query_search)
                     # Convert word positions to character positions
                     start_char = doc[start].idx  # Start character position
@@ -253,9 +581,6 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
                 for match_id, start, end, ratio, pattern in matches:
                     span = str(doc[start:end]).strip()
                     query_search = str(query).strip()
-                    #print("doc:", doc)
-                    #print("span:", span)
-                    #print("query_search:", query_search)
                     # Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
                     distance = Levenshtein.distance(query_search.lower(), span.lower())
@@ -269,9 +594,6 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
                         start_char = doc[start].idx  # Start character position
                         end_char = doc[end - 1].idx + len(doc[end - 1])  # End character position
-                        #print("start_char:", start_char)
-                        #print("end_char:", end_char)
                         all_matches.append(match_count)
                         all_start_positions.append(start_char)
                         all_end_positions.append(end_char)
@@ -281,59 +603,4 @@ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mista
     return all_start_positions, all_end_positions
-class CustomWordFuzzyRecognizer(EntityRecognizer):
-    def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
-        super().__init__(supported_entities=supported_entities)
-        self.custom_list = custom_list  # Store the custom_list as an instance attribute
-        self.spelling_mistakes_max = spelling_mistakes_max  # Store the max spelling mistakes
-        self.search_whole_phrase = search_whole_phrase  # Store the search whole phrase flag
-    def load(self) -> None:
-        """No loading is required."""
-        pass
-    def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]:
-        """
-        Logic for detecting a specific PII
-        """
-        start_pos, end_pos = spacy_fuzzy_search(text, self.custom_list, self.spelling_mistakes_max, self.search_whole_phrase)  # Pass new parameters
-        results = []
-        for i in range(0, len(start_pos)):
-            result = RecognizerResult(
-                entity_type="CUSTOM_FUZZY",
-                start=start_pos[i],
-                end=end_pos[i],
-                score=1
-            )
-            results.append(result)
-        return results
-custom_list_default = []
-custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
-# Create a class inheriting from SpacyNlpEngine
-class LoadedSpacyNlpEngine(SpacyNlpEngine):
-    def __init__(self, loaded_spacy_model):
-        super().__init__()
-        self.nlp = {"en": loaded_spacy_model}
-# Pass the loaded model to the new LoadedSpacyNlpEngine
-loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
-nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
-                default_score_threshold=score_threshold,
-                supported_languages=["en"],
-                log_decision_process=False,
-                )
-# Add custom recognisers to nlp_analyser
-nlp_analyser.registry.add_recognizer(street_recogniser)
-nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
-nlp_analyser.registry.add_recognizer(titles_recogniser)
-nlp_analyser.registry.add_recognizer(custom_recogniser)
-nlp_analyser.registry.add_recognizer(custom_word_fuzzy_recognizer)

 from typing import List
 from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
+from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts, NerModelConfiguration
 import spacy
+from spacy.matcher import Matcher
 from spaczz.matcher import FuzzyMatcher
 spacy.prefer_gpu()
 from spacy.cli.download import download
 import Levenshtein
 import re
+import os
+import requests
 import gradio as gr
+from tools.config import DEFAULT_LANGUAGE, TESSERACT_DATA_FOLDER
 score_threshold = 0.001
 custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
+# Create a class inheriting from SpacyNlpEngine
+class LoadedSpacyNlpEngine(SpacyNlpEngine):
+    def __init__(self, loaded_spacy_model, language_code: str):
+        super().__init__(ner_model_configuration=NerModelConfiguration(labels_to_ignore=["CARDINAL", "ORDINAL"])) # Ignore non-relevant labels
+        self.nlp = {language_code: loaded_spacy_model}
+def _base_language_code(language: str) -> str:
+    lang = _normalize_language_input(language)
+    if "_" in lang:
+        return lang.split("_")[0]
+    return lang
+def load_spacy_model(language: str = DEFAULT_LANGUAGE):
+    """
+    Load a spaCy model for the requested language and return it as `nlp`.
+    Accepts common inputs like: "en", "en_lg", "en_sm", "de", "fr", "es", "it", "nl", "pt", "zh", "ja", "xx".
+    Falls back through sensible candidates and will download if missing.
+    """
+    synonyms = {
+        "english": "en",
+        "catalan": "ca",
+        "danish": "da",
+        "german": "de",
+        "french": "fr",
+        "greek": "el",
+        "finnish": "fi",
+        "croatian": "hr",
+        "lithuanian": "lt",
+        "macedonian": "mk",
+        "norwegian_bokmaal": "nb",
+        "polish": "pl",
+        "russian": "ru",
+        "slovenian": "sl",
+        "swedish": "sv",
+        "dutch": "nl",
+        "portuguese": "pt",
+        "chinese": "zh",
+        "japanese": "ja",
+        "multilingual": "xx",
+    }
+    lang_norm = _normalize_language_input(language)
+    lang_norm = synonyms.get(lang_norm, lang_norm)
+    base_lang = _base_language_code(lang_norm)
+    candidates_by_lang = {
+        # English
+        "en": [
+            "en_core_web_lg",
+            "en_core_web_trf",
+            "en_core_web_md",
+            "en_core_web_sm",
+        ],
+        "en_lg": ["en_core_web_lg"],
+        "en_trf": ["en_core_web_trf"],
+        "en_md": ["en_core_web_md"],
+        "en_sm": ["en_core_web_sm"],
+        # Major languages (news pipelines)
+        "ca": ["ca_core_news_lg", "ca_core_news_md", "ca_core_news_sm"], # Catalan
+        "da": ["da_core_news_lg", "da_core_news_md", "da_core_news_sm"], # Danish
+        "de": ["de_core_news_lg", "de_core_news_md", "de_core_news_sm"], # German
+        "el": ["el_core_news_lg", "el_core_news_md", "el_core_news_sm"], # Greek
+        "es": ["es_core_news_lg", "es_core_news_md", "es_core_news_sm"], # Spanish
+        "fi": ["fi_core_news_lg", "fi_core_news_md", "fi_core_news_sm"], # Finnish
+        "fr": ["fr_core_news_lg", "fr_core_news_md", "fr_core_news_sm"], # French
+        "hr": ["hr_core_news_lg", "hr_core_news_md", "hr_core_news_sm"], # Croatian
+        "it": ["it_core_news_lg", "it_core_news_md", "it_core_news_sm"], # Italian
+        "ja": ["ja_core_news_lg", "ja_core_news_md", "ja_core_news_sm"], # Japanese
+        "ko": ["ko_core_news_lg", "ko_core_news_md", "ko_core_news_sm"], # Korean
+        "lt": ["lt_core_news_lg", "lt_core_news_md", "lt_core_news_sm"], # Lithuanian
+        "mk": ["mk_core_news_lg", "mk_core_news_md", "mk_core_news_sm"], # Macedonian
+        "nb": ["nb_core_news_lg", "nb_core_news_md", "nb_core_news_sm"], # Norwegian Bokmål
+        "nl": ["nl_core_news_lg", "nl_core_news_md", "nl_core_news_sm"], # Dutch
+        "pl": ["pl_core_news_lg", "pl_core_news_md", "pl_core_news_sm"], # Polish
+        "pt": ["pt_core_news_lg", "pt_core_news_md", "pt_core_news_sm"], # Portuguese
+        "ro": ["ro_core_news_lg", "ro_core_news_md", "ro_core_news_sm"], # Romanian
+        "ru": ["ru_core_news_lg", "ru_core_news_md", "ru_core_news_sm"], # Russian
+        "sl": ["sl_core_news_lg", "sl_core_news_md", "sl_core_news_sm"], # Slovenian
+        "sv": ["sv_core_news_lg", "sv_core_news_md", "sv_core_news_sm"], # Swedish
+        "uk": ["uk_core_news_lg", "uk_core_news_md", "uk_core_news_sm"], # Ukrainian
+        "zh": ["zh_core_web_lg", "zh_core_web_mod", "zh_core_web_sm", "zh_core_web_trf"], # Chinese
+        # Multilingual NER
+        "xx": ["xx_ent_wiki_sm"],
+    }
+    if lang_norm in candidates_by_lang:
+        candidates = candidates_by_lang[lang_norm]
+    elif base_lang in candidates_by_lang:
+        candidates = candidates_by_lang[base_lang]
+    else:
+        # Fallback to multilingual if unknown
+        candidates = candidates_by_lang["xx"]
+    last_error = None
+    for candidate in candidates:
+        # Try importable package first (fast-path when installed as a package)
+        try:
+            module = __import__(candidate)
+            print(f"Successfully imported spaCy model: {candidate}")
+            return module.load()
+        except Exception as e:
+            last_error = e
+        # Try spacy.load if package is linked/installed
+        try:
+            nlp = spacy.load(candidate)
+            print(f"Successfully loaded spaCy model via spacy.load: {candidate}")
+            return nlp
+        except Exception as e:
+            last_error = e
+        # Check if model is already downloaded before attempting to download
+        try:
+            # Try to load the model to see if it's already available
+            nlp = spacy.load(candidate)
+            print(f"Model {candidate} is already available, skipping download")
+            return nlp
+        except OSError:
+            # Model not found, proceed with download
+            pass
+        except Exception as e:
+            last_error = e
+            continue
+        # Attempt to download then load
+        try:
+            print(f"Downloading spaCy model: {candidate}")
+            download(candidate)
+            nlp = spacy.load(candidate)
+            print(f"Successfully downloaded and loaded spaCy model: {candidate}")
+            return nlp
+        except Exception as e:
+            last_error = e
+            continue
+    raise RuntimeError(f"Failed to load spaCy model for language '{language}'. Last error: {last_error}")
+# Language-aware spaCy model loader
+def _normalize_language_input(language: str) -> str:
+    return language.strip().lower().replace("-", "_")
+# Update the global variables to use the new function
+ACTIVE_LANGUAGE_CODE = _base_language_code(DEFAULT_LANGUAGE)
+nlp = None # Placeholder, will be loaded in the create_nlp_analyser function below #load_spacy_model(DEFAULT_LANGUAGE)
+def get_tesseract_lang_code(short_code:str):
+    """
+    Maps a two-letter language code to the corresponding Tesseract OCR code.
+    Args:
+        short_code (str): The two-letter language code (e.g., "en", "de").
+    Returns:
+        str or None: The Tesseract language code (e.g., "eng", "deu"),
+                     or None if no mapping is found.
+    """
+    # Mapping from 2-letter codes to Tesseract 3-letter codes
+    # Based on ISO 639-2/T codes.
+    lang_map = {
+        "en": "eng",
+        "de": "deu",
+        "fr": "fra",
+        "es": "spa",
+        "it": "ita",
+        "nl": "nld",
+        "pt": "por",
+        "zh": "chi_sim",  # Mapping to Simplified Chinese by default
+        "ja": "jpn",
+        "ko": "kor",
+        "lt": "lit",
+        "mk": "mkd",
+        "nb": "nor",
+        "pl": "pol",
+        "ro": "ron",
+        "ru": "rus",
+        "sl": "slv",
+        "sv": "swe",
+        "uk": "ukr"
+    }
+    return lang_map.get(short_code)
+def download_tesseract_lang_pack(short_lang_code:str, tessdata_dir=TESSERACT_DATA_FOLDER):
+    """
+    Downloads a Tesseract language pack to a local directory.
+    Args:
+        lang_code (str): The short code for the language (e.g., "eng", "fra").
+        tessdata_dir (str, optional): The directory to save the language pack.
+                                     Defaults to "tessdata".
+    """
+    # Create the directory if it doesn't exist
+    if not os.path.exists(tessdata_dir):
+        os.makedirs(tessdata_dir)
+    # Get the Tesseract language code
+    lang_code = get_tesseract_lang_code(short_lang_code)
+    if lang_code is None:
+        raise ValueError(f"Language code {short_lang_code} not found in Tesseract language map")
+    # Set the local file path
+    file_path = os.path.join(tessdata_dir, f"{lang_code}.traineddata")
+    # Check if the file already exists
+    if os.path.exists(file_path):
+        print(f"Language pack {lang_code}.traineddata already exists at {file_path}")
+        return file_path
+    # Construct the URL for the language pack
+    url = f"https://raw.githubusercontent.com/tesseract-ocr/tessdata/main/{lang_code}.traineddata"
+    # Download the file
+    try:
+        response = requests.get(url, stream=True)
+        response.raise_for_status()  # Raise an exception for bad status codes
+        with open(file_path, "wb") as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+        print(f"Successfully downloaded {lang_code}.traineddata to {file_path}")
+        return file_path
+    except requests.exceptions.RequestException as e:
+        print(f"Error downloading {lang_code}.traineddata: {e}")
+        return None
+#### Custom recognisers
 def custom_word_list_recogniser(custom_list:List[str]=[]):
     # Create regex pattern, handling quotes carefully
     return start_positions, end_positions
+class CustomWordFuzzyRecognizer(EntityRecognizer):
+    def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
+        super().__init__(supported_entities=supported_entities)
+        self.custom_list = custom_list  # Store the custom_list as an instance attribute
+        self.spelling_mistakes_max = spelling_mistakes_max  # Store the max spelling mistakes
+        self.search_whole_phrase = search_whole_phrase  # Store the search whole phrase flag
+    def load(self) -> None:
+        """No loading is required."""
+        pass
+    def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]:
+        """
+        Logic for detecting a specific PII
+        """
+        start_pos, end_pos = spacy_fuzzy_search(text, self.custom_list, self.spelling_mistakes_max, self.search_whole_phrase)  # Pass new parameters
+        results = []
+        for i in range(0, len(start_pos)):
+            result = RecognizerResult(
+                entity_type="CUSTOM_FUZZY",
+                start=start_pos[i],
+                end=end_pos[i],
+                score=1
+            )
+            results.append(result)
+        return results
+custom_list_default = []
+custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
+# Pass the loaded model to the new LoadedSpacyNlpEngine
+loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp, language_code = ACTIVE_LANGUAGE_CODE)
+def create_nlp_analyser(language: str = DEFAULT_LANGUAGE, custom_list: List[str] = None,
+                       spelling_mistakes_max: int = 1, search_whole_phrase: bool = True, existing_nlp_analyser: AnalyzerEngine = None, return_also_model: bool = False):
+    """
+    Create an nlp_analyser object based on the specified language input.
+    Args:
+        language (str): Language code (e.g., "en", "de", "fr", "es", etc.)
+        custom_list (List[str], optional): List of custom words to recognize. Defaults to None.
+        spelling_mistakes_max (int, optional): Maximum number of spelling mistakes for fuzzy matching. Defaults to 1.
+        search_whole_phrase (bool, optional): Whether to search for whole phrases or individual words. Defaults to True.
+        existing_nlp_analyser (AnalyzerEngine, optional): Existing nlp_analyser object to use. Defaults to None.
+        return_also_model (bool, optional): Whether to return the nlp_model object as well. Defaults to False.
+    Returns:
+        AnalyzerEngine: Configured nlp_analyser object with custom recognizers
+    """
+    if existing_nlp_analyser is None:
+        pass
+    else:
+        if existing_nlp_analyser.supported_languages[0] == language:
+            nlp_analyser = existing_nlp_analyser
+            print(f"Using existing nlp_analyser for {language}")
+            return nlp_analyser
+    # Load spaCy model for the specified language
+    nlp_model = load_spacy_model(language)
+    # Get base language code
+    base_lang_code = _base_language_code(language)
+    # Create custom recognizers
+    if custom_list is None:
+        custom_list = []
+    custom_recogniser = custom_word_list_recogniser(custom_list)
+    custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(
+        supported_entities=["CUSTOM_FUZZY"],
+        custom_list=custom_list,
+        spelling_mistakes_max=spelling_mistakes_max,
+        search_whole_phrase=search_whole_phrase
+    )
+    # Create NLP engine with loaded model
+    loaded_nlp_engine = LoadedSpacyNlpEngine(
+        loaded_spacy_model=nlp_model,
+        language_code=base_lang_code
+    )
+    # Create analyzer engine
+    nlp_analyser = AnalyzerEngine(
+        nlp_engine=loaded_nlp_engine,
+        default_score_threshold=score_threshold,
+        supported_languages=[base_lang_code],
+        log_decision_process=False,
+    )
+    # Add custom recognizers to nlp_analyser
+    nlp_analyser.registry.add_recognizer(custom_recogniser)
+    nlp_analyser.registry.add_recognizer(custom_word_fuzzy_recognizer)
+    # Add language-specific recognizers for English
+    if base_lang_code == "en":
+        nlp_analyser.registry.add_recognizer(street_recogniser)
+        nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
+        nlp_analyser.registry.add_recognizer(titles_recogniser)
+    if return_also_model:
+        return nlp_analyser, nlp_model
+    return nlp_analyser
+# Create the default nlp_analyser using the new function
+nlp_analyser, nlp = create_nlp_analyser(DEFAULT_LANGUAGE, return_also_model=True)
 def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mistakes_max:int = 1, search_whole_phrase:bool=True, nlp=nlp, progress=gr.Progress(track_tqdm=True)):
     ''' Conduct fuzzy match on a list of text data.'''
     for string_query in custom_query_list:
         query = nlp(string_query)
         if search_whole_phrase == False:
             spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
             if len(token_query) > 1:
                 #pattern_lemma = [{"LEMMA": {"IN": query}}]
                 pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": token_query}}}]
         else:
             # If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
             # If you want to match the whole phrase, use phrase matcher
             matcher = FuzzyMatcher(nlp.vocab)
             patterns = [nlp.make_doc(string_query)]  # Convert query into a Doc object
                 for match_id, start, end in matches:
                     span = str(doc[start:end]).strip()
                     query_search = str(query).strip()
                     # Convert word positions to character positions
                     start_char = doc[start].idx  # Start character position
                 for match_id, start, end, ratio, pattern in matches:
                     span = str(doc[start:end]).strip()
                     query_search = str(query).strip()
                     # Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
                     distance = Levenshtein.distance(query_search.lower(), span.lower())
                         start_char = doc[start].idx  # Start character position
                         end_char = doc[end - 1].idx + len(doc[end - 1])  # End character position
                         all_matches.append(match_count)
                         all_start_positions.append(start_char)
                         all_end_positions.append(end_char)
     return all_start_positions, all_end_positions

tools/redaction_review.py CHANGED Viewed

@@ -1,49 +1,62 @@
 import os
 import re
-import gradio as gr
 import pandas as pd
 import numpy as np
 from xml.etree.ElementTree import Element, SubElement, tostring, parse
 from xml.dom import minidom
 import uuid
-from typing import List, Tuple
 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
 from pymupdf import Document, Rect
 import pymupdf
 from PIL import ImageDraw, Image
 from datetime import datetime, timezone, timedelta
 from tools.config import OUTPUT_FOLDER, MAX_IMAGE_PIXELS, INPUT_FOLDER, COMPRESS_REDACTED_PDF
-from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, remove_duplicate_images_with_blank_boxes, fill_missing_ids, divide_coordinates_by_page_sizes, save_pdf_with_or_without_compression
 from tools.helper_functions import get_file_name_without_type,  detect_file_type
 from tools.file_redaction import redact_page_with_pymupdf
 if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
-def decrease_page(number:int):
     '''
     Decrease page number for review redactions page.
     '''
     if number > 1:
         return number - 1, number - 1
     else:
-        return 1, 1
-def increase_page(number:int, page_image_annotator_object:AnnotatedImageData):
     '''
     Increase page number for review redactions page.
     '''
-    if not page_image_annotator_object:
-        return 1, 1
-    max_pages = len(page_image_annotator_object)
     if number < max_pages:
         return number + 1, number + 1
     else:
-        return max_pages, max_pages
 def update_zoom(current_zoom_level:int, annotate_current_page:int, decrease:bool=True):
     if decrease == False:
@@ -86,8 +99,8 @@ def get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object:
                                  recogniser_dropdown_value:str,
                                  text_dropdown_value:str,
                                  page_dropdown_value:str,
-                                 review_df:pd.DataFrame=[],
-                                 page_sizes:List[str]=[]):
     '''
     Create a filtered recogniser dataframe and associated dropdowns based on current information in the image annotator and review data frame.
     '''
@@ -134,7 +147,7 @@ def get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object:
     return recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop
-def update_recogniser_dataframes(page_image_annotator_object:AnnotatedImageData, recogniser_dataframe_base:pd.DataFrame, recogniser_entities_dropdown_value:str="ALL", text_dropdown_value:str="ALL", page_dropdown_value:str="ALL", review_df:pd.DataFrame=[], page_sizes:list[str]=[]):
     '''
     Update recogniser dataframe information that appears alongside the pdf pages on the review screen.
     '''
@@ -166,7 +179,11 @@ def update_recogniser_dataframes(page_image_annotator_object:AnnotatedImageData,
     return recogniser_entities_list, recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_drop, text_entities_drop, page_entities_drop
 def undo_last_removal(backup_review_state:pd.DataFrame, backup_image_annotations_state:list[dict], backup_recogniser_entity_dataframe_base:pd.DataFrame):
-    return backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
 def update_annotator_page_from_review_df(
     review_df: pd.DataFrame,
@@ -188,23 +205,24 @@ def update_annotator_page_from_review_df(
     # Get the target page number from the selected row
     # Safely access the page number, handling potential errors or empty DataFrame
-    gradio_annotator_current_page_number: int = 0
     annotate_previous_page: int = 0 # Renaming for clarity if needed, matches original output
     if not selected_recogniser_entity_df_row.empty and 'page' in selected_recogniser_entity_df_row.columns:
         try:
-            # Use .iloc[0] and .item() for robust scalar extraction
-            gradio_annotator_current_page_number = int(selected_recogniser_entity_df_row['page'].iloc[0])
             annotate_previous_page = gradio_annotator_current_page_number # Store original page number
         except (IndexError, ValueError, TypeError):
-            print("Warning: Could not extract valid page number from selected_recogniser_entity_df_row. Defaulting to page 0 (or 1).")
             gradio_annotator_current_page_number = 1 # Or 0 depending on 1-based vs 0-based indexing elsewhere
     # Ensure page number is valid and 1-based for external display/logic
-    if gradio_annotator_current_page_number <= 0:
-        gradio_annotator_current_page_number = 1
-    page_max_reported = len(out_image_annotations_state)
     if gradio_annotator_current_page_number > page_max_reported:
         gradio_annotator_current_page_number = page_max_reported # Cap at max pages
     page_num_reported_zero_indexed = gradio_annotator_current_page_number - 1
@@ -247,7 +265,7 @@ def update_annotator_page_from_review_df(
         if not current_page_review_df.empty:
             # Convert the current page's review data to annotation list format for *this page*
-            current_page_annotations_list = []
             # Define expected annotation dict keys, including 'image', 'page', coords, 'label', 'text', 'color' etc.
             # Assuming review_df has compatible columns
             expected_annotation_keys = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id'] # Add/remove as needed
@@ -267,9 +285,6 @@ def update_annotator_page_from_review_df(
             current_page_annotations_list = current_page_annotations_list_raw
             # Update the annotations state for the current page
-            # Each entry in out_image_annotations_state seems to be a dict containing keys like 'image', 'page', 'annotations' (List[dict])
-            # Need to update the 'annotations' list for the specific page.
-            # Find the entry for the current page in the state
             page_state_entry_found = False
             for i, page_state_entry in enumerate(out_image_annotations_state):
                 # Assuming page_state_entry has a 'page' key (1-based)
@@ -291,16 +306,10 @@ def update_annotator_page_from_review_df(
                     break
             if not page_state_entry_found:
-                 # This scenario might happen if the current_image_annotations_state didn't initially contain
-                 # an entry for this page number. Depending on the application logic, you might need to
-                 # add a new entry here, but based on the original code's structure, it seems
-                 # out_image_annotations_state is pre-populated for all pages.
                  print(f"Warning: Entry for page {gradio_annotator_current_page_number} not found in current_image_annotations_state. Cannot update page annotations.")
-    # --- Image Path and Page Size Handling (already seems focused on current page, keep similar logic) ---
     # Get the image path for the current page from the updated state
-    # Ensure the entry exists before accessing
     current_image_path = None
     if len(out_image_annotations_state) > page_num_reported_zero_indexed and 'image' in out_image_annotations_state[page_num_reported_zero_indexed]:
          current_image_path = out_image_annotations_state[page_num_reported_zero_indexed]['image']
@@ -331,13 +340,9 @@ def update_annotator_page_from_review_df(
     if not page_sizes_df.empty:
         page_sizes = page_sizes_df.to_dict(orient='records')
     else:
-        page_sizes = [] # Ensure page_sizes is a list if df is empty
     # --- Re-evaluate Coordinate Multiplication and Duplicate Removal ---
-    # The original code multiplied coordinates for the *entire* document and removed duplicates
-    # across the *entire* document *after* converting the full review_df to state.
-    # With the optimized approach, we updated only one page's annotations in the state.
     # Let's assume remove_duplicate_images_with_blank_boxes expects the raw list of dicts state format:
     try:
          out_image_annotations_state = remove_duplicate_images_with_blank_boxes(out_image_annotations_state)
@@ -352,9 +357,7 @@ def update_annotator_page_from_review_df(
          print(f"Warning: Cannot select current page annotator object for index {page_num_reported_zero_indexed}.")
          out_current_page_annotator = {} # Or None, depending on expected output type
-    # The original code returns gradio_annotator_current_page_number as the 3rd value,
-    # which was potentially updated by bounding checks. Keep this.
     final_page_number_returned = gradio_annotator_current_page_number
     return (out_current_page_annotator,
@@ -364,6 +367,277 @@ def update_annotator_page_from_review_df(
             review_df, # review_df might have its 'page' column type changed, keep it as is or revert if necessary
             annotate_previous_page) # The original page number from selected_recogniser_entity_df_row
 def exclude_selected_items_from_redaction(review_df: pd.DataFrame,
                                           selected_rows_df: pd.DataFrame,
                                           image_file_paths:List[str],
@@ -437,6 +711,7 @@ def replace_annotator_object_img_np_array_with_page_sizes_image_path(
 def replace_placeholder_image_with_real_image(doc_full_file_name_textbox:str, current_image_path:str, page_sizes_df:pd.DataFrame, page_num_reported:int, input_folder:str):
         ''' If image path is still not valid, load in a new image an overwrite it. Then replace all items in the image annotation object for all pages based on the updated information.'''
         page_num_reported_zero_indexed = page_num_reported - 1
         if not os.path.exists(current_image_path):
@@ -471,11 +746,12 @@ def update_annotator_object_and_filter_df(
     gradio_annotator_current_page_number:int,
     recogniser_entities_dropdown_value:str="ALL",
     page_dropdown_value:str="ALL",
     text_dropdown_value:str="ALL",
-    recogniser_dataframe_base:gr.Dataframe=None, # Simplified default
     zoom:int=100,
     review_df:pd.DataFrame=None, # Use None for default empty DataFrame
-    page_sizes:List[dict]=[],
     doc_full_file_name_textbox:str='',
     input_folder:str=INPUT_FOLDER
 ) -> Tuple[image_annotator, gr.Number, gr.Number, int, str, gr.Dataframe, pd.DataFrame, List[str], List[str], List[dict], List[AnnotatedImageData]]:
@@ -483,6 +759,7 @@ def update_annotator_object_and_filter_df(
     Update a gradio_image_annotation object with new annotation data for the current page
     and update filter dataframes, optimizing by processing only the current page's data for display.
     '''
     zoom_str = str(zoom) + '%'
     # Handle default empty review_df and recogniser_dataframe_base
@@ -496,8 +773,9 @@ def update_annotator_object_and_filter_df(
     if not all_image_annotations:
         print("No all_image_annotation object found")
         # Return blank/default outputs
-        blank_annotator = gr.ImageAnnotator(
-            value = None, boxes_alpha=0.1, box_thickness=1, label_list=[], label_colors=[],
             show_label=False, height=zoom_str, width=zoom_str, box_min_size=1,
             box_selected_thickness=2, handle_size=4, sources=None,
             show_clear_button=False, show_share_button=False, show_remove_button=False,
@@ -508,7 +786,7 @@ def update_annotator_object_and_filter_df(
         return (blank_annotator, gr.Number(value=1), gr.Number(value=1), 1,
                 recogniser_entities_dropdown_value, blank_df_out_gr, blank_df_modified,
-                [], [], [], []) # Return empty lists/defaults for other outputs
     # Validate and bound the current page number (1-based logic)
     page_num_reported = max(1, gradio_annotator_current_page_number) # Minimum page is 1
@@ -519,6 +797,10 @@ def update_annotator_object_and_filter_df(
     page_num_reported_zero_indexed = page_num_reported - 1
     annotate_previous_page = page_num_reported # Store the determined page number
     # --- Process page sizes DataFrame ---
     page_sizes_df = pd.DataFrame(page_sizes)
     if not page_sizes_df.empty:
@@ -530,29 +812,17 @@ def update_annotator_object_and_filter_df(
             print("Warning: Page sizes DataFrame became empty after processing.")
     # --- Handle Image Path Replacement for the Current Page ---
-    # This modifies the specific page entry within all_image_annotations list
-    # Assuming replace_annotator_object_img_np_array_with_page_sizes_image_path
-    # correctly updates the image path within the list element.
     if len(all_image_annotations) > page_num_reported_zero_indexed:
-        # Make a shallow copy of the list and deep copy the specific page dict before modification
-        # to avoid modifying the input list unexpectedly if it's used elsewhere.
-        # However, the original code modified the list in place, so we'll stick to that
-        # pattern but acknowledge it.
         page_object_to_update = all_image_annotations[page_num_reported_zero_indexed]
         # Use the helper function to replace the image path within the page object
-        # Note: This helper returns the potentially modified page_object and the full state.
-        # The full state return seems redundant if only page_object_to_update is modified.
-        # Let's call it and assume it correctly updates the item in the list.
         updated_page_object, all_image_annotations_after_img_replace = replace_annotator_object_img_np_array_with_page_sizes_image_path(
              all_image_annotations, page_object_to_update, page_sizes, page_num_reported)
-        # The original code immediately re-assigns all_image_annotations.
-        # We'll rely on the function modifying the list element in place or returning the updated list.
-        # Assuming it returns the updated list for robustness:
         all_image_annotations = all_image_annotations_after_img_replace
         # Now handle the actual image file path replacement using replace_placeholder_image_with_real_image
         current_image_path = updated_page_object.get('image') # Get potentially updated image path
@@ -585,7 +855,7 @@ def update_annotator_object_and_filter_df(
     if not page_sizes_df.empty:
         page_sizes = page_sizes_df.to_dict(orient='records')
     else:
-        page_sizes = [] # Ensure page_sizes is a list if df is empty
     # --- OPTIMIZATION: Prepare data *only* for the current page for display ---
     current_page_image_annotator_object = None
@@ -596,7 +866,6 @@ def update_annotator_object_and_filter_df(
         # Assuming coordinate multiplication IS needed for display if state stores relative coords
         current_page_annotations_df = convert_annotation_data_to_dataframe([page_data_for_display])
         if not current_page_annotations_df.empty and not page_sizes_df.empty:
              # Multiply coordinates *only* for this page's DataFrame
              try:
@@ -642,18 +911,19 @@ def update_annotator_object_and_filter_df(
     except Exception as e:
         print(f"Error calling update_recogniser_dataframes: {e}. Returning empty/default filter data.")
-        recogniser_entities_list = []
-        recogniser_colour_list = []
         recogniser_dataframe_out_gr = gr.Dataframe(pd.DataFrame(columns=["page", "label", "text", "id"]))
         recogniser_dataframe_modified = pd.DataFrame(columns=["page", "label", "text", "id"])
-        text_entities_drop = []
-        page_entities_drop = []
     # --- Final Output Components ---
-    page_number_reported_gradio_comp = gr.Number(label = "Current page", value=page_num_reported, precision=0)
     ### Present image_annotator outputs
     # Handle the case where current_page_image_annotator_object couldn't be prepared
@@ -683,9 +953,12 @@ def update_annotator_object_and_filter_df(
             interactive=True # Keep interactive if data is present
         )
-    # The original code returned page_number_reported_gradio twice;
-    # returning the Gradio component and the plain integer value.
-    # Let's match the output signature.
     return (out_image_annotator,
             page_number_reported_gradio_comp,
             page_number_reported_gradio_comp, # Redundant, but matches original return signature
@@ -695,6 +968,7 @@ def update_annotator_object_and_filter_df(
             recogniser_dataframe_modified,
             text_entities_drop, # List of text entities for dropdown
             page_entities_drop, # List of page numbers for dropdown
             page_sizes, # Updated page_sizes list
             all_image_annotations) # Return the updated full state
@@ -703,13 +977,19 @@ def update_all_page_annotation_object_based_on_previous_page(
                                     current_page:int,
                                     previous_page:int,
                                     all_image_annotations:List[AnnotatedImageData],
-                                    page_sizes:List[dict]=[],
                                     clear_all:bool=False
                                     ):
     '''
     Overwrite image annotations on the page we are moving from with modifications.
     '''
     previous_page_zero_index = previous_page -1
     if not current_page: current_page = 1
@@ -718,7 +998,7 @@ def update_all_page_annotation_object_based_on_previous_page(
     page_image_annotator_object, all_image_annotations = replace_annotator_object_img_np_array_with_page_sizes_image_path(all_image_annotations, page_image_annotator_object, page_sizes, previous_page)
     if clear_all == False: all_image_annotations[previous_page_zero_index] = page_image_annotator_object
-    else: all_image_annotations[previous_page_zero_index]["boxes"] = []
     return all_image_annotations, current_page, current_page
@@ -730,16 +1010,16 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
                      review_file_state:pd.DataFrame,
                      output_folder:str = OUTPUT_FOLDER,
                      save_pdf:bool=True,
-                     page_sizes:List[dict]=[],
                      COMPRESS_REDACTED_PDF:bool=COMPRESS_REDACTED_PDF,
                      progress=gr.Progress(track_tqdm=True)):
     '''
     Apply modified redactions to a pymupdf and export review files.
     '''
-    output_files = []
-    output_log_files = []
-    pdf_doc = []
     review_df = review_file_state
     page_image_annotator_object = all_image_annotations[current_page - 1]
@@ -805,7 +1085,7 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
                 doc = [image]
             elif file_extension in '.csv':
-                pdf_doc = []
             # If working with pdfs
             elif is_pdf(file_path) == True:
@@ -815,7 +1095,7 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
                 output_files.append(orig_pdf_file_path)
                 number_of_pages = pdf_doc.page_count
-                original_cropboxes = []
                 page_sizes_df = pd.DataFrame(page_sizes)
                 page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
@@ -961,10 +1241,16 @@ def update_entities_df_page(choice:str, df:pd.DataFrame, label_dropdown_value:st
     '''
     if isinstance(choice, str):
         choice = [choice]
     if isinstance(label_dropdown_value, str):
         label_dropdown_value = [label_dropdown_value]
     if isinstance(text_dropdown_value, str):
         text_dropdown_value = [text_dropdown_value]
     filtered_df = df.copy()
@@ -989,6 +1275,29 @@ def update_entities_df_page(choice:str, df:pd.DataFrame, label_dropdown_value:st
     return filtered_df, recogniser_entities_drop, text_entities_drop
 def update_entities_df_text(choice:str, df:pd.DataFrame, label_dropdown_value:str, page_dropdown_value:str):
     '''
     Update the rows in a dataframe depending on the user choice from a dropdown
@@ -1042,6 +1351,24 @@ def reset_dropdowns(df:pd.DataFrame):
 def increase_bottom_page_count_based_on_top(page_number:int):
     return int(page_number)
 def df_select_callback_dataframe_row(df: pd.DataFrame, evt: gr.SelectData):
         row_value_page = int(evt.row_value[0]) # This is the page number value
@@ -1096,9 +1423,22 @@ def get_all_rows_with_same_text(df: pd.DataFrame, text: str):
     '''
     if text:
         # Get all rows with the same text as the selected row
-        return df[df["text"] == text]
     else:
         return pd.DataFrame(columns=["page", "label", "text", "id"])
 def update_selected_review_df_row_colour(
     redaction_row_selection: pd.DataFrame,
@@ -1286,7 +1626,7 @@ def convert_pymupdf_coords_to_adobe(x1: float, y1: float, x2: float, y2: float,
     return x1, adobe_y1, x2, adobe_y2
-def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:List[str]=[], document_cropboxes:List=[], page_sizes:List[dict]=[]):
     '''
     Create an xfdf file from a review csv file and a pdf
     '''
@@ -1378,11 +1718,11 @@ def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, i
     reparsed = minidom.parseString(rough_string)
     return reparsed.toxml() #.toprettyxml(indent="  ")
-def convert_df_to_xfdf(input_files:List[str], pdf_doc:Document, image_paths:List[str], output_folder:str = OUTPUT_FOLDER, document_cropboxes:List=[], page_sizes:List[dict]=[]):
     '''
     Load in files to convert a review file into an Adobe comment file format
     '''
-    output_paths = []
     pdf_name = ""
     file_path_name = ""
@@ -1481,7 +1821,7 @@ def parse_xfdf(xfdf_path:str):
     # Define the namespace
     namespace = {'xfdf': 'http://ns.adobe.com/xfdf/'}
-    redactions = []
     # Find all redact elements using the namespace
     for redact in root.findall('.//xfdf:redact', namespaces=namespace):
@@ -1513,8 +1853,8 @@ def convert_xfdf_to_dataframe(file_paths_list:List[str], pymupdf_doc, image_path
     Returns:
     - DataFrame containing redaction information
     '''
-    output_paths = []
-    xfdf_paths = []
     df = pd.DataFrame()
     # Sort the file paths so that the pdfs come first

 import os
 import re
 import pandas as pd
 import numpy as np
+import pandas as pd
+import string
+import random
 from xml.etree.ElementTree import Element, SubElement, tostring, parse
 from xml.dom import minidom
 import uuid
+from typing import List, Tuple, Dict, Set
 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
 from pymupdf import Document, Rect
 import pymupdf
 from PIL import ImageDraw, Image
 from datetime import datetime, timezone, timedelta
+from collections import defaultdict
+import gradio as gr
 from tools.config import OUTPUT_FOLDER, MAX_IMAGE_PIXELS, INPUT_FOLDER, COMPRESS_REDACTED_PDF
+from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, remove_duplicate_images_with_blank_boxes, fill_missing_ids, divide_coordinates_by_page_sizes, save_pdf_with_or_without_compression, fill_missing_ids_in_list
 from tools.helper_functions import get_file_name_without_type,  detect_file_type
 from tools.file_redaction import redact_page_with_pymupdf
 if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
+def decrease_page(number:int, all_annotations:dict):
     '''
     Decrease page number for review redactions page.
     '''
+    if not all_annotations:
+        raise Warning("No annotator object loaded")
     if number > 1:
         return number - 1, number - 1
+    elif number <= 1:
+        #return 1, 1
+        raise Warning("At first page")
     else:
+        raise Warning("At first page")
+def increase_page(number:int, all_annotations:dict):
     '''
     Increase page number for review redactions page.
     '''
+    if not all_annotations:
+        raise Warning("No annotator object loaded")
+        #return 1, 1
+    max_pages = len(all_annotations)
     if number < max_pages:
         return number + 1, number + 1
+    #elif number == max_pages:
+    #    return max_pages, max_pages
     else:
+        raise Warning("At last page")
 def update_zoom(current_zoom_level:int, annotate_current_page:int, decrease:bool=True):
     if decrease == False:
                                  recogniser_dropdown_value:str,
                                  text_dropdown_value:str,
                                  page_dropdown_value:str,
+                                 review_df:pd.DataFrame=list(),
+                                 page_sizes:List[str]=list()):
     '''
     Create a filtered recogniser dataframe and associated dropdowns based on current information in the image annotator and review data frame.
     '''
     return recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop
+def update_recogniser_dataframes(page_image_annotator_object:AnnotatedImageData, recogniser_dataframe_base:pd.DataFrame, recogniser_entities_dropdown_value:str="ALL", text_dropdown_value:str="ALL", page_dropdown_value:str="ALL", review_df:pd.DataFrame=list(), page_sizes:list[str]=list()):
     '''
     Update recogniser dataframe information that appears alongside the pdf pages on the review screen.
     '''
     return recogniser_entities_list, recogniser_dataframe_out_gr, recogniser_dataframe_out, recogniser_entities_drop, text_entities_drop, page_entities_drop
 def undo_last_removal(backup_review_state:pd.DataFrame, backup_image_annotations_state:list[dict], backup_recogniser_entity_dataframe_base:pd.DataFrame):
+    if backup_image_annotations_state:
+        return backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
+    else:
+        raise Warning("No actions have been taken to undo")
 def update_annotator_page_from_review_df(
     review_df: pd.DataFrame,
     # Get the target page number from the selected row
     # Safely access the page number, handling potential errors or empty DataFrame
+    gradio_annotator_current_page_number: int = 1
     annotate_previous_page: int = 0 # Renaming for clarity if needed, matches original output
     if not selected_recogniser_entity_df_row.empty and 'page' in selected_recogniser_entity_df_row.columns:
         try:
+            selected_page= selected_recogniser_entity_df_row['page'].iloc[0]
+            gradio_annotator_current_page_number = int(selected_page)
             annotate_previous_page = gradio_annotator_current_page_number # Store original page number
         except (IndexError, ValueError, TypeError):
+            print("Warning: Could not extract valid page number from selected_recogniser_entity_df_row. Defaulting to page 1.")
             gradio_annotator_current_page_number = 1 # Or 0 depending on 1-based vs 0-based indexing elsewhere
     # Ensure page number is valid and 1-based for external display/logic
+    if gradio_annotator_current_page_number <= 0: gradio_annotator_current_page_number = 1
+    page_max_reported = len(page_sizes) #len(out_image_annotations_state)
     if gradio_annotator_current_page_number > page_max_reported:
+        print("current page is greater than highest page:", page_max_reported)
         gradio_annotator_current_page_number = page_max_reported # Cap at max pages
     page_num_reported_zero_indexed = gradio_annotator_current_page_number - 1
         if not current_page_review_df.empty:
             # Convert the current page's review data to annotation list format for *this page*
+            current_page_annotations_list = list()
             # Define expected annotation dict keys, including 'image', 'page', coords, 'label', 'text', 'color' etc.
             # Assuming review_df has compatible columns
             expected_annotation_keys = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id'] # Add/remove as needed
             current_page_annotations_list = current_page_annotations_list_raw
             # Update the annotations state for the current page
             page_state_entry_found = False
             for i, page_state_entry in enumerate(out_image_annotations_state):
                 # Assuming page_state_entry has a 'page' key (1-based)
                     break
             if not page_state_entry_found:
                  print(f"Warning: Entry for page {gradio_annotator_current_page_number} not found in current_image_annotations_state. Cannot update page annotations.")
+    # --- Image Path and Page Size Handling ---
     # Get the image path for the current page from the updated state
     current_image_path = None
     if len(out_image_annotations_state) > page_num_reported_zero_indexed and 'image' in out_image_annotations_state[page_num_reported_zero_indexed]:
          current_image_path = out_image_annotations_state[page_num_reported_zero_indexed]['image']
     if not page_sizes_df.empty:
         page_sizes = page_sizes_df.to_dict(orient='records')
     else:
+        page_sizes = list() # Ensure page_sizes is a list if df is empty
     # --- Re-evaluate Coordinate Multiplication and Duplicate Removal ---
     # Let's assume remove_duplicate_images_with_blank_boxes expects the raw list of dicts state format:
     try:
          out_image_annotations_state = remove_duplicate_images_with_blank_boxes(out_image_annotations_state)
          print(f"Warning: Cannot select current page annotator object for index {page_num_reported_zero_indexed}.")
          out_current_page_annotator = {} # Or None, depending on expected output type
+    # Return final page number
     final_page_number_returned = gradio_annotator_current_page_number
     return (out_current_page_annotator,
             review_df, # review_df might have its 'page' column type changed, keep it as is or revert if necessary
             annotate_previous_page) # The original page number from selected_recogniser_entity_df_row
+# --- Helper Function for ID Generation ---
+# This function encapsulates your ID logic in a performant, batch-oriented way.
+def _generate_unique_ids(
+    num_ids_to_generate: int,
+    existing_ids_set: Set[str]
+) -> List[str]:
+    """
+    Generates a specified number of unique, 12-character alphanumeric IDs.
+    This is a batch-oriented, performant version of the original
+    `fill_missing_ids_in_list` logic, designed to work efficiently
+    with DataFrames.
+    Args:
+        num_ids_to_generate (int): The number of unique IDs to create.
+        existing_ids_set (Set[str]): A set of IDs that are already in use and
+                                     should be avoided.
+    Returns:
+        List[str]: A list of newly generated unique IDs.
+    """
+    id_length = 12
+    character_set = string.ascii_letters + string.digits
+    newly_generated_ids = set()
+    # The while loop ensures we generate exactly the number of IDs required,
+    # automatically handling the astronomically rare case of a collision.
+    while len(newly_generated_ids) < num_ids_to_generate:
+        candidate_id = ''.join(random.choices(character_set, k=id_length))
+        # Check against both pre-existing IDs and IDs generated in this batch
+        if candidate_id not in existing_ids_set and candidate_id not in newly_generated_ids:
+            newly_generated_ids.add(candidate_id)
+    return list(newly_generated_ids)
+def _merge_horizontally_adjacent_boxes(
+    df: pd.DataFrame,
+    x_merge_threshold: int = 0.02
+) -> pd.DataFrame:
+    """
+    Merges horizontally adjacent bounding boxes within the same line.
+    Args:
+        df (pd.DataFrame): DataFrame containing annotation boxes with columns
+                           like 'page', 'line', 'xmin', 'xmax', etc.
+        x_merge_threshold (int): The maximum pixel gap on the x-axis to
+                                 consider two boxes as adjacent.
+    Returns:
+        pd.DataFrame: A new DataFrame with adjacent boxes merged.
+    """
+    if df.empty:
+        return df
+    # 1. Sort values to ensure we are comparing adjacent boxes
+    df_sorted = df.sort_values(by=['page', 'line', 'xmin']).copy()
+    # 2. Identify groups of boxes to merge using shift() and cumsum()
+    # Get properties of the 'previous' box in the sorted list
+    prev_xmax = df_sorted['xmax'].shift(1)
+    prev_page = df_sorted['page'].shift(1)
+    prev_line = df_sorted['line'].shift(1)
+    # A box should be merged with the previous one if it's on the same page/line
+    # and the horizontal gap is within the threshold.
+    is_adjacent = (
+        (df_sorted['page'] == prev_page) &
+        (df_sorted['line'] == prev_line) &
+        (df_sorted['xmin'] - prev_xmax <= x_merge_threshold)
+    )
+    # A new group starts wherever a box is NOT adjacent to the previous one.
+    # cumsum() on this boolean series creates a unique ID for each group.
+    df_sorted['merge_group'] = (~is_adjacent).cumsum()
+    # 3. Aggregate each group into a single bounding box
+    # Define how to aggregate each column
+    agg_funcs = {
+        'xmin': 'min',
+        'ymin': 'min', # To get the highest point of the combined box
+        'xmax': 'max',
+        'ymax': 'max', # To get the lowest point of the combined box
+        'text': lambda s: ' '.join(s.astype(str)), # Join the text
+        # Carry over the first value for columns that are constant within a group
+        'page': 'first',
+        'line': 'first',
+        'image': 'first',
+        'label': 'first',
+        'color': 'first',
+    }
+    merged_df = df_sorted.groupby('merge_group').agg(agg_funcs).reset_index(drop=True)
+    print(f"Merged {len(df)} annotations into {len(merged_df)}.")
+    return merged_df
+def create_annotation_objects_from_filtered_ocr_results_with_words(
+    filtered_ocr_results_with_words_df: pd.DataFrame,
+    ocr_results_with_words_df_base: pd.DataFrame,
+    page_sizes: List[Dict],
+    existing_annotations_df: pd.DataFrame,
+    existing_annotations_list: List[Dict],
+    existing_recogniser_entity_df: pd.DataFrame,
+    redaction_label:str = "Redaction",
+    colour_label:str = '(0, 0, 0)',
+    progress:gr.Progress=gr.Progress()) -> Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    """
+    This function processes filtered OCR results with words to create new annotation objects. It merges these new annotations with existing ones, ensuring that horizontally adjacent boxes are combined for cleaner redactions. The function also updates the existing recogniser entity DataFrame and returns the updated annotations in both DataFrame and list-of-dicts formats.
+    Args:
+        filtered_ocr_results_with_words_df (pd.DataFrame): A DataFrame containing filtered OCR results with words.
+        ocr_results_with_words_df_base (pd.DataFrame): The base DataFrame of OCR results with words.
+        page_sizes (List[Dict]): A list of dictionaries containing page sizes.
+        existing_annotations_df (pd.DataFrame): A DataFrame of existing annotations.
+        existing_annotations_list (List[Dict]): A list of dictionaries representing existing annotations.
+        existing_recogniser_entity_df (pd.DataFrame): A DataFrame of existing recogniser entities.
+        progress (gr.Progress, optional): A progress tracker. Defaults to gr.Progress(track_tqdm=True).
+    Returns:
+        Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing the updated annotations list, updated existing annotations list, updated annotations DataFrame, updated existing annotations DataFrame, updated recogniser entity DataFrame, and the original existing recogniser entity DataFrame.
+    """
+    # Validate colour_label: must be a 3-number tuple with each value in [0, 255]
+    # If invalid, fallback to '(0, 0, 0,)' as requested
+    fallback_colour = '(0, 0, 0,)'
+    try:
+        valid = False
+        if isinstance(colour_label, str):
+            label_str = colour_label.strip()
+            match = re.match(r"^\(\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,?\s*\)$", label_str)
+            if match:
+                r_val, g_val, b_val = (int(match.group(1)), int(match.group(2)), int(match.group(3)))
+                if 0 <= r_val <= 255 and 0 <= g_val <= 255 and 0 <= b_val <= 255:
+                    valid = True
+        elif isinstance(colour_label, (tuple, list)) and len(colour_label) == 3:
+            r_val, g_val, b_val = colour_label
+            if all(isinstance(v, int) for v in (r_val, g_val, b_val)) and all(0 <= v <= 255 for v in (r_val, g_val, b_val)):
+                colour_label = f'({r_val}, {g_val}, {b_val},)'
+                valid = True
+        if not valid:
+            colour_label = fallback_colour
+    except Exception:
+        colour_label = fallback_colour
+    progress(0.2, desc="Identifying new redactions to add")
+    print("Identifying new redactions to add")
+    if filtered_ocr_results_with_words_df.empty:
+        print("No new annotations to add.")
+        updated_annotations_df = existing_annotations_df.copy()
+    else:
+        # Assuming index relationship holds for fast lookup
+        filtered_ocr_results_with_words_df.index = filtered_ocr_results_with_words_df["index"]
+        new_annotations_df = ocr_results_with_words_df_base.loc[filtered_ocr_results_with_words_df.index].copy()
+        if new_annotations_df.empty:
+             print("No new annotations to add.")
+             updated_annotations_df = existing_annotations_df.copy()
+        else:
+            page_to_image_map = {item['page']: item['image_path'] for item in page_sizes}
+            # Prepare the initial new annotations DataFrame
+            new_annotations_df = new_annotations_df.assign(
+                image=lambda df: df['page'].map(page_to_image_map),
+                label= redaction_label,
+                color= colour_label
+            ).rename(columns={
+                'word_x0': 'xmin',
+                'word_y0': 'ymin',
+                'word_x1': 'xmax',
+                'word_y1': 'ymax',
+                'word_text': 'text'
+            })
+            progress(0.3, desc="Checking for adjacent annotations to merge...")
+            print("Checking for adjacent annotations to merge...")
+            new_annotations_df = _merge_horizontally_adjacent_boxes(new_annotations_df)
+            progress(0.4, desc="Creating new redaction IDs...")
+            print("Creating new redaction IDs...")
+            existing_ids = set(existing_annotations_df['id'].dropna()) if 'id' in existing_annotations_df.columns else set()
+            num_new_ids = len(new_annotations_df)
+            new_id_list = _generate_unique_ids(num_new_ids, existing_ids)
+            new_annotations_df['id'] = new_id_list
+            annotation_cols = ['image', 'page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
+            new_annotations_df = new_annotations_df[annotation_cols]
+            key_cols = ['page', 'label', 'xmin', 'ymin', 'xmax', 'ymax', 'text']
+            progress(0.5, desc="Checking for duplicate redactions")
+            if existing_annotations_df.empty or not all(col in existing_annotations_df.columns for col in key_cols):
+                unique_new_df = new_annotations_df
+            else:
+                # Do not add duplicate redactions
+                merged = pd.merge(
+                    new_annotations_df,
+                    existing_annotations_df[key_cols].drop_duplicates(),
+                    on=key_cols,
+                    how='left',
+                    indicator=True
+                )
+                unique_new_df = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge'])
+                #unique_new_df = new_annotations_df
+            print(f"Found {len(unique_new_df)} new unique annotations to add.")
+            gr.Info(f"Found {len(unique_new_df)} new unique annotations to add.")
+            updated_annotations_df = pd.concat([existing_annotations_df, unique_new_df], ignore_index=True)
+    # --- Part 4: Convert final DataFrame to list-of-dicts ---
+    updated_recogniser_entity_df = pd.DataFrame()
+    if not updated_annotations_df.empty:
+         updated_recogniser_entity_df = updated_annotations_df[["page", "label", "text", "id"]]
+    if not page_sizes:
+        print("Warning: page_sizes is empty. No pages to process.")
+        return [], existing_annotations_list, pd.DataFrame(), existing_annotations_df, pd.DataFrame(), existing_recogniser_entity_df
+    all_pages_df = pd.DataFrame(page_sizes).rename(columns={'image_path': 'image'})
+    if not updated_annotations_df.empty:
+        page_to_image_map = {item['page']: item['image_path'] for item in page_sizes}
+        updated_annotations_df['image'] = updated_annotations_df['page'].map(page_to_image_map)
+        merged_df = pd.merge(all_pages_df[['image']], updated_annotations_df, on='image', how='left')
+    else:
+        merged_df = all_pages_df[['image']]
+    # 1. Get the list of image paths in the exact order they appear in page_sizes.
+    #    all_pages_df was created from page_sizes, so it preserves this order.
+    image_order = all_pages_df['image'].tolist()
+    # 2. Convert the 'image' column to a special 'Categorical' type.
+    #    This tells pandas that this column has a custom, non-alphabetical order.
+    merged_df['image'] = pd.Categorical(merged_df['image'], categories=image_order, ordered=True)
+    # 3. Sort the DataFrame based on this new custom order.
+    merged_df = merged_df.sort_values('image')
+    final_annotations_list = list()
+    box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
+    # Now, when we group, we use `sort=False`. This tells groupby to respect the
+    # DataFrame's current order, which we have just manually set. This is slightly
+    # more efficient than letting it sort again.
+    for image_path, group in merged_df.groupby('image', sort=False, observed=False):
+        # The progress.tqdm wrapper can be added back around the groupby object as you had it.
+        # for image_path, group in progress.tqdm(merged_df.groupby('image', sort=False), ...):
+        # Check if the group has actual annotations. iloc[0] is safe because even pages
+        # without annotations will have one row with NaN values from the merge.
+        if pd.isna(group.iloc[0].get('id')):
+            boxes = list()
+        else:
+            valid_box_cols = [col for col in box_cols if col in group.columns]
+            # We should also sort the boxes within a page for consistency (e.g., left-to-right)
+            sorted_group = group.sort_values(by=['ymin', 'xmin'])
+            boxes = sorted_group[valid_box_cols].to_dict('records')
+        final_annotations_list.append({
+            "image": image_path,
+            "boxes": boxes
+        })
+    progress(1.0, desc="Completed annotation processing")
+    return final_annotations_list, existing_annotations_list, updated_annotations_df, existing_annotations_df, updated_recogniser_entity_df, existing_recogniser_entity_df
 def exclude_selected_items_from_redaction(review_df: pd.DataFrame,
                                           selected_rows_df: pd.DataFrame,
                                           image_file_paths:List[str],
 def replace_placeholder_image_with_real_image(doc_full_file_name_textbox:str, current_image_path:str, page_sizes_df:pd.DataFrame, page_num_reported:int, input_folder:str):
         ''' If image path is still not valid, load in a new image an overwrite it. Then replace all items in the image annotation object for all pages based on the updated information.'''
         page_num_reported_zero_indexed = page_num_reported - 1
         if not os.path.exists(current_image_path):
     gradio_annotator_current_page_number:int,
     recogniser_entities_dropdown_value:str="ALL",
     page_dropdown_value:str="ALL",
+    page_dropdown_redaction_value:str="1",
     text_dropdown_value:str="ALL",
+    recogniser_dataframe_base:pd.DataFrame=None, # Simplified default
     zoom:int=100,
     review_df:pd.DataFrame=None, # Use None for default empty DataFrame
+    page_sizes:List[dict]=list(),
     doc_full_file_name_textbox:str='',
     input_folder:str=INPUT_FOLDER
 ) -> Tuple[image_annotator, gr.Number, gr.Number, int, str, gr.Dataframe, pd.DataFrame, List[str], List[str], List[dict], List[AnnotatedImageData]]:
     Update a gradio_image_annotation object with new annotation data for the current page
     and update filter dataframes, optimizing by processing only the current page's data for display.
     '''
     zoom_str = str(zoom) + '%'
     # Handle default empty review_df and recogniser_dataframe_base
     if not all_image_annotations:
         print("No all_image_annotation object found")
         # Return blank/default outputs
+        blank_annotator = image_annotator(
+            value = None, boxes_alpha=0.1, box_thickness=1, label_list=list(), label_colors=list(),
             show_label=False, height=zoom_str, width=zoom_str, box_min_size=1,
             box_selected_thickness=2, handle_size=4, sources=None,
             show_clear_button=False, show_share_button=False, show_remove_button=False,
         return (blank_annotator, gr.Number(value=1), gr.Number(value=1), 1,
                 recogniser_entities_dropdown_value, blank_df_out_gr, blank_df_modified,
+                [], [], [], [], []) # Return empty lists/defaults for other outputs
     # Validate and bound the current page number (1-based logic)
     page_num_reported = max(1, gradio_annotator_current_page_number) # Minimum page is 1
     page_num_reported_zero_indexed = page_num_reported - 1
     annotate_previous_page = page_num_reported # Store the determined page number
+    if not page_sizes:
+        page_num_reported = 0
+        annotate_previous_page = 0
     # --- Process page sizes DataFrame ---
     page_sizes_df = pd.DataFrame(page_sizes)
     if not page_sizes_df.empty:
             print("Warning: Page sizes DataFrame became empty after processing.")
     # --- Handle Image Path Replacement for the Current Page ---
     if len(all_image_annotations) > page_num_reported_zero_indexed:
         page_object_to_update = all_image_annotations[page_num_reported_zero_indexed]
         # Use the helper function to replace the image path within the page object
         updated_page_object, all_image_annotations_after_img_replace = replace_annotator_object_img_np_array_with_page_sizes_image_path(
              all_image_annotations, page_object_to_update, page_sizes, page_num_reported)
         all_image_annotations = all_image_annotations_after_img_replace
         # Now handle the actual image file path replacement using replace_placeholder_image_with_real_image
         current_image_path = updated_page_object.get('image') # Get potentially updated image path
     if not page_sizes_df.empty:
         page_sizes = page_sizes_df.to_dict(orient='records')
     else:
+        page_sizes = list() # Ensure page_sizes is a list if df is empty
     # --- OPTIMIZATION: Prepare data *only* for the current page for display ---
     current_page_image_annotator_object = None
         # Assuming coordinate multiplication IS needed for display if state stores relative coords
         current_page_annotations_df = convert_annotation_data_to_dataframe([page_data_for_display])
         if not current_page_annotations_df.empty and not page_sizes_df.empty:
              # Multiply coordinates *only* for this page's DataFrame
              try:
     except Exception as e:
         print(f"Error calling update_recogniser_dataframes: {e}. Returning empty/default filter data.")
+        recogniser_entities_list = list()
+        recogniser_colour_list = list()
         recogniser_dataframe_out_gr = gr.Dataframe(pd.DataFrame(columns=["page", "label", "text", "id"]))
         recogniser_dataframe_modified = pd.DataFrame(columns=["page", "label", "text", "id"])
+        text_entities_drop = list()
+        page_entities_drop = list()
     # --- Final Output Components ---
+    if page_sizes:
+        page_number_reported_gradio_comp = gr.Number(label = "Current page", value=page_num_reported, precision=0, maximum=len(page_sizes), minimum=1)
+    else:
+        page_number_reported_gradio_comp = gr.Number(label = "Current page", value=0, precision=0, maximum=9999, minimum=0)
     ### Present image_annotator outputs
     # Handle the case where current_page_image_annotator_object couldn't be prepared
             interactive=True # Keep interactive if data is present
         )
+    page_entities_drop_redaction_list = list()
+    all_pages_in_doc_list = [str(i) for i in range(1, len(page_sizes) + 1)]
+    page_entities_drop_redaction_list.extend(all_pages_in_doc_list)
+    page_entities_drop_redaction = gr.Dropdown(value = page_dropdown_redaction_value, choices=page_entities_drop_redaction_list, label="Page", allow_custom_value=True)
     return (out_image_annotator,
             page_number_reported_gradio_comp,
             page_number_reported_gradio_comp, # Redundant, but matches original return signature
             recogniser_dataframe_modified,
             text_entities_drop, # List of text entities for dropdown
             page_entities_drop, # List of page numbers for dropdown
+            page_entities_drop_redaction,
             page_sizes, # Updated page_sizes list
             all_image_annotations) # Return the updated full state
                                     current_page:int,
                                     previous_page:int,
                                     all_image_annotations:List[AnnotatedImageData],
+                                    page_sizes:List[dict]=list(),
                                     clear_all:bool=False
                                     ):
     '''
     Overwrite image annotations on the page we are moving from with modifications.
     '''
+    if current_page > len(page_sizes):
+        raise Warning("Selected page is higher than last page number")
+    elif current_page <= 0:
+        raise Warning("Selected page is lower than first page")
     previous_page_zero_index = previous_page -1
     if not current_page: current_page = 1
     page_image_annotator_object, all_image_annotations = replace_annotator_object_img_np_array_with_page_sizes_image_path(all_image_annotations, page_image_annotator_object, page_sizes, previous_page)
     if clear_all == False: all_image_annotations[previous_page_zero_index] = page_image_annotator_object
+    else: all_image_annotations[previous_page_zero_index]["boxes"] = list()
     return all_image_annotations, current_page, current_page
                      review_file_state:pd.DataFrame,
                      output_folder:str = OUTPUT_FOLDER,
                      save_pdf:bool=True,
+                     page_sizes:List[dict]=list(),
                      COMPRESS_REDACTED_PDF:bool=COMPRESS_REDACTED_PDF,
                      progress=gr.Progress(track_tqdm=True)):
     '''
     Apply modified redactions to a pymupdf and export review files.
     '''
+    output_files = list()
+    output_log_files = list()
+    pdf_doc = list()
     review_df = review_file_state
     page_image_annotator_object = all_image_annotations[current_page - 1]
                 doc = [image]
             elif file_extension in '.csv':
+                pdf_doc = list()
             # If working with pdfs
             elif is_pdf(file_path) == True:
                 output_files.append(orig_pdf_file_path)
                 number_of_pages = pdf_doc.page_count
+                original_cropboxes = list()
                 page_sizes_df = pd.DataFrame(page_sizes)
                 page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
     '''
     if isinstance(choice, str):
         choice = [choice]
+    elif not isinstance(choice, list):
+        choice = [str(choice)]
     if isinstance(label_dropdown_value, str):
         label_dropdown_value = [label_dropdown_value]
+    elif not isinstance(label_dropdown_value, list):
+        label_dropdown_value = [str(label_dropdown_value)]
     if isinstance(text_dropdown_value, str):
         text_dropdown_value = [text_dropdown_value]
+    elif not isinstance(text_dropdown_value, list):
+        text_dropdown_value = [str(text_dropdown_value)]
     filtered_df = df.copy()
     return filtered_df, recogniser_entities_drop, text_entities_drop
+def update_redact_choice_df_from_page_dropdown(choice:str, df:pd.DataFrame):
+    '''
+    Update the rows in a dataframe depending on the user choice from a dropdown
+    '''
+    if isinstance(choice, str):
+        choice = [choice]
+    elif not isinstance(choice, list):
+        choice = [str(choice)]
+    if "index" not in df.columns:
+        df["index"] = df.index
+    filtered_df = df[["page", "line", "word_text", "word_x0", "word_y0", "word_x1", "word_y1", "index"]].copy()
+    # Apply filtering based on dropdown selections
+    if not "ALL" in choice:
+        filtered_df = filtered_df.loc[filtered_df["page"].astype(str).isin(choice)]
+    page_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "page")
+    page_entities_drop = gr.Dropdown(value=choice[0], choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
+    return filtered_df
 def update_entities_df_text(choice:str, df:pd.DataFrame, label_dropdown_value:str, page_dropdown_value:str):
     '''
     Update the rows in a dataframe depending on the user choice from a dropdown
 def increase_bottom_page_count_based_on_top(page_number:int):
     return int(page_number)
+def df_select_callback_dataframe_row_ocr_with_words(df: pd.DataFrame, evt: gr.SelectData):
+        row_value_page = int(evt.row_value[0]) # This is the page number value
+        row_value_line = int(evt.row_value[1]) # This is the label number value
+        row_value_text = evt.row_value[2] # This is the text number value
+        row_value_x0 = evt.row_value[3] # This is the x0 value
+        row_value_y0 = evt.row_value[4] # This is the y0 value
+        row_value_x1 = evt.row_value[5] # This is the x1 value
+        row_value_y1 = evt.row_value[6] # This is the y1 value
+        row_value_index = evt.row_value[7] # This is the y1 value
+        row_value_df = pd.DataFrame(data={"page":[row_value_page], "line":[row_value_line], "word_text":[row_value_text],
+                                          "word_x0":[row_value_x0],	"word_y0":[row_value_y0],	"word_x1":[row_value_x1], "word_y1":[row_value_y1], "index":row_value_index
+                                          })
+        return row_value_df, row_value_text
 def df_select_callback_dataframe_row(df: pd.DataFrame, evt: gr.SelectData):
         row_value_page = int(evt.row_value[0]) # This is the page number value
     '''
     if text:
         # Get all rows with the same text as the selected row
+        return df.loc[df["text"] == text]
     else:
         return pd.DataFrame(columns=["page", "label", "text", "id"])
+def get_all_rows_with_same_text_redact(df: pd.DataFrame, text: str):
+    '''
+    Get all rows with the same text as the selected row for redaction tasks
+    '''
+    if "index" not in df.columns:
+        df["index"] = df.index
+    if text and not df.empty:
+        # Get all rows with the same text as the selected row
+        return df.loc[df["word_text"] == text]
+    else:
+        return pd.DataFrame(columns=["page", "line", "label",  "word_text", "word_x0", "word_y0", "word_x1", "word_y1", "index"])
 def update_selected_review_df_row_colour(
     redaction_row_selection: pd.DataFrame,
     return x1, adobe_y1, x2, adobe_y2
+def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:List[str]=list(), document_cropboxes:List=list(), page_sizes:List[dict]=list()):
     '''
     Create an xfdf file from a review csv file and a pdf
     '''
     reparsed = minidom.parseString(rough_string)
     return reparsed.toxml() #.toprettyxml(indent="  ")
+def convert_df_to_xfdf(input_files:List[str], pdf_doc:Document, image_paths:List[str], output_folder:str = OUTPUT_FOLDER, document_cropboxes:List=list(), page_sizes:List[dict]=list()):
     '''
     Load in files to convert a review file into an Adobe comment file format
     '''
+    output_paths = list()
     pdf_name = ""
     file_path_name = ""
     # Define the namespace
     namespace = {'xfdf': 'http://ns.adobe.com/xfdf/'}
+    redactions = list()
     # Find all redact elements using the namespace
     for redact in root.findall('.//xfdf:redact', namespaces=namespace):
     Returns:
     - DataFrame containing redaction information
     '''
+    output_paths = list()
+    xfdf_paths = list()
     df = pd.DataFrame()
     # Sort the file paths so that the pdfs come first