In [3]:
import os
import numpy as np
import pandas as pd
import pdf2image as p2i
from os import path
from pathlib import Path
from PIL import Image

# Allow for unlimited image size, some documents are pretty big...
Image.MAX_IMAGE_PIXELS = None


def process_pdf_document(filepath):
    if path.getsize(filepath) == 0:
        # TODO: substitute for logging
        print(f'{filepath} is empty, skipping')
        return []

    pages = p2i.convert_from_path(filepath)
    processed_pages: list[dict] = []

    label = 'other'
    root_dir, doctype = Path(filepath).parts[:2]
    for page_i, page in enumerate(pages):
        if page_i == 0:
            label = doctype
        elif page_i == len(pages) - 1:
            label = f'{label}-last'

        processed_pages.append({
            'filepath': filepath,
            'width': page.width,
            'height': page.height,
            'bytes': page.tobytes(),
            'label': label,
        })

    return processed_pages


def process_training_data():
    data_dir = Path('./data')

    for dirname, _, files in os.walk(data_dir):
        print(f'Processing folder {dirname}')
        doctype = path.basename(dirname)
        df = pd.DataFrame()

        for filename in files:
            print(f'Processing file {filename}')
            filepath = path.join(dirname, filename)
            _, ext = path.splitext(filepath)

            if ext.lower() == '.pdf':
                processed_pages = process_pdf_document(filepath)
                df = pd.concat([df, pd.DataFrame(processed_pages)], ignore_index=True)

        parquet_filepath = path.join(data_dir, f'{doctype}.parquet')
        print(f'Saving data for {doctype} in {parquet_filepath}')
        print(df)
        df.to_parquet(parquet_filepath)

In [None]:
process_training_data()

Processing folder data
Processing file data.parquet
Processing file .DS_Store
Saving data for data in data/data.parquet
Empty DataFrame
Columns: []
Index: []
Processing folder data/form
Processing file 1384753-request-form.pdf
Processing file 772791-12_sipc1301f_peer-individual-evaluation-form.pdf
Processing file 6428556-Cassatt-Form-D-2008.pdf
Processing file 3895717-INFORMATION-DISCLOSURE-REQUEST-FORM.pdf
Processing file 4321053-document-21556514.pdf
Processing file 4575531-REDACTED-PIW-CR285646.pdf
Processing file 6155510-Form-I.pdf
Processing file 3885938-JayaServicesLtd-Appointment-15112002.pdf
Processing file 4950830-Cook1995.pdf
Processing file 6368627-2013-Southern-Company-990.pdf
Processing file 4436484-Columbia-Armed-Guard-application-form.pdf
Processing file 21074299-company-application-form-for-dominicana-acquisition-sa.pdf
Processing file 746365-wa-madsen-barbara-2012.pdf
Processing file 5770184-CAMCF-RFYL-Registration-Form.pdf
Processing file 1683073-medication-info-form-