Spaces:

bigodel
/

hydra-classifier

Sleeping

App Files Files Community

João Pedro commited on Jan 25

Commit

861182c

1 Parent(s): b7d38ff

adding files from github

Browse files

Files changed (6) hide show

constants.py +29 -0
environment.yml +20 -0
pre_processing.ipynb +470 -0
pre_processing.py +111 -0
requirements.txt +26 -0
training.py +214 -0

constants.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from pathlib import Path
+from typing import TypedDict, Union, TypeAlias, Tuple
+# Constants
+DATA_DIR = Path('./data')
+RAW_DATA_DIR = DATA_DIR / 'raw'
+PROCESSED_DATA_DIR = DATA_DIR / 'processed'
+METADATA_FILEPATH = DATA_DIR / 'metadata.csv'
+BATCH_SIZE = 8
+EPOCHS = 1
+BERT_BASE = 'bert-base-uncased'
+MAX_SEQUENCE_LENGHT = 512
+MODEL_DIR = Path('./model')
+# Types
+FilePath: TypeAlias = Union[str, Path]
+class PageMetadata(TypedDict):
+    page_number: int
+    file_relpath: FilePath
+    width: int
+    height: int
+    label: str
+ImageSize: TypeAlias = Tuple[int, int]
+ImageInputShape: TypeAlias = Tuple[int, int, int]

environment.yml ADDED Viewed

	@@ -0,0 +1,20 @@

+name: hydra
+channels:
+- conda-forge
+dependencies:
+- ipykernel
+- ipython
+- ipython_genutils
+- jupyter_client
+- jupyter_core
+- pandas
+- pandera
+- pdf2image
+- pytables
+- pytesseract
+- python=3.12
+- python-lsp-server
+- scikit-learn
+- tesseract
+- traitlets

pre_processing.ipynb ADDED Viewed

	@@ -0,0 +1,470 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "63d95b8f-7559-4a3d-b025-5bb788914dc9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import pdf2image as p2i\n",
+    "from os import path\n",
+    "from pathlib import Path\n",
+    "from PIL import Image\n",
+    "\n",
+    "# Allow for unlimited image size, some documents are pretty big...\n",
+    "Image.MAX_IMAGE_PIXELS = None\n",
+    "\n",
+    "\n",
+    "def process_pdf_document(filepath):\n",
+    "    if path.getsize(filepath) == 0:\n",
+    "        # TODO: substitute for logging\n",
+    "        print(f'{filepath} is empty, skipping')\n",
+    "        return []\n",
+    "\n",
+    "    pages = p2i.convert_from_path(filepath)\n",
+    "    processed_pages: list[dict] = []\n",
+    "\n",
+    "    label = 'other'\n",
+    "    root_dir, doctype = Path(filepath).parts[:2]\n",
+    "    for page_i, page in enumerate(pages):\n",
+    "        if page_i == 0:\n",
+    "            label = doctype\n",
+    "        elif page_i == len(pages) - 1:\n",
+    "            label = f'{label}-last'\n",
+    "\n",
+    "        processed_pages.append({\n",
+    "            'filepath': filepath,\n",
+    "            'width': page.width,\n",
+    "            'height': page.height,\n",
+    "            'bytes': page.tobytes(),\n",
+    "            'label': label,\n",
+    "        })\n",
+    "\n",
+    "    return processed_pages\n",
+    "\n",
+    "\n",
+    "def process_training_data():\n",
+    "    data_dir = Path('./data')\n",
+    "\n",
+    "    for dirname, _, files in os.walk(data_dir):\n",
+    "        print(f'Processing folder {dirname}')\n",
+    "        doctype = path.basename(dirname)\n",
+    "        df = pd.DataFrame()\n",
+    "\n",
+    "        for filename in files:\n",
+    "            print(f'Processing file {filename}')\n",
+    "            filepath = path.join(dirname, filename)\n",
+    "            _, ext = path.splitext(filepath)\n",
+    "\n",
+    "            if ext.lower() == '.pdf':\n",
+    "                processed_pages = process_pdf_document(filepath)\n",
+    "                df = pd.concat([df, pd.DataFrame(processed_pages)], ignore_index=True)\n",
+    "\n",
+    "        parquet_filepath = path.join(data_dir, f'{doctype}.parquet')\n",
+    "        print(f'Saving data for {doctype} in {parquet_filepath}')\n",
+    "        print(df)\n",
+    "        df.to_parquet(parquet_filepath)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1efb8e82-92bc-46dc-9553-cdc712211c93",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processing folder data\n",
+      "Processing file data.parquet\n",
+      "Processing file .DS_Store\n",
+      "Saving data for data in data/data.parquet\n",
+      "Empty DataFrame\n",
+      "Columns: []\n",
+      "Index: []\n",
+      "Processing folder data/form\n",
+      "Processing file 1384753-request-form.pdf\n",
+      "Processing file 772791-12_sipc1301f_peer-individual-evaluation-form.pdf\n",
+      "Processing file 6428556-Cassatt-Form-D-2008.pdf\n",
+      "Processing file 3895717-INFORMATION-DISCLOSURE-REQUEST-FORM.pdf\n",
+      "Processing file 4321053-document-21556514.pdf\n",
+      "Processing file 4575531-REDACTED-PIW-CR285646.pdf\n",
+      "Processing file 6155510-Form-I.pdf\n",
+      "Processing file 3885938-JayaServicesLtd-Appointment-15112002.pdf\n",
+      "Processing file 4950830-Cook1995.pdf\n",
+      "Processing file 6368627-2013-Southern-Company-990.pdf\n",
+      "Processing file 4436484-Columbia-Armed-Guard-application-form.pdf\n",
+      "Processing file 21074299-company-application-form-for-dominicana-acquisition-sa.pdf\n",
+      "Processing file 746365-wa-madsen-barbara-2012.pdf\n",
+      "Processing file 5770184-CAMCF-RFYL-Registration-Form.pdf\n",
+      "Processing file 1683073-medication-info-form-eng-revised-2014.pdf\n",
+      "Processing file 4423347-Request-Form.pdf\n",
+      "Processing file 3022163-Event-Form.pdf\n",
+      "Processing file 1685242-cv-fresh-entry-form.pdf\n",
+      "Processing file 4910987-Records-Request-Form.pdf\n",
+      "Processing file 4349763-DOD-Form-254.pdf\n",
+      "Processing file 3885273-EIMEuropeanInvestmentsManagement-ChangeofName.pdf\n",
+      "Processing file 4951858-Ober3.pdf\n",
+      "Processing file 7034791-UOF-form.pdf\n",
+      "Processing file 6206290-Thunderbird-GBC1-Application-Form.pdf\n",
+      "Processing file 465336-disclosure-form-13491099365164-_-pdf.pdf\n",
+      "Processing file 6538004-Aderholt-Travel-Disclosure-Form.pdf\n",
+      "Processing file 1292759-host-accept-form-attachment.pdf\n",
+      "Processing file 1238787-request-form.pdf\n",
+      "Processing file 6173981-Cert-of-ID-form.pdf\n",
+      "Processing file 3220944-Sitton-July-2015-Campaign-Finance-Report.pdf\n",
+      "Processing file 6882316-Request-Form.pdf\n",
+      "Processing file 4111213-Coffman-Election-Filings.pdf\n",
+      "Processing file 4773259-RECORDS-REQUEST-FORM.pdf\n",
+      "Processing file 3116806-Black-River-Technical-College-Request-for-C-23.pdf\n",
+      "Processing file 761264-13_ee402a01_expository-essay-interpretation-of.pdf\n",
+      "Processing file 5955192-North-Dakota-corn-dogs.pdf\n",
+      "Processing file 6882816-Disaster-Relief-Cash-Assistance-Application-Form.pdf\n",
+      "Processing file 1303092-appeal-form.pdf\n",
+      "Processing file 3224254-CCT-Holidayhope-2016-Form.pdf\n",
+      "Processing file 528353-disclosure-form-13512855487444-_-pdf.pdf\n",
+      "Processing file 696442-nms-timeline-complaint.pdf\n",
+      "Processing file 842767-shallotte-assisted-living-penalty-packet-4.pdf\n",
+      "Processing file 4484638-GENERAL-APRA-form-2018.pdf\n",
+      "Processing file 20393262-margaux-keiser-form-460.pdf\n",
+      "Processing file 6785703-Release-form.pdf\n",
+      "Processing file 1683896-rtk-form-16241-pdf.pdf\n",
+      "Processing file 3234225-Philadelphia-Form-30401-pdf.pdf\n",
+      "Processing file 442530-collect-files-35870-political-file-2012-non.pdf\n",
+      "Processing file 4108784-2012-FSA-Enrollment-Form.pdf\n",
+      "Processing file 2426599-opega-review-request-form.pdf\n",
+      "Processing file 20429474-caserta-evidence-list.pdf\n",
+      "Processing file 7001493-ABC-Trailblazer-Application-Form-2020-21.pdf\n",
+      "Processing file 6310093-Registration.pdf\n",
+      "Processing file 21085990-mcguire-3q21.pdf\n",
+      "Processing file 6879322-Form-8871.pdf\n",
+      "Processing file 21182127-20220118-docketing-statement-b.pdf\n",
+      "Processing file 5518517-Corbett-email-PRR-017155-pdf.pdf\n",
+      "Processing file 4058754-Records-Release-pdf.pdf\n",
+      "Processing file 1386200-annual-disclosure-request-form.pdf\n",
+      "Processing file 7202517-Photo-Release-Form.pdf\n",
+      "Processing file 3860778-810A-Evidence-Submission-Form.pdf\n",
+      "Processing file 5899836-Medford-Tp-summary-form.pdf\n",
+      "Processing file 5776714-CRID-1046835.pdf\n",
+      "Processing file 6536846-Transcript-Request.pdf\n",
+      "Processing file 7041210-Development-Application-Form.pdf\n",
+      "Processing file 466205-wa-united-for-marriage-avails-request-form.pdf\n",
+      "Processing file 7000910-NCFPD-CORA-Request-Form.pdf\n",
+      "Processing file 21884855-dos-22-04-051.pdf\n",
+      "data/form/21884855-dos-22-04-051.pdf is empty, skipping\n",
+      "Saving data for form in data/form.parquet\n",
+      "                                              filepath  width  height  \\\n",
+      "0                   data/form/1384753-request-form.pdf   1700    2200   \n",
+      "1    data/form/772791-12_sipc1301f_peer-individual-...   1700    2200   \n",
+      "2            data/form/6428556-Cassatt-Form-D-2008.pdf   1700    2200   \n",
+      "3            data/form/6428556-Cassatt-Form-D-2008.pdf   1700    2200   \n",
+      "4            data/form/6428556-Cassatt-Form-D-2008.pdf   1700    2200   \n",
+      "..                                                 ...    ...     ...   \n",
+      "618  data/form/7041210-Development-Application-Form...   1700    2200   \n",
+      "619  data/form/7041210-Development-Application-Form...   1700    2200   \n",
+      "620  data/form/7041210-Development-Application-Form...   1700    2200   \n",
+      "621  data/form/466205-wa-united-for-marriage-avails...   1700    2200   \n",
+      "622      data/form/7000910-NCFPD-CORA-Request-Form.pdf   1700    2200   \n",
+      "\n",
+      "                                                 bytes      label  \n",
+      "0    b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff...       form  \n",
+      "1    b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff...       form  \n",
+      "2    b'\\x00\\x00\\x00\\x00\\x00\\x00\\xfe\\xfe\\xfe\\xff\\xff...       form  \n",
+      "3    b'\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\xfe\\xfe...       form  \n",
+      "4    b'\\x00\\x00\\x00\\x00\\x00\\x00\\xfe\\xfe\\xfe\\xff\\xff...       form  \n",
+      "..                                                 ...        ...  \n",
+      "618  b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff...       form  \n",
+      "619  b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff...       form  \n",
+      "620  b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff...  form-last  \n",
+      "621  b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff...       form  \n",
+      "622  b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff...       form  \n",
+      "\n",
+      "[623 rows x 5 columns]\n",
+      "Processing folder data/scientific publication\n",
+      "Processing file 6425893-Oxford-Journals-Article.pdf\n",
+      "Processing file websearch_7_00001_scientific_publication.pdf\n",
+      "Processing file websearch_7_00017_scientific_publication.pdf\n",
+      "Processing file websearch_7_00022_scientific_publication.pdf\n",
+      "Processing file 3923078-Paper-1.pdf\n",
+      "Processing file websearch_7_00025_scientific_publication.pdf\n",
+      "Processing file websearch_7_00010_scientific_publication.pdf\n",
+      "Processing file 4247250-JACOBSON-Original-Article.pdf\n",
+      "Processing file websearch_7_00006_scientific_publication.pdf\n",
+      "Processing file 3559070-McCance-Katz-Et-Al-2017-the-American-Journal-on.pdf\n",
+      "Processing file websearch_7_00011_scientific_publication.pdf\n",
+      "Processing file websearch_7_00024_scientific_publication.pdf\n",
+      "Processing file websearch_7_00007_scientific_publication.pdf\n",
+      "Processing file websearch_7_00000_scientific_publication.pdf\n",
+      "Processing file websearch_7_00023_scientific_publication.pdf\n",
+      "Processing file 21085273-wood-schulman-article.pdf\n",
+      "Processing file websearch_7_00016_scientific_publication.pdf\n",
+      "Processing file 2189923-lancet-article-from-1989.pdf\n",
+      "Processing file 7223327-Forensic-Article.pdf\n",
+      "Processing file websearch_7_00009_scientific_publication.pdf\n",
+      "Processing file websearch_7_00005_scientific_publication.pdf\n",
+      "Processing file 6429770-Anderson2012-Article.pdf\n",
+      "Processing file websearch_7_00013_scientific_publication.pdf\n",
+      "Processing file websearch_7_00026_scientific_publication.pdf\n",
+      "Processing file websearch_7_00021_scientific_publication.pdf\n",
+      "Processing file websearch_7_00018_scientific_publication.pdf\n",
+      "Processing file websearch_7_00014_scientific_publication.pdf\n",
+      "Processing file websearch_7_00002_scientific_publication.pdf\n",
+      "Processing file websearch_7_00019_scientific_publication.pdf\n",
+      "Processing file websearch_7_00015_scientific_publication.pdf\n",
+      "Processing file websearch_7_00020_scientific_publication.pdf\n",
+      "Processing file websearch_7_00003_scientific_publication.pdf\n",
+      "Processing file 5980147-Venu-B-Article-1.pdf\n",
+      "Processing file 20519853-2007-recommendations-for-medical-management-of-adult-lead-exposure.pdf\n",
+      "Processing file 4343429-Meltdown-paper.pdf\n",
+      "Processing file 785243-paper.pdf\n",
+      "Processing file websearch_7_00008_scientific_publication.pdf\n",
+      "Processing file websearch_7_00004_scientific_publication.pdf\n",
+      "Processing file websearch_7_00012_scientific_publication.pdf\n",
+      "Saving data for scientific publication in data/scientific publication.parquet\n",
+      "                                              filepath  width  height  \\\n",
+      "0    data/scientific publication/6425893-Oxford-Jou...   1700    2200   \n",
+      "1    data/scientific publication/6425893-Oxford-Jou...   1700    2200   \n",
+      "2    data/scientific publication/6425893-Oxford-Jou...   1700    2200   \n",
+      "3    data/scientific publication/6425893-Oxford-Jou...   1700    2200   \n",
+      "4    data/scientific publication/6425893-Oxford-Jou...   1700    2200   \n",
+      "..                                                 ...    ...     ...   \n",
+      "532  data/scientific publication/websearch_7_00012_...   1700    2200   \n",
+      "533  data/scientific publication/websearch_7_00012_...   1700    2200   \n",
+      "534  data/scientific publication/websearch_7_00012_...   1700    2200   \n",
+      "535  data/scientific publication/websearch_7_00012_...   1700    2200   \n",
+      "536  data/scientific publication/websearch_7_00012_...   1700    2200   \n",
+      "\n",
+      "                                                 bytes  \\\n",
+      "0    b'\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd...   \n",
+      "1    b'\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe...   \n",
+      "2    b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff...   \n",
+      "3    b'\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd...   \n",
+      "4    b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff...   \n",
+      "..                                                 ...   \n",
+      "532  b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff...   \n",
+      "533  b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff...   \n",
+      "534  b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff...   \n",
+      "535  b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff...   \n",
+      "536  b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff...   \n",
+      "\n",
+      "                           label  \n",
+      "0         scientific publication  \n",
+      "1         scientific publication  \n",
+      "2         scientific publication  \n",
+      "3         scientific publication  \n",
+      "4         scientific publication  \n",
+      "..                           ...  \n",
+      "532       scientific publication  \n",
+      "533       scientific publication  \n",
+      "534       scientific publication  \n",
+      "535       scientific publication  \n",
+      "536  scientific publication-last  \n",
+      "\n",
+      "[537 rows x 5 columns]\n",
+      "Processing folder data/handwritten\n",
+      "Processing file 3223324-Scan-20111118-195759.pdf\n",
+      "Processing file 20743971-handwritten-statement.pdf\n",
+      "Processing file 21030538-doj-donoghue-notes.pdf\n",
+      "Processing file 519547-sere-psychologist-and-bush-admin-torture.pdf\n",
+      "Processing file 6240196-Zenith-Rathfelder-Notes.pdf\n",
+      "Processing file 1630443-nick-stoneman-notes.pdf\n",
+      "Processing file 4182029-HANDWRITTEN-COMPILATION-OF-COLLECTION.pdf\n",
+      "Processing file 528421-handwritten-notes-on-financial-offer-from-orders.pdf\n",
+      "Processing file 5498003-1996-08-11-Handwritten-Law-Enforcement-Notes.pdf\n",
+      "Processing file 202587-doc21.pdf\n",
+      "Processing file 5190901-WasteWS-07062018-145153-Handwritten-Letter-NO-OCR.pdf\n",
+      "Processing file 625990-thatcher-mt-notes.pdf\n",
+      "Processing file 782204-loeser-will.pdf\n",
+      "Processing file 2189928-handwritten-agreement.pdf\n",
+      "Processing file 1223362-gauthe-clinical-notes.pdf\n",
+      "Processing file 4190352-MEMORANDUM-HIDALGO-BALMES-HANDWRITTEN.pdf\n",
+      "Processing file 6939323-7-8-11-83-Handwritten-Notes.pdf\n",
+      "Processing file 2996996-TL-Handwritten-Fax-Labeled-Confidential.pdf\n",
+      "Processing file 3468379-DPD-Use-of-Force-Handwritten-Notes.pdf\n",
+      "Processing file 4188317-LIST-HANDWRITTEN-SUMMARIES-NOTES.pdf\n",
+      "Processing file 339816-jeff-long-notes.pdf\n",
+      "Processing file 3864522-Handwritten-letter-in-a-defense-sentencing-memo.pdf\n",
+      "Processing file 4183043-HANDWRITTEN-MEMO-SUBJ-ARRIVAL-COVERT-KEY-WEST.pdf\n",
+      "Processing file 5002049-Doc-10-09-2018-12-41-31.pdf\n",
+      "Processing file 4780536-Jury-Notes-Manafort-Trial.pdf\n",
+      "Processing file 2300328-gov-lepage-handwritten-note-on-lmf.pdf\n",
+      "Processing file 3002469-Investigation-Notes-Witness-Statements.pdf\n",
+      "Processing file 4193148-11111707.pdf\n",
+      "Processing file 4598117-2002-Handwritten-Notes-of-City-Meeting.pdf\n",
+      "Processing file 4444642-Letter-Chase-Nicholson-s-grandmother.pdf\n",
+      "Processing file 2519466-10-10-13-handwritten-list-of-malfunctioning-doors.pdf\n",
+      "Processing file 2940920-2002-09-12-Letter-Handwritten-Tony-Blair-to.pdf\n",
+      "Processing file 4187525-HANDWRITTEN-NOTE-CUBAN-ACTIVITIES-IN-ARGENTINA.pdf\n",
+      "Processing file 3462227-Notes-2.pdf\n",
+      "Processing file 2791337-FBI-Handwritten-Notes.pdf\n",
+      "Processing file 354332-notes.pdf\n",
+      "Processing file 4182092-HANDWRITTEN-PAPER-CONCERNING-PROJECT-ZRRIFLE.pdf\n",
+      "Processing file 3766497-Detective-Notes.pdf\n",
+      "Processing file 6749500-LEOPOLD-FILES-Bruce-Jessen-Handwritten-Notes.pdf\n",
+      "Processing file 3220458-Handwritten-Note.pdf\n",
+      "Processing file 4188296-HANDWRITTEN-LIST-OF-14-NAMES.pdf\n",
+      "Processing file 804992-fairhope-city-council-resolution-10-14-13.pdf\n",
+      "Processing file 2997471-Handwritten-Notes-to-and-From-Gov-LePage.pdf\n",
+      "Processing file 4598082-Letter-to-Judge-Schroeder.pdf\n",
+      "Processing file 3877562-Mikulcik-Interview-Notes.pdf\n",
+      "Processing file 479570-emily-dickinsons-handwritten-poem.pdf\n",
+      "Processing file 21108937-pv-handwritten-motion-for-trial-transcript-3215.pdf\n",
+      "Processing file 6020949-Aretha-Franklin-Handwritten-Will-1.pdf\n",
+      "Processing file 6464722-Fn-22-03252.pdf\n",
+      "Processing file 21117526-pv-handwritten-pre-sentencing-memorandum-21312.pdf\n",
+      "Processing file 705120-mingle-notes.pdf\n",
+      "Processing file 1213307-rialto-unified-holocaust-essays-set-13-part-05.pdf\n",
+      "Processing file 4189369-HANDWRITTEN-SHEET-WITH-REPORT-NUMBERS.pdf\n",
+      "Processing file 202586-doc20.pdf\n",
+      "Processing file 4184368-OPS-NOTES-ON-AMBIDDY.pdf\n",
+      "Processing file 393660-fibroscopic-notes-original.pdf\n",
+      "Processing file 4182778-HANDWRITTEN-NOTES.pdf\n",
+      "Processing file 4187708-DOCUMENT-3-HANDWRITTEN-NOTE.pdf\n",
+      "Processing file 702325-snitch-list.pdf\n",
+      "Processing file 4180597-HANDWRITTEN-MEMO-RE-IMPLICATION-OF-ALINE-MOSBY.pdf\n",
+      "Processing file 1236260-sidney-holters-note.pdf\n",
+      "Processing file 3223330-Scan-20111118-200635.pdf\n",
+      "Processing file 3223289-Scan-20111116-171028.pdf\n",
+      "Processing file 21043867-2021-01-25-mo-sen-eigel-handwritten-note-sb-66-allows-people-to-run-over-protestors.pdf\n",
+      "Processing file 3233254-Earl-Bradley-Letter-102516.pdf\n",
+      "Processing file 3227804-Cornell-Handwritten-Letters.pdf\n",
+      "Processing file 3862956-Handwritten-Note-of-Meeting-With-Howard-Hill.pdf\n",
+      "Processing file 803848-elvis-presleys-letter-to-richard-nixon.pdf\n",
+      "Processing file 3223331-Scan-20111118-200718.pdf\n",
+      "Processing file 202599-doc33.pdf\n",
+      "Processing file 5690139-Child-s-Australia-Day-Letter.pdf\n",
+      "Processing file 4182800-HANDWRITTEN-MEMO-ON-HUNT-AND-HIS-USE-OF-A-PEN-NAME.pdf\n",
+      "Processing file 3223399-Scan-20111125-141919.pdf\n",
+      "Processing file 3521430-IMG-4827.pdf\n",
+      "Processing file 4420170-Mullkoff-Note.pdf\n",
+      "Processing file 4185023-HANDWRITTEN-NOTE-RE-FILING-DOCUMENTS-IN-DIAZ-L-S.pdf\n",
+      "Processing file 2746630-Cardenas-notes.pdf\n",
+      "Processing file 1086683-hand-written-notes.pdf\n",
+      "Processing file 5190857-WasteWS-07062018-120826-HANDWRITTEN-NO-OCR.pdf\n",
+      "Processing file 4189400-HANDWRITTEN-CARDS-RESEARCH-DEPARTMENT-MEMORANDUM.pdf\n",
+      "Processing file 3223321-Scan-20111118-195438.pdf\n",
+      "Processing file 4328038-Correspondence-From-John-Crowley-to-Scott.pdf\n",
+      "Processing file 5018573-Chad-Notes.pdf\n",
+      "Processing file 21109051-pv-handwritten-letter-requesting-new-federal-defender-counsel-41212.pdf\n",
+      "Processing file 5316881-Journal-Entries.pdf\n",
+      "Processing file 4191532-OFFICE-NOTES-RE-AGENTS.pdf\n",
+      "Processing file 1213274-rialto-unified-holocaust-essays-set-01-part-01.pdf\n",
+      "Processing file 4490103-Gergely-Interview-Notes.pdf\n",
+      "Processing file 2708465-Brandon-Astor-Jones-Timeline-HW.pdf\n",
+      "Processing file 3462228-notes-3.pdf\n",
+      "Processing file 528418-handwritten-note-from-april-3-2001.pdf\n",
+      "Processing file 4185802-HANDWRITTEN-NOTES-MARY-CHECKING-FOR-PERTINENT.pdf\n",
+      "Processing file 3011003-Richard-Bain-s-handwritten-account-of-Sept-4-2012.pdf\n",
+      "Processing file 5002048-Doc-10-09-2018-12-42-54.pdf\n",
+      "Processing file 1372097-applebee-valuch-notes.pdf\n",
+      "Processing file 705119-davies-notes.pdf\n",
+      "Processing file 3237222-Notes-1216.pdf\n",
+      "Processing file 3718215-PP-D0414.pdf\n",
+      "Processing file 4187089-LECTURE-AT-FARM.pdf\n",
+      "Processing file 3223531-Scan-20111202-194427.pdf\n",
+      "Processing file 4450893-Lithuanian-Extradition-Request-for-Release.pdf\n",
+      "Processing file 4193146-11111705.pdf\n",
+      "Processing file 526429-east-coast-rapist-suspects-apology-letter.pdf\n",
+      "Processing file 4181294-HANDWRITTEN-LIST-DDO-FILE-REQUESTS.pdf\n",
+      "Processing file 5764377-Handwritten-Notes.pdf\n",
+      "Processing file 4183714-HANDWRITTEN-NOTE-RE-RICHARD-GIBSON-ISSUE.pdf\n",
+      "Processing file 6393576-Lawson-Letter.pdf\n",
+      "Processing file 1283645-dearest-celeste.pdf\n",
+      "Processing file 2825042-Mathes-Handwritten-Notes.pdf\n",
+      "Processing file 202597-doc31.pdf\n",
+      "Processing file 21011336-foxs-hand-drawn-map.pdf\n",
+      "Processing file 339815-jon-fagg-notes.pdf\n",
+      "Processing file 1336598-hartmannnotes.pdf\n",
+      "Processing file 2650353-1983-10-22-F4-Handritten.pdf\n",
+      "Processing file 282179-july-14-1995-mladic-diary-handwritten.pdf\n",
+      "Processing file 2271360-hartfield-handwritten-writ.pdf\n",
+      "Processing file 4178930-HANDWRITTEN-DOC-SUMMARIES-ILLEGIBLE.pdf\n",
+      "Processing file 5096174-Handwritten-Minutes.pdf\n",
+      "Processing file 6382284-Letter-Ferrell-Scott.pdf\n",
+      "Processing file 2474354-oland-statement-written.pdf\n",
+      "Processing file 1306277-4-9-22-11-hanah-cho.pdf\n",
+      "Processing file 3223449-Scan-20111125-145718.pdf\n",
+      "Processing file 2517459-bill-clintons-handwritten-speech-from-the.pdf\n",
+      "Processing file 3223329-Scan-20111118-200524.pdf\n",
+      "Processing file 4179732-INVENTORY-HANDWRITTEN.pdf\n",
+      "Processing file 3223447-Scan-20111125-145513.pdf\n",
+      "Processing file 786278-frederick-pabsts-handwritten-will.pdf\n",
+      "Processing file 1378369-bricker-doc.pdf\n",
+      "Processing file 1371011-witness-40-journal-entry.pdf\n",
+      "Processing file 4191654-HANDWRITTEN-NOTES-ON-CIA-AND-FBI-DOCUMENTS.pdf\n",
+      "Processing file 803812-iggy-pops-letter-to-a-fan-1995.pdf\n",
+      "Processing file 3223444-Scan-20111125-145344.pdf\n",
+      "Processing file 4193147-11111706.pdf\n",
+      "Processing file 2504031-joseph-brennick-handwritten-memos.pdf\n",
+      "Processing file 21108969-pvs-amended-handwritten-41-page-motion-for-reconsideration-en-banc-11317.pdf\n",
+      "Processing file 5190870-WasteWS-07062018-121609-HANDWRITTEN-NO-OCR-Pdf.pdf\n",
+      "Processing file 3223420-Scan-20111125-142900.pdf\n",
+      "Processing file 555087-vh-defrock-letter.pdf\n",
+      "Processing file 5002051-Doc-10-09-2018-12-39-47.pdf\n",
+      "Processing file 5634528-181217-Flynn-Fbi-Notes.pdf\n",
+      "Processing file 5025013-J-R-Thomas-response.pdf\n",
+      "Processing file 3223527-Scan-20111202-190234.pdf\n",
+      "Processing file 4184490-HANDWRITTEN-BERNARD-BARKER-CHRONOLOGY.pdf\n",
+      "Processing file 4911037-Manuel-Orrego-Savala-Letter.pdf\n",
+      "Processing file 6571779-Frederick-Veal-Interview-Notes.pdf\n",
+      "Processing file 1096680-alexandra-hollinghurst-notes.pdf\n",
+      "Processing file 5780462-GGGW-v-Schwitzer-Handwritten-Ex-Staffer-Stmt.pdf\n",
+      "Processing file 321762-gina-hutchinson-journal-entry.pdf\n",
+      "Processing file 566855-inslee-letter.pdf\n",
+      "Processing file 2811636-Callis-Handwritten-Letter-1999-13-09.pdf\n",
+      "Processing file 803827-a-mothers-letter-to-the-foundling-asylum.pdf\n",
+      "Processing file 4704949-Bechtel-Notes.pdf\n",
+      "Processing file 6771659-Preston-Handwritten-Response.pdf\n",
+      "Processing file 776417-helios-notes.pdf\n",
+      "Processing file 4191914-FORM-MEMORANDUM-FOR-THE-RECORD-HARVEY-LEE-OSWALD.pdf\n",
+      "Processing file 3462229-Notes-1.pdf\n",
+      "Processing file 21030480-doj-notes.pdf\n",
+      "Processing file 1658584-1admission-notes-redacted.pdf\n",
+      "Processing file 1668278-doc066.pdf\n",
+      "Processing file 5332373-1-AEA-2080-NOTES-HANDWRITTEN-CHRONOLOGY-EVENTS.pdf\n",
+      "Processing file 528420-handwritten-notes-of-meeting-with-cori-june-26.pdf\n",
+      "Processing file 2777703-AV-Interview-Notes.pdf\n",
+      "Processing file 2163779-skm-c224e15070214420-pdf-handwritten-notes-about.pdf\n",
+      "Processing file 266726-boogaard-journal.pdf\n"
+     ]
+    }
+   ],
+   "source": [
+    "process_training_data()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (hydra)",
+   "language": "python",
+   "name": "hydra"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.1"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {},
+    "version_major": 2,
+    "version_minor": 0
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

pre_processing.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# -*- coding: utf-8 -*-
+import os
+import pandas as pd
+import pdf2image as p2i
+import pytesseract
+from os import path
+from PIL import Image
+from typing import List, Tuple
+from transformers import BertTokenizer
+from constants import (RAW_DATA_DIR,
+                       PROCESSED_DATA_DIR,
+                       METADATA_FILEPATH,
+                       BERT_BASE,
+                       MAX_SEQUENCE_LENGHT,
+                       FilePath,
+                       PageMetadata)
+# Allow for unlimited image size, some documents are pretty big...
+Image.MAX_IMAGE_PIXELS = None
+def make_page_filepaths(basename, label, page_index) -> Tuple[str, str]:
+    out_dirname = path.join(PROCESSED_DATA_DIR, label)
+    os.makedirs(out_dirname, exist_ok=True)
+    out_filename = path.join(out_dirname, f'{basename}_{page_index}')
+    out_img_filepath = f'{out_filename}.jpg'
+    out_txt_filepath = f'{out_filename}.txt'
+    return out_img_filepath, out_txt_filepath
+def tokenize_text(text: str) -> Tuple[List[int], List[int]]:
+    tokenizer = BertTokenizer.from_pretrained(BERT_BASE)
+    tokenized = tokenizer(
+        text,
+        padding=True,
+        truncation=True,
+        max_length=MAX_SEQUENCE_LENGHT,
+    )
+    return tokenized['input_ids'], tokenized['attention_mask']
+def process_pdf_file(pdf_filepath: FilePath):
+    if path.getsize(pdf_filepath) == 0:
+        # TODO: substitute for logging
+        print(f'{pdf_filepath} is empty, skipping')
+        return []
+    pages: List[Image] = p2i.convert_from_path(pdf_filepath)
+    pages_metadata: List[PageMetadata] = []
+    root_dir, doctype = path.split(path.dirname(pdf_filepath))
+    base_filename = path.basename(path.splitext(pdf_filepath)[0])
+    for page_i, page in enumerate(pages):
+        label = 'other'
+        if page_i == 0:
+            label = doctype
+        # If the document only has one page, override the label with
+        if page_i == len(pages) - 1:
+            label = f'{doctype}-last'
+        out_img_filepath, out_txt_filepath = make_page_filepaths(base_filename, label, page_i)
+        page.save(out_img_filepath)
+        ocr_text = pytesseract.image_to_string(page)
+        input_ids, attention_mask = tokenize_text(ocr_text)
+        with open(out_txt_filepath, 'w') as out_txt_file:
+            out_txt_file.write(ocr_text)
+        pages_metadata.append({
+            'page_number': page_i + 1,
+            'pdf_filepath': path.relpath(pdf_filepath, start='.'),
+            'img_filepath': out_img_filepath,
+            'txt_filepath': out_txt_filepath,
+            # 'text_tokens': tokens,
+            'width': page.width,
+            'height': page.height,
+            'label': label,
+        })
+    return pages_metadata
+def process_training_data() -> pd.DataFrame:
+    pages_metadata: List[List[PageMetadata]] = []
+    for dirname, _, files in os.walk(RAW_DATA_DIR):
+        if path.samefile(dirname, RAW_DATA_DIR):
+            continue
+        print(f'Processing folder {dirname}')
+        for filename in files:
+            _, ext = path.splitext(filename)
+            # Avoid processing non-document files
+            if ext.lower() == '.pdf':
+                print(f'Processing file {filename}')
+                pdf_filepath = path.join(dirname, filename)
+                pages_metadata.extend(process_pdf_file(pdf_filepath))
+    pages_metadata_df = pd.DataFrame(pages_metadata)
+    print(f'Writing metadata to {METADATA_FILEPATH}')
+    pages_metadata_df.to_csv(METADATA_FILEPATH, index=False)
+    return pages_metadata_df
+process_training_data()

requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+argcomplete==3.3.0
+astunparse==1.6.3
+gast==0.6.0
+kagglehub==0.3.6
+keras-hub==0.18.1
+libclang==18.1.1
+Markdown==3.7
+markdown-it-py==3.0.0
+numpy==2.0.2
+opt_einsum==3.4.0
+optree==0.14.0
+packaging==24.2
+pip-autoremove==0.10.0
+pipx==1.5.0
+platformdirs==4.2.2
+protobuf==5.29.3
+requests==2.32.3
+setuptools==75.8.0
+six==1.17.0
+tensorboard==2.18.0
+tensorflow-text==2.18.1
+tf_keras==2.18.0
+typing_extensions==4.12.2
+userpath==1.9.2
+Werkzeug==3.1.3
+wheel==0.44.0

training.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import pandas as pd
+import tensorflow as tf
+import tf_keras as keras
+from constants import (PROCESSED_DATA_DIR,
+                       METADATA_FILEPATH,
+                       BATCH_SIZE,
+                       EPOCHS,
+                       BERT_BASE,
+                       MAX_SEQUENCE_LENGHT,
+                       FilePath,
+                       PageMetadata,
+                       ImageSize,
+                       ImageInputShape)
+from pandera.typing import DataFrame
+from typing import Tuple, List
+from transformers import TFBertModel
+from tf_keras import layers, models
+from PIL import Image
+# Allow for unlimited image size, some documents are pretty big...
+Image.MAX_IMAGE_PIXELS = None
+def stratified_split(
+        df: pd.DataFrame,
+        train_frac: float,
+        val_frac: float,
+        test_frac: float,
+) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    train_dfs, val_dfs, test_dfs = [], [], []
+    for label, group in df.groupby('label'):
+        n = len(group)
+        train_end = int(n * train_frac)
+        val_end = train_end + int(n * val_frac)
+        train_dfs.append(group.iloc[:train_end])
+        val_dfs.append(group.iloc[train_end:val_end])
+        test_dfs.append(group.iloc[val_end:])
+    train_df = pd.concat(train_dfs).reset_index(drop=True)
+    val_df = pd.concat(val_dfs).reset_index(drop=True)
+    test_df = pd.concat(test_dfs).reset_index(drop=True)
+    return train_df, val_df, test_df
+def dataset_from_dataframe(df: pd.DataFrame) -> tf.data.Dataset:
+    return tf.data.Dataset.from_tensor_slices((
+        df['img_filepath'].values,
+        df['input_ids'].values,
+        df['attention_mask'].values,
+        df['label'].values,
+    ))
+def load_image(image_path: FilePath, image_size: ImageSize) -> Image:
+    img_width, img_height = image_size
+    # Load image
+    image = tf.io.read_file(image_path)
+    image = tf.image.decode_jpeg(image, channels=3)
+    image = tf.image.resize(image, [img_width, img_height])
+    image /= 255.0
+    return image
+def prepare_dataset(
+        ds: tf.data.Dataset,
+        image_size: ImageSize,
+        batch_size=32,
+        buffer_size=1000
+) -> tf.data.Dataset:
+    def load_image_and_format_tensor_shape(
+            img_path: FilePath,
+            input_ids: List[int],
+            attention_mask: List[int],
+            label: str
+    ):
+        image = load_image(img_path, image_size)
+        return ((image, input_ids, attention_mask), label)
+    return ds.map(
+        load_image_and_format_tensor_shape,
+        num_parallel_calls=tf.data.experimental.AUTOTUNE,
+    ) \
+        .shuffle(buffer_size=buffer_size) \
+        .batch(batch_size) \
+        .prefetch(tf.data.experimental.AUTOTUNE)
+metadata_df: DataFrame[PageMetadata] = pd.read_csv(METADATA_FILEPATH)
+metadata_df = metadata_df.sample(n=50, random_state=42)
+median_height = int(metadata_df['height'].median())
+median_width = int(metadata_df['width'].median())
+img_size: ImageSize = (median_height, median_width)
+img_input_shape: ImageInputShape = img_size + (3,)
+label_names: List[str] = sorted(
+    [d.name for d in PROCESSED_DATA_DIR.iterdir() if d.is_dir()]
+)
+num_classes = len(label_names)
+print('Splitting the DataFrame into training, validation and test')
+train_df, val_df, test_df = stratified_split(
+    metadata_df,
+    train_frac=0.7,
+    val_frac=0.15,
+    test_frac=0.15,
+)
+print('Batching and shuffling the datasets')
+train_ds = dataset_from_dataframe(train_df)
+train_ds = prepare_dataset(train_ds, img_size, batch_size=BATCH_SIZE)
+val_ds = dataset_from_dataframe(val_df)
+val_ds = prepare_dataset(val_ds, img_size, batch_size=BATCH_SIZE)
+test_ds = dataset_from_dataframe(test_df)
+test_ds = prepare_dataset(test_ds, img_size, batch_size=BATCH_SIZE)
+def build_image_model(input_shape: ImageInputShape) -> keras.Model:
+    img_model = models.Sequential([
+        layers.Input(shape=input_shape),
+        layers.Conv2D(32, (3, 3), activation='relu'),
+        layers.MaxPooling2D((2, 2)),
+        layers.Conv2D(64, (3, 3), activation='relu'),
+        layers.MaxPooling2D((2, 2)),
+        layers.Conv2D(128, (3, 3), activation='relu'),
+        layers.MaxPooling2D((2, 2)),
+        layers.Conv2D(128, (3, 3), activation='relu'),
+        layers.MaxPooling2D((2, 2)),
+        layers.Flatten(),
+        layers.Dense(512, activation='relu'),
+    ], name='image_classification')
+    img_model.summary()
+    return img_model
+def build_text_model() -> keras.Model:
+    bert_model = TFBertModel.from_pretrained(BERT_BASE)
+    input_ids = layers.Input(
+        shape=(MAX_SEQUENCE_LENGHT,), dtype=tf.int32, name='input_ids'
+    )
+    attention_mask = layers.Input(
+        shape=(MAX_SEQUENCE_LENGHT,), dtype=tf.int32, name='attention_mask'
+    )
+    # The second element of the BERT output is the pooled output i.e. the
+    # representation of the [CLS] token
+    outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)[1]
+    text_model = models.Model(
+        inputs=[input_ids, attention_mask],
+        outputs=outputs,
+        name='bert'
+    )
+    text_model.summary()
+    return text_model
+def build_multimodal_model(
+        num_classes: int,
+        img_input_shape: ImageInputShape
+) -> keras.Model:
+    img_model = build_image_model(img_input_shape)
+    text_model = build_text_model()
+    img_input = layers.Input(shape=img_input_shape, name='img_input')
+    text_input_ids = layers.Input(
+        shape=(MAX_SEQUENCE_LENGHT,), dtype=tf.int32, name='text_input_ids'
+    )
+    text_input_mask = layers.Input(
+        shape=(MAX_SEQUENCE_LENGHT,), dtype=tf.int32, name='text_input_mask'
+    )
+    img_features = img_model(img_input)
+    text_features = text_model([text_input_ids, text_input_mask])
+    classification_layers = keras.Sequential([
+        tf.keras.layers.Dense(512, activation='relu'),
+        tf.keras.layers.Dense(num_classes, activation='softmax'),
+    ], name='classification_layers')
+    concat_features = layers.concatenate([img_features, text_features],
+                                         name='concatenate_features')
+    outputs = classification_layers(concat_features)
+    multimodal_model = models.Model(
+        inputs=[img_input, text_input_ids, text_input_mask],
+        outputs=outputs,
+        name='multimodal_document_page_classifier'
+    )
+    return multimodal_model
+multimodal_model = build_multimodal_model(num_classes, img_input_shape)
+multimodal_model.summary()
+multimodal_model.compile(
+    optimizer='adam',
+    loss='sparse_categorical_crossentropy',
+    metrics=['accuracy']
+)
+multimodal_model.fit(
+    train_ds,
+    epochs=EPOCHS,
+    batch_size=BATCH_SIZE,
+    validation_data=val_ds,
+)