Spaces:
Sleeping
Sleeping
João Pedro
commited on
Commit
·
861182c
1
Parent(s):
b7d38ff
adding files from github
Browse files- constants.py +29 -0
- environment.yml +20 -0
- pre_processing.ipynb +470 -0
- pre_processing.py +111 -0
- requirements.txt +26 -0
- training.py +214 -0
constants.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from typing import TypedDict, Union, TypeAlias, Tuple
|
3 |
+
|
4 |
+
# Constants
|
5 |
+
DATA_DIR = Path('./data')
|
6 |
+
RAW_DATA_DIR = DATA_DIR / 'raw'
|
7 |
+
PROCESSED_DATA_DIR = DATA_DIR / 'processed'
|
8 |
+
METADATA_FILEPATH = DATA_DIR / 'metadata.csv'
|
9 |
+
|
10 |
+
BATCH_SIZE = 8
|
11 |
+
EPOCHS = 1
|
12 |
+
BERT_BASE = 'bert-base-uncased'
|
13 |
+
MAX_SEQUENCE_LENGHT = 512
|
14 |
+
MODEL_DIR = Path('./model')
|
15 |
+
|
16 |
+
# Types
|
17 |
+
FilePath: TypeAlias = Union[str, Path]
|
18 |
+
|
19 |
+
|
20 |
+
class PageMetadata(TypedDict):
|
21 |
+
page_number: int
|
22 |
+
file_relpath: FilePath
|
23 |
+
width: int
|
24 |
+
height: int
|
25 |
+
label: str
|
26 |
+
|
27 |
+
|
28 |
+
ImageSize: TypeAlias = Tuple[int, int]
|
29 |
+
ImageInputShape: TypeAlias = Tuple[int, int, int]
|
environment.yml
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: hydra
|
2 |
+
channels:
|
3 |
+
- conda-forge
|
4 |
+
dependencies:
|
5 |
+
- ipykernel
|
6 |
+
- ipython
|
7 |
+
- ipython_genutils
|
8 |
+
- jupyter_client
|
9 |
+
- jupyter_core
|
10 |
+
- pandas
|
11 |
+
- pandera
|
12 |
+
- pdf2image
|
13 |
+
- pytables
|
14 |
+
- pytesseract
|
15 |
+
- python=3.12
|
16 |
+
- python-lsp-server
|
17 |
+
- scikit-learn
|
18 |
+
- tesseract
|
19 |
+
- traitlets
|
20 |
+
|
pre_processing.ipynb
ADDED
@@ -0,0 +1,470 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 3,
|
6 |
+
"id": "63d95b8f-7559-4a3d-b025-5bb788914dc9",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"import os\n",
|
11 |
+
"import numpy as np\n",
|
12 |
+
"import pandas as pd\n",
|
13 |
+
"import pdf2image as p2i\n",
|
14 |
+
"from os import path\n",
|
15 |
+
"from pathlib import Path\n",
|
16 |
+
"from PIL import Image\n",
|
17 |
+
"\n",
|
18 |
+
"# Allow for unlimited image size, some documents are pretty big...\n",
|
19 |
+
"Image.MAX_IMAGE_PIXELS = None\n",
|
20 |
+
"\n",
|
21 |
+
"\n",
|
22 |
+
"def process_pdf_document(filepath):\n",
|
23 |
+
" if path.getsize(filepath) == 0:\n",
|
24 |
+
" # TODO: substitute for logging\n",
|
25 |
+
" print(f'{filepath} is empty, skipping')\n",
|
26 |
+
" return []\n",
|
27 |
+
"\n",
|
28 |
+
" pages = p2i.convert_from_path(filepath)\n",
|
29 |
+
" processed_pages: list[dict] = []\n",
|
30 |
+
"\n",
|
31 |
+
" label = 'other'\n",
|
32 |
+
" root_dir, doctype = Path(filepath).parts[:2]\n",
|
33 |
+
" for page_i, page in enumerate(pages):\n",
|
34 |
+
" if page_i == 0:\n",
|
35 |
+
" label = doctype\n",
|
36 |
+
" elif page_i == len(pages) - 1:\n",
|
37 |
+
" label = f'{label}-last'\n",
|
38 |
+
"\n",
|
39 |
+
" processed_pages.append({\n",
|
40 |
+
" 'filepath': filepath,\n",
|
41 |
+
" 'width': page.width,\n",
|
42 |
+
" 'height': page.height,\n",
|
43 |
+
" 'bytes': page.tobytes(),\n",
|
44 |
+
" 'label': label,\n",
|
45 |
+
" })\n",
|
46 |
+
"\n",
|
47 |
+
" return processed_pages\n",
|
48 |
+
"\n",
|
49 |
+
"\n",
|
50 |
+
"def process_training_data():\n",
|
51 |
+
" data_dir = Path('./data')\n",
|
52 |
+
"\n",
|
53 |
+
" for dirname, _, files in os.walk(data_dir):\n",
|
54 |
+
" print(f'Processing folder {dirname}')\n",
|
55 |
+
" doctype = path.basename(dirname)\n",
|
56 |
+
" df = pd.DataFrame()\n",
|
57 |
+
"\n",
|
58 |
+
" for filename in files:\n",
|
59 |
+
" print(f'Processing file {filename}')\n",
|
60 |
+
" filepath = path.join(dirname, filename)\n",
|
61 |
+
" _, ext = path.splitext(filepath)\n",
|
62 |
+
"\n",
|
63 |
+
" if ext.lower() == '.pdf':\n",
|
64 |
+
" processed_pages = process_pdf_document(filepath)\n",
|
65 |
+
" df = pd.concat([df, pd.DataFrame(processed_pages)], ignore_index=True)\n",
|
66 |
+
"\n",
|
67 |
+
" parquet_filepath = path.join(data_dir, f'{doctype}.parquet')\n",
|
68 |
+
" print(f'Saving data for {doctype} in {parquet_filepath}')\n",
|
69 |
+
" print(df)\n",
|
70 |
+
" df.to_parquet(parquet_filepath)"
|
71 |
+
]
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"cell_type": "code",
|
75 |
+
"execution_count": null,
|
76 |
+
"id": "1efb8e82-92bc-46dc-9553-cdc712211c93",
|
77 |
+
"metadata": {},
|
78 |
+
"outputs": [
|
79 |
+
{
|
80 |
+
"name": "stdout",
|
81 |
+
"output_type": "stream",
|
82 |
+
"text": [
|
83 |
+
"Processing folder data\n",
|
84 |
+
"Processing file data.parquet\n",
|
85 |
+
"Processing file .DS_Store\n",
|
86 |
+
"Saving data for data in data/data.parquet\n",
|
87 |
+
"Empty DataFrame\n",
|
88 |
+
"Columns: []\n",
|
89 |
+
"Index: []\n",
|
90 |
+
"Processing folder data/form\n",
|
91 |
+
"Processing file 1384753-request-form.pdf\n",
|
92 |
+
"Processing file 772791-12_sipc1301f_peer-individual-evaluation-form.pdf\n",
|
93 |
+
"Processing file 6428556-Cassatt-Form-D-2008.pdf\n",
|
94 |
+
"Processing file 3895717-INFORMATION-DISCLOSURE-REQUEST-FORM.pdf\n",
|
95 |
+
"Processing file 4321053-document-21556514.pdf\n",
|
96 |
+
"Processing file 4575531-REDACTED-PIW-CR285646.pdf\n",
|
97 |
+
"Processing file 6155510-Form-I.pdf\n",
|
98 |
+
"Processing file 3885938-JayaServicesLtd-Appointment-15112002.pdf\n",
|
99 |
+
"Processing file 4950830-Cook1995.pdf\n",
|
100 |
+
"Processing file 6368627-2013-Southern-Company-990.pdf\n",
|
101 |
+
"Processing file 4436484-Columbia-Armed-Guard-application-form.pdf\n",
|
102 |
+
"Processing file 21074299-company-application-form-for-dominicana-acquisition-sa.pdf\n",
|
103 |
+
"Processing file 746365-wa-madsen-barbara-2012.pdf\n",
|
104 |
+
"Processing file 5770184-CAMCF-RFYL-Registration-Form.pdf\n",
|
105 |
+
"Processing file 1683073-medication-info-form-eng-revised-2014.pdf\n",
|
106 |
+
"Processing file 4423347-Request-Form.pdf\n",
|
107 |
+
"Processing file 3022163-Event-Form.pdf\n",
|
108 |
+
"Processing file 1685242-cv-fresh-entry-form.pdf\n",
|
109 |
+
"Processing file 4910987-Records-Request-Form.pdf\n",
|
110 |
+
"Processing file 4349763-DOD-Form-254.pdf\n",
|
111 |
+
"Processing file 3885273-EIMEuropeanInvestmentsManagement-ChangeofName.pdf\n",
|
112 |
+
"Processing file 4951858-Ober3.pdf\n",
|
113 |
+
"Processing file 7034791-UOF-form.pdf\n",
|
114 |
+
"Processing file 6206290-Thunderbird-GBC1-Application-Form.pdf\n",
|
115 |
+
"Processing file 465336-disclosure-form-13491099365164-_-pdf.pdf\n",
|
116 |
+
"Processing file 6538004-Aderholt-Travel-Disclosure-Form.pdf\n",
|
117 |
+
"Processing file 1292759-host-accept-form-attachment.pdf\n",
|
118 |
+
"Processing file 1238787-request-form.pdf\n",
|
119 |
+
"Processing file 6173981-Cert-of-ID-form.pdf\n",
|
120 |
+
"Processing file 3220944-Sitton-July-2015-Campaign-Finance-Report.pdf\n",
|
121 |
+
"Processing file 6882316-Request-Form.pdf\n",
|
122 |
+
"Processing file 4111213-Coffman-Election-Filings.pdf\n",
|
123 |
+
"Processing file 4773259-RECORDS-REQUEST-FORM.pdf\n",
|
124 |
+
"Processing file 3116806-Black-River-Technical-College-Request-for-C-23.pdf\n",
|
125 |
+
"Processing file 761264-13_ee402a01_expository-essay-interpretation-of.pdf\n",
|
126 |
+
"Processing file 5955192-North-Dakota-corn-dogs.pdf\n",
|
127 |
+
"Processing file 6882816-Disaster-Relief-Cash-Assistance-Application-Form.pdf\n",
|
128 |
+
"Processing file 1303092-appeal-form.pdf\n",
|
129 |
+
"Processing file 3224254-CCT-Holidayhope-2016-Form.pdf\n",
|
130 |
+
"Processing file 528353-disclosure-form-13512855487444-_-pdf.pdf\n",
|
131 |
+
"Processing file 696442-nms-timeline-complaint.pdf\n",
|
132 |
+
"Processing file 842767-shallotte-assisted-living-penalty-packet-4.pdf\n",
|
133 |
+
"Processing file 4484638-GENERAL-APRA-form-2018.pdf\n",
|
134 |
+
"Processing file 20393262-margaux-keiser-form-460.pdf\n",
|
135 |
+
"Processing file 6785703-Release-form.pdf\n",
|
136 |
+
"Processing file 1683896-rtk-form-16241-pdf.pdf\n",
|
137 |
+
"Processing file 3234225-Philadelphia-Form-30401-pdf.pdf\n",
|
138 |
+
"Processing file 442530-collect-files-35870-political-file-2012-non.pdf\n",
|
139 |
+
"Processing file 4108784-2012-FSA-Enrollment-Form.pdf\n",
|
140 |
+
"Processing file 2426599-opega-review-request-form.pdf\n",
|
141 |
+
"Processing file 20429474-caserta-evidence-list.pdf\n",
|
142 |
+
"Processing file 7001493-ABC-Trailblazer-Application-Form-2020-21.pdf\n",
|
143 |
+
"Processing file 6310093-Registration.pdf\n",
|
144 |
+
"Processing file 21085990-mcguire-3q21.pdf\n",
|
145 |
+
"Processing file 6879322-Form-8871.pdf\n",
|
146 |
+
"Processing file 21182127-20220118-docketing-statement-b.pdf\n",
|
147 |
+
"Processing file 5518517-Corbett-email-PRR-017155-pdf.pdf\n",
|
148 |
+
"Processing file 4058754-Records-Release-pdf.pdf\n",
|
149 |
+
"Processing file 1386200-annual-disclosure-request-form.pdf\n",
|
150 |
+
"Processing file 7202517-Photo-Release-Form.pdf\n",
|
151 |
+
"Processing file 3860778-810A-Evidence-Submission-Form.pdf\n",
|
152 |
+
"Processing file 5899836-Medford-Tp-summary-form.pdf\n",
|
153 |
+
"Processing file 5776714-CRID-1046835.pdf\n",
|
154 |
+
"Processing file 6536846-Transcript-Request.pdf\n",
|
155 |
+
"Processing file 7041210-Development-Application-Form.pdf\n",
|
156 |
+
"Processing file 466205-wa-united-for-marriage-avails-request-form.pdf\n",
|
157 |
+
"Processing file 7000910-NCFPD-CORA-Request-Form.pdf\n",
|
158 |
+
"Processing file 21884855-dos-22-04-051.pdf\n",
|
159 |
+
"data/form/21884855-dos-22-04-051.pdf is empty, skipping\n",
|
160 |
+
"Saving data for form in data/form.parquet\n",
|
161 |
+
" filepath width height \\\n",
|
162 |
+
"0 data/form/1384753-request-form.pdf 1700 2200 \n",
|
163 |
+
"1 data/form/772791-12_sipc1301f_peer-individual-... 1700 2200 \n",
|
164 |
+
"2 data/form/6428556-Cassatt-Form-D-2008.pdf 1700 2200 \n",
|
165 |
+
"3 data/form/6428556-Cassatt-Form-D-2008.pdf 1700 2200 \n",
|
166 |
+
"4 data/form/6428556-Cassatt-Form-D-2008.pdf 1700 2200 \n",
|
167 |
+
".. ... ... ... \n",
|
168 |
+
"618 data/form/7041210-Development-Application-Form... 1700 2200 \n",
|
169 |
+
"619 data/form/7041210-Development-Application-Form... 1700 2200 \n",
|
170 |
+
"620 data/form/7041210-Development-Application-Form... 1700 2200 \n",
|
171 |
+
"621 data/form/466205-wa-united-for-marriage-avails... 1700 2200 \n",
|
172 |
+
"622 data/form/7000910-NCFPD-CORA-Request-Form.pdf 1700 2200 \n",
|
173 |
+
"\n",
|
174 |
+
" bytes label \n",
|
175 |
+
"0 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n",
|
176 |
+
"1 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n",
|
177 |
+
"2 b'\\x00\\x00\\x00\\x00\\x00\\x00\\xfe\\xfe\\xfe\\xff\\xff... form \n",
|
178 |
+
"3 b'\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\xfe\\xfe... form \n",
|
179 |
+
"4 b'\\x00\\x00\\x00\\x00\\x00\\x00\\xfe\\xfe\\xfe\\xff\\xff... form \n",
|
180 |
+
".. ... ... \n",
|
181 |
+
"618 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n",
|
182 |
+
"619 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n",
|
183 |
+
"620 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form-last \n",
|
184 |
+
"621 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n",
|
185 |
+
"622 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n",
|
186 |
+
"\n",
|
187 |
+
"[623 rows x 5 columns]\n",
|
188 |
+
"Processing folder data/scientific publication\n",
|
189 |
+
"Processing file 6425893-Oxford-Journals-Article.pdf\n",
|
190 |
+
"Processing file websearch_7_00001_scientific_publication.pdf\n",
|
191 |
+
"Processing file websearch_7_00017_scientific_publication.pdf\n",
|
192 |
+
"Processing file websearch_7_00022_scientific_publication.pdf\n",
|
193 |
+
"Processing file 3923078-Paper-1.pdf\n",
|
194 |
+
"Processing file websearch_7_00025_scientific_publication.pdf\n",
|
195 |
+
"Processing file websearch_7_00010_scientific_publication.pdf\n",
|
196 |
+
"Processing file 4247250-JACOBSON-Original-Article.pdf\n",
|
197 |
+
"Processing file websearch_7_00006_scientific_publication.pdf\n",
|
198 |
+
"Processing file 3559070-McCance-Katz-Et-Al-2017-the-American-Journal-on.pdf\n",
|
199 |
+
"Processing file websearch_7_00011_scientific_publication.pdf\n",
|
200 |
+
"Processing file websearch_7_00024_scientific_publication.pdf\n",
|
201 |
+
"Processing file websearch_7_00007_scientific_publication.pdf\n",
|
202 |
+
"Processing file websearch_7_00000_scientific_publication.pdf\n",
|
203 |
+
"Processing file websearch_7_00023_scientific_publication.pdf\n",
|
204 |
+
"Processing file 21085273-wood-schulman-article.pdf\n",
|
205 |
+
"Processing file websearch_7_00016_scientific_publication.pdf\n",
|
206 |
+
"Processing file 2189923-lancet-article-from-1989.pdf\n",
|
207 |
+
"Processing file 7223327-Forensic-Article.pdf\n",
|
208 |
+
"Processing file websearch_7_00009_scientific_publication.pdf\n",
|
209 |
+
"Processing file websearch_7_00005_scientific_publication.pdf\n",
|
210 |
+
"Processing file 6429770-Anderson2012-Article.pdf\n",
|
211 |
+
"Processing file websearch_7_00013_scientific_publication.pdf\n",
|
212 |
+
"Processing file websearch_7_00026_scientific_publication.pdf\n",
|
213 |
+
"Processing file websearch_7_00021_scientific_publication.pdf\n",
|
214 |
+
"Processing file websearch_7_00018_scientific_publication.pdf\n",
|
215 |
+
"Processing file websearch_7_00014_scientific_publication.pdf\n",
|
216 |
+
"Processing file websearch_7_00002_scientific_publication.pdf\n",
|
217 |
+
"Processing file websearch_7_00019_scientific_publication.pdf\n",
|
218 |
+
"Processing file websearch_7_00015_scientific_publication.pdf\n",
|
219 |
+
"Processing file websearch_7_00020_scientific_publication.pdf\n",
|
220 |
+
"Processing file websearch_7_00003_scientific_publication.pdf\n",
|
221 |
+
"Processing file 5980147-Venu-B-Article-1.pdf\n",
|
222 |
+
"Processing file 20519853-2007-recommendations-for-medical-management-of-adult-lead-exposure.pdf\n",
|
223 |
+
"Processing file 4343429-Meltdown-paper.pdf\n",
|
224 |
+
"Processing file 785243-paper.pdf\n",
|
225 |
+
"Processing file websearch_7_00008_scientific_publication.pdf\n",
|
226 |
+
"Processing file websearch_7_00004_scientific_publication.pdf\n",
|
227 |
+
"Processing file websearch_7_00012_scientific_publication.pdf\n",
|
228 |
+
"Saving data for scientific publication in data/scientific publication.parquet\n",
|
229 |
+
" filepath width height \\\n",
|
230 |
+
"0 data/scientific publication/6425893-Oxford-Jou... 1700 2200 \n",
|
231 |
+
"1 data/scientific publication/6425893-Oxford-Jou... 1700 2200 \n",
|
232 |
+
"2 data/scientific publication/6425893-Oxford-Jou... 1700 2200 \n",
|
233 |
+
"3 data/scientific publication/6425893-Oxford-Jou... 1700 2200 \n",
|
234 |
+
"4 data/scientific publication/6425893-Oxford-Jou... 1700 2200 \n",
|
235 |
+
".. ... ... ... \n",
|
236 |
+
"532 data/scientific publication/websearch_7_00012_... 1700 2200 \n",
|
237 |
+
"533 data/scientific publication/websearch_7_00012_... 1700 2200 \n",
|
238 |
+
"534 data/scientific publication/websearch_7_00012_... 1700 2200 \n",
|
239 |
+
"535 data/scientific publication/websearch_7_00012_... 1700 2200 \n",
|
240 |
+
"536 data/scientific publication/websearch_7_00012_... 1700 2200 \n",
|
241 |
+
"\n",
|
242 |
+
" bytes \\\n",
|
243 |
+
"0 b'\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd... \n",
|
244 |
+
"1 b'\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe... \n",
|
245 |
+
"2 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n",
|
246 |
+
"3 b'\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd... \n",
|
247 |
+
"4 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n",
|
248 |
+
".. ... \n",
|
249 |
+
"532 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n",
|
250 |
+
"533 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n",
|
251 |
+
"534 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n",
|
252 |
+
"535 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n",
|
253 |
+
"536 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n",
|
254 |
+
"\n",
|
255 |
+
" label \n",
|
256 |
+
"0 scientific publication \n",
|
257 |
+
"1 scientific publication \n",
|
258 |
+
"2 scientific publication \n",
|
259 |
+
"3 scientific publication \n",
|
260 |
+
"4 scientific publication \n",
|
261 |
+
".. ... \n",
|
262 |
+
"532 scientific publication \n",
|
263 |
+
"533 scientific publication \n",
|
264 |
+
"534 scientific publication \n",
|
265 |
+
"535 scientific publication \n",
|
266 |
+
"536 scientific publication-last \n",
|
267 |
+
"\n",
|
268 |
+
"[537 rows x 5 columns]\n",
|
269 |
+
"Processing folder data/handwritten\n",
|
270 |
+
"Processing file 3223324-Scan-20111118-195759.pdf\n",
|
271 |
+
"Processing file 20743971-handwritten-statement.pdf\n",
|
272 |
+
"Processing file 21030538-doj-donoghue-notes.pdf\n",
|
273 |
+
"Processing file 519547-sere-psychologist-and-bush-admin-torture.pdf\n",
|
274 |
+
"Processing file 6240196-Zenith-Rathfelder-Notes.pdf\n",
|
275 |
+
"Processing file 1630443-nick-stoneman-notes.pdf\n",
|
276 |
+
"Processing file 4182029-HANDWRITTEN-COMPILATION-OF-COLLECTION.pdf\n",
|
277 |
+
"Processing file 528421-handwritten-notes-on-financial-offer-from-orders.pdf\n",
|
278 |
+
"Processing file 5498003-1996-08-11-Handwritten-Law-Enforcement-Notes.pdf\n",
|
279 |
+
"Processing file 202587-doc21.pdf\n",
|
280 |
+
"Processing file 5190901-WasteWS-07062018-145153-Handwritten-Letter-NO-OCR.pdf\n",
|
281 |
+
"Processing file 625990-thatcher-mt-notes.pdf\n",
|
282 |
+
"Processing file 782204-loeser-will.pdf\n",
|
283 |
+
"Processing file 2189928-handwritten-agreement.pdf\n",
|
284 |
+
"Processing file 1223362-gauthe-clinical-notes.pdf\n",
|
285 |
+
"Processing file 4190352-MEMORANDUM-HIDALGO-BALMES-HANDWRITTEN.pdf\n",
|
286 |
+
"Processing file 6939323-7-8-11-83-Handwritten-Notes.pdf\n",
|
287 |
+
"Processing file 2996996-TL-Handwritten-Fax-Labeled-Confidential.pdf\n",
|
288 |
+
"Processing file 3468379-DPD-Use-of-Force-Handwritten-Notes.pdf\n",
|
289 |
+
"Processing file 4188317-LIST-HANDWRITTEN-SUMMARIES-NOTES.pdf\n",
|
290 |
+
"Processing file 339816-jeff-long-notes.pdf\n",
|
291 |
+
"Processing file 3864522-Handwritten-letter-in-a-defense-sentencing-memo.pdf\n",
|
292 |
+
"Processing file 4183043-HANDWRITTEN-MEMO-SUBJ-ARRIVAL-COVERT-KEY-WEST.pdf\n",
|
293 |
+
"Processing file 5002049-Doc-10-09-2018-12-41-31.pdf\n",
|
294 |
+
"Processing file 4780536-Jury-Notes-Manafort-Trial.pdf\n",
|
295 |
+
"Processing file 2300328-gov-lepage-handwritten-note-on-lmf.pdf\n",
|
296 |
+
"Processing file 3002469-Investigation-Notes-Witness-Statements.pdf\n",
|
297 |
+
"Processing file 4193148-11111707.pdf\n",
|
298 |
+
"Processing file 4598117-2002-Handwritten-Notes-of-City-Meeting.pdf\n",
|
299 |
+
"Processing file 4444642-Letter-Chase-Nicholson-s-grandmother.pdf\n",
|
300 |
+
"Processing file 2519466-10-10-13-handwritten-list-of-malfunctioning-doors.pdf\n",
|
301 |
+
"Processing file 2940920-2002-09-12-Letter-Handwritten-Tony-Blair-to.pdf\n",
|
302 |
+
"Processing file 4187525-HANDWRITTEN-NOTE-CUBAN-ACTIVITIES-IN-ARGENTINA.pdf\n",
|
303 |
+
"Processing file 3462227-Notes-2.pdf\n",
|
304 |
+
"Processing file 2791337-FBI-Handwritten-Notes.pdf\n",
|
305 |
+
"Processing file 354332-notes.pdf\n",
|
306 |
+
"Processing file 4182092-HANDWRITTEN-PAPER-CONCERNING-PROJECT-ZRRIFLE.pdf\n",
|
307 |
+
"Processing file 3766497-Detective-Notes.pdf\n",
|
308 |
+
"Processing file 6749500-LEOPOLD-FILES-Bruce-Jessen-Handwritten-Notes.pdf\n",
|
309 |
+
"Processing file 3220458-Handwritten-Note.pdf\n",
|
310 |
+
"Processing file 4188296-HANDWRITTEN-LIST-OF-14-NAMES.pdf\n",
|
311 |
+
"Processing file 804992-fairhope-city-council-resolution-10-14-13.pdf\n",
|
312 |
+
"Processing file 2997471-Handwritten-Notes-to-and-From-Gov-LePage.pdf\n",
|
313 |
+
"Processing file 4598082-Letter-to-Judge-Schroeder.pdf\n",
|
314 |
+
"Processing file 3877562-Mikulcik-Interview-Notes.pdf\n",
|
315 |
+
"Processing file 479570-emily-dickinsons-handwritten-poem.pdf\n",
|
316 |
+
"Processing file 21108937-pv-handwritten-motion-for-trial-transcript-3215.pdf\n",
|
317 |
+
"Processing file 6020949-Aretha-Franklin-Handwritten-Will-1.pdf\n",
|
318 |
+
"Processing file 6464722-Fn-22-03252.pdf\n",
|
319 |
+
"Processing file 21117526-pv-handwritten-pre-sentencing-memorandum-21312.pdf\n",
|
320 |
+
"Processing file 705120-mingle-notes.pdf\n",
|
321 |
+
"Processing file 1213307-rialto-unified-holocaust-essays-set-13-part-05.pdf\n",
|
322 |
+
"Processing file 4189369-HANDWRITTEN-SHEET-WITH-REPORT-NUMBERS.pdf\n",
|
323 |
+
"Processing file 202586-doc20.pdf\n",
|
324 |
+
"Processing file 4184368-OPS-NOTES-ON-AMBIDDY.pdf\n",
|
325 |
+
"Processing file 393660-fibroscopic-notes-original.pdf\n",
|
326 |
+
"Processing file 4182778-HANDWRITTEN-NOTES.pdf\n",
|
327 |
+
"Processing file 4187708-DOCUMENT-3-HANDWRITTEN-NOTE.pdf\n",
|
328 |
+
"Processing file 702325-snitch-list.pdf\n",
|
329 |
+
"Processing file 4180597-HANDWRITTEN-MEMO-RE-IMPLICATION-OF-ALINE-MOSBY.pdf\n",
|
330 |
+
"Processing file 1236260-sidney-holters-note.pdf\n",
|
331 |
+
"Processing file 3223330-Scan-20111118-200635.pdf\n",
|
332 |
+
"Processing file 3223289-Scan-20111116-171028.pdf\n",
|
333 |
+
"Processing file 21043867-2021-01-25-mo-sen-eigel-handwritten-note-sb-66-allows-people-to-run-over-protestors.pdf\n",
|
334 |
+
"Processing file 3233254-Earl-Bradley-Letter-102516.pdf\n",
|
335 |
+
"Processing file 3227804-Cornell-Handwritten-Letters.pdf\n",
|
336 |
+
"Processing file 3862956-Handwritten-Note-of-Meeting-With-Howard-Hill.pdf\n",
|
337 |
+
"Processing file 803848-elvis-presleys-letter-to-richard-nixon.pdf\n",
|
338 |
+
"Processing file 3223331-Scan-20111118-200718.pdf\n",
|
339 |
+
"Processing file 202599-doc33.pdf\n",
|
340 |
+
"Processing file 5690139-Child-s-Australia-Day-Letter.pdf\n",
|
341 |
+
"Processing file 4182800-HANDWRITTEN-MEMO-ON-HUNT-AND-HIS-USE-OF-A-PEN-NAME.pdf\n",
|
342 |
+
"Processing file 3223399-Scan-20111125-141919.pdf\n",
|
343 |
+
"Processing file 3521430-IMG-4827.pdf\n",
|
344 |
+
"Processing file 4420170-Mullkoff-Note.pdf\n",
|
345 |
+
"Processing file 4185023-HANDWRITTEN-NOTE-RE-FILING-DOCUMENTS-IN-DIAZ-L-S.pdf\n",
|
346 |
+
"Processing file 2746630-Cardenas-notes.pdf\n",
|
347 |
+
"Processing file 1086683-hand-written-notes.pdf\n",
|
348 |
+
"Processing file 5190857-WasteWS-07062018-120826-HANDWRITTEN-NO-OCR.pdf\n",
|
349 |
+
"Processing file 4189400-HANDWRITTEN-CARDS-RESEARCH-DEPARTMENT-MEMORANDUM.pdf\n",
|
350 |
+
"Processing file 3223321-Scan-20111118-195438.pdf\n",
|
351 |
+
"Processing file 4328038-Correspondence-From-John-Crowley-to-Scott.pdf\n",
|
352 |
+
"Processing file 5018573-Chad-Notes.pdf\n",
|
353 |
+
"Processing file 21109051-pv-handwritten-letter-requesting-new-federal-defender-counsel-41212.pdf\n",
|
354 |
+
"Processing file 5316881-Journal-Entries.pdf\n",
|
355 |
+
"Processing file 4191532-OFFICE-NOTES-RE-AGENTS.pdf\n",
|
356 |
+
"Processing file 1213274-rialto-unified-holocaust-essays-set-01-part-01.pdf\n",
|
357 |
+
"Processing file 4490103-Gergely-Interview-Notes.pdf\n",
|
358 |
+
"Processing file 2708465-Brandon-Astor-Jones-Timeline-HW.pdf\n",
|
359 |
+
"Processing file 3462228-notes-3.pdf\n",
|
360 |
+
"Processing file 528418-handwritten-note-from-april-3-2001.pdf\n",
|
361 |
+
"Processing file 4185802-HANDWRITTEN-NOTES-MARY-CHECKING-FOR-PERTINENT.pdf\n",
|
362 |
+
"Processing file 3011003-Richard-Bain-s-handwritten-account-of-Sept-4-2012.pdf\n",
|
363 |
+
"Processing file 5002048-Doc-10-09-2018-12-42-54.pdf\n",
|
364 |
+
"Processing file 1372097-applebee-valuch-notes.pdf\n",
|
365 |
+
"Processing file 705119-davies-notes.pdf\n",
|
366 |
+
"Processing file 3237222-Notes-1216.pdf\n",
|
367 |
+
"Processing file 3718215-PP-D0414.pdf\n",
|
368 |
+
"Processing file 4187089-LECTURE-AT-FARM.pdf\n",
|
369 |
+
"Processing file 3223531-Scan-20111202-194427.pdf\n",
|
370 |
+
"Processing file 4450893-Lithuanian-Extradition-Request-for-Release.pdf\n",
|
371 |
+
"Processing file 4193146-11111705.pdf\n",
|
372 |
+
"Processing file 526429-east-coast-rapist-suspects-apology-letter.pdf\n",
|
373 |
+
"Processing file 4181294-HANDWRITTEN-LIST-DDO-FILE-REQUESTS.pdf\n",
|
374 |
+
"Processing file 5764377-Handwritten-Notes.pdf\n",
|
375 |
+
"Processing file 4183714-HANDWRITTEN-NOTE-RE-RICHARD-GIBSON-ISSUE.pdf\n",
|
376 |
+
"Processing file 6393576-Lawson-Letter.pdf\n",
|
377 |
+
"Processing file 1283645-dearest-celeste.pdf\n",
|
378 |
+
"Processing file 2825042-Mathes-Handwritten-Notes.pdf\n",
|
379 |
+
"Processing file 202597-doc31.pdf\n",
|
380 |
+
"Processing file 21011336-foxs-hand-drawn-map.pdf\n",
|
381 |
+
"Processing file 339815-jon-fagg-notes.pdf\n",
|
382 |
+
"Processing file 1336598-hartmannnotes.pdf\n",
|
383 |
+
"Processing file 2650353-1983-10-22-F4-Handritten.pdf\n",
|
384 |
+
"Processing file 282179-july-14-1995-mladic-diary-handwritten.pdf\n",
|
385 |
+
"Processing file 2271360-hartfield-handwritten-writ.pdf\n",
|
386 |
+
"Processing file 4178930-HANDWRITTEN-DOC-SUMMARIES-ILLEGIBLE.pdf\n",
|
387 |
+
"Processing file 5096174-Handwritten-Minutes.pdf\n",
|
388 |
+
"Processing file 6382284-Letter-Ferrell-Scott.pdf\n",
|
389 |
+
"Processing file 2474354-oland-statement-written.pdf\n",
|
390 |
+
"Processing file 1306277-4-9-22-11-hanah-cho.pdf\n",
|
391 |
+
"Processing file 3223449-Scan-20111125-145718.pdf\n",
|
392 |
+
"Processing file 2517459-bill-clintons-handwritten-speech-from-the.pdf\n",
|
393 |
+
"Processing file 3223329-Scan-20111118-200524.pdf\n",
|
394 |
+
"Processing file 4179732-INVENTORY-HANDWRITTEN.pdf\n",
|
395 |
+
"Processing file 3223447-Scan-20111125-145513.pdf\n",
|
396 |
+
"Processing file 786278-frederick-pabsts-handwritten-will.pdf\n",
|
397 |
+
"Processing file 1378369-bricker-doc.pdf\n",
|
398 |
+
"Processing file 1371011-witness-40-journal-entry.pdf\n",
|
399 |
+
"Processing file 4191654-HANDWRITTEN-NOTES-ON-CIA-AND-FBI-DOCUMENTS.pdf\n",
|
400 |
+
"Processing file 803812-iggy-pops-letter-to-a-fan-1995.pdf\n",
|
401 |
+
"Processing file 3223444-Scan-20111125-145344.pdf\n",
|
402 |
+
"Processing file 4193147-11111706.pdf\n",
|
403 |
+
"Processing file 2504031-joseph-brennick-handwritten-memos.pdf\n",
|
404 |
+
"Processing file 21108969-pvs-amended-handwritten-41-page-motion-for-reconsideration-en-banc-11317.pdf\n",
|
405 |
+
"Processing file 5190870-WasteWS-07062018-121609-HANDWRITTEN-NO-OCR-Pdf.pdf\n",
|
406 |
+
"Processing file 3223420-Scan-20111125-142900.pdf\n",
|
407 |
+
"Processing file 555087-vh-defrock-letter.pdf\n",
|
408 |
+
"Processing file 5002051-Doc-10-09-2018-12-39-47.pdf\n",
|
409 |
+
"Processing file 5634528-181217-Flynn-Fbi-Notes.pdf\n",
|
410 |
+
"Processing file 5025013-J-R-Thomas-response.pdf\n",
|
411 |
+
"Processing file 3223527-Scan-20111202-190234.pdf\n",
|
412 |
+
"Processing file 4184490-HANDWRITTEN-BERNARD-BARKER-CHRONOLOGY.pdf\n",
|
413 |
+
"Processing file 4911037-Manuel-Orrego-Savala-Letter.pdf\n",
|
414 |
+
"Processing file 6571779-Frederick-Veal-Interview-Notes.pdf\n",
|
415 |
+
"Processing file 1096680-alexandra-hollinghurst-notes.pdf\n",
|
416 |
+
"Processing file 5780462-GGGW-v-Schwitzer-Handwritten-Ex-Staffer-Stmt.pdf\n",
|
417 |
+
"Processing file 321762-gina-hutchinson-journal-entry.pdf\n",
|
418 |
+
"Processing file 566855-inslee-letter.pdf\n",
|
419 |
+
"Processing file 2811636-Callis-Handwritten-Letter-1999-13-09.pdf\n",
|
420 |
+
"Processing file 803827-a-mothers-letter-to-the-foundling-asylum.pdf\n",
|
421 |
+
"Processing file 4704949-Bechtel-Notes.pdf\n",
|
422 |
+
"Processing file 6771659-Preston-Handwritten-Response.pdf\n",
|
423 |
+
"Processing file 776417-helios-notes.pdf\n",
|
424 |
+
"Processing file 4191914-FORM-MEMORANDUM-FOR-THE-RECORD-HARVEY-LEE-OSWALD.pdf\n",
|
425 |
+
"Processing file 3462229-Notes-1.pdf\n",
|
426 |
+
"Processing file 21030480-doj-notes.pdf\n",
|
427 |
+
"Processing file 1658584-1admission-notes-redacted.pdf\n",
|
428 |
+
"Processing file 1668278-doc066.pdf\n",
|
429 |
+
"Processing file 5332373-1-AEA-2080-NOTES-HANDWRITTEN-CHRONOLOGY-EVENTS.pdf\n",
|
430 |
+
"Processing file 528420-handwritten-notes-of-meeting-with-cori-june-26.pdf\n",
|
431 |
+
"Processing file 2777703-AV-Interview-Notes.pdf\n",
|
432 |
+
"Processing file 2163779-skm-c224e15070214420-pdf-handwritten-notes-about.pdf\n",
|
433 |
+
"Processing file 266726-boogaard-journal.pdf\n"
|
434 |
+
]
|
435 |
+
}
|
436 |
+
],
|
437 |
+
"source": [
|
438 |
+
"process_training_data()"
|
439 |
+
]
|
440 |
+
}
|
441 |
+
],
|
442 |
+
"metadata": {
|
443 |
+
"kernelspec": {
|
444 |
+
"display_name": "Python (hydra)",
|
445 |
+
"language": "python",
|
446 |
+
"name": "hydra"
|
447 |
+
},
|
448 |
+
"language_info": {
|
449 |
+
"codemirror_mode": {
|
450 |
+
"name": "ipython",
|
451 |
+
"version": 3
|
452 |
+
},
|
453 |
+
"file_extension": ".py",
|
454 |
+
"mimetype": "text/x-python",
|
455 |
+
"name": "python",
|
456 |
+
"nbconvert_exporter": "python",
|
457 |
+
"pygments_lexer": "ipython3",
|
458 |
+
"version": "3.13.1"
|
459 |
+
},
|
460 |
+
"widgets": {
|
461 |
+
"application/vnd.jupyter.widget-state+json": {
|
462 |
+
"state": {},
|
463 |
+
"version_major": 2,
|
464 |
+
"version_minor": 0
|
465 |
+
}
|
466 |
+
}
|
467 |
+
},
|
468 |
+
"nbformat": 4,
|
469 |
+
"nbformat_minor": 5
|
470 |
+
}
|
pre_processing.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import os
|
3 |
+
import pandas as pd
|
4 |
+
import pdf2image as p2i
|
5 |
+
import pytesseract
|
6 |
+
from os import path
|
7 |
+
from PIL import Image
|
8 |
+
from typing import List, Tuple
|
9 |
+
from transformers import BertTokenizer
|
10 |
+
from constants import (RAW_DATA_DIR,
|
11 |
+
PROCESSED_DATA_DIR,
|
12 |
+
METADATA_FILEPATH,
|
13 |
+
BERT_BASE,
|
14 |
+
MAX_SEQUENCE_LENGHT,
|
15 |
+
FilePath,
|
16 |
+
PageMetadata)
|
17 |
+
|
18 |
+
# Allow for unlimited image size, some documents are pretty big...
|
19 |
+
Image.MAX_IMAGE_PIXELS = None
|
20 |
+
|
21 |
+
|
22 |
+
def make_page_filepaths(basename, label, page_index) -> Tuple[str, str]:
|
23 |
+
out_dirname = path.join(PROCESSED_DATA_DIR, label)
|
24 |
+
os.makedirs(out_dirname, exist_ok=True)
|
25 |
+
out_filename = path.join(out_dirname, f'{basename}_{page_index}')
|
26 |
+
|
27 |
+
out_img_filepath = f'{out_filename}.jpg'
|
28 |
+
out_txt_filepath = f'{out_filename}.txt'
|
29 |
+
|
30 |
+
return out_img_filepath, out_txt_filepath
|
31 |
+
|
32 |
+
|
33 |
+
def tokenize_text(text: str) -> Tuple[List[int], List[int]]:
|
34 |
+
tokenizer = BertTokenizer.from_pretrained(BERT_BASE)
|
35 |
+
tokenized = tokenizer(
|
36 |
+
text,
|
37 |
+
padding=True,
|
38 |
+
truncation=True,
|
39 |
+
max_length=MAX_SEQUENCE_LENGHT,
|
40 |
+
)
|
41 |
+
|
42 |
+
return tokenized['input_ids'], tokenized['attention_mask']
|
43 |
+
|
44 |
+
|
45 |
+
def process_pdf_file(pdf_filepath: FilePath):
|
46 |
+
if path.getsize(pdf_filepath) == 0:
|
47 |
+
# TODO: substitute for logging
|
48 |
+
print(f'{pdf_filepath} is empty, skipping')
|
49 |
+
return []
|
50 |
+
|
51 |
+
pages: List[Image] = p2i.convert_from_path(pdf_filepath)
|
52 |
+
pages_metadata: List[PageMetadata] = []
|
53 |
+
|
54 |
+
root_dir, doctype = path.split(path.dirname(pdf_filepath))
|
55 |
+
base_filename = path.basename(path.splitext(pdf_filepath)[0])
|
56 |
+
for page_i, page in enumerate(pages):
|
57 |
+
label = 'other'
|
58 |
+
if page_i == 0:
|
59 |
+
label = doctype
|
60 |
+
# If the document only has one page, override the label with
|
61 |
+
if page_i == len(pages) - 1:
|
62 |
+
label = f'{doctype}-last'
|
63 |
+
|
64 |
+
out_img_filepath, out_txt_filepath = make_page_filepaths(base_filename, label, page_i)
|
65 |
+
|
66 |
+
page.save(out_img_filepath)
|
67 |
+
|
68 |
+
ocr_text = pytesseract.image_to_string(page)
|
69 |
+
input_ids, attention_mask = tokenize_text(ocr_text)
|
70 |
+
with open(out_txt_filepath, 'w') as out_txt_file:
|
71 |
+
out_txt_file.write(ocr_text)
|
72 |
+
|
73 |
+
pages_metadata.append({
|
74 |
+
'page_number': page_i + 1,
|
75 |
+
'pdf_filepath': path.relpath(pdf_filepath, start='.'),
|
76 |
+
'img_filepath': out_img_filepath,
|
77 |
+
'txt_filepath': out_txt_filepath,
|
78 |
+
# 'text_tokens': tokens,
|
79 |
+
'width': page.width,
|
80 |
+
'height': page.height,
|
81 |
+
'label': label,
|
82 |
+
})
|
83 |
+
|
84 |
+
return pages_metadata
|
85 |
+
|
86 |
+
|
87 |
+
def process_training_data() -> pd.DataFrame:
|
88 |
+
pages_metadata: List[List[PageMetadata]] = []
|
89 |
+
|
90 |
+
for dirname, _, files in os.walk(RAW_DATA_DIR):
|
91 |
+
if path.samefile(dirname, RAW_DATA_DIR):
|
92 |
+
continue
|
93 |
+
|
94 |
+
print(f'Processing folder {dirname}')
|
95 |
+
|
96 |
+
for filename in files:
|
97 |
+
_, ext = path.splitext(filename)
|
98 |
+
|
99 |
+
# Avoid processing non-document files
|
100 |
+
if ext.lower() == '.pdf':
|
101 |
+
print(f'Processing file {filename}')
|
102 |
+
pdf_filepath = path.join(dirname, filename)
|
103 |
+
pages_metadata.extend(process_pdf_file(pdf_filepath))
|
104 |
+
|
105 |
+
pages_metadata_df = pd.DataFrame(pages_metadata)
|
106 |
+
print(f'Writing metadata to {METADATA_FILEPATH}')
|
107 |
+
pages_metadata_df.to_csv(METADATA_FILEPATH, index=False)
|
108 |
+
return pages_metadata_df
|
109 |
+
|
110 |
+
|
111 |
+
process_training_data()
|
requirements.txt
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
argcomplete==3.3.0
|
2 |
+
astunparse==1.6.3
|
3 |
+
gast==0.6.0
|
4 |
+
kagglehub==0.3.6
|
5 |
+
keras-hub==0.18.1
|
6 |
+
libclang==18.1.1
|
7 |
+
Markdown==3.7
|
8 |
+
markdown-it-py==3.0.0
|
9 |
+
numpy==2.0.2
|
10 |
+
opt_einsum==3.4.0
|
11 |
+
optree==0.14.0
|
12 |
+
packaging==24.2
|
13 |
+
pip-autoremove==0.10.0
|
14 |
+
pipx==1.5.0
|
15 |
+
platformdirs==4.2.2
|
16 |
+
protobuf==5.29.3
|
17 |
+
requests==2.32.3
|
18 |
+
setuptools==75.8.0
|
19 |
+
six==1.17.0
|
20 |
+
tensorboard==2.18.0
|
21 |
+
tensorflow-text==2.18.1
|
22 |
+
tf_keras==2.18.0
|
23 |
+
typing_extensions==4.12.2
|
24 |
+
userpath==1.9.2
|
25 |
+
Werkzeug==3.1.3
|
26 |
+
wheel==0.44.0
|
training.py
ADDED
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import tensorflow as tf
|
3 |
+
import tf_keras as keras
|
4 |
+
from constants import (PROCESSED_DATA_DIR,
|
5 |
+
METADATA_FILEPATH,
|
6 |
+
BATCH_SIZE,
|
7 |
+
EPOCHS,
|
8 |
+
BERT_BASE,
|
9 |
+
MAX_SEQUENCE_LENGHT,
|
10 |
+
FilePath,
|
11 |
+
PageMetadata,
|
12 |
+
ImageSize,
|
13 |
+
ImageInputShape)
|
14 |
+
from pandera.typing import DataFrame
|
15 |
+
from typing import Tuple, List
|
16 |
+
from transformers import TFBertModel
|
17 |
+
from tf_keras import layers, models
|
18 |
+
from PIL import Image
|
19 |
+
|
20 |
+
# Allow for unlimited image size, some documents are pretty big...
|
21 |
+
Image.MAX_IMAGE_PIXELS = None
|
22 |
+
|
23 |
+
|
24 |
+
def stratified_split(
|
25 |
+
df: pd.DataFrame,
|
26 |
+
train_frac: float,
|
27 |
+
val_frac: float,
|
28 |
+
test_frac: float,
|
29 |
+
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
30 |
+
train_dfs, val_dfs, test_dfs = [], [], []
|
31 |
+
|
32 |
+
for label, group in df.groupby('label'):
|
33 |
+
n = len(group)
|
34 |
+
train_end = int(n * train_frac)
|
35 |
+
val_end = train_end + int(n * val_frac)
|
36 |
+
|
37 |
+
train_dfs.append(group.iloc[:train_end])
|
38 |
+
val_dfs.append(group.iloc[train_end:val_end])
|
39 |
+
test_dfs.append(group.iloc[val_end:])
|
40 |
+
|
41 |
+
train_df = pd.concat(train_dfs).reset_index(drop=True)
|
42 |
+
val_df = pd.concat(val_dfs).reset_index(drop=True)
|
43 |
+
test_df = pd.concat(test_dfs).reset_index(drop=True)
|
44 |
+
|
45 |
+
return train_df, val_df, test_df
|
46 |
+
|
47 |
+
|
48 |
+
def dataset_from_dataframe(df: pd.DataFrame) -> tf.data.Dataset:
|
49 |
+
return tf.data.Dataset.from_tensor_slices((
|
50 |
+
df['img_filepath'].values,
|
51 |
+
df['input_ids'].values,
|
52 |
+
df['attention_mask'].values,
|
53 |
+
df['label'].values,
|
54 |
+
))
|
55 |
+
|
56 |
+
|
57 |
+
def load_image(image_path: FilePath, image_size: ImageSize) -> Image:
|
58 |
+
img_width, img_height = image_size
|
59 |
+
|
60 |
+
# Load image
|
61 |
+
image = tf.io.read_file(image_path)
|
62 |
+
image = tf.image.decode_jpeg(image, channels=3)
|
63 |
+
image = tf.image.resize(image, [img_width, img_height])
|
64 |
+
image /= 255.0
|
65 |
+
|
66 |
+
return image
|
67 |
+
|
68 |
+
|
69 |
+
def prepare_dataset(
|
70 |
+
ds: tf.data.Dataset,
|
71 |
+
image_size: ImageSize,
|
72 |
+
batch_size=32,
|
73 |
+
buffer_size=1000
|
74 |
+
) -> tf.data.Dataset:
|
75 |
+
def load_image_and_format_tensor_shape(
|
76 |
+
img_path: FilePath,
|
77 |
+
input_ids: List[int],
|
78 |
+
attention_mask: List[int],
|
79 |
+
label: str
|
80 |
+
):
|
81 |
+
image = load_image(img_path, image_size)
|
82 |
+
return ((image, input_ids, attention_mask), label)
|
83 |
+
|
84 |
+
return ds.map(
|
85 |
+
load_image_and_format_tensor_shape,
|
86 |
+
num_parallel_calls=tf.data.experimental.AUTOTUNE,
|
87 |
+
) \
|
88 |
+
.shuffle(buffer_size=buffer_size) \
|
89 |
+
.batch(batch_size) \
|
90 |
+
.prefetch(tf.data.experimental.AUTOTUNE)
|
91 |
+
|
92 |
+
|
93 |
+
metadata_df: DataFrame[PageMetadata] = pd.read_csv(METADATA_FILEPATH)
|
94 |
+
metadata_df = metadata_df.sample(n=50, random_state=42)
|
95 |
+
|
96 |
+
median_height = int(metadata_df['height'].median())
|
97 |
+
median_width = int(metadata_df['width'].median())
|
98 |
+
img_size: ImageSize = (median_height, median_width)
|
99 |
+
img_input_shape: ImageInputShape = img_size + (3,)
|
100 |
+
|
101 |
+
label_names: List[str] = sorted(
|
102 |
+
[d.name for d in PROCESSED_DATA_DIR.iterdir() if d.is_dir()]
|
103 |
+
)
|
104 |
+
num_classes = len(label_names)
|
105 |
+
|
106 |
+
print('Splitting the DataFrame into training, validation and test')
|
107 |
+
train_df, val_df, test_df = stratified_split(
|
108 |
+
metadata_df,
|
109 |
+
train_frac=0.7,
|
110 |
+
val_frac=0.15,
|
111 |
+
test_frac=0.15,
|
112 |
+
)
|
113 |
+
|
114 |
+
print('Batching and shuffling the datasets')
|
115 |
+
train_ds = dataset_from_dataframe(train_df)
|
116 |
+
train_ds = prepare_dataset(train_ds, img_size, batch_size=BATCH_SIZE)
|
117 |
+
|
118 |
+
val_ds = dataset_from_dataframe(val_df)
|
119 |
+
val_ds = prepare_dataset(val_ds, img_size, batch_size=BATCH_SIZE)
|
120 |
+
|
121 |
+
test_ds = dataset_from_dataframe(test_df)
|
122 |
+
test_ds = prepare_dataset(test_ds, img_size, batch_size=BATCH_SIZE)
|
123 |
+
|
124 |
+
|
125 |
+
def build_image_model(input_shape: ImageInputShape) -> keras.Model:
|
126 |
+
img_model = models.Sequential([
|
127 |
+
layers.Input(shape=input_shape),
|
128 |
+
layers.Conv2D(32, (3, 3), activation='relu'),
|
129 |
+
layers.MaxPooling2D((2, 2)),
|
130 |
+
layers.Conv2D(64, (3, 3), activation='relu'),
|
131 |
+
layers.MaxPooling2D((2, 2)),
|
132 |
+
layers.Conv2D(128, (3, 3), activation='relu'),
|
133 |
+
layers.MaxPooling2D((2, 2)),
|
134 |
+
layers.Conv2D(128, (3, 3), activation='relu'),
|
135 |
+
layers.MaxPooling2D((2, 2)),
|
136 |
+
layers.Flatten(),
|
137 |
+
layers.Dense(512, activation='relu'),
|
138 |
+
], name='image_classification')
|
139 |
+
|
140 |
+
img_model.summary()
|
141 |
+
return img_model
|
142 |
+
|
143 |
+
|
144 |
+
def build_text_model() -> keras.Model:
|
145 |
+
bert_model = TFBertModel.from_pretrained(BERT_BASE)
|
146 |
+
|
147 |
+
input_ids = layers.Input(
|
148 |
+
shape=(MAX_SEQUENCE_LENGHT,), dtype=tf.int32, name='input_ids'
|
149 |
+
)
|
150 |
+
attention_mask = layers.Input(
|
151 |
+
shape=(MAX_SEQUENCE_LENGHT,), dtype=tf.int32, name='attention_mask'
|
152 |
+
)
|
153 |
+
|
154 |
+
# The second element of the BERT output is the pooled output i.e. the
|
155 |
+
# representation of the [CLS] token
|
156 |
+
outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)[1]
|
157 |
+
|
158 |
+
text_model = models.Model(
|
159 |
+
inputs=[input_ids, attention_mask],
|
160 |
+
outputs=outputs,
|
161 |
+
name='bert'
|
162 |
+
)
|
163 |
+
text_model.summary()
|
164 |
+
|
165 |
+
return text_model
|
166 |
+
|
167 |
+
|
168 |
+
def build_multimodal_model(
|
169 |
+
num_classes: int,
|
170 |
+
img_input_shape: ImageInputShape
|
171 |
+
) -> keras.Model:
|
172 |
+
img_model = build_image_model(img_input_shape)
|
173 |
+
text_model = build_text_model()
|
174 |
+
|
175 |
+
img_input = layers.Input(shape=img_input_shape, name='img_input')
|
176 |
+
text_input_ids = layers.Input(
|
177 |
+
shape=(MAX_SEQUENCE_LENGHT,), dtype=tf.int32, name='text_input_ids'
|
178 |
+
)
|
179 |
+
text_input_mask = layers.Input(
|
180 |
+
shape=(MAX_SEQUENCE_LENGHT,), dtype=tf.int32, name='text_input_mask'
|
181 |
+
)
|
182 |
+
|
183 |
+
img_features = img_model(img_input)
|
184 |
+
text_features = text_model([text_input_ids, text_input_mask])
|
185 |
+
|
186 |
+
classification_layers = keras.Sequential([
|
187 |
+
tf.keras.layers.Dense(512, activation='relu'),
|
188 |
+
tf.keras.layers.Dense(num_classes, activation='softmax'),
|
189 |
+
], name='classification_layers')
|
190 |
+
concat_features = layers.concatenate([img_features, text_features],
|
191 |
+
name='concatenate_features')
|
192 |
+
outputs = classification_layers(concat_features)
|
193 |
+
|
194 |
+
multimodal_model = models.Model(
|
195 |
+
inputs=[img_input, text_input_ids, text_input_mask],
|
196 |
+
outputs=outputs,
|
197 |
+
name='multimodal_document_page_classifier'
|
198 |
+
)
|
199 |
+
return multimodal_model
|
200 |
+
|
201 |
+
|
202 |
+
multimodal_model = build_multimodal_model(num_classes, img_input_shape)
|
203 |
+
multimodal_model.summary()
|
204 |
+
multimodal_model.compile(
|
205 |
+
optimizer='adam',
|
206 |
+
loss='sparse_categorical_crossentropy',
|
207 |
+
metrics=['accuracy']
|
208 |
+
)
|
209 |
+
multimodal_model.fit(
|
210 |
+
train_ds,
|
211 |
+
epochs=EPOCHS,
|
212 |
+
batch_size=BATCH_SIZE,
|
213 |
+
validation_data=val_ds,
|
214 |
+
)
|