João Pedro commited on
Commit
861182c
·
1 Parent(s): b7d38ff

adding files from github

Browse files
Files changed (6) hide show
  1. constants.py +29 -0
  2. environment.yml +20 -0
  3. pre_processing.ipynb +470 -0
  4. pre_processing.py +111 -0
  5. requirements.txt +26 -0
  6. training.py +214 -0
constants.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import TypedDict, Union, TypeAlias, Tuple
3
+
4
+ # Constants
5
+ DATA_DIR = Path('./data')
6
+ RAW_DATA_DIR = DATA_DIR / 'raw'
7
+ PROCESSED_DATA_DIR = DATA_DIR / 'processed'
8
+ METADATA_FILEPATH = DATA_DIR / 'metadata.csv'
9
+
10
+ BATCH_SIZE = 8
11
+ EPOCHS = 1
12
+ BERT_BASE = 'bert-base-uncased'
13
+ MAX_SEQUENCE_LENGHT = 512
14
+ MODEL_DIR = Path('./model')
15
+
16
+ # Types
17
+ FilePath: TypeAlias = Union[str, Path]
18
+
19
+
20
+ class PageMetadata(TypedDict):
21
+ page_number: int
22
+ file_relpath: FilePath
23
+ width: int
24
+ height: int
25
+ label: str
26
+
27
+
28
+ ImageSize: TypeAlias = Tuple[int, int]
29
+ ImageInputShape: TypeAlias = Tuple[int, int, int]
environment.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: hydra
2
+ channels:
3
+ - conda-forge
4
+ dependencies:
5
+ - ipykernel
6
+ - ipython
7
+ - ipython_genutils
8
+ - jupyter_client
9
+ - jupyter_core
10
+ - pandas
11
+ - pandera
12
+ - pdf2image
13
+ - pytables
14
+ - pytesseract
15
+ - python=3.12
16
+ - python-lsp-server
17
+ - scikit-learn
18
+ - tesseract
19
+ - traitlets
20
+
pre_processing.ipynb ADDED
@@ -0,0 +1,470 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "id": "63d95b8f-7559-4a3d-b025-5bb788914dc9",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import os\n",
11
+ "import numpy as np\n",
12
+ "import pandas as pd\n",
13
+ "import pdf2image as p2i\n",
14
+ "from os import path\n",
15
+ "from pathlib import Path\n",
16
+ "from PIL import Image\n",
17
+ "\n",
18
+ "# Allow for unlimited image size, some documents are pretty big...\n",
19
+ "Image.MAX_IMAGE_PIXELS = None\n",
20
+ "\n",
21
+ "\n",
22
+ "def process_pdf_document(filepath):\n",
23
+ " if path.getsize(filepath) == 0:\n",
24
+ " # TODO: substitute for logging\n",
25
+ " print(f'{filepath} is empty, skipping')\n",
26
+ " return []\n",
27
+ "\n",
28
+ " pages = p2i.convert_from_path(filepath)\n",
29
+ " processed_pages: list[dict] = []\n",
30
+ "\n",
31
+ " label = 'other'\n",
32
+ " root_dir, doctype = Path(filepath).parts[:2]\n",
33
+ " for page_i, page in enumerate(pages):\n",
34
+ " if page_i == 0:\n",
35
+ " label = doctype\n",
36
+ " elif page_i == len(pages) - 1:\n",
37
+ " label = f'{label}-last'\n",
38
+ "\n",
39
+ " processed_pages.append({\n",
40
+ " 'filepath': filepath,\n",
41
+ " 'width': page.width,\n",
42
+ " 'height': page.height,\n",
43
+ " 'bytes': page.tobytes(),\n",
44
+ " 'label': label,\n",
45
+ " })\n",
46
+ "\n",
47
+ " return processed_pages\n",
48
+ "\n",
49
+ "\n",
50
+ "def process_training_data():\n",
51
+ " data_dir = Path('./data')\n",
52
+ "\n",
53
+ " for dirname, _, files in os.walk(data_dir):\n",
54
+ " print(f'Processing folder {dirname}')\n",
55
+ " doctype = path.basename(dirname)\n",
56
+ " df = pd.DataFrame()\n",
57
+ "\n",
58
+ " for filename in files:\n",
59
+ " print(f'Processing file {filename}')\n",
60
+ " filepath = path.join(dirname, filename)\n",
61
+ " _, ext = path.splitext(filepath)\n",
62
+ "\n",
63
+ " if ext.lower() == '.pdf':\n",
64
+ " processed_pages = process_pdf_document(filepath)\n",
65
+ " df = pd.concat([df, pd.DataFrame(processed_pages)], ignore_index=True)\n",
66
+ "\n",
67
+ " parquet_filepath = path.join(data_dir, f'{doctype}.parquet')\n",
68
+ " print(f'Saving data for {doctype} in {parquet_filepath}')\n",
69
+ " print(df)\n",
70
+ " df.to_parquet(parquet_filepath)"
71
+ ]
72
+ },
73
+ {
74
+ "cell_type": "code",
75
+ "execution_count": null,
76
+ "id": "1efb8e82-92bc-46dc-9553-cdc712211c93",
77
+ "metadata": {},
78
+ "outputs": [
79
+ {
80
+ "name": "stdout",
81
+ "output_type": "stream",
82
+ "text": [
83
+ "Processing folder data\n",
84
+ "Processing file data.parquet\n",
85
+ "Processing file .DS_Store\n",
86
+ "Saving data for data in data/data.parquet\n",
87
+ "Empty DataFrame\n",
88
+ "Columns: []\n",
89
+ "Index: []\n",
90
+ "Processing folder data/form\n",
91
+ "Processing file 1384753-request-form.pdf\n",
92
+ "Processing file 772791-12_sipc1301f_peer-individual-evaluation-form.pdf\n",
93
+ "Processing file 6428556-Cassatt-Form-D-2008.pdf\n",
94
+ "Processing file 3895717-INFORMATION-DISCLOSURE-REQUEST-FORM.pdf\n",
95
+ "Processing file 4321053-document-21556514.pdf\n",
96
+ "Processing file 4575531-REDACTED-PIW-CR285646.pdf\n",
97
+ "Processing file 6155510-Form-I.pdf\n",
98
+ "Processing file 3885938-JayaServicesLtd-Appointment-15112002.pdf\n",
99
+ "Processing file 4950830-Cook1995.pdf\n",
100
+ "Processing file 6368627-2013-Southern-Company-990.pdf\n",
101
+ "Processing file 4436484-Columbia-Armed-Guard-application-form.pdf\n",
102
+ "Processing file 21074299-company-application-form-for-dominicana-acquisition-sa.pdf\n",
103
+ "Processing file 746365-wa-madsen-barbara-2012.pdf\n",
104
+ "Processing file 5770184-CAMCF-RFYL-Registration-Form.pdf\n",
105
+ "Processing file 1683073-medication-info-form-eng-revised-2014.pdf\n",
106
+ "Processing file 4423347-Request-Form.pdf\n",
107
+ "Processing file 3022163-Event-Form.pdf\n",
108
+ "Processing file 1685242-cv-fresh-entry-form.pdf\n",
109
+ "Processing file 4910987-Records-Request-Form.pdf\n",
110
+ "Processing file 4349763-DOD-Form-254.pdf\n",
111
+ "Processing file 3885273-EIMEuropeanInvestmentsManagement-ChangeofName.pdf\n",
112
+ "Processing file 4951858-Ober3.pdf\n",
113
+ "Processing file 7034791-UOF-form.pdf\n",
114
+ "Processing file 6206290-Thunderbird-GBC1-Application-Form.pdf\n",
115
+ "Processing file 465336-disclosure-form-13491099365164-_-pdf.pdf\n",
116
+ "Processing file 6538004-Aderholt-Travel-Disclosure-Form.pdf\n",
117
+ "Processing file 1292759-host-accept-form-attachment.pdf\n",
118
+ "Processing file 1238787-request-form.pdf\n",
119
+ "Processing file 6173981-Cert-of-ID-form.pdf\n",
120
+ "Processing file 3220944-Sitton-July-2015-Campaign-Finance-Report.pdf\n",
121
+ "Processing file 6882316-Request-Form.pdf\n",
122
+ "Processing file 4111213-Coffman-Election-Filings.pdf\n",
123
+ "Processing file 4773259-RECORDS-REQUEST-FORM.pdf\n",
124
+ "Processing file 3116806-Black-River-Technical-College-Request-for-C-23.pdf\n",
125
+ "Processing file 761264-13_ee402a01_expository-essay-interpretation-of.pdf\n",
126
+ "Processing file 5955192-North-Dakota-corn-dogs.pdf\n",
127
+ "Processing file 6882816-Disaster-Relief-Cash-Assistance-Application-Form.pdf\n",
128
+ "Processing file 1303092-appeal-form.pdf\n",
129
+ "Processing file 3224254-CCT-Holidayhope-2016-Form.pdf\n",
130
+ "Processing file 528353-disclosure-form-13512855487444-_-pdf.pdf\n",
131
+ "Processing file 696442-nms-timeline-complaint.pdf\n",
132
+ "Processing file 842767-shallotte-assisted-living-penalty-packet-4.pdf\n",
133
+ "Processing file 4484638-GENERAL-APRA-form-2018.pdf\n",
134
+ "Processing file 20393262-margaux-keiser-form-460.pdf\n",
135
+ "Processing file 6785703-Release-form.pdf\n",
136
+ "Processing file 1683896-rtk-form-16241-pdf.pdf\n",
137
+ "Processing file 3234225-Philadelphia-Form-30401-pdf.pdf\n",
138
+ "Processing file 442530-collect-files-35870-political-file-2012-non.pdf\n",
139
+ "Processing file 4108784-2012-FSA-Enrollment-Form.pdf\n",
140
+ "Processing file 2426599-opega-review-request-form.pdf\n",
141
+ "Processing file 20429474-caserta-evidence-list.pdf\n",
142
+ "Processing file 7001493-ABC-Trailblazer-Application-Form-2020-21.pdf\n",
143
+ "Processing file 6310093-Registration.pdf\n",
144
+ "Processing file 21085990-mcguire-3q21.pdf\n",
145
+ "Processing file 6879322-Form-8871.pdf\n",
146
+ "Processing file 21182127-20220118-docketing-statement-b.pdf\n",
147
+ "Processing file 5518517-Corbett-email-PRR-017155-pdf.pdf\n",
148
+ "Processing file 4058754-Records-Release-pdf.pdf\n",
149
+ "Processing file 1386200-annual-disclosure-request-form.pdf\n",
150
+ "Processing file 7202517-Photo-Release-Form.pdf\n",
151
+ "Processing file 3860778-810A-Evidence-Submission-Form.pdf\n",
152
+ "Processing file 5899836-Medford-Tp-summary-form.pdf\n",
153
+ "Processing file 5776714-CRID-1046835.pdf\n",
154
+ "Processing file 6536846-Transcript-Request.pdf\n",
155
+ "Processing file 7041210-Development-Application-Form.pdf\n",
156
+ "Processing file 466205-wa-united-for-marriage-avails-request-form.pdf\n",
157
+ "Processing file 7000910-NCFPD-CORA-Request-Form.pdf\n",
158
+ "Processing file 21884855-dos-22-04-051.pdf\n",
159
+ "data/form/21884855-dos-22-04-051.pdf is empty, skipping\n",
160
+ "Saving data for form in data/form.parquet\n",
161
+ " filepath width height \\\n",
162
+ "0 data/form/1384753-request-form.pdf 1700 2200 \n",
163
+ "1 data/form/772791-12_sipc1301f_peer-individual-... 1700 2200 \n",
164
+ "2 data/form/6428556-Cassatt-Form-D-2008.pdf 1700 2200 \n",
165
+ "3 data/form/6428556-Cassatt-Form-D-2008.pdf 1700 2200 \n",
166
+ "4 data/form/6428556-Cassatt-Form-D-2008.pdf 1700 2200 \n",
167
+ ".. ... ... ... \n",
168
+ "618 data/form/7041210-Development-Application-Form... 1700 2200 \n",
169
+ "619 data/form/7041210-Development-Application-Form... 1700 2200 \n",
170
+ "620 data/form/7041210-Development-Application-Form... 1700 2200 \n",
171
+ "621 data/form/466205-wa-united-for-marriage-avails... 1700 2200 \n",
172
+ "622 data/form/7000910-NCFPD-CORA-Request-Form.pdf 1700 2200 \n",
173
+ "\n",
174
+ " bytes label \n",
175
+ "0 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n",
176
+ "1 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n",
177
+ "2 b'\\x00\\x00\\x00\\x00\\x00\\x00\\xfe\\xfe\\xfe\\xff\\xff... form \n",
178
+ "3 b'\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\xfe\\xfe... form \n",
179
+ "4 b'\\x00\\x00\\x00\\x00\\x00\\x00\\xfe\\xfe\\xfe\\xff\\xff... form \n",
180
+ ".. ... ... \n",
181
+ "618 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n",
182
+ "619 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n",
183
+ "620 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form-last \n",
184
+ "621 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n",
185
+ "622 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n",
186
+ "\n",
187
+ "[623 rows x 5 columns]\n",
188
+ "Processing folder data/scientific publication\n",
189
+ "Processing file 6425893-Oxford-Journals-Article.pdf\n",
190
+ "Processing file websearch_7_00001_scientific_publication.pdf\n",
191
+ "Processing file websearch_7_00017_scientific_publication.pdf\n",
192
+ "Processing file websearch_7_00022_scientific_publication.pdf\n",
193
+ "Processing file 3923078-Paper-1.pdf\n",
194
+ "Processing file websearch_7_00025_scientific_publication.pdf\n",
195
+ "Processing file websearch_7_00010_scientific_publication.pdf\n",
196
+ "Processing file 4247250-JACOBSON-Original-Article.pdf\n",
197
+ "Processing file websearch_7_00006_scientific_publication.pdf\n",
198
+ "Processing file 3559070-McCance-Katz-Et-Al-2017-the-American-Journal-on.pdf\n",
199
+ "Processing file websearch_7_00011_scientific_publication.pdf\n",
200
+ "Processing file websearch_7_00024_scientific_publication.pdf\n",
201
+ "Processing file websearch_7_00007_scientific_publication.pdf\n",
202
+ "Processing file websearch_7_00000_scientific_publication.pdf\n",
203
+ "Processing file websearch_7_00023_scientific_publication.pdf\n",
204
+ "Processing file 21085273-wood-schulman-article.pdf\n",
205
+ "Processing file websearch_7_00016_scientific_publication.pdf\n",
206
+ "Processing file 2189923-lancet-article-from-1989.pdf\n",
207
+ "Processing file 7223327-Forensic-Article.pdf\n",
208
+ "Processing file websearch_7_00009_scientific_publication.pdf\n",
209
+ "Processing file websearch_7_00005_scientific_publication.pdf\n",
210
+ "Processing file 6429770-Anderson2012-Article.pdf\n",
211
+ "Processing file websearch_7_00013_scientific_publication.pdf\n",
212
+ "Processing file websearch_7_00026_scientific_publication.pdf\n",
213
+ "Processing file websearch_7_00021_scientific_publication.pdf\n",
214
+ "Processing file websearch_7_00018_scientific_publication.pdf\n",
215
+ "Processing file websearch_7_00014_scientific_publication.pdf\n",
216
+ "Processing file websearch_7_00002_scientific_publication.pdf\n",
217
+ "Processing file websearch_7_00019_scientific_publication.pdf\n",
218
+ "Processing file websearch_7_00015_scientific_publication.pdf\n",
219
+ "Processing file websearch_7_00020_scientific_publication.pdf\n",
220
+ "Processing file websearch_7_00003_scientific_publication.pdf\n",
221
+ "Processing file 5980147-Venu-B-Article-1.pdf\n",
222
+ "Processing file 20519853-2007-recommendations-for-medical-management-of-adult-lead-exposure.pdf\n",
223
+ "Processing file 4343429-Meltdown-paper.pdf\n",
224
+ "Processing file 785243-paper.pdf\n",
225
+ "Processing file websearch_7_00008_scientific_publication.pdf\n",
226
+ "Processing file websearch_7_00004_scientific_publication.pdf\n",
227
+ "Processing file websearch_7_00012_scientific_publication.pdf\n",
228
+ "Saving data for scientific publication in data/scientific publication.parquet\n",
229
+ " filepath width height \\\n",
230
+ "0 data/scientific publication/6425893-Oxford-Jou... 1700 2200 \n",
231
+ "1 data/scientific publication/6425893-Oxford-Jou... 1700 2200 \n",
232
+ "2 data/scientific publication/6425893-Oxford-Jou... 1700 2200 \n",
233
+ "3 data/scientific publication/6425893-Oxford-Jou... 1700 2200 \n",
234
+ "4 data/scientific publication/6425893-Oxford-Jou... 1700 2200 \n",
235
+ ".. ... ... ... \n",
236
+ "532 data/scientific publication/websearch_7_00012_... 1700 2200 \n",
237
+ "533 data/scientific publication/websearch_7_00012_... 1700 2200 \n",
238
+ "534 data/scientific publication/websearch_7_00012_... 1700 2200 \n",
239
+ "535 data/scientific publication/websearch_7_00012_... 1700 2200 \n",
240
+ "536 data/scientific publication/websearch_7_00012_... 1700 2200 \n",
241
+ "\n",
242
+ " bytes \\\n",
243
+ "0 b'\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd... \n",
244
+ "1 b'\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe... \n",
245
+ "2 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n",
246
+ "3 b'\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd... \n",
247
+ "4 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n",
248
+ ".. ... \n",
249
+ "532 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n",
250
+ "533 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n",
251
+ "534 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n",
252
+ "535 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n",
253
+ "536 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n",
254
+ "\n",
255
+ " label \n",
256
+ "0 scientific publication \n",
257
+ "1 scientific publication \n",
258
+ "2 scientific publication \n",
259
+ "3 scientific publication \n",
260
+ "4 scientific publication \n",
261
+ ".. ... \n",
262
+ "532 scientific publication \n",
263
+ "533 scientific publication \n",
264
+ "534 scientific publication \n",
265
+ "535 scientific publication \n",
266
+ "536 scientific publication-last \n",
267
+ "\n",
268
+ "[537 rows x 5 columns]\n",
269
+ "Processing folder data/handwritten\n",
270
+ "Processing file 3223324-Scan-20111118-195759.pdf\n",
271
+ "Processing file 20743971-handwritten-statement.pdf\n",
272
+ "Processing file 21030538-doj-donoghue-notes.pdf\n",
273
+ "Processing file 519547-sere-psychologist-and-bush-admin-torture.pdf\n",
274
+ "Processing file 6240196-Zenith-Rathfelder-Notes.pdf\n",
275
+ "Processing file 1630443-nick-stoneman-notes.pdf\n",
276
+ "Processing file 4182029-HANDWRITTEN-COMPILATION-OF-COLLECTION.pdf\n",
277
+ "Processing file 528421-handwritten-notes-on-financial-offer-from-orders.pdf\n",
278
+ "Processing file 5498003-1996-08-11-Handwritten-Law-Enforcement-Notes.pdf\n",
279
+ "Processing file 202587-doc21.pdf\n",
280
+ "Processing file 5190901-WasteWS-07062018-145153-Handwritten-Letter-NO-OCR.pdf\n",
281
+ "Processing file 625990-thatcher-mt-notes.pdf\n",
282
+ "Processing file 782204-loeser-will.pdf\n",
283
+ "Processing file 2189928-handwritten-agreement.pdf\n",
284
+ "Processing file 1223362-gauthe-clinical-notes.pdf\n",
285
+ "Processing file 4190352-MEMORANDUM-HIDALGO-BALMES-HANDWRITTEN.pdf\n",
286
+ "Processing file 6939323-7-8-11-83-Handwritten-Notes.pdf\n",
287
+ "Processing file 2996996-TL-Handwritten-Fax-Labeled-Confidential.pdf\n",
288
+ "Processing file 3468379-DPD-Use-of-Force-Handwritten-Notes.pdf\n",
289
+ "Processing file 4188317-LIST-HANDWRITTEN-SUMMARIES-NOTES.pdf\n",
290
+ "Processing file 339816-jeff-long-notes.pdf\n",
291
+ "Processing file 3864522-Handwritten-letter-in-a-defense-sentencing-memo.pdf\n",
292
+ "Processing file 4183043-HANDWRITTEN-MEMO-SUBJ-ARRIVAL-COVERT-KEY-WEST.pdf\n",
293
+ "Processing file 5002049-Doc-10-09-2018-12-41-31.pdf\n",
294
+ "Processing file 4780536-Jury-Notes-Manafort-Trial.pdf\n",
295
+ "Processing file 2300328-gov-lepage-handwritten-note-on-lmf.pdf\n",
296
+ "Processing file 3002469-Investigation-Notes-Witness-Statements.pdf\n",
297
+ "Processing file 4193148-11111707.pdf\n",
298
+ "Processing file 4598117-2002-Handwritten-Notes-of-City-Meeting.pdf\n",
299
+ "Processing file 4444642-Letter-Chase-Nicholson-s-grandmother.pdf\n",
300
+ "Processing file 2519466-10-10-13-handwritten-list-of-malfunctioning-doors.pdf\n",
301
+ "Processing file 2940920-2002-09-12-Letter-Handwritten-Tony-Blair-to.pdf\n",
302
+ "Processing file 4187525-HANDWRITTEN-NOTE-CUBAN-ACTIVITIES-IN-ARGENTINA.pdf\n",
303
+ "Processing file 3462227-Notes-2.pdf\n",
304
+ "Processing file 2791337-FBI-Handwritten-Notes.pdf\n",
305
+ "Processing file 354332-notes.pdf\n",
306
+ "Processing file 4182092-HANDWRITTEN-PAPER-CONCERNING-PROJECT-ZRRIFLE.pdf\n",
307
+ "Processing file 3766497-Detective-Notes.pdf\n",
308
+ "Processing file 6749500-LEOPOLD-FILES-Bruce-Jessen-Handwritten-Notes.pdf\n",
309
+ "Processing file 3220458-Handwritten-Note.pdf\n",
310
+ "Processing file 4188296-HANDWRITTEN-LIST-OF-14-NAMES.pdf\n",
311
+ "Processing file 804992-fairhope-city-council-resolution-10-14-13.pdf\n",
312
+ "Processing file 2997471-Handwritten-Notes-to-and-From-Gov-LePage.pdf\n",
313
+ "Processing file 4598082-Letter-to-Judge-Schroeder.pdf\n",
314
+ "Processing file 3877562-Mikulcik-Interview-Notes.pdf\n",
315
+ "Processing file 479570-emily-dickinsons-handwritten-poem.pdf\n",
316
+ "Processing file 21108937-pv-handwritten-motion-for-trial-transcript-3215.pdf\n",
317
+ "Processing file 6020949-Aretha-Franklin-Handwritten-Will-1.pdf\n",
318
+ "Processing file 6464722-Fn-22-03252.pdf\n",
319
+ "Processing file 21117526-pv-handwritten-pre-sentencing-memorandum-21312.pdf\n",
320
+ "Processing file 705120-mingle-notes.pdf\n",
321
+ "Processing file 1213307-rialto-unified-holocaust-essays-set-13-part-05.pdf\n",
322
+ "Processing file 4189369-HANDWRITTEN-SHEET-WITH-REPORT-NUMBERS.pdf\n",
323
+ "Processing file 202586-doc20.pdf\n",
324
+ "Processing file 4184368-OPS-NOTES-ON-AMBIDDY.pdf\n",
325
+ "Processing file 393660-fibroscopic-notes-original.pdf\n",
326
+ "Processing file 4182778-HANDWRITTEN-NOTES.pdf\n",
327
+ "Processing file 4187708-DOCUMENT-3-HANDWRITTEN-NOTE.pdf\n",
328
+ "Processing file 702325-snitch-list.pdf\n",
329
+ "Processing file 4180597-HANDWRITTEN-MEMO-RE-IMPLICATION-OF-ALINE-MOSBY.pdf\n",
330
+ "Processing file 1236260-sidney-holters-note.pdf\n",
331
+ "Processing file 3223330-Scan-20111118-200635.pdf\n",
332
+ "Processing file 3223289-Scan-20111116-171028.pdf\n",
333
+ "Processing file 21043867-2021-01-25-mo-sen-eigel-handwritten-note-sb-66-allows-people-to-run-over-protestors.pdf\n",
334
+ "Processing file 3233254-Earl-Bradley-Letter-102516.pdf\n",
335
+ "Processing file 3227804-Cornell-Handwritten-Letters.pdf\n",
336
+ "Processing file 3862956-Handwritten-Note-of-Meeting-With-Howard-Hill.pdf\n",
337
+ "Processing file 803848-elvis-presleys-letter-to-richard-nixon.pdf\n",
338
+ "Processing file 3223331-Scan-20111118-200718.pdf\n",
339
+ "Processing file 202599-doc33.pdf\n",
340
+ "Processing file 5690139-Child-s-Australia-Day-Letter.pdf\n",
341
+ "Processing file 4182800-HANDWRITTEN-MEMO-ON-HUNT-AND-HIS-USE-OF-A-PEN-NAME.pdf\n",
342
+ "Processing file 3223399-Scan-20111125-141919.pdf\n",
343
+ "Processing file 3521430-IMG-4827.pdf\n",
344
+ "Processing file 4420170-Mullkoff-Note.pdf\n",
345
+ "Processing file 4185023-HANDWRITTEN-NOTE-RE-FILING-DOCUMENTS-IN-DIAZ-L-S.pdf\n",
346
+ "Processing file 2746630-Cardenas-notes.pdf\n",
347
+ "Processing file 1086683-hand-written-notes.pdf\n",
348
+ "Processing file 5190857-WasteWS-07062018-120826-HANDWRITTEN-NO-OCR.pdf\n",
349
+ "Processing file 4189400-HANDWRITTEN-CARDS-RESEARCH-DEPARTMENT-MEMORANDUM.pdf\n",
350
+ "Processing file 3223321-Scan-20111118-195438.pdf\n",
351
+ "Processing file 4328038-Correspondence-From-John-Crowley-to-Scott.pdf\n",
352
+ "Processing file 5018573-Chad-Notes.pdf\n",
353
+ "Processing file 21109051-pv-handwritten-letter-requesting-new-federal-defender-counsel-41212.pdf\n",
354
+ "Processing file 5316881-Journal-Entries.pdf\n",
355
+ "Processing file 4191532-OFFICE-NOTES-RE-AGENTS.pdf\n",
356
+ "Processing file 1213274-rialto-unified-holocaust-essays-set-01-part-01.pdf\n",
357
+ "Processing file 4490103-Gergely-Interview-Notes.pdf\n",
358
+ "Processing file 2708465-Brandon-Astor-Jones-Timeline-HW.pdf\n",
359
+ "Processing file 3462228-notes-3.pdf\n",
360
+ "Processing file 528418-handwritten-note-from-april-3-2001.pdf\n",
361
+ "Processing file 4185802-HANDWRITTEN-NOTES-MARY-CHECKING-FOR-PERTINENT.pdf\n",
362
+ "Processing file 3011003-Richard-Bain-s-handwritten-account-of-Sept-4-2012.pdf\n",
363
+ "Processing file 5002048-Doc-10-09-2018-12-42-54.pdf\n",
364
+ "Processing file 1372097-applebee-valuch-notes.pdf\n",
365
+ "Processing file 705119-davies-notes.pdf\n",
366
+ "Processing file 3237222-Notes-1216.pdf\n",
367
+ "Processing file 3718215-PP-D0414.pdf\n",
368
+ "Processing file 4187089-LECTURE-AT-FARM.pdf\n",
369
+ "Processing file 3223531-Scan-20111202-194427.pdf\n",
370
+ "Processing file 4450893-Lithuanian-Extradition-Request-for-Release.pdf\n",
371
+ "Processing file 4193146-11111705.pdf\n",
372
+ "Processing file 526429-east-coast-rapist-suspects-apology-letter.pdf\n",
373
+ "Processing file 4181294-HANDWRITTEN-LIST-DDO-FILE-REQUESTS.pdf\n",
374
+ "Processing file 5764377-Handwritten-Notes.pdf\n",
375
+ "Processing file 4183714-HANDWRITTEN-NOTE-RE-RICHARD-GIBSON-ISSUE.pdf\n",
376
+ "Processing file 6393576-Lawson-Letter.pdf\n",
377
+ "Processing file 1283645-dearest-celeste.pdf\n",
378
+ "Processing file 2825042-Mathes-Handwritten-Notes.pdf\n",
379
+ "Processing file 202597-doc31.pdf\n",
380
+ "Processing file 21011336-foxs-hand-drawn-map.pdf\n",
381
+ "Processing file 339815-jon-fagg-notes.pdf\n",
382
+ "Processing file 1336598-hartmannnotes.pdf\n",
383
+ "Processing file 2650353-1983-10-22-F4-Handritten.pdf\n",
384
+ "Processing file 282179-july-14-1995-mladic-diary-handwritten.pdf\n",
385
+ "Processing file 2271360-hartfield-handwritten-writ.pdf\n",
386
+ "Processing file 4178930-HANDWRITTEN-DOC-SUMMARIES-ILLEGIBLE.pdf\n",
387
+ "Processing file 5096174-Handwritten-Minutes.pdf\n",
388
+ "Processing file 6382284-Letter-Ferrell-Scott.pdf\n",
389
+ "Processing file 2474354-oland-statement-written.pdf\n",
390
+ "Processing file 1306277-4-9-22-11-hanah-cho.pdf\n",
391
+ "Processing file 3223449-Scan-20111125-145718.pdf\n",
392
+ "Processing file 2517459-bill-clintons-handwritten-speech-from-the.pdf\n",
393
+ "Processing file 3223329-Scan-20111118-200524.pdf\n",
394
+ "Processing file 4179732-INVENTORY-HANDWRITTEN.pdf\n",
395
+ "Processing file 3223447-Scan-20111125-145513.pdf\n",
396
+ "Processing file 786278-frederick-pabsts-handwritten-will.pdf\n",
397
+ "Processing file 1378369-bricker-doc.pdf\n",
398
+ "Processing file 1371011-witness-40-journal-entry.pdf\n",
399
+ "Processing file 4191654-HANDWRITTEN-NOTES-ON-CIA-AND-FBI-DOCUMENTS.pdf\n",
400
+ "Processing file 803812-iggy-pops-letter-to-a-fan-1995.pdf\n",
401
+ "Processing file 3223444-Scan-20111125-145344.pdf\n",
402
+ "Processing file 4193147-11111706.pdf\n",
403
+ "Processing file 2504031-joseph-brennick-handwritten-memos.pdf\n",
404
+ "Processing file 21108969-pvs-amended-handwritten-41-page-motion-for-reconsideration-en-banc-11317.pdf\n",
405
+ "Processing file 5190870-WasteWS-07062018-121609-HANDWRITTEN-NO-OCR-Pdf.pdf\n",
406
+ "Processing file 3223420-Scan-20111125-142900.pdf\n",
407
+ "Processing file 555087-vh-defrock-letter.pdf\n",
408
+ "Processing file 5002051-Doc-10-09-2018-12-39-47.pdf\n",
409
+ "Processing file 5634528-181217-Flynn-Fbi-Notes.pdf\n",
410
+ "Processing file 5025013-J-R-Thomas-response.pdf\n",
411
+ "Processing file 3223527-Scan-20111202-190234.pdf\n",
412
+ "Processing file 4184490-HANDWRITTEN-BERNARD-BARKER-CHRONOLOGY.pdf\n",
413
+ "Processing file 4911037-Manuel-Orrego-Savala-Letter.pdf\n",
414
+ "Processing file 6571779-Frederick-Veal-Interview-Notes.pdf\n",
415
+ "Processing file 1096680-alexandra-hollinghurst-notes.pdf\n",
416
+ "Processing file 5780462-GGGW-v-Schwitzer-Handwritten-Ex-Staffer-Stmt.pdf\n",
417
+ "Processing file 321762-gina-hutchinson-journal-entry.pdf\n",
418
+ "Processing file 566855-inslee-letter.pdf\n",
419
+ "Processing file 2811636-Callis-Handwritten-Letter-1999-13-09.pdf\n",
420
+ "Processing file 803827-a-mothers-letter-to-the-foundling-asylum.pdf\n",
421
+ "Processing file 4704949-Bechtel-Notes.pdf\n",
422
+ "Processing file 6771659-Preston-Handwritten-Response.pdf\n",
423
+ "Processing file 776417-helios-notes.pdf\n",
424
+ "Processing file 4191914-FORM-MEMORANDUM-FOR-THE-RECORD-HARVEY-LEE-OSWALD.pdf\n",
425
+ "Processing file 3462229-Notes-1.pdf\n",
426
+ "Processing file 21030480-doj-notes.pdf\n",
427
+ "Processing file 1658584-1admission-notes-redacted.pdf\n",
428
+ "Processing file 1668278-doc066.pdf\n",
429
+ "Processing file 5332373-1-AEA-2080-NOTES-HANDWRITTEN-CHRONOLOGY-EVENTS.pdf\n",
430
+ "Processing file 528420-handwritten-notes-of-meeting-with-cori-june-26.pdf\n",
431
+ "Processing file 2777703-AV-Interview-Notes.pdf\n",
432
+ "Processing file 2163779-skm-c224e15070214420-pdf-handwritten-notes-about.pdf\n",
433
+ "Processing file 266726-boogaard-journal.pdf\n"
434
+ ]
435
+ }
436
+ ],
437
+ "source": [
438
+ "process_training_data()"
439
+ ]
440
+ }
441
+ ],
442
+ "metadata": {
443
+ "kernelspec": {
444
+ "display_name": "Python (hydra)",
445
+ "language": "python",
446
+ "name": "hydra"
447
+ },
448
+ "language_info": {
449
+ "codemirror_mode": {
450
+ "name": "ipython",
451
+ "version": 3
452
+ },
453
+ "file_extension": ".py",
454
+ "mimetype": "text/x-python",
455
+ "name": "python",
456
+ "nbconvert_exporter": "python",
457
+ "pygments_lexer": "ipython3",
458
+ "version": "3.13.1"
459
+ },
460
+ "widgets": {
461
+ "application/vnd.jupyter.widget-state+json": {
462
+ "state": {},
463
+ "version_major": 2,
464
+ "version_minor": 0
465
+ }
466
+ }
467
+ },
468
+ "nbformat": 4,
469
+ "nbformat_minor": 5
470
+ }
pre_processing.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import os
3
+ import pandas as pd
4
+ import pdf2image as p2i
5
+ import pytesseract
6
+ from os import path
7
+ from PIL import Image
8
+ from typing import List, Tuple
9
+ from transformers import BertTokenizer
10
+ from constants import (RAW_DATA_DIR,
11
+ PROCESSED_DATA_DIR,
12
+ METADATA_FILEPATH,
13
+ BERT_BASE,
14
+ MAX_SEQUENCE_LENGHT,
15
+ FilePath,
16
+ PageMetadata)
17
+
18
+ # Allow for unlimited image size, some documents are pretty big...
19
+ Image.MAX_IMAGE_PIXELS = None
20
+
21
+
22
+ def make_page_filepaths(basename, label, page_index) -> Tuple[str, str]:
23
+ out_dirname = path.join(PROCESSED_DATA_DIR, label)
24
+ os.makedirs(out_dirname, exist_ok=True)
25
+ out_filename = path.join(out_dirname, f'{basename}_{page_index}')
26
+
27
+ out_img_filepath = f'{out_filename}.jpg'
28
+ out_txt_filepath = f'{out_filename}.txt'
29
+
30
+ return out_img_filepath, out_txt_filepath
31
+
32
+
33
+ def tokenize_text(text: str) -> Tuple[List[int], List[int]]:
34
+ tokenizer = BertTokenizer.from_pretrained(BERT_BASE)
35
+ tokenized = tokenizer(
36
+ text,
37
+ padding=True,
38
+ truncation=True,
39
+ max_length=MAX_SEQUENCE_LENGHT,
40
+ )
41
+
42
+ return tokenized['input_ids'], tokenized['attention_mask']
43
+
44
+
45
+ def process_pdf_file(pdf_filepath: FilePath):
46
+ if path.getsize(pdf_filepath) == 0:
47
+ # TODO: substitute for logging
48
+ print(f'{pdf_filepath} is empty, skipping')
49
+ return []
50
+
51
+ pages: List[Image] = p2i.convert_from_path(pdf_filepath)
52
+ pages_metadata: List[PageMetadata] = []
53
+
54
+ root_dir, doctype = path.split(path.dirname(pdf_filepath))
55
+ base_filename = path.basename(path.splitext(pdf_filepath)[0])
56
+ for page_i, page in enumerate(pages):
57
+ label = 'other'
58
+ if page_i == 0:
59
+ label = doctype
60
+ # If the document only has one page, override the label with
61
+ if page_i == len(pages) - 1:
62
+ label = f'{doctype}-last'
63
+
64
+ out_img_filepath, out_txt_filepath = make_page_filepaths(base_filename, label, page_i)
65
+
66
+ page.save(out_img_filepath)
67
+
68
+ ocr_text = pytesseract.image_to_string(page)
69
+ input_ids, attention_mask = tokenize_text(ocr_text)
70
+ with open(out_txt_filepath, 'w') as out_txt_file:
71
+ out_txt_file.write(ocr_text)
72
+
73
+ pages_metadata.append({
74
+ 'page_number': page_i + 1,
75
+ 'pdf_filepath': path.relpath(pdf_filepath, start='.'),
76
+ 'img_filepath': out_img_filepath,
77
+ 'txt_filepath': out_txt_filepath,
78
+ # 'text_tokens': tokens,
79
+ 'width': page.width,
80
+ 'height': page.height,
81
+ 'label': label,
82
+ })
83
+
84
+ return pages_metadata
85
+
86
+
87
+ def process_training_data() -> pd.DataFrame:
88
+ pages_metadata: List[List[PageMetadata]] = []
89
+
90
+ for dirname, _, files in os.walk(RAW_DATA_DIR):
91
+ if path.samefile(dirname, RAW_DATA_DIR):
92
+ continue
93
+
94
+ print(f'Processing folder {dirname}')
95
+
96
+ for filename in files:
97
+ _, ext = path.splitext(filename)
98
+
99
+ # Avoid processing non-document files
100
+ if ext.lower() == '.pdf':
101
+ print(f'Processing file {filename}')
102
+ pdf_filepath = path.join(dirname, filename)
103
+ pages_metadata.extend(process_pdf_file(pdf_filepath))
104
+
105
+ pages_metadata_df = pd.DataFrame(pages_metadata)
106
+ print(f'Writing metadata to {METADATA_FILEPATH}')
107
+ pages_metadata_df.to_csv(METADATA_FILEPATH, index=False)
108
+ return pages_metadata_df
109
+
110
+
111
+ process_training_data()
requirements.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ argcomplete==3.3.0
2
+ astunparse==1.6.3
3
+ gast==0.6.0
4
+ kagglehub==0.3.6
5
+ keras-hub==0.18.1
6
+ libclang==18.1.1
7
+ Markdown==3.7
8
+ markdown-it-py==3.0.0
9
+ numpy==2.0.2
10
+ opt_einsum==3.4.0
11
+ optree==0.14.0
12
+ packaging==24.2
13
+ pip-autoremove==0.10.0
14
+ pipx==1.5.0
15
+ platformdirs==4.2.2
16
+ protobuf==5.29.3
17
+ requests==2.32.3
18
+ setuptools==75.8.0
19
+ six==1.17.0
20
+ tensorboard==2.18.0
21
+ tensorflow-text==2.18.1
22
+ tf_keras==2.18.0
23
+ typing_extensions==4.12.2
24
+ userpath==1.9.2
25
+ Werkzeug==3.1.3
26
+ wheel==0.44.0
training.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import tensorflow as tf
3
+ import tf_keras as keras
4
+ from constants import (PROCESSED_DATA_DIR,
5
+ METADATA_FILEPATH,
6
+ BATCH_SIZE,
7
+ EPOCHS,
8
+ BERT_BASE,
9
+ MAX_SEQUENCE_LENGHT,
10
+ FilePath,
11
+ PageMetadata,
12
+ ImageSize,
13
+ ImageInputShape)
14
+ from pandera.typing import DataFrame
15
+ from typing import Tuple, List
16
+ from transformers import TFBertModel
17
+ from tf_keras import layers, models
18
+ from PIL import Image
19
+
20
+ # Allow for unlimited image size, some documents are pretty big...
21
+ Image.MAX_IMAGE_PIXELS = None
22
+
23
+
24
+ def stratified_split(
25
+ df: pd.DataFrame,
26
+ train_frac: float,
27
+ val_frac: float,
28
+ test_frac: float,
29
+ ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
30
+ train_dfs, val_dfs, test_dfs = [], [], []
31
+
32
+ for label, group in df.groupby('label'):
33
+ n = len(group)
34
+ train_end = int(n * train_frac)
35
+ val_end = train_end + int(n * val_frac)
36
+
37
+ train_dfs.append(group.iloc[:train_end])
38
+ val_dfs.append(group.iloc[train_end:val_end])
39
+ test_dfs.append(group.iloc[val_end:])
40
+
41
+ train_df = pd.concat(train_dfs).reset_index(drop=True)
42
+ val_df = pd.concat(val_dfs).reset_index(drop=True)
43
+ test_df = pd.concat(test_dfs).reset_index(drop=True)
44
+
45
+ return train_df, val_df, test_df
46
+
47
+
48
+ def dataset_from_dataframe(df: pd.DataFrame) -> tf.data.Dataset:
49
+ return tf.data.Dataset.from_tensor_slices((
50
+ df['img_filepath'].values,
51
+ df['input_ids'].values,
52
+ df['attention_mask'].values,
53
+ df['label'].values,
54
+ ))
55
+
56
+
57
+ def load_image(image_path: FilePath, image_size: ImageSize) -> Image:
58
+ img_width, img_height = image_size
59
+
60
+ # Load image
61
+ image = tf.io.read_file(image_path)
62
+ image = tf.image.decode_jpeg(image, channels=3)
63
+ image = tf.image.resize(image, [img_width, img_height])
64
+ image /= 255.0
65
+
66
+ return image
67
+
68
+
69
+ def prepare_dataset(
70
+ ds: tf.data.Dataset,
71
+ image_size: ImageSize,
72
+ batch_size=32,
73
+ buffer_size=1000
74
+ ) -> tf.data.Dataset:
75
+ def load_image_and_format_tensor_shape(
76
+ img_path: FilePath,
77
+ input_ids: List[int],
78
+ attention_mask: List[int],
79
+ label: str
80
+ ):
81
+ image = load_image(img_path, image_size)
82
+ return ((image, input_ids, attention_mask), label)
83
+
84
+ return ds.map(
85
+ load_image_and_format_tensor_shape,
86
+ num_parallel_calls=tf.data.experimental.AUTOTUNE,
87
+ ) \
88
+ .shuffle(buffer_size=buffer_size) \
89
+ .batch(batch_size) \
90
+ .prefetch(tf.data.experimental.AUTOTUNE)
91
+
92
+
93
+ metadata_df: DataFrame[PageMetadata] = pd.read_csv(METADATA_FILEPATH)
94
+ metadata_df = metadata_df.sample(n=50, random_state=42)
95
+
96
+ median_height = int(metadata_df['height'].median())
97
+ median_width = int(metadata_df['width'].median())
98
+ img_size: ImageSize = (median_height, median_width)
99
+ img_input_shape: ImageInputShape = img_size + (3,)
100
+
101
+ label_names: List[str] = sorted(
102
+ [d.name for d in PROCESSED_DATA_DIR.iterdir() if d.is_dir()]
103
+ )
104
+ num_classes = len(label_names)
105
+
106
+ print('Splitting the DataFrame into training, validation and test')
107
+ train_df, val_df, test_df = stratified_split(
108
+ metadata_df,
109
+ train_frac=0.7,
110
+ val_frac=0.15,
111
+ test_frac=0.15,
112
+ )
113
+
114
+ print('Batching and shuffling the datasets')
115
+ train_ds = dataset_from_dataframe(train_df)
116
+ train_ds = prepare_dataset(train_ds, img_size, batch_size=BATCH_SIZE)
117
+
118
+ val_ds = dataset_from_dataframe(val_df)
119
+ val_ds = prepare_dataset(val_ds, img_size, batch_size=BATCH_SIZE)
120
+
121
+ test_ds = dataset_from_dataframe(test_df)
122
+ test_ds = prepare_dataset(test_ds, img_size, batch_size=BATCH_SIZE)
123
+
124
+
125
+ def build_image_model(input_shape: ImageInputShape) -> keras.Model:
126
+ img_model = models.Sequential([
127
+ layers.Input(shape=input_shape),
128
+ layers.Conv2D(32, (3, 3), activation='relu'),
129
+ layers.MaxPooling2D((2, 2)),
130
+ layers.Conv2D(64, (3, 3), activation='relu'),
131
+ layers.MaxPooling2D((2, 2)),
132
+ layers.Conv2D(128, (3, 3), activation='relu'),
133
+ layers.MaxPooling2D((2, 2)),
134
+ layers.Conv2D(128, (3, 3), activation='relu'),
135
+ layers.MaxPooling2D((2, 2)),
136
+ layers.Flatten(),
137
+ layers.Dense(512, activation='relu'),
138
+ ], name='image_classification')
139
+
140
+ img_model.summary()
141
+ return img_model
142
+
143
+
144
+ def build_text_model() -> keras.Model:
145
+ bert_model = TFBertModel.from_pretrained(BERT_BASE)
146
+
147
+ input_ids = layers.Input(
148
+ shape=(MAX_SEQUENCE_LENGHT,), dtype=tf.int32, name='input_ids'
149
+ )
150
+ attention_mask = layers.Input(
151
+ shape=(MAX_SEQUENCE_LENGHT,), dtype=tf.int32, name='attention_mask'
152
+ )
153
+
154
+ # The second element of the BERT output is the pooled output i.e. the
155
+ # representation of the [CLS] token
156
+ outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)[1]
157
+
158
+ text_model = models.Model(
159
+ inputs=[input_ids, attention_mask],
160
+ outputs=outputs,
161
+ name='bert'
162
+ )
163
+ text_model.summary()
164
+
165
+ return text_model
166
+
167
+
168
+ def build_multimodal_model(
169
+ num_classes: int,
170
+ img_input_shape: ImageInputShape
171
+ ) -> keras.Model:
172
+ img_model = build_image_model(img_input_shape)
173
+ text_model = build_text_model()
174
+
175
+ img_input = layers.Input(shape=img_input_shape, name='img_input')
176
+ text_input_ids = layers.Input(
177
+ shape=(MAX_SEQUENCE_LENGHT,), dtype=tf.int32, name='text_input_ids'
178
+ )
179
+ text_input_mask = layers.Input(
180
+ shape=(MAX_SEQUENCE_LENGHT,), dtype=tf.int32, name='text_input_mask'
181
+ )
182
+
183
+ img_features = img_model(img_input)
184
+ text_features = text_model([text_input_ids, text_input_mask])
185
+
186
+ classification_layers = keras.Sequential([
187
+ tf.keras.layers.Dense(512, activation='relu'),
188
+ tf.keras.layers.Dense(num_classes, activation='softmax'),
189
+ ], name='classification_layers')
190
+ concat_features = layers.concatenate([img_features, text_features],
191
+ name='concatenate_features')
192
+ outputs = classification_layers(concat_features)
193
+
194
+ multimodal_model = models.Model(
195
+ inputs=[img_input, text_input_ids, text_input_mask],
196
+ outputs=outputs,
197
+ name='multimodal_document_page_classifier'
198
+ )
199
+ return multimodal_model
200
+
201
+
202
+ multimodal_model = build_multimodal_model(num_classes, img_input_shape)
203
+ multimodal_model.summary()
204
+ multimodal_model.compile(
205
+ optimizer='adam',
206
+ loss='sparse_categorical_crossentropy',
207
+ metrics=['accuracy']
208
+ )
209
+ multimodal_model.fit(
210
+ train_ds,
211
+ epochs=EPOCHS,
212
+ batch_size=BATCH_SIZE,
213
+ validation_data=val_ds,
214
+ )