{ "cells": [ { "cell_type": "code", "execution_count": 3, "id": "63d95b8f-7559-4a3d-b025-5bb788914dc9", "metadata": {}, "outputs": [], "source": [ "import os\n", "import numpy as np\n", "import pandas as pd\n", "import pdf2image as p2i\n", "from os import path\n", "from pathlib import Path\n", "from PIL import Image\n", "\n", "# Allow for unlimited image size, some documents are pretty big...\n", "Image.MAX_IMAGE_PIXELS = None\n", "\n", "\n", "def process_pdf_document(filepath):\n", " if path.getsize(filepath) == 0:\n", " # TODO: substitute for logging\n", " print(f'{filepath} is empty, skipping')\n", " return []\n", "\n", " pages = p2i.convert_from_path(filepath)\n", " processed_pages: list[dict] = []\n", "\n", " label = 'other'\n", " root_dir, doctype = Path(filepath).parts[:2]\n", " for page_i, page in enumerate(pages):\n", " if page_i == 0:\n", " label = doctype\n", " elif page_i == len(pages) - 1:\n", " label = f'{label}-last'\n", "\n", " processed_pages.append({\n", " 'filepath': filepath,\n", " 'width': page.width,\n", " 'height': page.height,\n", " 'bytes': page.tobytes(),\n", " 'label': label,\n", " })\n", "\n", " return processed_pages\n", "\n", "\n", "def process_training_data():\n", " data_dir = Path('./data')\n", "\n", " for dirname, _, files in os.walk(data_dir):\n", " print(f'Processing folder {dirname}')\n", " doctype = path.basename(dirname)\n", " df = pd.DataFrame()\n", "\n", " for filename in files:\n", " print(f'Processing file {filename}')\n", " filepath = path.join(dirname, filename)\n", " _, ext = path.splitext(filepath)\n", "\n", " if ext.lower() == '.pdf':\n", " processed_pages = process_pdf_document(filepath)\n", " df = pd.concat([df, pd.DataFrame(processed_pages)], ignore_index=True)\n", "\n", " parquet_filepath = path.join(data_dir, f'{doctype}.parquet')\n", " print(f'Saving data for {doctype} in {parquet_filepath}')\n", " print(df)\n", " df.to_parquet(parquet_filepath)" ] }, { "cell_type": "code", "execution_count": null, "id": "1efb8e82-92bc-46dc-9553-cdc712211c93", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing folder data\n", "Processing file data.parquet\n", "Processing file .DS_Store\n", "Saving data for data in data/data.parquet\n", "Empty DataFrame\n", "Columns: []\n", "Index: []\n", "Processing folder data/form\n", "Processing file 1384753-request-form.pdf\n", "Processing file 772791-12_sipc1301f_peer-individual-evaluation-form.pdf\n", "Processing file 6428556-Cassatt-Form-D-2008.pdf\n", "Processing file 3895717-INFORMATION-DISCLOSURE-REQUEST-FORM.pdf\n", "Processing file 4321053-document-21556514.pdf\n", "Processing file 4575531-REDACTED-PIW-CR285646.pdf\n", "Processing file 6155510-Form-I.pdf\n", "Processing file 3885938-JayaServicesLtd-Appointment-15112002.pdf\n", "Processing file 4950830-Cook1995.pdf\n", "Processing file 6368627-2013-Southern-Company-990.pdf\n", "Processing file 4436484-Columbia-Armed-Guard-application-form.pdf\n", "Processing file 21074299-company-application-form-for-dominicana-acquisition-sa.pdf\n", "Processing file 746365-wa-madsen-barbara-2012.pdf\n", "Processing file 5770184-CAMCF-RFYL-Registration-Form.pdf\n", "Processing file 1683073-medication-info-form-eng-revised-2014.pdf\n", "Processing file 4423347-Request-Form.pdf\n", "Processing file 3022163-Event-Form.pdf\n", "Processing file 1685242-cv-fresh-entry-form.pdf\n", "Processing file 4910987-Records-Request-Form.pdf\n", "Processing file 4349763-DOD-Form-254.pdf\n", "Processing file 3885273-EIMEuropeanInvestmentsManagement-ChangeofName.pdf\n", "Processing file 4951858-Ober3.pdf\n", "Processing file 7034791-UOF-form.pdf\n", "Processing file 6206290-Thunderbird-GBC1-Application-Form.pdf\n", "Processing file 465336-disclosure-form-13491099365164-_-pdf.pdf\n", "Processing file 6538004-Aderholt-Travel-Disclosure-Form.pdf\n", "Processing file 1292759-host-accept-form-attachment.pdf\n", "Processing file 1238787-request-form.pdf\n", "Processing file 6173981-Cert-of-ID-form.pdf\n", "Processing file 3220944-Sitton-July-2015-Campaign-Finance-Report.pdf\n", "Processing file 6882316-Request-Form.pdf\n", "Processing file 4111213-Coffman-Election-Filings.pdf\n", "Processing file 4773259-RECORDS-REQUEST-FORM.pdf\n", "Processing file 3116806-Black-River-Technical-College-Request-for-C-23.pdf\n", "Processing file 761264-13_ee402a01_expository-essay-interpretation-of.pdf\n", "Processing file 5955192-North-Dakota-corn-dogs.pdf\n", "Processing file 6882816-Disaster-Relief-Cash-Assistance-Application-Form.pdf\n", "Processing file 1303092-appeal-form.pdf\n", "Processing file 3224254-CCT-Holidayhope-2016-Form.pdf\n", "Processing file 528353-disclosure-form-13512855487444-_-pdf.pdf\n", "Processing file 696442-nms-timeline-complaint.pdf\n", "Processing file 842767-shallotte-assisted-living-penalty-packet-4.pdf\n", "Processing file 4484638-GENERAL-APRA-form-2018.pdf\n", "Processing file 20393262-margaux-keiser-form-460.pdf\n", "Processing file 6785703-Release-form.pdf\n", "Processing file 1683896-rtk-form-16241-pdf.pdf\n", "Processing file 3234225-Philadelphia-Form-30401-pdf.pdf\n", "Processing file 442530-collect-files-35870-political-file-2012-non.pdf\n", "Processing file 4108784-2012-FSA-Enrollment-Form.pdf\n", "Processing file 2426599-opega-review-request-form.pdf\n", "Processing file 20429474-caserta-evidence-list.pdf\n", "Processing file 7001493-ABC-Trailblazer-Application-Form-2020-21.pdf\n", "Processing file 6310093-Registration.pdf\n", "Processing file 21085990-mcguire-3q21.pdf\n", "Processing file 6879322-Form-8871.pdf\n", "Processing file 21182127-20220118-docketing-statement-b.pdf\n", "Processing file 5518517-Corbett-email-PRR-017155-pdf.pdf\n", "Processing file 4058754-Records-Release-pdf.pdf\n", "Processing file 1386200-annual-disclosure-request-form.pdf\n", "Processing file 7202517-Photo-Release-Form.pdf\n", "Processing file 3860778-810A-Evidence-Submission-Form.pdf\n", "Processing file 5899836-Medford-Tp-summary-form.pdf\n", "Processing file 5776714-CRID-1046835.pdf\n", "Processing file 6536846-Transcript-Request.pdf\n", "Processing file 7041210-Development-Application-Form.pdf\n", "Processing file 466205-wa-united-for-marriage-avails-request-form.pdf\n", "Processing file 7000910-NCFPD-CORA-Request-Form.pdf\n", "Processing file 21884855-dos-22-04-051.pdf\n", "data/form/21884855-dos-22-04-051.pdf is empty, skipping\n", "Saving data for form in data/form.parquet\n", " filepath width height \\\n", "0 data/form/1384753-request-form.pdf 1700 2200 \n", "1 data/form/772791-12_sipc1301f_peer-individual-... 1700 2200 \n", "2 data/form/6428556-Cassatt-Form-D-2008.pdf 1700 2200 \n", "3 data/form/6428556-Cassatt-Form-D-2008.pdf 1700 2200 \n", "4 data/form/6428556-Cassatt-Form-D-2008.pdf 1700 2200 \n", ".. ... ... ... \n", "618 data/form/7041210-Development-Application-Form... 1700 2200 \n", "619 data/form/7041210-Development-Application-Form... 1700 2200 \n", "620 data/form/7041210-Development-Application-Form... 1700 2200 \n", "621 data/form/466205-wa-united-for-marriage-avails... 1700 2200 \n", "622 data/form/7000910-NCFPD-CORA-Request-Form.pdf 1700 2200 \n", "\n", " bytes label \n", "0 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n", "1 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n", "2 b'\\x00\\x00\\x00\\x00\\x00\\x00\\xfe\\xfe\\xfe\\xff\\xff... form \n", "3 b'\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\xfe\\xfe... form \n", "4 b'\\x00\\x00\\x00\\x00\\x00\\x00\\xfe\\xfe\\xfe\\xff\\xff... form \n", ".. ... ... \n", "618 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n", "619 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n", "620 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form-last \n", "621 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n", "622 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n", "\n", "[623 rows x 5 columns]\n", "Processing folder data/scientific publication\n", "Processing file 6425893-Oxford-Journals-Article.pdf\n", "Processing file websearch_7_00001_scientific_publication.pdf\n", "Processing file websearch_7_00017_scientific_publication.pdf\n", "Processing file websearch_7_00022_scientific_publication.pdf\n", "Processing file 3923078-Paper-1.pdf\n", "Processing file websearch_7_00025_scientific_publication.pdf\n", "Processing file websearch_7_00010_scientific_publication.pdf\n", "Processing file 4247250-JACOBSON-Original-Article.pdf\n", "Processing file websearch_7_00006_scientific_publication.pdf\n", "Processing file 3559070-McCance-Katz-Et-Al-2017-the-American-Journal-on.pdf\n", "Processing file websearch_7_00011_scientific_publication.pdf\n", "Processing file websearch_7_00024_scientific_publication.pdf\n", "Processing file websearch_7_00007_scientific_publication.pdf\n", "Processing file websearch_7_00000_scientific_publication.pdf\n", "Processing file websearch_7_00023_scientific_publication.pdf\n", "Processing file 21085273-wood-schulman-article.pdf\n", "Processing file websearch_7_00016_scientific_publication.pdf\n", "Processing file 2189923-lancet-article-from-1989.pdf\n", "Processing file 7223327-Forensic-Article.pdf\n", "Processing file websearch_7_00009_scientific_publication.pdf\n", "Processing file websearch_7_00005_scientific_publication.pdf\n", "Processing file 6429770-Anderson2012-Article.pdf\n", "Processing file websearch_7_00013_scientific_publication.pdf\n", "Processing file websearch_7_00026_scientific_publication.pdf\n", "Processing file websearch_7_00021_scientific_publication.pdf\n", "Processing file websearch_7_00018_scientific_publication.pdf\n", "Processing file websearch_7_00014_scientific_publication.pdf\n", "Processing file websearch_7_00002_scientific_publication.pdf\n", "Processing file websearch_7_00019_scientific_publication.pdf\n", "Processing file websearch_7_00015_scientific_publication.pdf\n", "Processing file websearch_7_00020_scientific_publication.pdf\n", "Processing file websearch_7_00003_scientific_publication.pdf\n", "Processing file 5980147-Venu-B-Article-1.pdf\n", "Processing file 20519853-2007-recommendations-for-medical-management-of-adult-lead-exposure.pdf\n", "Processing file 4343429-Meltdown-paper.pdf\n", "Processing file 785243-paper.pdf\n", "Processing file websearch_7_00008_scientific_publication.pdf\n", "Processing file websearch_7_00004_scientific_publication.pdf\n", "Processing file websearch_7_00012_scientific_publication.pdf\n", "Saving data for scientific publication in data/scientific publication.parquet\n", " filepath width height \\\n", "0 data/scientific publication/6425893-Oxford-Jou... 1700 2200 \n", "1 data/scientific publication/6425893-Oxford-Jou... 1700 2200 \n", "2 data/scientific publication/6425893-Oxford-Jou... 1700 2200 \n", "3 data/scientific publication/6425893-Oxford-Jou... 1700 2200 \n", "4 data/scientific publication/6425893-Oxford-Jou... 1700 2200 \n", ".. ... ... ... \n", "532 data/scientific publication/websearch_7_00012_... 1700 2200 \n", "533 data/scientific publication/websearch_7_00012_... 1700 2200 \n", "534 data/scientific publication/websearch_7_00012_... 1700 2200 \n", "535 data/scientific publication/websearch_7_00012_... 1700 2200 \n", "536 data/scientific publication/websearch_7_00012_... 1700 2200 \n", "\n", " bytes \\\n", "0 b'\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd... \n", "1 b'\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe... \n", "2 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n", "3 b'\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd... \n", "4 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n", ".. ... \n", "532 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n", "533 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n", "534 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n", "535 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n", "536 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n", "\n", " label \n", "0 scientific publication \n", "1 scientific publication \n", "2 scientific publication \n", "3 scientific publication \n", "4 scientific publication \n", ".. ... \n", "532 scientific publication \n", "533 scientific publication \n", "534 scientific publication \n", "535 scientific publication \n", "536 scientific publication-last \n", "\n", "[537 rows x 5 columns]\n", "Processing folder data/handwritten\n", "Processing file 3223324-Scan-20111118-195759.pdf\n", "Processing file 20743971-handwritten-statement.pdf\n", "Processing file 21030538-doj-donoghue-notes.pdf\n", "Processing file 519547-sere-psychologist-and-bush-admin-torture.pdf\n", "Processing file 6240196-Zenith-Rathfelder-Notes.pdf\n", "Processing file 1630443-nick-stoneman-notes.pdf\n", "Processing file 4182029-HANDWRITTEN-COMPILATION-OF-COLLECTION.pdf\n", "Processing file 528421-handwritten-notes-on-financial-offer-from-orders.pdf\n", "Processing file 5498003-1996-08-11-Handwritten-Law-Enforcement-Notes.pdf\n", "Processing file 202587-doc21.pdf\n", "Processing file 5190901-WasteWS-07062018-145153-Handwritten-Letter-NO-OCR.pdf\n", "Processing file 625990-thatcher-mt-notes.pdf\n", "Processing file 782204-loeser-will.pdf\n", "Processing file 2189928-handwritten-agreement.pdf\n", "Processing file 1223362-gauthe-clinical-notes.pdf\n", "Processing file 4190352-MEMORANDUM-HIDALGO-BALMES-HANDWRITTEN.pdf\n", "Processing file 6939323-7-8-11-83-Handwritten-Notes.pdf\n", "Processing file 2996996-TL-Handwritten-Fax-Labeled-Confidential.pdf\n", "Processing file 3468379-DPD-Use-of-Force-Handwritten-Notes.pdf\n", "Processing file 4188317-LIST-HANDWRITTEN-SUMMARIES-NOTES.pdf\n", "Processing file 339816-jeff-long-notes.pdf\n", "Processing file 3864522-Handwritten-letter-in-a-defense-sentencing-memo.pdf\n", "Processing file 4183043-HANDWRITTEN-MEMO-SUBJ-ARRIVAL-COVERT-KEY-WEST.pdf\n", "Processing file 5002049-Doc-10-09-2018-12-41-31.pdf\n", "Processing file 4780536-Jury-Notes-Manafort-Trial.pdf\n", "Processing file 2300328-gov-lepage-handwritten-note-on-lmf.pdf\n", "Processing file 3002469-Investigation-Notes-Witness-Statements.pdf\n", "Processing file 4193148-11111707.pdf\n", "Processing file 4598117-2002-Handwritten-Notes-of-City-Meeting.pdf\n", "Processing file 4444642-Letter-Chase-Nicholson-s-grandmother.pdf\n", "Processing file 2519466-10-10-13-handwritten-list-of-malfunctioning-doors.pdf\n", "Processing file 2940920-2002-09-12-Letter-Handwritten-Tony-Blair-to.pdf\n", "Processing file 4187525-HANDWRITTEN-NOTE-CUBAN-ACTIVITIES-IN-ARGENTINA.pdf\n", "Processing file 3462227-Notes-2.pdf\n", "Processing file 2791337-FBI-Handwritten-Notes.pdf\n", "Processing file 354332-notes.pdf\n", "Processing file 4182092-HANDWRITTEN-PAPER-CONCERNING-PROJECT-ZRRIFLE.pdf\n", "Processing file 3766497-Detective-Notes.pdf\n", "Processing file 6749500-LEOPOLD-FILES-Bruce-Jessen-Handwritten-Notes.pdf\n", "Processing file 3220458-Handwritten-Note.pdf\n", "Processing file 4188296-HANDWRITTEN-LIST-OF-14-NAMES.pdf\n", "Processing file 804992-fairhope-city-council-resolution-10-14-13.pdf\n", "Processing file 2997471-Handwritten-Notes-to-and-From-Gov-LePage.pdf\n", "Processing file 4598082-Letter-to-Judge-Schroeder.pdf\n", "Processing file 3877562-Mikulcik-Interview-Notes.pdf\n", "Processing file 479570-emily-dickinsons-handwritten-poem.pdf\n", "Processing file 21108937-pv-handwritten-motion-for-trial-transcript-3215.pdf\n", "Processing file 6020949-Aretha-Franklin-Handwritten-Will-1.pdf\n", "Processing file 6464722-Fn-22-03252.pdf\n", "Processing file 21117526-pv-handwritten-pre-sentencing-memorandum-21312.pdf\n", "Processing file 705120-mingle-notes.pdf\n", "Processing file 1213307-rialto-unified-holocaust-essays-set-13-part-05.pdf\n", "Processing file 4189369-HANDWRITTEN-SHEET-WITH-REPORT-NUMBERS.pdf\n", "Processing file 202586-doc20.pdf\n", "Processing file 4184368-OPS-NOTES-ON-AMBIDDY.pdf\n", "Processing file 393660-fibroscopic-notes-original.pdf\n", "Processing file 4182778-HANDWRITTEN-NOTES.pdf\n", "Processing file 4187708-DOCUMENT-3-HANDWRITTEN-NOTE.pdf\n", "Processing file 702325-snitch-list.pdf\n", "Processing file 4180597-HANDWRITTEN-MEMO-RE-IMPLICATION-OF-ALINE-MOSBY.pdf\n", "Processing file 1236260-sidney-holters-note.pdf\n", "Processing file 3223330-Scan-20111118-200635.pdf\n", "Processing file 3223289-Scan-20111116-171028.pdf\n", "Processing file 21043867-2021-01-25-mo-sen-eigel-handwritten-note-sb-66-allows-people-to-run-over-protestors.pdf\n", "Processing file 3233254-Earl-Bradley-Letter-102516.pdf\n", "Processing file 3227804-Cornell-Handwritten-Letters.pdf\n", "Processing file 3862956-Handwritten-Note-of-Meeting-With-Howard-Hill.pdf\n", "Processing file 803848-elvis-presleys-letter-to-richard-nixon.pdf\n", "Processing file 3223331-Scan-20111118-200718.pdf\n", "Processing file 202599-doc33.pdf\n", "Processing file 5690139-Child-s-Australia-Day-Letter.pdf\n", "Processing file 4182800-HANDWRITTEN-MEMO-ON-HUNT-AND-HIS-USE-OF-A-PEN-NAME.pdf\n", "Processing file 3223399-Scan-20111125-141919.pdf\n", "Processing file 3521430-IMG-4827.pdf\n", "Processing file 4420170-Mullkoff-Note.pdf\n", "Processing file 4185023-HANDWRITTEN-NOTE-RE-FILING-DOCUMENTS-IN-DIAZ-L-S.pdf\n", "Processing file 2746630-Cardenas-notes.pdf\n", "Processing file 1086683-hand-written-notes.pdf\n", "Processing file 5190857-WasteWS-07062018-120826-HANDWRITTEN-NO-OCR.pdf\n", "Processing file 4189400-HANDWRITTEN-CARDS-RESEARCH-DEPARTMENT-MEMORANDUM.pdf\n", "Processing file 3223321-Scan-20111118-195438.pdf\n", "Processing file 4328038-Correspondence-From-John-Crowley-to-Scott.pdf\n", "Processing file 5018573-Chad-Notes.pdf\n", "Processing file 21109051-pv-handwritten-letter-requesting-new-federal-defender-counsel-41212.pdf\n", "Processing file 5316881-Journal-Entries.pdf\n", "Processing file 4191532-OFFICE-NOTES-RE-AGENTS.pdf\n", "Processing file 1213274-rialto-unified-holocaust-essays-set-01-part-01.pdf\n", "Processing file 4490103-Gergely-Interview-Notes.pdf\n", "Processing file 2708465-Brandon-Astor-Jones-Timeline-HW.pdf\n", "Processing file 3462228-notes-3.pdf\n", "Processing file 528418-handwritten-note-from-april-3-2001.pdf\n", "Processing file 4185802-HANDWRITTEN-NOTES-MARY-CHECKING-FOR-PERTINENT.pdf\n", "Processing file 3011003-Richard-Bain-s-handwritten-account-of-Sept-4-2012.pdf\n", "Processing file 5002048-Doc-10-09-2018-12-42-54.pdf\n", "Processing file 1372097-applebee-valuch-notes.pdf\n", "Processing file 705119-davies-notes.pdf\n", "Processing file 3237222-Notes-1216.pdf\n", "Processing file 3718215-PP-D0414.pdf\n", "Processing file 4187089-LECTURE-AT-FARM.pdf\n", "Processing file 3223531-Scan-20111202-194427.pdf\n", "Processing file 4450893-Lithuanian-Extradition-Request-for-Release.pdf\n", "Processing file 4193146-11111705.pdf\n", "Processing file 526429-east-coast-rapist-suspects-apology-letter.pdf\n", "Processing file 4181294-HANDWRITTEN-LIST-DDO-FILE-REQUESTS.pdf\n", "Processing file 5764377-Handwritten-Notes.pdf\n", "Processing file 4183714-HANDWRITTEN-NOTE-RE-RICHARD-GIBSON-ISSUE.pdf\n", "Processing file 6393576-Lawson-Letter.pdf\n", "Processing file 1283645-dearest-celeste.pdf\n", "Processing file 2825042-Mathes-Handwritten-Notes.pdf\n", "Processing file 202597-doc31.pdf\n", "Processing file 21011336-foxs-hand-drawn-map.pdf\n", "Processing file 339815-jon-fagg-notes.pdf\n", "Processing file 1336598-hartmannnotes.pdf\n", "Processing file 2650353-1983-10-22-F4-Handritten.pdf\n", "Processing file 282179-july-14-1995-mladic-diary-handwritten.pdf\n", "Processing file 2271360-hartfield-handwritten-writ.pdf\n", "Processing file 4178930-HANDWRITTEN-DOC-SUMMARIES-ILLEGIBLE.pdf\n", "Processing file 5096174-Handwritten-Minutes.pdf\n", "Processing file 6382284-Letter-Ferrell-Scott.pdf\n", "Processing file 2474354-oland-statement-written.pdf\n", "Processing file 1306277-4-9-22-11-hanah-cho.pdf\n", "Processing file 3223449-Scan-20111125-145718.pdf\n", "Processing file 2517459-bill-clintons-handwritten-speech-from-the.pdf\n", "Processing file 3223329-Scan-20111118-200524.pdf\n", "Processing file 4179732-INVENTORY-HANDWRITTEN.pdf\n", "Processing file 3223447-Scan-20111125-145513.pdf\n", "Processing file 786278-frederick-pabsts-handwritten-will.pdf\n", "Processing file 1378369-bricker-doc.pdf\n", "Processing file 1371011-witness-40-journal-entry.pdf\n", "Processing file 4191654-HANDWRITTEN-NOTES-ON-CIA-AND-FBI-DOCUMENTS.pdf\n", "Processing file 803812-iggy-pops-letter-to-a-fan-1995.pdf\n", "Processing file 3223444-Scan-20111125-145344.pdf\n", "Processing file 4193147-11111706.pdf\n", "Processing file 2504031-joseph-brennick-handwritten-memos.pdf\n", "Processing file 21108969-pvs-amended-handwritten-41-page-motion-for-reconsideration-en-banc-11317.pdf\n", "Processing file 5190870-WasteWS-07062018-121609-HANDWRITTEN-NO-OCR-Pdf.pdf\n", "Processing file 3223420-Scan-20111125-142900.pdf\n", "Processing file 555087-vh-defrock-letter.pdf\n", "Processing file 5002051-Doc-10-09-2018-12-39-47.pdf\n", "Processing file 5634528-181217-Flynn-Fbi-Notes.pdf\n", "Processing file 5025013-J-R-Thomas-response.pdf\n", "Processing file 3223527-Scan-20111202-190234.pdf\n", "Processing file 4184490-HANDWRITTEN-BERNARD-BARKER-CHRONOLOGY.pdf\n", "Processing file 4911037-Manuel-Orrego-Savala-Letter.pdf\n", "Processing file 6571779-Frederick-Veal-Interview-Notes.pdf\n", "Processing file 1096680-alexandra-hollinghurst-notes.pdf\n", "Processing file 5780462-GGGW-v-Schwitzer-Handwritten-Ex-Staffer-Stmt.pdf\n", "Processing file 321762-gina-hutchinson-journal-entry.pdf\n", "Processing file 566855-inslee-letter.pdf\n", "Processing file 2811636-Callis-Handwritten-Letter-1999-13-09.pdf\n", "Processing file 803827-a-mothers-letter-to-the-foundling-asylum.pdf\n", "Processing file 4704949-Bechtel-Notes.pdf\n", "Processing file 6771659-Preston-Handwritten-Response.pdf\n", "Processing file 776417-helios-notes.pdf\n", "Processing file 4191914-FORM-MEMORANDUM-FOR-THE-RECORD-HARVEY-LEE-OSWALD.pdf\n", "Processing file 3462229-Notes-1.pdf\n", "Processing file 21030480-doj-notes.pdf\n", "Processing file 1658584-1admission-notes-redacted.pdf\n", "Processing file 1668278-doc066.pdf\n", "Processing file 5332373-1-AEA-2080-NOTES-HANDWRITTEN-CHRONOLOGY-EVENTS.pdf\n", "Processing file 528420-handwritten-notes-of-meeting-with-cori-june-26.pdf\n", "Processing file 2777703-AV-Interview-Notes.pdf\n", "Processing file 2163779-skm-c224e15070214420-pdf-handwritten-notes-about.pdf\n", "Processing file 266726-boogaard-journal.pdf\n" ] } ], "source": [ "process_training_data()" ] } ], "metadata": { "kernelspec": { "display_name": "Python (hydra)", "language": "python", "name": "hydra" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.1" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 5 }