Spaces:
Sleeping
Sleeping
File size: 26,779 Bytes
861182c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "63d95b8f-7559-4a3d-b025-5bb788914dc9",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import numpy as np\n",
"import pandas as pd\n",
"import pdf2image as p2i\n",
"from os import path\n",
"from pathlib import Path\n",
"from PIL import Image\n",
"\n",
"# Allow for unlimited image size, some documents are pretty big...\n",
"Image.MAX_IMAGE_PIXELS = None\n",
"\n",
"\n",
"def process_pdf_document(filepath):\n",
" if path.getsize(filepath) == 0:\n",
" # TODO: substitute for logging\n",
" print(f'{filepath} is empty, skipping')\n",
" return []\n",
"\n",
" pages = p2i.convert_from_path(filepath)\n",
" processed_pages: list[dict] = []\n",
"\n",
" label = 'other'\n",
" root_dir, doctype = Path(filepath).parts[:2]\n",
" for page_i, page in enumerate(pages):\n",
" if page_i == 0:\n",
" label = doctype\n",
" elif page_i == len(pages) - 1:\n",
" label = f'{label}-last'\n",
"\n",
" processed_pages.append({\n",
" 'filepath': filepath,\n",
" 'width': page.width,\n",
" 'height': page.height,\n",
" 'bytes': page.tobytes(),\n",
" 'label': label,\n",
" })\n",
"\n",
" return processed_pages\n",
"\n",
"\n",
"def process_training_data():\n",
" data_dir = Path('./data')\n",
"\n",
" for dirname, _, files in os.walk(data_dir):\n",
" print(f'Processing folder {dirname}')\n",
" doctype = path.basename(dirname)\n",
" df = pd.DataFrame()\n",
"\n",
" for filename in files:\n",
" print(f'Processing file {filename}')\n",
" filepath = path.join(dirname, filename)\n",
" _, ext = path.splitext(filepath)\n",
"\n",
" if ext.lower() == '.pdf':\n",
" processed_pages = process_pdf_document(filepath)\n",
" df = pd.concat([df, pd.DataFrame(processed_pages)], ignore_index=True)\n",
"\n",
" parquet_filepath = path.join(data_dir, f'{doctype}.parquet')\n",
" print(f'Saving data for {doctype} in {parquet_filepath}')\n",
" print(df)\n",
" df.to_parquet(parquet_filepath)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1efb8e82-92bc-46dc-9553-cdc712211c93",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing folder data\n",
"Processing file data.parquet\n",
"Processing file .DS_Store\n",
"Saving data for data in data/data.parquet\n",
"Empty DataFrame\n",
"Columns: []\n",
"Index: []\n",
"Processing folder data/form\n",
"Processing file 1384753-request-form.pdf\n",
"Processing file 772791-12_sipc1301f_peer-individual-evaluation-form.pdf\n",
"Processing file 6428556-Cassatt-Form-D-2008.pdf\n",
"Processing file 3895717-INFORMATION-DISCLOSURE-REQUEST-FORM.pdf\n",
"Processing file 4321053-document-21556514.pdf\n",
"Processing file 4575531-REDACTED-PIW-CR285646.pdf\n",
"Processing file 6155510-Form-I.pdf\n",
"Processing file 3885938-JayaServicesLtd-Appointment-15112002.pdf\n",
"Processing file 4950830-Cook1995.pdf\n",
"Processing file 6368627-2013-Southern-Company-990.pdf\n",
"Processing file 4436484-Columbia-Armed-Guard-application-form.pdf\n",
"Processing file 21074299-company-application-form-for-dominicana-acquisition-sa.pdf\n",
"Processing file 746365-wa-madsen-barbara-2012.pdf\n",
"Processing file 5770184-CAMCF-RFYL-Registration-Form.pdf\n",
"Processing file 1683073-medication-info-form-eng-revised-2014.pdf\n",
"Processing file 4423347-Request-Form.pdf\n",
"Processing file 3022163-Event-Form.pdf\n",
"Processing file 1685242-cv-fresh-entry-form.pdf\n",
"Processing file 4910987-Records-Request-Form.pdf\n",
"Processing file 4349763-DOD-Form-254.pdf\n",
"Processing file 3885273-EIMEuropeanInvestmentsManagement-ChangeofName.pdf\n",
"Processing file 4951858-Ober3.pdf\n",
"Processing file 7034791-UOF-form.pdf\n",
"Processing file 6206290-Thunderbird-GBC1-Application-Form.pdf\n",
"Processing file 465336-disclosure-form-13491099365164-_-pdf.pdf\n",
"Processing file 6538004-Aderholt-Travel-Disclosure-Form.pdf\n",
"Processing file 1292759-host-accept-form-attachment.pdf\n",
"Processing file 1238787-request-form.pdf\n",
"Processing file 6173981-Cert-of-ID-form.pdf\n",
"Processing file 3220944-Sitton-July-2015-Campaign-Finance-Report.pdf\n",
"Processing file 6882316-Request-Form.pdf\n",
"Processing file 4111213-Coffman-Election-Filings.pdf\n",
"Processing file 4773259-RECORDS-REQUEST-FORM.pdf\n",
"Processing file 3116806-Black-River-Technical-College-Request-for-C-23.pdf\n",
"Processing file 761264-13_ee402a01_expository-essay-interpretation-of.pdf\n",
"Processing file 5955192-North-Dakota-corn-dogs.pdf\n",
"Processing file 6882816-Disaster-Relief-Cash-Assistance-Application-Form.pdf\n",
"Processing file 1303092-appeal-form.pdf\n",
"Processing file 3224254-CCT-Holidayhope-2016-Form.pdf\n",
"Processing file 528353-disclosure-form-13512855487444-_-pdf.pdf\n",
"Processing file 696442-nms-timeline-complaint.pdf\n",
"Processing file 842767-shallotte-assisted-living-penalty-packet-4.pdf\n",
"Processing file 4484638-GENERAL-APRA-form-2018.pdf\n",
"Processing file 20393262-margaux-keiser-form-460.pdf\n",
"Processing file 6785703-Release-form.pdf\n",
"Processing file 1683896-rtk-form-16241-pdf.pdf\n",
"Processing file 3234225-Philadelphia-Form-30401-pdf.pdf\n",
"Processing file 442530-collect-files-35870-political-file-2012-non.pdf\n",
"Processing file 4108784-2012-FSA-Enrollment-Form.pdf\n",
"Processing file 2426599-opega-review-request-form.pdf\n",
"Processing file 20429474-caserta-evidence-list.pdf\n",
"Processing file 7001493-ABC-Trailblazer-Application-Form-2020-21.pdf\n",
"Processing file 6310093-Registration.pdf\n",
"Processing file 21085990-mcguire-3q21.pdf\n",
"Processing file 6879322-Form-8871.pdf\n",
"Processing file 21182127-20220118-docketing-statement-b.pdf\n",
"Processing file 5518517-Corbett-email-PRR-017155-pdf.pdf\n",
"Processing file 4058754-Records-Release-pdf.pdf\n",
"Processing file 1386200-annual-disclosure-request-form.pdf\n",
"Processing file 7202517-Photo-Release-Form.pdf\n",
"Processing file 3860778-810A-Evidence-Submission-Form.pdf\n",
"Processing file 5899836-Medford-Tp-summary-form.pdf\n",
"Processing file 5776714-CRID-1046835.pdf\n",
"Processing file 6536846-Transcript-Request.pdf\n",
"Processing file 7041210-Development-Application-Form.pdf\n",
"Processing file 466205-wa-united-for-marriage-avails-request-form.pdf\n",
"Processing file 7000910-NCFPD-CORA-Request-Form.pdf\n",
"Processing file 21884855-dos-22-04-051.pdf\n",
"data/form/21884855-dos-22-04-051.pdf is empty, skipping\n",
"Saving data for form in data/form.parquet\n",
" filepath width height \\\n",
"0 data/form/1384753-request-form.pdf 1700 2200 \n",
"1 data/form/772791-12_sipc1301f_peer-individual-... 1700 2200 \n",
"2 data/form/6428556-Cassatt-Form-D-2008.pdf 1700 2200 \n",
"3 data/form/6428556-Cassatt-Form-D-2008.pdf 1700 2200 \n",
"4 data/form/6428556-Cassatt-Form-D-2008.pdf 1700 2200 \n",
".. ... ... ... \n",
"618 data/form/7041210-Development-Application-Form... 1700 2200 \n",
"619 data/form/7041210-Development-Application-Form... 1700 2200 \n",
"620 data/form/7041210-Development-Application-Form... 1700 2200 \n",
"621 data/form/466205-wa-united-for-marriage-avails... 1700 2200 \n",
"622 data/form/7000910-NCFPD-CORA-Request-Form.pdf 1700 2200 \n",
"\n",
" bytes label \n",
"0 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n",
"1 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n",
"2 b'\\x00\\x00\\x00\\x00\\x00\\x00\\xfe\\xfe\\xfe\\xff\\xff... form \n",
"3 b'\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\xfe\\xfe... form \n",
"4 b'\\x00\\x00\\x00\\x00\\x00\\x00\\xfe\\xfe\\xfe\\xff\\xff... form \n",
".. ... ... \n",
"618 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n",
"619 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n",
"620 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form-last \n",
"621 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n",
"622 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... form \n",
"\n",
"[623 rows x 5 columns]\n",
"Processing folder data/scientific publication\n",
"Processing file 6425893-Oxford-Journals-Article.pdf\n",
"Processing file websearch_7_00001_scientific_publication.pdf\n",
"Processing file websearch_7_00017_scientific_publication.pdf\n",
"Processing file websearch_7_00022_scientific_publication.pdf\n",
"Processing file 3923078-Paper-1.pdf\n",
"Processing file websearch_7_00025_scientific_publication.pdf\n",
"Processing file websearch_7_00010_scientific_publication.pdf\n",
"Processing file 4247250-JACOBSON-Original-Article.pdf\n",
"Processing file websearch_7_00006_scientific_publication.pdf\n",
"Processing file 3559070-McCance-Katz-Et-Al-2017-the-American-Journal-on.pdf\n",
"Processing file websearch_7_00011_scientific_publication.pdf\n",
"Processing file websearch_7_00024_scientific_publication.pdf\n",
"Processing file websearch_7_00007_scientific_publication.pdf\n",
"Processing file websearch_7_00000_scientific_publication.pdf\n",
"Processing file websearch_7_00023_scientific_publication.pdf\n",
"Processing file 21085273-wood-schulman-article.pdf\n",
"Processing file websearch_7_00016_scientific_publication.pdf\n",
"Processing file 2189923-lancet-article-from-1989.pdf\n",
"Processing file 7223327-Forensic-Article.pdf\n",
"Processing file websearch_7_00009_scientific_publication.pdf\n",
"Processing file websearch_7_00005_scientific_publication.pdf\n",
"Processing file 6429770-Anderson2012-Article.pdf\n",
"Processing file websearch_7_00013_scientific_publication.pdf\n",
"Processing file websearch_7_00026_scientific_publication.pdf\n",
"Processing file websearch_7_00021_scientific_publication.pdf\n",
"Processing file websearch_7_00018_scientific_publication.pdf\n",
"Processing file websearch_7_00014_scientific_publication.pdf\n",
"Processing file websearch_7_00002_scientific_publication.pdf\n",
"Processing file websearch_7_00019_scientific_publication.pdf\n",
"Processing file websearch_7_00015_scientific_publication.pdf\n",
"Processing file websearch_7_00020_scientific_publication.pdf\n",
"Processing file websearch_7_00003_scientific_publication.pdf\n",
"Processing file 5980147-Venu-B-Article-1.pdf\n",
"Processing file 20519853-2007-recommendations-for-medical-management-of-adult-lead-exposure.pdf\n",
"Processing file 4343429-Meltdown-paper.pdf\n",
"Processing file 785243-paper.pdf\n",
"Processing file websearch_7_00008_scientific_publication.pdf\n",
"Processing file websearch_7_00004_scientific_publication.pdf\n",
"Processing file websearch_7_00012_scientific_publication.pdf\n",
"Saving data for scientific publication in data/scientific publication.parquet\n",
" filepath width height \\\n",
"0 data/scientific publication/6425893-Oxford-Jou... 1700 2200 \n",
"1 data/scientific publication/6425893-Oxford-Jou... 1700 2200 \n",
"2 data/scientific publication/6425893-Oxford-Jou... 1700 2200 \n",
"3 data/scientific publication/6425893-Oxford-Jou... 1700 2200 \n",
"4 data/scientific publication/6425893-Oxford-Jou... 1700 2200 \n",
".. ... ... ... \n",
"532 data/scientific publication/websearch_7_00012_... 1700 2200 \n",
"533 data/scientific publication/websearch_7_00012_... 1700 2200 \n",
"534 data/scientific publication/websearch_7_00012_... 1700 2200 \n",
"535 data/scientific publication/websearch_7_00012_... 1700 2200 \n",
"536 data/scientific publication/websearch_7_00012_... 1700 2200 \n",
"\n",
" bytes \\\n",
"0 b'\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd... \n",
"1 b'\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe\\xfe... \n",
"2 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n",
"3 b'\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd\\xfd... \n",
"4 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n",
".. ... \n",
"532 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n",
"533 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n",
"534 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n",
"535 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n",
"536 b'\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff... \n",
"\n",
" label \n",
"0 scientific publication \n",
"1 scientific publication \n",
"2 scientific publication \n",
"3 scientific publication \n",
"4 scientific publication \n",
".. ... \n",
"532 scientific publication \n",
"533 scientific publication \n",
"534 scientific publication \n",
"535 scientific publication \n",
"536 scientific publication-last \n",
"\n",
"[537 rows x 5 columns]\n",
"Processing folder data/handwritten\n",
"Processing file 3223324-Scan-20111118-195759.pdf\n",
"Processing file 20743971-handwritten-statement.pdf\n",
"Processing file 21030538-doj-donoghue-notes.pdf\n",
"Processing file 519547-sere-psychologist-and-bush-admin-torture.pdf\n",
"Processing file 6240196-Zenith-Rathfelder-Notes.pdf\n",
"Processing file 1630443-nick-stoneman-notes.pdf\n",
"Processing file 4182029-HANDWRITTEN-COMPILATION-OF-COLLECTION.pdf\n",
"Processing file 528421-handwritten-notes-on-financial-offer-from-orders.pdf\n",
"Processing file 5498003-1996-08-11-Handwritten-Law-Enforcement-Notes.pdf\n",
"Processing file 202587-doc21.pdf\n",
"Processing file 5190901-WasteWS-07062018-145153-Handwritten-Letter-NO-OCR.pdf\n",
"Processing file 625990-thatcher-mt-notes.pdf\n",
"Processing file 782204-loeser-will.pdf\n",
"Processing file 2189928-handwritten-agreement.pdf\n",
"Processing file 1223362-gauthe-clinical-notes.pdf\n",
"Processing file 4190352-MEMORANDUM-HIDALGO-BALMES-HANDWRITTEN.pdf\n",
"Processing file 6939323-7-8-11-83-Handwritten-Notes.pdf\n",
"Processing file 2996996-TL-Handwritten-Fax-Labeled-Confidential.pdf\n",
"Processing file 3468379-DPD-Use-of-Force-Handwritten-Notes.pdf\n",
"Processing file 4188317-LIST-HANDWRITTEN-SUMMARIES-NOTES.pdf\n",
"Processing file 339816-jeff-long-notes.pdf\n",
"Processing file 3864522-Handwritten-letter-in-a-defense-sentencing-memo.pdf\n",
"Processing file 4183043-HANDWRITTEN-MEMO-SUBJ-ARRIVAL-COVERT-KEY-WEST.pdf\n",
"Processing file 5002049-Doc-10-09-2018-12-41-31.pdf\n",
"Processing file 4780536-Jury-Notes-Manafort-Trial.pdf\n",
"Processing file 2300328-gov-lepage-handwritten-note-on-lmf.pdf\n",
"Processing file 3002469-Investigation-Notes-Witness-Statements.pdf\n",
"Processing file 4193148-11111707.pdf\n",
"Processing file 4598117-2002-Handwritten-Notes-of-City-Meeting.pdf\n",
"Processing file 4444642-Letter-Chase-Nicholson-s-grandmother.pdf\n",
"Processing file 2519466-10-10-13-handwritten-list-of-malfunctioning-doors.pdf\n",
"Processing file 2940920-2002-09-12-Letter-Handwritten-Tony-Blair-to.pdf\n",
"Processing file 4187525-HANDWRITTEN-NOTE-CUBAN-ACTIVITIES-IN-ARGENTINA.pdf\n",
"Processing file 3462227-Notes-2.pdf\n",
"Processing file 2791337-FBI-Handwritten-Notes.pdf\n",
"Processing file 354332-notes.pdf\n",
"Processing file 4182092-HANDWRITTEN-PAPER-CONCERNING-PROJECT-ZRRIFLE.pdf\n",
"Processing file 3766497-Detective-Notes.pdf\n",
"Processing file 6749500-LEOPOLD-FILES-Bruce-Jessen-Handwritten-Notes.pdf\n",
"Processing file 3220458-Handwritten-Note.pdf\n",
"Processing file 4188296-HANDWRITTEN-LIST-OF-14-NAMES.pdf\n",
"Processing file 804992-fairhope-city-council-resolution-10-14-13.pdf\n",
"Processing file 2997471-Handwritten-Notes-to-and-From-Gov-LePage.pdf\n",
"Processing file 4598082-Letter-to-Judge-Schroeder.pdf\n",
"Processing file 3877562-Mikulcik-Interview-Notes.pdf\n",
"Processing file 479570-emily-dickinsons-handwritten-poem.pdf\n",
"Processing file 21108937-pv-handwritten-motion-for-trial-transcript-3215.pdf\n",
"Processing file 6020949-Aretha-Franklin-Handwritten-Will-1.pdf\n",
"Processing file 6464722-Fn-22-03252.pdf\n",
"Processing file 21117526-pv-handwritten-pre-sentencing-memorandum-21312.pdf\n",
"Processing file 705120-mingle-notes.pdf\n",
"Processing file 1213307-rialto-unified-holocaust-essays-set-13-part-05.pdf\n",
"Processing file 4189369-HANDWRITTEN-SHEET-WITH-REPORT-NUMBERS.pdf\n",
"Processing file 202586-doc20.pdf\n",
"Processing file 4184368-OPS-NOTES-ON-AMBIDDY.pdf\n",
"Processing file 393660-fibroscopic-notes-original.pdf\n",
"Processing file 4182778-HANDWRITTEN-NOTES.pdf\n",
"Processing file 4187708-DOCUMENT-3-HANDWRITTEN-NOTE.pdf\n",
"Processing file 702325-snitch-list.pdf\n",
"Processing file 4180597-HANDWRITTEN-MEMO-RE-IMPLICATION-OF-ALINE-MOSBY.pdf\n",
"Processing file 1236260-sidney-holters-note.pdf\n",
"Processing file 3223330-Scan-20111118-200635.pdf\n",
"Processing file 3223289-Scan-20111116-171028.pdf\n",
"Processing file 21043867-2021-01-25-mo-sen-eigel-handwritten-note-sb-66-allows-people-to-run-over-protestors.pdf\n",
"Processing file 3233254-Earl-Bradley-Letter-102516.pdf\n",
"Processing file 3227804-Cornell-Handwritten-Letters.pdf\n",
"Processing file 3862956-Handwritten-Note-of-Meeting-With-Howard-Hill.pdf\n",
"Processing file 803848-elvis-presleys-letter-to-richard-nixon.pdf\n",
"Processing file 3223331-Scan-20111118-200718.pdf\n",
"Processing file 202599-doc33.pdf\n",
"Processing file 5690139-Child-s-Australia-Day-Letter.pdf\n",
"Processing file 4182800-HANDWRITTEN-MEMO-ON-HUNT-AND-HIS-USE-OF-A-PEN-NAME.pdf\n",
"Processing file 3223399-Scan-20111125-141919.pdf\n",
"Processing file 3521430-IMG-4827.pdf\n",
"Processing file 4420170-Mullkoff-Note.pdf\n",
"Processing file 4185023-HANDWRITTEN-NOTE-RE-FILING-DOCUMENTS-IN-DIAZ-L-S.pdf\n",
"Processing file 2746630-Cardenas-notes.pdf\n",
"Processing file 1086683-hand-written-notes.pdf\n",
"Processing file 5190857-WasteWS-07062018-120826-HANDWRITTEN-NO-OCR.pdf\n",
"Processing file 4189400-HANDWRITTEN-CARDS-RESEARCH-DEPARTMENT-MEMORANDUM.pdf\n",
"Processing file 3223321-Scan-20111118-195438.pdf\n",
"Processing file 4328038-Correspondence-From-John-Crowley-to-Scott.pdf\n",
"Processing file 5018573-Chad-Notes.pdf\n",
"Processing file 21109051-pv-handwritten-letter-requesting-new-federal-defender-counsel-41212.pdf\n",
"Processing file 5316881-Journal-Entries.pdf\n",
"Processing file 4191532-OFFICE-NOTES-RE-AGENTS.pdf\n",
"Processing file 1213274-rialto-unified-holocaust-essays-set-01-part-01.pdf\n",
"Processing file 4490103-Gergely-Interview-Notes.pdf\n",
"Processing file 2708465-Brandon-Astor-Jones-Timeline-HW.pdf\n",
"Processing file 3462228-notes-3.pdf\n",
"Processing file 528418-handwritten-note-from-april-3-2001.pdf\n",
"Processing file 4185802-HANDWRITTEN-NOTES-MARY-CHECKING-FOR-PERTINENT.pdf\n",
"Processing file 3011003-Richard-Bain-s-handwritten-account-of-Sept-4-2012.pdf\n",
"Processing file 5002048-Doc-10-09-2018-12-42-54.pdf\n",
"Processing file 1372097-applebee-valuch-notes.pdf\n",
"Processing file 705119-davies-notes.pdf\n",
"Processing file 3237222-Notes-1216.pdf\n",
"Processing file 3718215-PP-D0414.pdf\n",
"Processing file 4187089-LECTURE-AT-FARM.pdf\n",
"Processing file 3223531-Scan-20111202-194427.pdf\n",
"Processing file 4450893-Lithuanian-Extradition-Request-for-Release.pdf\n",
"Processing file 4193146-11111705.pdf\n",
"Processing file 526429-east-coast-rapist-suspects-apology-letter.pdf\n",
"Processing file 4181294-HANDWRITTEN-LIST-DDO-FILE-REQUESTS.pdf\n",
"Processing file 5764377-Handwritten-Notes.pdf\n",
"Processing file 4183714-HANDWRITTEN-NOTE-RE-RICHARD-GIBSON-ISSUE.pdf\n",
"Processing file 6393576-Lawson-Letter.pdf\n",
"Processing file 1283645-dearest-celeste.pdf\n",
"Processing file 2825042-Mathes-Handwritten-Notes.pdf\n",
"Processing file 202597-doc31.pdf\n",
"Processing file 21011336-foxs-hand-drawn-map.pdf\n",
"Processing file 339815-jon-fagg-notes.pdf\n",
"Processing file 1336598-hartmannnotes.pdf\n",
"Processing file 2650353-1983-10-22-F4-Handritten.pdf\n",
"Processing file 282179-july-14-1995-mladic-diary-handwritten.pdf\n",
"Processing file 2271360-hartfield-handwritten-writ.pdf\n",
"Processing file 4178930-HANDWRITTEN-DOC-SUMMARIES-ILLEGIBLE.pdf\n",
"Processing file 5096174-Handwritten-Minutes.pdf\n",
"Processing file 6382284-Letter-Ferrell-Scott.pdf\n",
"Processing file 2474354-oland-statement-written.pdf\n",
"Processing file 1306277-4-9-22-11-hanah-cho.pdf\n",
"Processing file 3223449-Scan-20111125-145718.pdf\n",
"Processing file 2517459-bill-clintons-handwritten-speech-from-the.pdf\n",
"Processing file 3223329-Scan-20111118-200524.pdf\n",
"Processing file 4179732-INVENTORY-HANDWRITTEN.pdf\n",
"Processing file 3223447-Scan-20111125-145513.pdf\n",
"Processing file 786278-frederick-pabsts-handwritten-will.pdf\n",
"Processing file 1378369-bricker-doc.pdf\n",
"Processing file 1371011-witness-40-journal-entry.pdf\n",
"Processing file 4191654-HANDWRITTEN-NOTES-ON-CIA-AND-FBI-DOCUMENTS.pdf\n",
"Processing file 803812-iggy-pops-letter-to-a-fan-1995.pdf\n",
"Processing file 3223444-Scan-20111125-145344.pdf\n",
"Processing file 4193147-11111706.pdf\n",
"Processing file 2504031-joseph-brennick-handwritten-memos.pdf\n",
"Processing file 21108969-pvs-amended-handwritten-41-page-motion-for-reconsideration-en-banc-11317.pdf\n",
"Processing file 5190870-WasteWS-07062018-121609-HANDWRITTEN-NO-OCR-Pdf.pdf\n",
"Processing file 3223420-Scan-20111125-142900.pdf\n",
"Processing file 555087-vh-defrock-letter.pdf\n",
"Processing file 5002051-Doc-10-09-2018-12-39-47.pdf\n",
"Processing file 5634528-181217-Flynn-Fbi-Notes.pdf\n",
"Processing file 5025013-J-R-Thomas-response.pdf\n",
"Processing file 3223527-Scan-20111202-190234.pdf\n",
"Processing file 4184490-HANDWRITTEN-BERNARD-BARKER-CHRONOLOGY.pdf\n",
"Processing file 4911037-Manuel-Orrego-Savala-Letter.pdf\n",
"Processing file 6571779-Frederick-Veal-Interview-Notes.pdf\n",
"Processing file 1096680-alexandra-hollinghurst-notes.pdf\n",
"Processing file 5780462-GGGW-v-Schwitzer-Handwritten-Ex-Staffer-Stmt.pdf\n",
"Processing file 321762-gina-hutchinson-journal-entry.pdf\n",
"Processing file 566855-inslee-letter.pdf\n",
"Processing file 2811636-Callis-Handwritten-Letter-1999-13-09.pdf\n",
"Processing file 803827-a-mothers-letter-to-the-foundling-asylum.pdf\n",
"Processing file 4704949-Bechtel-Notes.pdf\n",
"Processing file 6771659-Preston-Handwritten-Response.pdf\n",
"Processing file 776417-helios-notes.pdf\n",
"Processing file 4191914-FORM-MEMORANDUM-FOR-THE-RECORD-HARVEY-LEE-OSWALD.pdf\n",
"Processing file 3462229-Notes-1.pdf\n",
"Processing file 21030480-doj-notes.pdf\n",
"Processing file 1658584-1admission-notes-redacted.pdf\n",
"Processing file 1668278-doc066.pdf\n",
"Processing file 5332373-1-AEA-2080-NOTES-HANDWRITTEN-CHRONOLOGY-EVENTS.pdf\n",
"Processing file 528420-handwritten-notes-of-meeting-with-cori-june-26.pdf\n",
"Processing file 2777703-AV-Interview-Notes.pdf\n",
"Processing file 2163779-skm-c224e15070214420-pdf-handwritten-notes-about.pdf\n",
"Processing file 266726-boogaard-journal.pdf\n"
]
}
],
"source": [
"process_training_data()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (hydra)",
"language": "python",
"name": "hydra"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.1"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"state": {},
"version_major": 2,
"version_minor": 0
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|