File size: 5,118 Bytes
97047b9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import json
import random
import gradio as gr
import pymupdf
def parse_pdf(file_path: str):
doc = pymupdf.open(file_path)
report_run_date_time = None
transportation_provider_name = None
rows = []
row = {}
for page in doc:
text_page = page.get_textpage()
state = None
for block in text_page.extractBLOCKS():
x0, y0, x1, y1, text = block[:5]
text: str
text = text.strip()
if text.startswith("Report Run Date Time"):
lines = text.split("\n")
report_run_date_time = lines[1]
transportation_provider_name = lines[2].replace("Transportation Provider Name:", "").strip()
elif "Fidelis Agent" in text:
state = "page_no"
elif state == "page_no":
state = "row_1"
elif state == "row_1":
lines = text.split("\n")
if len(lines) == 1:
type_, case_no, plan_id, member_id = lines[0].split(" ")
else:
type_ = lines[0]
case_no, plan_id, member_id = lines[1].split(" ")
row["type"] = type_
row["case_no"] = case_no
row["plan_id"] = plan_id
row["member_id"] = member_id
state = "row_2"
elif state == "row_2":
pickup_info = text
row["pickup_info"] = pickup_info
state = "row_3"
elif state == "row_3":
text = text.replace("Pickup:", "")
text = text.strip()
text = text.replace("\n", " ")
pickup_time = text
row["pickup_time"] = pickup_time
state = "row_4"
elif state == "row_4":
lines = text.split("\n")
assert len(lines) in (3, 4)
num_of_one_way_trips = int(lines[0])
vehicle_type = lines[1]
num_of_riders = int(lines[2])
if len(lines) == 4:
auth_number = lines[3]
else:
auth_number = None
row["num_of_one_way_trips"] = num_of_one_way_trips
row["vehicle_type"] = vehicle_type
row["num_of_riders"] = num_of_riders
row["auth_number"] = auth_number
state = "row_5"
elif state == "row_5":
dest_info = text
row["dest_info"] = dest_info
state = "row_6"
elif state == "row_6":
if x0 >= 700:
fidelis_agent = text
row["special_needs_and_comments"] = None
row["fidelis_agent"] = fidelis_agent
rows.append(row)
row = {}
state = "row_1"
elif x1 > 719:
lines = text.split("\n")
special_needs_and_comments = "\n".join(lines[:-1])
fidelis_agent = lines[-1]
row["special_needs_and_comments"] = special_needs_and_comments
row["fidelis_agent"] = fidelis_agent
rows.append(row)
row = {}
state = "row_1"
else:
special_needs_and_comments = text
row["special_needs_and_comments"] = special_needs_and_comments
state = "row_7"
elif state == "row_7":
if x0 >= 700:
fidelis_agent = text
row["fidelis_agent"] = fidelis_agent
rows.append(row)
row = {}
state = "row_1"
else:
special_needs_and_comments += text
row["special_needs_and_comments"] = special_needs_and_comments
state = "row_8"
elif state == "row_8":
assert x0 >= 700
fidelis_agent = text
row["fidelis_agent"] = fidelis_agent
rows.append(row)
row = {}
state = "row_1"
metadata = f"""\
Report Run Date Time: {report_run_date_time}
Transportation Provider Name: {transportation_provider_name}
Number of Items: {len(rows)}
"""
json_data = {
"report_run_date_time": report_run_date_time,
"transportation_provider_name": transportation_provider_name,
"itmes": rows,
}
output_path = f"output_{random.randint(0, 1000000):08d}.json"
with open(output_path, "w") as f:
json.dump(json_data, f, indent=4)
return metadata, output_path
def main():
app = gr.Interface(
fn=parse_pdf,
inputs=gr.File(label="PDF File"),
outputs=[
gr.Textbox(label="Metadata", lines=7),
gr.DownloadButton(label="Download JSON"),
],
allow_flagging=False,
)
app.launch()
if __name__ == "__main__":
main()
|