|
import json |
|
import random |
|
|
|
import gradio as gr |
|
import pymupdf |
|
|
|
|
|
def parse_pdf(file_path: str): |
|
doc = pymupdf.open(file_path) |
|
|
|
report_run_date_time = None |
|
transportation_provider_name = None |
|
|
|
rows = [] |
|
row = {} |
|
for page in doc: |
|
text_page = page.get_textpage() |
|
|
|
state = None |
|
for block in text_page.extractBLOCKS(): |
|
x0, y0, x1, y1, text = block[:5] |
|
text: str |
|
text = text.strip() |
|
|
|
if text.startswith("Report Run Date Time"): |
|
lines = text.split("\n") |
|
report_run_date_time = lines[1] |
|
transportation_provider_name = lines[2].replace("Transportation Provider Name:", "").strip() |
|
elif "Fidelis Agent" in text: |
|
state = "page_no" |
|
elif state == "page_no": |
|
state = "row_1" |
|
elif state == "row_1": |
|
lines = text.split("\n") |
|
if len(lines) == 1: |
|
type_, case_no, plan_id, member_id = lines[0].split(" ") |
|
else: |
|
type_ = lines[0] |
|
case_no, plan_id, member_id = lines[1].split(" ") |
|
row["type"] = type_ |
|
row["case_no"] = case_no |
|
row["plan_id"] = plan_id |
|
row["member_id"] = member_id |
|
state = "row_2" |
|
elif state == "row_2": |
|
pickup_info = text |
|
row["pickup_info"] = pickup_info |
|
state = "row_3" |
|
elif state == "row_3": |
|
text = text.replace("Pickup:", "") |
|
text = text.strip() |
|
text = text.replace("\n", " ") |
|
pickup_time = text |
|
row["pickup_time"] = pickup_time |
|
state = "row_4" |
|
elif state == "row_4": |
|
lines = text.split("\n") |
|
assert len(lines) in (3, 4) |
|
num_of_one_way_trips = int(lines[0]) |
|
vehicle_type = lines[1] |
|
num_of_riders = int(lines[2]) |
|
if len(lines) == 4: |
|
auth_number = lines[3] |
|
else: |
|
auth_number = None |
|
row["num_of_one_way_trips"] = num_of_one_way_trips |
|
row["vehicle_type"] = vehicle_type |
|
row["num_of_riders"] = num_of_riders |
|
row["auth_number"] = auth_number |
|
state = "row_5" |
|
elif state == "row_5": |
|
dest_info = text |
|
row["dest_info"] = dest_info |
|
state = "row_6" |
|
elif state == "row_6": |
|
if x0 >= 700: |
|
fidelis_agent = text |
|
row["special_needs_and_comments"] = None |
|
row["fidelis_agent"] = fidelis_agent |
|
rows.append(row) |
|
row = {} |
|
state = "row_1" |
|
elif x1 > 719: |
|
lines = text.split("\n") |
|
special_needs_and_comments = "\n".join(lines[:-1]) |
|
fidelis_agent = lines[-1] |
|
row["special_needs_and_comments"] = special_needs_and_comments |
|
row["fidelis_agent"] = fidelis_agent |
|
rows.append(row) |
|
row = {} |
|
state = "row_1" |
|
else: |
|
special_needs_and_comments = text |
|
row["special_needs_and_comments"] = special_needs_and_comments |
|
state = "row_7" |
|
elif state == "row_7": |
|
if x0 >= 700: |
|
fidelis_agent = text |
|
row["fidelis_agent"] = fidelis_agent |
|
rows.append(row) |
|
row = {} |
|
state = "row_1" |
|
else: |
|
special_needs_and_comments += text |
|
row["special_needs_and_comments"] = special_needs_and_comments |
|
state = "row_8" |
|
elif state == "row_8": |
|
assert x0 >= 700 |
|
fidelis_agent = text |
|
row["fidelis_agent"] = fidelis_agent |
|
rows.append(row) |
|
row = {} |
|
state = "row_1" |
|
|
|
metadata = f"""\ |
|
Report Run Date Time: {report_run_date_time} |
|
Transportation Provider Name: {transportation_provider_name} |
|
Number of Items: {len(rows)} |
|
""" |
|
|
|
json_data = { |
|
"report_run_date_time": report_run_date_time, |
|
"transportation_provider_name": transportation_provider_name, |
|
"itmes": rows, |
|
} |
|
|
|
output_path = f"output_{random.randint(0, 1000000):08d}.json" |
|
|
|
with open(output_path, "w") as f: |
|
json.dump(json_data, f, indent=4) |
|
|
|
return metadata, output_path |
|
|
|
|
|
def main(): |
|
app = gr.Interface( |
|
fn=parse_pdf, |
|
inputs=gr.File(label="PDF File"), |
|
outputs=[ |
|
gr.Textbox(label="Metadata", lines=7), |
|
gr.DownloadButton(label="Download JSON"), |
|
], |
|
allow_flagging=False, |
|
) |
|
app.launch() |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|