PDFParser / app.py
vivym's picture
v1
97047b9
import json
import random
import gradio as gr
import pymupdf
def parse_pdf(file_path: str):
doc = pymupdf.open(file_path)
report_run_date_time = None
transportation_provider_name = None
rows = []
row = {}
for page in doc:
text_page = page.get_textpage()
state = None
for block in text_page.extractBLOCKS():
x0, y0, x1, y1, text = block[:5]
text: str
text = text.strip()
if text.startswith("Report Run Date Time"):
lines = text.split("\n")
report_run_date_time = lines[1]
transportation_provider_name = lines[2].replace("Transportation Provider Name:", "").strip()
elif "Fidelis Agent" in text:
state = "page_no"
elif state == "page_no":
state = "row_1"
elif state == "row_1":
lines = text.split("\n")
if len(lines) == 1:
type_, case_no, plan_id, member_id = lines[0].split(" ")
else:
type_ = lines[0]
case_no, plan_id, member_id = lines[1].split(" ")
row["type"] = type_
row["case_no"] = case_no
row["plan_id"] = plan_id
row["member_id"] = member_id
state = "row_2"
elif state == "row_2":
pickup_info = text
row["pickup_info"] = pickup_info
state = "row_3"
elif state == "row_3":
text = text.replace("Pickup:", "")
text = text.strip()
text = text.replace("\n", " ")
pickup_time = text
row["pickup_time"] = pickup_time
state = "row_4"
elif state == "row_4":
lines = text.split("\n")
assert len(lines) in (3, 4)
num_of_one_way_trips = int(lines[0])
vehicle_type = lines[1]
num_of_riders = int(lines[2])
if len(lines) == 4:
auth_number = lines[3]
else:
auth_number = None
row["num_of_one_way_trips"] = num_of_one_way_trips
row["vehicle_type"] = vehicle_type
row["num_of_riders"] = num_of_riders
row["auth_number"] = auth_number
state = "row_5"
elif state == "row_5":
dest_info = text
row["dest_info"] = dest_info
state = "row_6"
elif state == "row_6":
if x0 >= 700:
fidelis_agent = text
row["special_needs_and_comments"] = None
row["fidelis_agent"] = fidelis_agent
rows.append(row)
row = {}
state = "row_1"
elif x1 > 719:
lines = text.split("\n")
special_needs_and_comments = "\n".join(lines[:-1])
fidelis_agent = lines[-1]
row["special_needs_and_comments"] = special_needs_and_comments
row["fidelis_agent"] = fidelis_agent
rows.append(row)
row = {}
state = "row_1"
else:
special_needs_and_comments = text
row["special_needs_and_comments"] = special_needs_and_comments
state = "row_7"
elif state == "row_7":
if x0 >= 700:
fidelis_agent = text
row["fidelis_agent"] = fidelis_agent
rows.append(row)
row = {}
state = "row_1"
else:
special_needs_and_comments += text
row["special_needs_and_comments"] = special_needs_and_comments
state = "row_8"
elif state == "row_8":
assert x0 >= 700
fidelis_agent = text
row["fidelis_agent"] = fidelis_agent
rows.append(row)
row = {}
state = "row_1"
metadata = f"""\
Report Run Date Time: {report_run_date_time}
Transportation Provider Name: {transportation_provider_name}
Number of Items: {len(rows)}
"""
json_data = {
"report_run_date_time": report_run_date_time,
"transportation_provider_name": transportation_provider_name,
"itmes": rows,
}
output_path = f"output_{random.randint(0, 1000000):08d}.json"
with open(output_path, "w") as f:
json.dump(json_data, f, indent=4)
return metadata, output_path
def main():
app = gr.Interface(
fn=parse_pdf,
inputs=gr.File(label="PDF File"),
outputs=[
gr.Textbox(label="Metadata", lines=7),
gr.DownloadButton(label="Download JSON"),
],
allow_flagging=False,
)
app.launch()
if __name__ == "__main__":
main()