v1
Browse filesSigned-off-by: Ming Yang <[email protected]>
- .gitignore +1 -0
- app.py +149 -0
- requirements.txt +2 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
output*.json
|
app.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import random
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
import pymupdf
|
6 |
+
|
7 |
+
|
8 |
+
def parse_pdf(file_path: str):
|
9 |
+
doc = pymupdf.open(file_path)
|
10 |
+
|
11 |
+
report_run_date_time = None
|
12 |
+
transportation_provider_name = None
|
13 |
+
|
14 |
+
rows = []
|
15 |
+
row = {}
|
16 |
+
for page in doc:
|
17 |
+
text_page = page.get_textpage()
|
18 |
+
|
19 |
+
state = None
|
20 |
+
for block in text_page.extractBLOCKS():
|
21 |
+
x0, y0, x1, y1, text = block[:5]
|
22 |
+
text: str
|
23 |
+
text = text.strip()
|
24 |
+
|
25 |
+
if text.startswith("Report Run Date Time"):
|
26 |
+
lines = text.split("\n")
|
27 |
+
report_run_date_time = lines[1]
|
28 |
+
transportation_provider_name = lines[2].replace("Transportation Provider Name:", "").strip()
|
29 |
+
elif "Fidelis Agent" in text:
|
30 |
+
state = "page_no"
|
31 |
+
elif state == "page_no":
|
32 |
+
state = "row_1"
|
33 |
+
elif state == "row_1":
|
34 |
+
lines = text.split("\n")
|
35 |
+
if len(lines) == 1:
|
36 |
+
type_, case_no, plan_id, member_id = lines[0].split(" ")
|
37 |
+
else:
|
38 |
+
type_ = lines[0]
|
39 |
+
case_no, plan_id, member_id = lines[1].split(" ")
|
40 |
+
row["type"] = type_
|
41 |
+
row["case_no"] = case_no
|
42 |
+
row["plan_id"] = plan_id
|
43 |
+
row["member_id"] = member_id
|
44 |
+
state = "row_2"
|
45 |
+
elif state == "row_2":
|
46 |
+
pickup_info = text
|
47 |
+
row["pickup_info"] = pickup_info
|
48 |
+
state = "row_3"
|
49 |
+
elif state == "row_3":
|
50 |
+
text = text.replace("Pickup:", "")
|
51 |
+
text = text.strip()
|
52 |
+
text = text.replace("\n", " ")
|
53 |
+
pickup_time = text
|
54 |
+
row["pickup_time"] = pickup_time
|
55 |
+
state = "row_4"
|
56 |
+
elif state == "row_4":
|
57 |
+
lines = text.split("\n")
|
58 |
+
assert len(lines) in (3, 4)
|
59 |
+
num_of_one_way_trips = int(lines[0])
|
60 |
+
vehicle_type = lines[1]
|
61 |
+
num_of_riders = int(lines[2])
|
62 |
+
if len(lines) == 4:
|
63 |
+
auth_number = lines[3]
|
64 |
+
else:
|
65 |
+
auth_number = None
|
66 |
+
row["num_of_one_way_trips"] = num_of_one_way_trips
|
67 |
+
row["vehicle_type"] = vehicle_type
|
68 |
+
row["num_of_riders"] = num_of_riders
|
69 |
+
row["auth_number"] = auth_number
|
70 |
+
state = "row_5"
|
71 |
+
elif state == "row_5":
|
72 |
+
dest_info = text
|
73 |
+
row["dest_info"] = dest_info
|
74 |
+
state = "row_6"
|
75 |
+
elif state == "row_6":
|
76 |
+
if x0 >= 700:
|
77 |
+
fidelis_agent = text
|
78 |
+
row["special_needs_and_comments"] = None
|
79 |
+
row["fidelis_agent"] = fidelis_agent
|
80 |
+
rows.append(row)
|
81 |
+
row = {}
|
82 |
+
state = "row_1"
|
83 |
+
elif x1 > 719:
|
84 |
+
lines = text.split("\n")
|
85 |
+
special_needs_and_comments = "\n".join(lines[:-1])
|
86 |
+
fidelis_agent = lines[-1]
|
87 |
+
row["special_needs_and_comments"] = special_needs_and_comments
|
88 |
+
row["fidelis_agent"] = fidelis_agent
|
89 |
+
rows.append(row)
|
90 |
+
row = {}
|
91 |
+
state = "row_1"
|
92 |
+
else:
|
93 |
+
special_needs_and_comments = text
|
94 |
+
row["special_needs_and_comments"] = special_needs_and_comments
|
95 |
+
state = "row_7"
|
96 |
+
elif state == "row_7":
|
97 |
+
if x0 >= 700:
|
98 |
+
fidelis_agent = text
|
99 |
+
row["fidelis_agent"] = fidelis_agent
|
100 |
+
rows.append(row)
|
101 |
+
row = {}
|
102 |
+
state = "row_1"
|
103 |
+
else:
|
104 |
+
special_needs_and_comments += text
|
105 |
+
row["special_needs_and_comments"] = special_needs_and_comments
|
106 |
+
state = "row_8"
|
107 |
+
elif state == "row_8":
|
108 |
+
assert x0 >= 700
|
109 |
+
fidelis_agent = text
|
110 |
+
row["fidelis_agent"] = fidelis_agent
|
111 |
+
rows.append(row)
|
112 |
+
row = {}
|
113 |
+
state = "row_1"
|
114 |
+
|
115 |
+
metadata = f"""\
|
116 |
+
Report Run Date Time: {report_run_date_time}
|
117 |
+
Transportation Provider Name: {transportation_provider_name}
|
118 |
+
Number of Items: {len(rows)}
|
119 |
+
"""
|
120 |
+
|
121 |
+
json_data = {
|
122 |
+
"report_run_date_time": report_run_date_time,
|
123 |
+
"transportation_provider_name": transportation_provider_name,
|
124 |
+
"itmes": rows,
|
125 |
+
}
|
126 |
+
|
127 |
+
output_path = f"output_{random.randint(0, 1000000):08d}.json"
|
128 |
+
|
129 |
+
with open(output_path, "w") as f:
|
130 |
+
json.dump(json_data, f, indent=4)
|
131 |
+
|
132 |
+
return metadata, output_path
|
133 |
+
|
134 |
+
|
135 |
+
def main():
|
136 |
+
app = gr.Interface(
|
137 |
+
fn=parse_pdf,
|
138 |
+
inputs=gr.File(label="PDF File"),
|
139 |
+
outputs=[
|
140 |
+
gr.Textbox(label="Metadata", lines=7),
|
141 |
+
gr.DownloadButton(label="Download JSON"),
|
142 |
+
],
|
143 |
+
allow_flagging=False,
|
144 |
+
)
|
145 |
+
app.launch()
|
146 |
+
|
147 |
+
|
148 |
+
if __name__ == "__main__":
|
149 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
gradio==4.39.0
|
2 |
+
pymupdf==1.24.9
|