vivym commited on
Commit
97047b9
·
1 Parent(s): fd2c8c9

Signed-off-by: Ming Yang <[email protected]>

Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +149 -0
  3. requirements.txt +2 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ output*.json
app.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+
4
+ import gradio as gr
5
+ import pymupdf
6
+
7
+
8
+ def parse_pdf(file_path: str):
9
+ doc = pymupdf.open(file_path)
10
+
11
+ report_run_date_time = None
12
+ transportation_provider_name = None
13
+
14
+ rows = []
15
+ row = {}
16
+ for page in doc:
17
+ text_page = page.get_textpage()
18
+
19
+ state = None
20
+ for block in text_page.extractBLOCKS():
21
+ x0, y0, x1, y1, text = block[:5]
22
+ text: str
23
+ text = text.strip()
24
+
25
+ if text.startswith("Report Run Date Time"):
26
+ lines = text.split("\n")
27
+ report_run_date_time = lines[1]
28
+ transportation_provider_name = lines[2].replace("Transportation Provider Name:", "").strip()
29
+ elif "Fidelis Agent" in text:
30
+ state = "page_no"
31
+ elif state == "page_no":
32
+ state = "row_1"
33
+ elif state == "row_1":
34
+ lines = text.split("\n")
35
+ if len(lines) == 1:
36
+ type_, case_no, plan_id, member_id = lines[0].split(" ")
37
+ else:
38
+ type_ = lines[0]
39
+ case_no, plan_id, member_id = lines[1].split(" ")
40
+ row["type"] = type_
41
+ row["case_no"] = case_no
42
+ row["plan_id"] = plan_id
43
+ row["member_id"] = member_id
44
+ state = "row_2"
45
+ elif state == "row_2":
46
+ pickup_info = text
47
+ row["pickup_info"] = pickup_info
48
+ state = "row_3"
49
+ elif state == "row_3":
50
+ text = text.replace("Pickup:", "")
51
+ text = text.strip()
52
+ text = text.replace("\n", " ")
53
+ pickup_time = text
54
+ row["pickup_time"] = pickup_time
55
+ state = "row_4"
56
+ elif state == "row_4":
57
+ lines = text.split("\n")
58
+ assert len(lines) in (3, 4)
59
+ num_of_one_way_trips = int(lines[0])
60
+ vehicle_type = lines[1]
61
+ num_of_riders = int(lines[2])
62
+ if len(lines) == 4:
63
+ auth_number = lines[3]
64
+ else:
65
+ auth_number = None
66
+ row["num_of_one_way_trips"] = num_of_one_way_trips
67
+ row["vehicle_type"] = vehicle_type
68
+ row["num_of_riders"] = num_of_riders
69
+ row["auth_number"] = auth_number
70
+ state = "row_5"
71
+ elif state == "row_5":
72
+ dest_info = text
73
+ row["dest_info"] = dest_info
74
+ state = "row_6"
75
+ elif state == "row_6":
76
+ if x0 >= 700:
77
+ fidelis_agent = text
78
+ row["special_needs_and_comments"] = None
79
+ row["fidelis_agent"] = fidelis_agent
80
+ rows.append(row)
81
+ row = {}
82
+ state = "row_1"
83
+ elif x1 > 719:
84
+ lines = text.split("\n")
85
+ special_needs_and_comments = "\n".join(lines[:-1])
86
+ fidelis_agent = lines[-1]
87
+ row["special_needs_and_comments"] = special_needs_and_comments
88
+ row["fidelis_agent"] = fidelis_agent
89
+ rows.append(row)
90
+ row = {}
91
+ state = "row_1"
92
+ else:
93
+ special_needs_and_comments = text
94
+ row["special_needs_and_comments"] = special_needs_and_comments
95
+ state = "row_7"
96
+ elif state == "row_7":
97
+ if x0 >= 700:
98
+ fidelis_agent = text
99
+ row["fidelis_agent"] = fidelis_agent
100
+ rows.append(row)
101
+ row = {}
102
+ state = "row_1"
103
+ else:
104
+ special_needs_and_comments += text
105
+ row["special_needs_and_comments"] = special_needs_and_comments
106
+ state = "row_8"
107
+ elif state == "row_8":
108
+ assert x0 >= 700
109
+ fidelis_agent = text
110
+ row["fidelis_agent"] = fidelis_agent
111
+ rows.append(row)
112
+ row = {}
113
+ state = "row_1"
114
+
115
+ metadata = f"""\
116
+ Report Run Date Time: {report_run_date_time}
117
+ Transportation Provider Name: {transportation_provider_name}
118
+ Number of Items: {len(rows)}
119
+ """
120
+
121
+ json_data = {
122
+ "report_run_date_time": report_run_date_time,
123
+ "transportation_provider_name": transportation_provider_name,
124
+ "itmes": rows,
125
+ }
126
+
127
+ output_path = f"output_{random.randint(0, 1000000):08d}.json"
128
+
129
+ with open(output_path, "w") as f:
130
+ json.dump(json_data, f, indent=4)
131
+
132
+ return metadata, output_path
133
+
134
+
135
+ def main():
136
+ app = gr.Interface(
137
+ fn=parse_pdf,
138
+ inputs=gr.File(label="PDF File"),
139
+ outputs=[
140
+ gr.Textbox(label="Metadata", lines=7),
141
+ gr.DownloadButton(label="Download JSON"),
142
+ ],
143
+ allow_flagging=False,
144
+ )
145
+ app.launch()
146
+
147
+
148
+ if __name__ == "__main__":
149
+ main()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio==4.39.0
2
+ pymupdf==1.24.9