File size: 5,118 Bytes
97047b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import json
import random

import gradio as gr
import pymupdf


def parse_pdf(file_path: str):
    doc = pymupdf.open(file_path)

    report_run_date_time = None
    transportation_provider_name = None

    rows = []
    row = {}
    for page in doc:
        text_page = page.get_textpage()

        state = None
        for block in text_page.extractBLOCKS():
            x0, y0, x1, y1, text = block[:5]
            text: str
            text = text.strip()

            if text.startswith("Report Run Date Time"):
                lines = text.split("\n")
                report_run_date_time = lines[1]
                transportation_provider_name = lines[2].replace("Transportation Provider Name:", "").strip()
            elif "Fidelis Agent" in text:
                state = "page_no"
            elif state == "page_no":
                state = "row_1"
            elif state == "row_1":
                lines = text.split("\n")
                if len(lines) == 1:
                    type_, case_no, plan_id, member_id = lines[0].split(" ")
                else:
                    type_ = lines[0]
                    case_no, plan_id, member_id = lines[1].split(" ")
                row["type"] = type_
                row["case_no"] = case_no
                row["plan_id"] = plan_id
                row["member_id"] = member_id
                state = "row_2"
            elif state == "row_2":
                pickup_info = text
                row["pickup_info"] = pickup_info
                state = "row_3"
            elif state == "row_3":
                text = text.replace("Pickup:", "")
                text = text.strip()
                text = text.replace("\n", " ")
                pickup_time = text
                row["pickup_time"] = pickup_time
                state = "row_4"
            elif state == "row_4":
                lines = text.split("\n")
                assert len(lines) in (3, 4)
                num_of_one_way_trips = int(lines[0])
                vehicle_type = lines[1]
                num_of_riders = int(lines[2])
                if len(lines) == 4:
                    auth_number = lines[3]
                else:
                    auth_number = None
                row["num_of_one_way_trips"] = num_of_one_way_trips
                row["vehicle_type"] = vehicle_type
                row["num_of_riders"] = num_of_riders
                row["auth_number"] = auth_number
                state = "row_5"
            elif state == "row_5":
                dest_info = text
                row["dest_info"] = dest_info
                state = "row_6"
            elif state == "row_6":
                if x0 >= 700:
                    fidelis_agent = text
                    row["special_needs_and_comments"] = None
                    row["fidelis_agent"] = fidelis_agent
                    rows.append(row)
                    row = {}
                    state = "row_1"
                elif x1 > 719:
                    lines = text.split("\n")
                    special_needs_and_comments = "\n".join(lines[:-1])
                    fidelis_agent = lines[-1]
                    row["special_needs_and_comments"] = special_needs_and_comments
                    row["fidelis_agent"] = fidelis_agent
                    rows.append(row)
                    row = {}
                    state = "row_1"
                else:
                    special_needs_and_comments = text
                    row["special_needs_and_comments"] = special_needs_and_comments
                    state = "row_7"
            elif state == "row_7":
                if x0 >= 700:
                    fidelis_agent = text
                    row["fidelis_agent"] = fidelis_agent
                    rows.append(row)
                    row = {}
                    state = "row_1"
                else:
                    special_needs_and_comments += text
                    row["special_needs_and_comments"] = special_needs_and_comments
                    state = "row_8"
            elif state == "row_8":
                assert x0 >= 700
                fidelis_agent = text
                row["fidelis_agent"] = fidelis_agent
                rows.append(row)
                row = {}
                state = "row_1"

    metadata = f"""\
Report Run Date Time: {report_run_date_time}
Transportation Provider Name: {transportation_provider_name}
Number of Items: {len(rows)}
    """

    json_data = {
        "report_run_date_time": report_run_date_time,
        "transportation_provider_name": transportation_provider_name,
        "itmes": rows,
    }

    output_path = f"output_{random.randint(0, 1000000):08d}.json"

    with open(output_path, "w") as f:
        json.dump(json_data, f, indent=4)

    return metadata, output_path


def main():
    app = gr.Interface(
        fn=parse_pdf,
        inputs=gr.File(label="PDF File"),
        outputs=[
            gr.Textbox(label="Metadata", lines=7),
            gr.DownloadButton(label="Download JSON"),
        ],
        allow_flagging=False,
    )
    app.launch()


if __name__ == "__main__":
    main()