File size: 6,133 Bytes
7dd2893
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96a1a44
 
f666f56
 
cfd6ece
b085dec
96a1a44
 
 
cdba7f7
f666f56
 
 
 
96a1a44
 
7d85666
 
54ec234
96a1a44
 
 
 
b83edb4
 
 
279ca43
79ada0b
7d85666
 
79ada0b
7d85666
cdba7f7
b83edb4
3cefaa0
96a1a44
b83edb4
96a1a44
7d85666
08bab63
96a1a44
b83edb4
96a1a44
 
 
 
 
79ada0b
 
7f98e24
96a1a44
79ada0b
 
a8294f2
 
 
96a1a44
 
 
79ada0b
 
 
b085dec
79ada0b
 
 
7f98e24
79ada0b
 
96a1a44
 
 
cfd6ece
 
96a1a44
79ada0b
96a1a44
b085dec
 
 
 
 
 
 
 
 
79ada0b
429cc62
b085dec
 
 
 
 
7f98e24
b085dec
79ada0b
 
 
b085dec
 
 
 
 
79ada0b
 
b085dec
 
 
79ada0b
 
b085dec
429cc62
b085dec
 
 
 
79ada0b
b085dec
 
 
 
 
 
 
79ada0b
 
b085dec
429cc62
b085dec
 
 
 
 
 
79ada0b
 
b085dec
1ed30a6
b085dec
96a1a44
 
 
 
 
79ada0b
429cc62
7d85666
51482f3
429cc62
 
51482f3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#
#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

import copy
import re

from api.db import ParserType
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
from deepdoc.parser import PdfParser, PlainParser
from rag.utils import num_tokens_from_string


class Pdf(PdfParser):
    def __init__(self):
        self.model_speciess = ParserType.MANUAL.value
        super().__init__()

    def __call__(self, filename, binary=None, from_page=0,

                 to_page=100000, zoomin=3, callback=None):
        from timeit import default_timer as timer
        start = timer()
        callback(msg="OCR is running...")
        self.__images__(
            filename if not binary else binary,
            zoomin,
            from_page,
            to_page,
            callback
        )
        callback(msg="OCR finished.")
        # for bb in self.boxes:
        #    for b in bb:
        #        print(b)
        print("OCR:", timer() - start)

        self._layouts_rec(zoomin)
        callback(0.65, "Layout analysis finished.")
        print("layouts:", timer() - start)
        self._table_transformer_job(zoomin)
        callback(0.67, "Table analysis finished.")
        self._text_merge()
        tbls = self._extract_table_figure(True, zoomin, True, True)
        self._concat_downward()
        self._filter_forpages()
        callback(0.68, "Text merging finished")

        # clean mess
        for b in self.boxes:
            b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())

        return [(b["text"], b.get("layout_no", ""), self.get_position(b, zoomin))
                for i, b in enumerate(self.boxes)], tbls


def chunk(filename, binary=None, from_page=0, to_page=100000,

          lang="Chinese", callback=None, **kwargs):
    """

        Only pdf is supported.

    """
    pdf_parser = None

    if re.search(r"\.pdf$", filename, re.IGNORECASE):
        pdf_parser = Pdf() if kwargs.get(
            "parser_config", {}).get(
            "layout_recognize", True) else PlainParser()
        sections, tbls = pdf_parser(filename if not binary else binary,
                                    from_page=from_page, to_page=to_page, callback=callback)
        if sections and len(sections[0]) < 3:
            sections = [(t, l, [[0] * 5]) for t, l in sections]

    else:
        raise NotImplementedError("file type not supported yet(pdf supported)")
    doc = {
        "docnm_kwd": filename
    }
    doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
    # is it English
    eng = lang.lower() == "english"  # pdf_parser.is_english

    # set pivot using the most frequent type of title,
    # then merge between 2 pivot
    if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1:
        max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
        most_level = max(0, max_lvl - 1)
        levels = []
        for txt, _, _ in sections:
            for t, lvl in pdf_parser.outlines:
                tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)])
                tks_ = set([txt[i] + txt[i + 1]
                            for i in range(min(len(t), len(txt) - 1))])
                if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8:
                    levels.append(lvl)
                    break
            else:
                levels.append(max_lvl + 1)

    else:
        bull = bullets_category([txt for txt, _, _ in sections])
        most_level, levels = title_frequency(
            bull, [(txt, l) for txt, l, poss in sections])

    assert len(sections) == len(levels)
    sec_ids = []
    sid = 0
    for i, lvl in enumerate(levels):
        if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
            sid += 1
        sec_ids.append(sid)
        # print(lvl, self.boxes[i]["text"], most_level, sid)

    sections = [(txt, sec_ids[i], poss)
                for i, (txt, _, poss) in enumerate(sections)]
    for (img, rows), poss in tbls:
        if not rows: continue
        sections.append((rows if isinstance(rows, str) else rows[0], -1,
                         [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))

    def tag(pn, left, right, top, bottom):
        if pn + left + right + top + bottom == 0:
            return ""
        return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
            .format(pn, left, right, top, bottom)

    chunks = []
    last_sid = -2
    tk_cnt = 0
    for txt, sec_id, poss in sorted(sections, key=lambda x: (
            x[-1][0][0], x[-1][0][3], x[-1][0][1])):
        poss = "\t".join([tag(*pos) for pos in poss])
        if tk_cnt < 32 or (tk_cnt < 1024 and (sec_id == last_sid or sec_id == -1)):
            if chunks:
                chunks[-1] += "\n" + txt + poss
                tk_cnt += num_tokens_from_string(txt)
                continue
        chunks.append(txt + poss)
        tk_cnt = num_tokens_from_string(txt)
        if sec_id > -1:
            last_sid = sec_id

    res = tokenize_table(tbls, doc, eng)
    res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
    return res


if __name__ == "__main__":
    import sys


    def dummy(prog=None, msg=""):
        pass


    chunk(sys.argv[1], callback=dummy)