KevinHuSh commited on
Commit
072f9dd
·
1 Parent(s): e32ef75

Add app to rag module: presentaion & laws (#43)

Browse files
api/utils/file_utils.py CHANGED
@@ -150,4 +150,4 @@ def filename_type(filename):
150
  return FileType.AURAL.value
151
 
152
  if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename):
153
- return FileType.VISUAL
 
150
  return FileType.AURAL.value
151
 
152
  if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename):
153
+ return FileType.VISUAL
rag/app/__init__.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+
4
+ def callback__(progress, msg, func):
5
+ if not func :return
6
+ func(progress, msg)
7
+
8
+
9
+ BULLET_PATTERN = [[
10
+ r"第[零一二三四五六七八九十百]+编",
11
+ r"第[零一二三四五六七八九十百]+章",
12
+ r"第[零一二三四五六七八九十百]+节",
13
+ r"第[零一二三四五六七八九十百]+条",
14
+ r"[\((][零一二三四五六七八九十百]+[\))]",
15
+ ], [
16
+ r"[0-9]{,3}[\. 、]",
17
+ r"[0-9]{,2}\.[0-9]{,2}",
18
+ r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
19
+ r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
20
+ ], [
21
+ r"[零一二三四五六七八九十百]+[ 、]",
22
+ r"[\((][零一二三四五六七八九十百]+[\))]",
23
+ r"[\((][0-9]{,2}[\))]",
24
+ ] ,[
25
+ r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)",
26
+ r"Chapter (I+V?|VI*|XI|IX|X)",
27
+ r"Section [0-9]+",
28
+ r"Article [0-9]+"
29
+ ]
30
+ ]
31
+
32
+
33
+ def bullets_category(sections):
34
+ global BULLET_PATTERN
35
+ hits = [0] * len(BULLET_PATTERN)
36
+ for i, pro in enumerate(BULLET_PATTERN):
37
+ for sec in sections:
38
+ for p in pro:
39
+ if re.match(p, sec):
40
+ hits[i] += 1
41
+ break
42
+ maxium = 0
43
+ res = -1
44
+ for i,h in enumerate(hits):
45
+ if h <= maxium:continue
46
+ res = i
47
+ maxium = h
48
+ return res
rag/app/laws.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import re
3
+ from io import BytesIO
4
+ from docx import Document
5
+ import numpy as np
6
+ from rag.app import callback__, bullets_category, BULLET_PATTERN
7
+ from rag.nlp import huqie
8
+ from rag.parser.pdf_parser import HuParser
9
+
10
+
11
+ class Docx(object):
12
+ def __init__(self):
13
+ pass
14
+
15
+ def __clean(self, line):
16
+ line = re.sub(r"\u3000", " ", line).strip()
17
+ return line
18
+
19
+ def __call__(self, filename, binary=None):
20
+ self.doc = Document(
21
+ filename) if not binary else Document(BytesIO(binary))
22
+ lines = [self.__clean(p.text) for p in self.doc.paragraphs]
23
+ return [l for l in lines if l]
24
+
25
+
26
+ class Pdf(HuParser):
27
+ def __call__(self, filename, binary=None, from_page=0,
28
+ to_page=100000, zoomin=3, callback=None):
29
+ self.__images__(
30
+ filename if not binary else binary,
31
+ zoomin,
32
+ from_page,
33
+ to_page)
34
+ callback__((min(to_page, self.total_page) - from_page) / self.total_page / 2,
35
+ "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback)
36
+
37
+ from timeit import default_timer as timer
38
+ start = timer()
39
+ self._layouts_paddle(zoomin)
40
+ callback__((min(to_page, self.total_page) - from_page) / self.total_page / 2,
41
+ "Page {}~{}: Layout analysis finished".format(from_page, min(to_page, self.total_page)), callback)
42
+ print("paddle layouts:", timer()-start)
43
+ bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
44
+ # is it English
45
+ eng = 0
46
+ for b in bxs:
47
+ if re.match(r"[a-zA-Z]", b["text"].strip()):
48
+ eng += 1
49
+ if eng / len(bxs) > 0.8:
50
+ eng = True
51
+ else:
52
+ eng = False
53
+ # Merge vertically
54
+ i = 0
55
+ while i + 1 < len(bxs):
56
+ b = bxs[i]
57
+ b_ = bxs[i + 1]
58
+ if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]):
59
+ bxs.pop(i)
60
+ continue
61
+ concatting_feats = [
62
+ b["text"].strip()[-1] in ",;:'\",、‘“;:",
63
+ len(b["text"].strip())>1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
64
+ b["text"].strip()[0] in "。;?!?”)),,、:",
65
+ ]
66
+ # features for not concating
67
+ feats = [
68
+ b.get("layoutno",0) != b.get("layoutno",0),
69
+ b["text"].strip()[-1] in "。?!?",
70
+ eng and b["text"].strip()[-1] in ".!?",
71
+ b["page_number"] == b_["page_number"] and b_["top"] - \
72
+ b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
73
+ b["page_number"] < b_["page_number"] and abs(
74
+ b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4
75
+ ]
76
+ if any(feats) and not any(concatting_feats):
77
+ i += 1
78
+ continue
79
+ # merge up and down
80
+ b["bottom"] = b_["bottom"]
81
+ b["text"] += b_["text"]
82
+ b["x0"] = min(b["x0"], b_["x0"])
83
+ b["x1"] = max(b["x1"], b_["x1"])
84
+ bxs.pop(i + 1)
85
+
86
+ callback__((min(to_page, self.total_page) - from_page) / self.total_page / 2,
87
+ "Page {}~{}: Text extraction finished".format(from_page, min(to_page, self.total_page)), callback)
88
+
89
+ return [b["text"] + self._line_tag(b, zoomin) for b in bxs]
90
+
91
+
92
+ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
93
+ doc = {
94
+ "docnm_kwd": filename,
95
+ "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
96
+ }
97
+ doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
98
+ pdf_parser = None
99
+ sections = []
100
+ if re.search(r"\.docx?$", filename, re.IGNORECASE):
101
+ for txt in Docx()(filename, binary):
102
+ sections.append(txt)
103
+ if re.search(r"\.pdf$", filename, re.IGNORECASE):
104
+ pdf_parser = Pdf()
105
+ for txt in pdf_parser(filename if not binary else binary,
106
+ from_page=from_page, to_page=to_page, callback=callback):
107
+ sections.append(txt)
108
+ if re.search(r"\.txt$", filename, re.IGNORECASE):
109
+ txt = ""
110
+ if binary:txt = binary.decode("utf-8")
111
+ else:
112
+ with open(filename, "r") as f:
113
+ while True:
114
+ l = f.readline()
115
+ if not l:break
116
+ txt += l
117
+ sections = txt.split("\n")
118
+ sections = [l for l in sections if l]
119
+
120
+ # is it English
121
+ eng = 0
122
+ for sec in sections:
123
+ if re.match(r"[a-zA-Z]", sec.strip()):
124
+ eng += 1
125
+ if eng / len(sections) > 0.8:
126
+ eng = True
127
+ else:
128
+ eng = False
129
+ # Remove 'Contents' part
130
+ i = 0
131
+ while i < len(sections):
132
+ if not re.match(r"(Contents|目录|目次)$", re.sub(r"( | |\u3000)+", "", sections[i].split("@@")[0])):
133
+ i += 1
134
+ continue
135
+ sections.pop(i)
136
+ if i >= len(sections): break
137
+ prefix = sections[i].strip()[:3] if not eng else " ".join(sections[i].strip().split(" ")[:2])
138
+ while not prefix:
139
+ sections.pop(i)
140
+ if i >= len(sections): break
141
+ prefix = sections[i].strip()[:3] if not eng else " ".join(sections[i].strip().split(" ")[:2])
142
+ sections.pop(i)
143
+ if i >= len(sections) or not prefix: break
144
+ for j in range(i, min(i+128, len(sections))):
145
+ if not re.match(prefix, sections[j]):
146
+ continue
147
+ for k in range(i, j):sections.pop(i)
148
+ break
149
+
150
+ bull = bullets_category(sections)
151
+ projs = [len(BULLET_PATTERN[bull])] * len(sections)
152
+ for i, sec in enumerate(sections):
153
+ for j,p in enumerate(BULLET_PATTERN[bull]):
154
+ if re.match(p, sec.strip()):
155
+ projs[i] = j
156
+ break
157
+ readed = [0] * len(sections)
158
+ cks = []
159
+ for pr in range(len(BULLET_PATTERN[bull])-1, 1, -1):
160
+ for i in range(len(sections)):
161
+ if readed[i] or projs[i] < pr:
162
+ continue
163
+ # find father and grand-father and grand...father
164
+ p = projs[i]
165
+ readed[i] = 1
166
+ ck = [sections[i]]
167
+ for j in range(i-1, -1, -1):
168
+ if projs[j] >= p:continue
169
+ ck.append(sections[j])
170
+ readed[j] = 1
171
+ p = projs[j]
172
+ if p == 0: break
173
+ cks.append(ck[::-1])
174
+
175
+ res = []
176
+ # wrap up to es documents
177
+ for ck in cks:
178
+ print("\n-".join(ck))
179
+ ck = "\n".join(ck)
180
+ d = copy.deepcopy(doc)
181
+ if pdf_parser:
182
+ d["image"] = pdf_parser.crop(ck)
183
+ ck = pdf_parser.remove_tag(ck)
184
+ d["content_ltks"] = huqie.qie(ck)
185
+ d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
186
+ res.append(d)
187
+ return res
188
+
189
+
190
+ if __name__ == "__main__":
191
+ import sys
192
+ chunk(sys.argv[1])
rag/app/presentation.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import re
3
+ from io import BytesIO
4
+ from pptx import Presentation
5
+
6
+ from rag.app import callback__
7
+ from rag.nlp import huqie
8
+ from rag.parser.pdf_parser import HuParser
9
+
10
+
11
+ class Ppt(object):
12
+ def __init__(self):
13
+ super().__init__()
14
+
15
+ def __extract(self, shape):
16
+ if shape.shape_type == 19:
17
+ tb = shape.table
18
+ rows = []
19
+ for i in range(1, len(tb.rows)):
20
+ rows.append("; ".join([tb.cell(0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
21
+ return "\n".join(rows)
22
+
23
+ if shape.has_text_frame:
24
+ return shape.text_frame.text
25
+
26
+ if shape.shape_type == 6:
27
+ texts = []
28
+ for p in shape.shapes:
29
+ t = self.__extract(p)
30
+ if t: texts.append(t)
31
+ return "\n".join(texts)
32
+
33
+ def __call__(self, fnm, from_page, to_page, callback=None):
34
+ ppt = Presentation(fnm) if isinstance(
35
+ fnm, str) else Presentation(
36
+ BytesIO(fnm))
37
+ txts = []
38
+ self.total_page = len(ppt.slides)
39
+ for i, slide in enumerate(ppt.slides[from_page: to_page]):
40
+ texts = []
41
+ for shape in slide.shapes:
42
+ txt = self.__extract(shape)
43
+ if txt: texts.append(txt)
44
+ txts.append("\n".join(texts))
45
+ callback__((i+1)/self.total_page/2, "", callback)
46
+
47
+ callback__((min(to_page, self.total_page) - from_page) / self.total_page,
48
+ "Page {}~{}: Text extraction finished".format(from_page, min(to_page, self.total_page)), callback)
49
+ import aspose.slides as slides
50
+ import aspose.pydrawing as drawing
51
+ imgs = []
52
+ with slides.Presentation(BytesIO(fnm)) as presentation:
53
+ for i, slide in enumerate(presentation.slides[from_page: to_page]):
54
+ buffered = BytesIO()
55
+ slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg)
56
+ imgs.append(buffered.getvalue())
57
+ assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
58
+ callback__((min(to_page, self.total_page) - from_page) / self.total_page,
59
+ "Page {}~{}: Image extraction finished".format(from_page, min(to_page, self.total_page)), callback)
60
+
61
+ return [(txts[i], imgs[i]) for i in range(len(txts))]
62
+
63
+
64
+ class Pdf(HuParser):
65
+ def __init__(self):
66
+ super().__init__()
67
+
68
+ def __garbage(self, txt):
69
+ txt = txt.lower().strip()
70
+ if re.match(r"[0-9\.,%/-]+$", txt): return True
71
+ if len(txt) < 3:return True
72
+ return False
73
+
74
+ def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
75
+ self.__images__(filename if not binary else binary, zoomin, from_page, to_page)
76
+ callback__((min(to_page, self.total_page)-from_page) / self.total_page, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)), callback)
77
+ assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
78
+ res = []
79
+ #################### More precisely ###################
80
+ # self._layouts_paddle(zoomin)
81
+ # self._text_merge()
82
+ # pages = {}
83
+ # for b in self.boxes:
84
+ # if self.__garbage(b["text"]):continue
85
+ # if b["page_number"] not in pages: pages[b["page_number"]] = []
86
+ # pages[b["page_number"]].append(b["text"])
87
+ # for i, lines in pages.items():
88
+ # res.append(("\n".join(lines), self.page_images[i-1]))
89
+ # return res
90
+ ########################################
91
+
92
+ for i in range(len(self.boxes)):
93
+ lines = "\n".join([b["text"] for b in self.boxes[i] if not self.__garbage(b["text"])])
94
+ res.append((lines, self.page_images[i]))
95
+ return res
96
+
97
+
98
+ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
99
+ doc = {
100
+ "docnm_kwd": filename,
101
+ "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
102
+ }
103
+ doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
104
+ res = []
105
+ if re.search(r"\.pptx?$", filename, re.IGNORECASE):
106
+ for txt,img in Ppt()(filename if not binary else binary, from_page, to_page, callback):
107
+ d = copy.deepcopy(doc)
108
+ d["content_ltks"] = huqie.qie(txt)
109
+ d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
110
+ d["image"] = img
111
+ res.append(d)
112
+ return res
113
+ if re.search(r"\.pdf$", filename, re.IGNORECASE):
114
+ for txt,img in Pdf()(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback):
115
+ d = copy.deepcopy(doc)
116
+ d["content_ltks"] = huqie.qie(txt)
117
+ d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
118
+ d["image"] = img
119
+ res.append(d)
120
+ return res
121
+ callback__(-1, "This kind of presentation document did not support yet!", callback)
122
+
123
+
124
+ if __name__== "__main__":
125
+ import sys
126
+ print(chunk(sys.argv[1]))
127
+
rag/nlp/huchunk.py CHANGED
@@ -352,11 +352,6 @@ class ExcelChunker(HuChunker):
352
 
353
  class PptChunker(HuChunker):
354
 
355
- @dataclass
356
- class Fields:
357
- text_chunks: List = None
358
- table_chunks: List = None
359
-
360
  def __init__(self):
361
  super().__init__()
362
 
 
352
 
353
  class PptChunker(HuChunker):
354
 
 
 
 
 
 
355
  def __init__(self):
356
  super().__init__()
357
 
rag/parser/pdf_parser.py CHANGED
@@ -370,7 +370,7 @@ class HuParser:
370
  res.append(lts)
371
  return res
372
 
373
- def __table_transformer_job(self, ZM):
374
  logging.info("Table processing...")
375
  imgs, pos = [], []
376
  tbcnt = [0]
@@ -416,6 +416,50 @@ class HuParser:
416
  pg.append(it)
417
  self.tb_cpns.extend(pg)
418
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
419
  def __ocr_paddle(self, pagenum, img, chars, ZM=3):
420
  bxs = self.ocr.ocr(np.array(img), cls=True)[0]
421
  if not bxs:
@@ -453,7 +497,7 @@ class HuParser:
453
 
454
  self.boxes.append(bxs)
455
 
456
- def __layouts_paddle(self, ZM):
457
  assert len(self.page_images) == len(self.boxes)
458
  # Tag layout type
459
  boxes = []
@@ -524,7 +568,24 @@ class HuParser:
524
 
525
  self.boxes = boxes
526
 
527
- def __text_merge(self, garbage):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
528
  # merge adjusted boxes
529
  bxs = self.boxes
530
 
@@ -537,6 +598,7 @@ class HuParser:
537
  tt = b.get("text", "").strip()
538
  return tt and any([tt.find(t.strip()) == 0 for t in txts])
539
 
 
540
  i = 0
541
  while i < len(bxs) - 1:
542
  b = bxs[i]
@@ -567,7 +629,8 @@ class HuParser:
567
  i += 1
568
  self.boxes = bxs
569
 
570
- # count boxes in the same row
 
571
  for i in range(len(self.boxes)):
572
  mh = self.mean_height[self.boxes[i]["page_number"] - 1]
573
  self.boxes[i]["in_row"] = 0
@@ -583,49 +646,6 @@ class HuParser:
583
  break
584
  j += 1
585
 
586
- def gather(kwd, fzy=10, ption=0.6):
587
- eles = self.sort_Y_firstly(
588
- [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
589
- eles = self.__layouts_cleanup(self.boxes, eles, 5, ption)
590
- return self.sort_Y_firstly(eles, 0)
591
-
592
- headers = gather(r".*header$")
593
- rows = gather(r".* (row|header)")
594
- spans = gather(r".*spanning")
595
- clmns = sorted([r for r in self.tb_cpns if re.match(
596
- r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
597
- clmns = self.__layouts_cleanup(self.boxes, clmns, 5, 0.5)
598
- for b in self.boxes:
599
- if b.get("layout_type", "") != "table":
600
- continue
601
- ii = self.__find_overlapped_with_threashold(b, rows, thr=0.3)
602
- if ii is not None:
603
- b["R"] = ii
604
- b["R_top"] = rows[ii]["top"]
605
- b["R_bott"] = rows[ii]["bottom"]
606
-
607
- ii = self.__find_overlapped_with_threashold(b, headers, thr=0.3)
608
- if ii is not None:
609
- b["H_top"] = headers[ii]["top"]
610
- b["H_bott"] = headers[ii]["bottom"]
611
- b["H_left"] = headers[ii]["x0"]
612
- b["H_right"] = headers[ii]["x1"]
613
- b["H"] = ii
614
-
615
- ii = self.__find_overlapped_with_threashold(b, clmns, thr=0.3)
616
- if ii is not None:
617
- b["C"] = ii
618
- b["C_left"] = clmns[ii]["x0"]
619
- b["C_right"] = clmns[ii]["x1"]
620
-
621
- ii = self.__find_overlapped_with_threashold(b, spans, thr=0.3)
622
- if ii is not None:
623
- b["H_top"] = spans[ii]["top"]
624
- b["H_bott"] = spans[ii]["bottom"]
625
- b["H_left"] = spans[ii]["x0"]
626
- b["H_right"] = spans[ii]["x1"]
627
- b["SP"] = ii
628
-
629
  # concat between rows
630
  boxes = deepcopy(self.boxes)
631
  blocks = []
@@ -633,8 +653,6 @@ class HuParser:
633
  chunks = []
634
 
635
  def dfs(up, dp):
636
- if not up["text"].strip() or up["text"].strip() in garbage:
637
- return
638
  chunks.append(up)
639
  i = dp
640
  while i < min(dp + 12, len(boxes)):
@@ -658,8 +676,7 @@ class HuParser:
658
  i += 1
659
  continue
660
 
661
- if not down["text"].strip() \
662
- or down["text"].strip() in garbage:
663
  i += 1
664
  continue
665
 
@@ -1444,18 +1461,19 @@ class HuParser:
1444
  return j
1445
  return
1446
 
 
 
 
 
 
 
 
 
 
 
 
 
1447
  def __filterout_scraps(self, boxes, ZM):
1448
- def line_tag(bx):
1449
- pn = [bx["page_number"]]
1450
- top = bx["top"] - self.page_cum_height[pn[0] - 1]
1451
- bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
1452
- while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
1453
- bott -= self.page_images[pn[-1] - 1].size[1] / ZM
1454
- pn.append(pn[-1] + 1)
1455
-
1456
- return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
1457
- .format("-".join([str(p) for p in pn]),
1458
- bx["x0"], bx["x1"], top, bott)
1459
 
1460
  def width(b):
1461
  return b["x1"] - b["x0"]
@@ -1520,14 +1538,14 @@ class HuParser:
1520
  boxes.pop(0)
1521
  mw = np.mean(widths)
1522
  if mj or mw / pw >= 0.35 or mw > 200:
1523
- res.append("\n".join([c["text"] + line_tag(c) for c in lines]))
1524
  else:
1525
  logging.debug("REMOVED: " +
1526
  "<<".join([c["text"] for c in lines]))
1527
 
1528
  return "\n\n".join(res)
1529
 
1530
- def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
1531
  self.lefted_chars = []
1532
  self.mean_height = []
1533
  self.mean_width = []
@@ -1537,22 +1555,25 @@ class HuParser:
1537
  self.page_layout = []
1538
  try:
1539
  self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
1540
- self.page_images = [p.to_image(resolution=72*zoomin).annotated for i,p in enumerate(self.pdf.pages[:299])]
1541
- self.page_chars = [[c for c in self.pdf.pages[i].chars if self._has_color(c)] for i in range(len(self.page_images))]
 
 
 
1542
  except Exception as e:
1543
  self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf")
1544
  self.page_images = []
1545
  self.page_chars = []
1546
  mat = fitz.Matrix(zoomin, zoomin)
1547
- for page in self.pdf:
1548
- pix = page.getPixmap(matrix = mat)
 
1549
  img = Image.frombytes("RGB", [pix.width, pix.height],
1550
  pix.samples)
1551
  self.page_images.append(img)
1552
  self.page_chars.append([])
1553
 
1554
  logging.info("Images converted.")
1555
-
1556
  for i, img in enumerate(self.page_images):
1557
  chars = self.page_chars[i]
1558
  self.mean_height.append(
@@ -1561,40 +1582,26 @@ class HuParser:
1561
  self.mean_width.append(
1562
  np.median(sorted([c["width"] for c in chars])) if chars else 8
1563
  )
1564
- if i > 0:
1565
- if not chars:
1566
- self.page_cum_height.append(img.size[1] / zoomin)
1567
- else:
1568
- self.page_cum_height.append(
1569
- np.max([c["bottom"] for c in chars]))
 
1570
  self.__ocr_paddle(i + 1, img, chars, zoomin)
1571
- self.__layouts_paddle(zoomin)
1572
 
1573
  self.page_cum_height = np.cumsum(self.page_cum_height)
1574
- assert len(self.page_cum_height) == len(self.page_images)
1575
 
1576
- garbage = set()
1577
- for k in self.garbages.keys():
1578
- self.garbages[k] = Counter(self.garbages[k])
1579
- for g, c in self.garbages[k].items():
1580
- if c > 1:
1581
- garbage.add(g)
1582
-
1583
- logging.debug("GARBAGE:" + ",".join(garbage))
1584
- self.boxes = [b for b in self.boxes if b["text"] not in garbage]
1585
-
1586
- # cumlative Y
1587
- for i in range(len(self.boxes)):
1588
- self.boxes[i]["top"] += \
1589
- self.page_cum_height[self.boxes[i]["page_number"] - 1]
1590
- self.boxes[i]["bottom"] += \
1591
- self.page_cum_height[self.boxes[i]["page_number"] - 1]
1592
-
1593
- self.__table_transformer_job(zoomin)
1594
- self.__text_merge(garbage)
1595
  self.__filter_forpages()
1596
  tbls = self.__extract_table_figure(need_image, zoomin, return_html)
1597
-
1598
  return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
1599
 
1600
  def remove_tag(self, txt):
 
370
  res.append(lts)
371
  return res
372
 
373
+ def _table_transformer_job(self, ZM):
374
  logging.info("Table processing...")
375
  imgs, pos = [], []
376
  tbcnt = [0]
 
416
  pg.append(it)
417
  self.tb_cpns.extend(pg)
418
 
419
+ def gather(kwd, fzy=10, ption=0.6):
420
+ eles = self.sort_Y_firstly(
421
+ [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
422
+ eles = self.__layouts_cleanup(self.boxes, eles, 5, ption)
423
+ return self.sort_Y_firstly(eles, 0)
424
+
425
+ # add R,H,C,SP tag to boxes within table layout
426
+ headers = gather(r".*header$")
427
+ rows = gather(r".* (row|header)")
428
+ spans = gather(r".*spanning")
429
+ clmns = sorted([r for r in self.tb_cpns if re.match(
430
+ r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
431
+ clmns = self.__layouts_cleanup(self.boxes, clmns, 5, 0.5)
432
+ for b in self.boxes:
433
+ if b.get("layout_type", "") != "table":
434
+ continue
435
+ ii = self.__find_overlapped_with_threashold(b, rows, thr=0.3)
436
+ if ii is not None:
437
+ b["R"] = ii
438
+ b["R_top"] = rows[ii]["top"]
439
+ b["R_bott"] = rows[ii]["bottom"]
440
+
441
+ ii = self.__find_overlapped_with_threashold(b, headers, thr=0.3)
442
+ if ii is not None:
443
+ b["H_top"] = headers[ii]["top"]
444
+ b["H_bott"] = headers[ii]["bottom"]
445
+ b["H_left"] = headers[ii]["x0"]
446
+ b["H_right"] = headers[ii]["x1"]
447
+ b["H"] = ii
448
+
449
+ ii = self.__find_overlapped_with_threashold(b, clmns, thr=0.3)
450
+ if ii is not None:
451
+ b["C"] = ii
452
+ b["C_left"] = clmns[ii]["x0"]
453
+ b["C_right"] = clmns[ii]["x1"]
454
+
455
+ ii = self.__find_overlapped_with_threashold(b, spans, thr=0.3)
456
+ if ii is not None:
457
+ b["H_top"] = spans[ii]["top"]
458
+ b["H_bott"] = spans[ii]["bottom"]
459
+ b["H_left"] = spans[ii]["x0"]
460
+ b["H_right"] = spans[ii]["x1"]
461
+ b["SP"] = ii
462
+
463
  def __ocr_paddle(self, pagenum, img, chars, ZM=3):
464
  bxs = self.ocr.ocr(np.array(img), cls=True)[0]
465
  if not bxs:
 
497
 
498
  self.boxes.append(bxs)
499
 
500
+ def _layouts_paddle(self, ZM):
501
  assert len(self.page_images) == len(self.boxes)
502
  # Tag layout type
503
  boxes = []
 
568
 
569
  self.boxes = boxes
570
 
571
+ garbage = set()
572
+ for k in self.garbages.keys():
573
+ self.garbages[k] = Counter(self.garbages[k])
574
+ for g, c in self.garbages[k].items():
575
+ if c > 1:
576
+ garbage.add(g)
577
+
578
+ logging.debug("GARBAGE:" + ",".join(garbage))
579
+ self.boxes = [b for b in self.boxes if b["text"].strip() not in garbage]
580
+
581
+ # cumlative Y
582
+ for i in range(len(self.boxes)):
583
+ self.boxes[i]["top"] += \
584
+ self.page_cum_height[self.boxes[i]["page_number"] - 1]
585
+ self.boxes[i]["bottom"] += \
586
+ self.page_cum_height[self.boxes[i]["page_number"] - 1]
587
+
588
+ def _text_merge(self):
589
  # merge adjusted boxes
590
  bxs = self.boxes
591
 
 
598
  tt = b.get("text", "").strip()
599
  return tt and any([tt.find(t.strip()) == 0 for t in txts])
600
 
601
+ # horizontally merge adjacent box with the same layout
602
  i = 0
603
  while i < len(bxs) - 1:
604
  b = bxs[i]
 
629
  i += 1
630
  self.boxes = bxs
631
 
632
+ def _concat_downward(self):
633
+ # count boxes in the same row as a feature
634
  for i in range(len(self.boxes)):
635
  mh = self.mean_height[self.boxes[i]["page_number"] - 1]
636
  self.boxes[i]["in_row"] = 0
 
646
  break
647
  j += 1
648
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
649
  # concat between rows
650
  boxes = deepcopy(self.boxes)
651
  blocks = []
 
653
  chunks = []
654
 
655
  def dfs(up, dp):
 
 
656
  chunks.append(up)
657
  i = dp
658
  while i < min(dp + 12, len(boxes)):
 
676
  i += 1
677
  continue
678
 
679
+ if not down["text"].strip():
 
680
  i += 1
681
  continue
682
 
 
1461
  return j
1462
  return
1463
 
1464
+ def _line_tag(self, bx, ZM):
1465
+ pn = [bx["page_number"]]
1466
+ top = bx["top"] - self.page_cum_height[pn[0] - 1]
1467
+ bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
1468
+ while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
1469
+ bott -= self.page_images[pn[-1] - 1].size[1] / ZM
1470
+ pn.append(pn[-1] + 1)
1471
+
1472
+ return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
1473
+ .format("-".join([str(p) for p in pn]),
1474
+ bx["x0"], bx["x1"], top, bott)
1475
+
1476
  def __filterout_scraps(self, boxes, ZM):
 
 
 
 
 
 
 
 
 
 
 
1477
 
1478
  def width(b):
1479
  return b["x1"] - b["x0"]
 
1538
  boxes.pop(0)
1539
  mw = np.mean(widths)
1540
  if mj or mw / pw >= 0.35 or mw > 200:
1541
+ res.append("\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
1542
  else:
1543
  logging.debug("REMOVED: " +
1544
  "<<".join([c["text"] for c in lines]))
1545
 
1546
  return "\n\n".join(res)
1547
 
1548
+ def __images__(self, fnm, zoomin=3, page_from=0, page_to=299):
1549
  self.lefted_chars = []
1550
  self.mean_height = []
1551
  self.mean_width = []
 
1555
  self.page_layout = []
1556
  try:
1557
  self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
1558
+ self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
1559
+ enumerate(self.pdf.pages[page_from:page_to])]
1560
+ self.page_chars = [[c for c in self.pdf.pages[i].chars if self._has_color(c)] for i in
1561
+ range(len(self.page_images))]
1562
+ self.total_page = len(self.pdf.pages)
1563
  except Exception as e:
1564
  self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf")
1565
  self.page_images = []
1566
  self.page_chars = []
1567
  mat = fitz.Matrix(zoomin, zoomin)
1568
+ self.total_page = len(self.pdf)
1569
+ for page in self.pdf[page_from:page_to]:
1570
+ pix = page.getPixmap(matrix=mat)
1571
  img = Image.frombytes("RGB", [pix.width, pix.height],
1572
  pix.samples)
1573
  self.page_images.append(img)
1574
  self.page_chars.append([])
1575
 
1576
  logging.info("Images converted.")
 
1577
  for i, img in enumerate(self.page_images):
1578
  chars = self.page_chars[i]
1579
  self.mean_height.append(
 
1582
  self.mean_width.append(
1583
  np.median(sorted([c["width"] for c in chars])) if chars else 8
1584
  )
1585
+ self.page_cum_height.append(img.size[1] / zoomin)
1586
+ # if i > 0:
1587
+ # if not chars:
1588
+ # self.page_cum_height.append(img.size[1] / zoomin)
1589
+ # else:
1590
+ # self.page_cum_height.append(
1591
+ # np.max([c["bottom"] for c in chars]))
1592
  self.__ocr_paddle(i + 1, img, chars, zoomin)
 
1593
 
1594
  self.page_cum_height = np.cumsum(self.page_cum_height)
1595
+ assert len(self.page_cum_height) == len(self.page_images)+1
1596
 
1597
+ def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
1598
+ self.__images__(fnm, zoomin)
1599
+ self._layouts_paddle(zoomin)
1600
+ self._table_transformer_job(zoomin)
1601
+ self._text_merge()
1602
+ self._concat_downward()
 
 
 
 
 
 
 
 
 
 
 
 
 
1603
  self.__filter_forpages()
1604
  tbls = self.__extract_table_figure(need_image, zoomin, return_html)
 
1605
  return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
1606
 
1607
  def remove_tag(self, txt):
rag/settings.py CHANGED
@@ -35,3 +35,4 @@ LoggerFactory.LEVEL = 10
35
  es_logger = getLogger("es")
36
  minio_logger = getLogger("minio")
37
  cron_logger = getLogger("cron_logger")
 
 
35
  es_logger = getLogger("es")
36
  minio_logger = getLogger("minio")
37
  cron_logger = getLogger("cron_logger")
38
+ chunk_logger = getLogger("chunk_logger")