Kevin Hu commited on
Commit
0129457
·
1 Parent(s): b797251

Upgrades Document Layout Analysis model. (#4054)

Browse files

### What problem does this PR solve?

#4052

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

api/db/services/task_service.py CHANGED
@@ -247,8 +247,8 @@ def queue_tasks(doc: dict, bucket: str, name: str):
247
  task["progress"] = 0.0
248
 
249
  prev_tasks = TaskService.get_tasks(doc["id"])
 
250
  if prev_tasks:
251
- ck_num = 0
252
  for task in tsks:
253
  ck_num += reuse_prev_task_chunks(task, prev_tasks, chunking_config)
254
  TaskService.filter_delete([Task.doc_id == doc["id"]])
@@ -258,7 +258,7 @@ def queue_tasks(doc: dict, bucket: str, name: str):
258
  chunk_ids.extend(task["chunk_ids"].split())
259
  if chunk_ids:
260
  settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(chunking_config["tenant_id"]), chunking_config["kb_id"])
261
- DocumentService.update_by_id(doc["id"], {"chunk_num": ck_num})
262
 
263
  bulk_insert_into_db(Task, tsks, True)
264
  DocumentService.begin2parse(doc["id"])
 
247
  task["progress"] = 0.0
248
 
249
  prev_tasks = TaskService.get_tasks(doc["id"])
250
+ ck_num = 0
251
  if prev_tasks:
 
252
  for task in tsks:
253
  ck_num += reuse_prev_task_chunks(task, prev_tasks, chunking_config)
254
  TaskService.filter_delete([Task.doc_id == doc["id"]])
 
258
  chunk_ids.extend(task["chunk_ids"].split())
259
  if chunk_ids:
260
  settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(chunking_config["tenant_id"]), chunking_config["kb_id"])
261
+ DocumentService.update_by_id(doc["id"], {"chunk_num": ck_num})
262
 
263
  bulk_insert_into_db(Task, tsks, True)
264
  DocumentService.begin2parse(doc["id"])
conf/infinity_mapping.json CHANGED
@@ -16,6 +16,8 @@
16
  "content_with_weight": {"type": "varchar", "default": ""},
17
  "content_ltks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
18
  "content_sm_ltks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
 
 
19
  "page_num_int": {"type": "varchar", "default": ""},
20
  "top_int": {"type": "varchar", "default": ""},
21
  "position_int": {"type": "varchar", "default": ""},
 
16
  "content_with_weight": {"type": "varchar", "default": ""},
17
  "content_ltks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
18
  "content_sm_ltks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
19
+ "authors_tks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
20
+ "authors_sm_tks": {"type": "varchar", "default": "", "analyzer": "whitespace"},
21
  "page_num_int": {"type": "varchar", "default": ""},
22
  "top_int": {"type": "varchar", "default": ""},
23
  "position_int": {"type": "varchar", "default": ""},
deepdoc/vision/__init__.py CHANGED
@@ -15,9 +15,10 @@ import pdfplumber
15
 
16
  from .ocr import OCR
17
  from .recognizer import Recognizer
18
- from .layout_recognizer import LayoutRecognizer
19
  from .table_structure_recognizer import TableStructureRecognizer
20
 
 
21
  def init_in_out(args):
22
  from PIL import Image
23
  import os
 
15
 
16
  from .ocr import OCR
17
  from .recognizer import Recognizer
18
+ from .layout_recognizer import LayoutRecognizer4YOLOv10 as LayoutRecognizer
19
  from .table_structure_recognizer import TableStructureRecognizer
20
 
21
+
22
  def init_in_out(args):
23
  from PIL import Image
24
  import os
deepdoc/vision/layout_recognizer.py CHANGED
@@ -14,11 +14,14 @@ import os
14
  import re
15
  from collections import Counter
16
  from copy import deepcopy
 
 
17
  import numpy as np
18
  from huggingface_hub import snapshot_download
19
 
20
  from api.utils.file_utils import get_project_base_directory
21
  from deepdoc.vision import Recognizer
 
22
 
23
 
24
  class LayoutRecognizer(Recognizer):
@@ -149,3 +152,88 @@ class LayoutRecognizer(Recognizer):
149
 
150
  ocr_res = [b for b in ocr_res if b["text"].strip() not in garbag_set]
151
  return ocr_res, page_layout
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  import re
15
  from collections import Counter
16
  from copy import deepcopy
17
+
18
+ import cv2
19
  import numpy as np
20
  from huggingface_hub import snapshot_download
21
 
22
  from api.utils.file_utils import get_project_base_directory
23
  from deepdoc.vision import Recognizer
24
+ from deepdoc.vision.operators import nms
25
 
26
 
27
  class LayoutRecognizer(Recognizer):
 
152
 
153
  ocr_res = [b for b in ocr_res if b["text"].strip() not in garbag_set]
154
  return ocr_res, page_layout
155
+
156
+
157
+ class LayoutRecognizer4YOLOv10(LayoutRecognizer):
158
+ labels = [
159
+ "title",
160
+ "Text",
161
+ "Reference",
162
+ "Figure",
163
+ "Figure caption",
164
+ "Table",
165
+ "Table caption",
166
+ "Table caption",
167
+ "Equation",
168
+ "Figure caption",
169
+ ]
170
+
171
+ def __init__(self, domain):
172
+ domain = "layout"
173
+ super().__init__(domain)
174
+ self.auto = False
175
+ self.scaleFill = False
176
+ self.scaleup = True
177
+ self.stride = 32
178
+ self.center = True
179
+
180
+ def preprocess(self, image_list):
181
+ inputs = []
182
+ new_shape = self.input_shape # height, width
183
+ for img in image_list:
184
+ shape = img.shape[:2]# current shape [height, width]
185
+ # Scale ratio (new / old)
186
+ r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
187
+ # Compute padding
188
+ new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
189
+ dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
190
+ dw /= 2 # divide padding into 2 sides
191
+ dh /= 2
192
+ ww, hh = new_unpad
193
+ img = np.array(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)).astype(np.float32)
194
+ img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
195
+ top, bottom = int(round(dh - 0.1)) if self.center else 0, int(round(dh + 0.1))
196
+ left, right = int(round(dw - 0.1)) if self.center else 0, int(round(dw + 0.1))
197
+ img = cv2.copyMakeBorder(
198
+ img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
199
+ ) # add border
200
+ img /= 255.0
201
+ img = img.transpose(2, 0, 1)
202
+ img = img[np.newaxis, :, :, :].astype(np.float32)
203
+ inputs.append({self.input_names[0]: img, "scale_factor": [shape[1]/ww, shape[0]/hh, dw, dh]})
204
+
205
+ return inputs
206
+
207
+ def postprocess(self, boxes, inputs, thr):
208
+ thr = 0.08
209
+ boxes = np.squeeze(boxes)
210
+ scores = boxes[:, 4]
211
+ boxes = boxes[scores > thr, :]
212
+ scores = scores[scores > thr]
213
+ if len(boxes) == 0:
214
+ return []
215
+ class_ids = boxes[:, -1].astype(int)
216
+ boxes = boxes[:, :4]
217
+ boxes[:, 0] -= inputs["scale_factor"][2]
218
+ boxes[:, 2] -= inputs["scale_factor"][2]
219
+ boxes[:, 1] -= inputs["scale_factor"][3]
220
+ boxes[:, 3] -= inputs["scale_factor"][3]
221
+ input_shape = np.array([inputs["scale_factor"][0], inputs["scale_factor"][1], inputs["scale_factor"][0],
222
+ inputs["scale_factor"][1]])
223
+ boxes = np.multiply(boxes, input_shape, dtype=np.float32)
224
+
225
+ unique_class_ids = np.unique(class_ids)
226
+ indices = []
227
+ for class_id in unique_class_ids:
228
+ class_indices = np.where(class_ids == class_id)[0]
229
+ class_boxes = boxes[class_indices, :]
230
+ class_scores = scores[class_indices]
231
+ class_keep_boxes = nms(class_boxes, class_scores, 0.45)
232
+ indices.extend(class_indices[class_keep_boxes])
233
+
234
+ return [{
235
+ "type": self.label_list[class_ids[i]].lower(),
236
+ "bbox": [float(t) for t in boxes[i].tolist()],
237
+ "score": float(scores[i])
238
+ } for i in indices]
239
+
deepdoc/vision/operators.py CHANGED
@@ -709,3 +709,29 @@ def preprocess(im, preprocess_ops):
709
  for operator in preprocess_ops:
710
  im, im_info = operator(im, im_info)
711
  return im, im_info
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709
  for operator in preprocess_ops:
710
  im, im_info = operator(im, im_info)
711
  return im, im_info
712
+
713
+
714
+ def nms(bboxes, scores, iou_thresh):
715
+ import numpy as np
716
+ x1 = bboxes[:, 0]
717
+ y1 = bboxes[:, 1]
718
+ x2 = bboxes[:, 2]
719
+ y2 = bboxes[:, 3]
720
+ areas = (y2 - y1) * (x2 - x1)
721
+
722
+ indices = []
723
+ index = scores.argsort()[::-1]
724
+ while index.size > 0:
725
+ i = index[0]
726
+ indices.append(i)
727
+ x11 = np.maximum(x1[i], x1[index[1:]])
728
+ y11 = np.maximum(y1[i], y1[index[1:]])
729
+ x22 = np.minimum(x2[i], x2[index[1:]])
730
+ y22 = np.minimum(y2[i], y2[index[1:]])
731
+ w = np.maximum(0, x22 - x11 + 1)
732
+ h = np.maximum(0, y22 - y11 + 1)
733
+ overlaps = w * h
734
+ ious = overlaps / (areas[i] + areas[index[1:]] - overlaps)
735
+ idx = np.where(ious <= iou_thresh)[0]
736
+ index = index[idx + 1]
737
+ return indices