KevinHuSh commited on
Commit
3772f42
·
1 Parent(s): aa396c5

add ocr and recognizer demo, update README (#74)

Browse files
api/apps/conversation_app.py CHANGED
@@ -58,7 +58,7 @@ def set_conversation():
58
  conv = {
59
  "id": get_uuid(),
60
  "dialog_id": req["dialog_id"],
61
- "name": "New conversation",
62
  "message": [{"role": "assistant", "content": dia.prompt_config["prologue"]}]
63
  }
64
  ConversationService.save(**conv)
@@ -102,7 +102,7 @@ def rm():
102
  def list_convsersation():
103
  dialog_id = request.args["dialog_id"]
104
  try:
105
- convs = ConversationService.query(dialog_id=dialog_id)
106
  convs = [d.to_dict() for d in convs]
107
  return get_json_result(data=convs)
108
  except Exception as e:
 
58
  conv = {
59
  "id": get_uuid(),
60
  "dialog_id": req["dialog_id"],
61
+ "name": req.get("name", "New conversation"),
62
  "message": [{"role": "assistant", "content": dia.prompt_config["prologue"]}]
63
  }
64
  ConversationService.save(**conv)
 
102
  def list_convsersation():
103
  dialog_id = request.args["dialog_id"]
104
  try:
105
+ convs = ConversationService.query(dialog_id=dialog_id, order_by=ConversationService.model.create_time, reverse=True)
106
  convs = [d.to_dict() for d in convs]
107
  return get_json_result(data=convs)
108
  except Exception as e:
api/utils/file_utils.py CHANGED
@@ -185,5 +185,11 @@ def thumbnail(filename, blob):
185
  pass
186
 
187
 
 
 
 
 
 
 
188
 
189
 
 
185
  pass
186
 
187
 
188
+ def traversal_files(base):
189
+ for root, ds, fs in os.walk(base):
190
+ for f in fs:
191
+ fullname = os.path.join(root, f)
192
+ yield fullname
193
+
194
 
195
 
deepdoc/README.md CHANGED
@@ -11,7 +11,36 @@ English | [简体中文](./README_zh.md)
11
 
12
  With a bunch of documents from various domains with various formats and along with diverse retrieval requirements,
13
  an accurate analysis becomes a very challenge task. *Deep*Doc is born for that purpose.
14
- There 2 parts in *Deep*Doc so far: vision and parser.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  <a name="2"></a>
17
  ## 2. Vision
@@ -19,9 +48,14 @@ There 2 parts in *Deep*Doc so far: vision and parser.
19
  We use vision information to resolve problems as human being.
20
  - OCR. Since a lot of documents presented as images or at least be able to transform to image,
21
  OCR is a very essential and fundamental or even universal solution for text extraction.
22
-
 
 
 
 
 
23
  <div align="center" style="margin-top:20px;margin-bottom:20px;">
24
- <img src="https://lh6.googleusercontent.com/2xdiSjaGWkZ71YdORc71Ujf7jCHmO6G-6ONklzGiUYEh3QZpjPo6MQ9eqEFX20am_cdW4Ck0YRraXEetXWnM08kJd99yhik13Cy0_YKUAq2zVGR15LzkovRAmK9iT4o3hcJ8dTpspaJKUwt6R4gN7So" width="300"/>
25
  </div>
26
 
27
  - Layout recognition. Documents from different domain may have various layouts,
@@ -39,11 +73,18 @@ We use vision information to resolve problems as human being.
39
  - Footer
40
  - Reference
41
  - Equation
 
 
 
 
 
 
 
42
  <div align="center" style="margin-top:20px;margin-bottom:20px;">
43
- <img src="https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.7/ppstructure/docs/layout/layout.png?raw=true" width="900"/>
44
  </div>
45
 
46
- - Table Structure Recognition(TSR). Data table is a frequently used structure present data including numbers or text.
47
  And the structure of a table might be very complex, like hierarchy headers, spanning cells and projected row headers.
48
  Along with TSR, we also reassemble the content into sentences which could be well comprehended by LLM.
49
  We have five labels for TSR task:
@@ -52,8 +93,15 @@ We use vision information to resolve problems as human being.
52
  - Column header
53
  - Projected row header
54
  - Spanning cell
 
 
 
 
 
 
 
55
  <div align="center" style="margin-top:20px;margin-bottom:20px;">
56
- <img src="https://user-images.githubusercontent.com/10793386/139559159-cd23c972-8731-48ed-91df-f3f27e9f4d79.jpg" width="900"/>
57
  </div>
58
 
59
  <a name="3"></a>
@@ -71,4 +119,4 @@ The résumé is a very complicated kind of document. A résumé which is compose
71
  with various layouts could be resolved into structured data composed of nearly a hundred of fields.
72
  We haven't opened the parser yet, as we open the processing method after parsing procedure.
73
 
74
-
 
11
 
12
  With a bunch of documents from various domains with various formats and along with diverse retrieval requirements,
13
  an accurate analysis becomes a very challenge task. *Deep*Doc is born for that purpose.
14
+ There are 2 parts in *Deep*Doc so far: vision and parser.
15
+ You can run the flowing test programs if you're interested in our results of OCR, layout recognition and TSR.
16
+ ```bash
17
+ python deepdoc/vision/t_ocr.py -h
18
+ usage: t_ocr.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR]
19
+
20
+ options:
21
+ -h, --help show this help message and exit
22
+ --inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF
23
+ --output_dir OUTPUT_DIR
24
+ Directory where to store the output images. Default: './ocr_outputs'
25
+ ```
26
+ ```bash
27
+ python deepdoc/vision/t_recognizer.py -h
28
+ usage: t_recognizer.py [-h] --inputs INPUTS [--output_dir OUTPUT_DIR] [--threshold THRESHOLD] [--mode {layout,tsr}]
29
+
30
+ options:
31
+ -h, --help show this help message and exit
32
+ --inputs INPUTS Directory where to store images or PDFs, or a file path to a single image or PDF
33
+ --output_dir OUTPUT_DIR
34
+ Directory where to store the output images. Default: './layouts_outputs'
35
+ --threshold THRESHOLD
36
+ A threshold to filter out detections. Default: 0.5
37
+ --mode {layout,tsr} Task mode: layout recognition or table structure recognition
38
+ ```
39
+
40
+ Our models are served on HuggingFace. If you have trouble downloading HuggingFace models, this might help!!
41
+ ```bash
42
+ export HF_ENDPOINT=https://hf-mirror.com
43
+ ```
44
 
45
  <a name="2"></a>
46
  ## 2. Vision
 
48
  We use vision information to resolve problems as human being.
49
  - OCR. Since a lot of documents presented as images or at least be able to transform to image,
50
  OCR is a very essential and fundamental or even universal solution for text extraction.
51
+ ```bash
52
+ python deepdoc/vision/t_ocr.py --inputs=path_to_images_or_pdfs --output_dir=path_to_store_result
53
+ ```
54
+ The inputs could be directory to images or PDF, or a image or PDF.
55
+ You can look into the folder 'path_to_store_result' where has images which demonstrate the positions of results,
56
+ txt files which contain the OCR text.
57
  <div align="center" style="margin-top:20px;margin-bottom:20px;">
58
+ <img src="https://github.com/infiniflow/ragflow/assets/12318111/f25bee3d-aaf7-4102-baf5-d5208361d110" width="900"/>
59
  </div>
60
 
61
  - Layout recognition. Documents from different domain may have various layouts,
 
73
  - Footer
74
  - Reference
75
  - Equation
76
+
77
+ Have a try on the following command to see the layout detection results.
78
+ ```bash
79
+ python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=layout --output_dir=path_to_store_result
80
+ ```
81
+ The inputs could be directory to images or PDF, or a image or PDF.
82
+ You can look into the folder 'path_to_store_result' where has images which demonstrate the detection results as following:
83
  <div align="center" style="margin-top:20px;margin-bottom:20px;">
84
+ <img src="https://github.com/infiniflow/ragflow/assets/12318111/07e0f625-9b28-43d0-9fbb-5bf586cd286f" width="1000"/>
85
  </div>
86
 
87
+ - Table Structure Recognition(TSR). Data table is a frequently used structure to present data including numbers or text.
88
  And the structure of a table might be very complex, like hierarchy headers, spanning cells and projected row headers.
89
  Along with TSR, we also reassemble the content into sentences which could be well comprehended by LLM.
90
  We have five labels for TSR task:
 
93
  - Column header
94
  - Projected row header
95
  - Spanning cell
96
+
97
+ Have a try on the following command to see the layout detection results.
98
+ ```bash
99
+ python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=tsr --output_dir=path_to_store_result
100
+ ```
101
+ The inputs could be directory to images or PDF, or a image or PDF.
102
+ You can look into the folder 'path_to_store_result' where has both images and html pages which demonstrate the detection results as following:
103
  <div align="center" style="margin-top:20px;margin-bottom:20px;">
104
+ <img src="https://github.com/infiniflow/ragflow/assets/12318111/cb24e81b-f2ba-49f3-ac09-883d75606f4c" width="1000"/>
105
  </div>
106
 
107
  <a name="3"></a>
 
119
  with various layouts could be resolved into structured data composed of nearly a hundred of fields.
120
  We haven't opened the parser yet, as we open the processing method after parsing procedure.
121
 
122
+
deepdoc/vision/__init__.py CHANGED
@@ -1,4 +1,49 @@
 
1
  from .ocr import OCR
2
  from .recognizer import Recognizer
3
  from .layout_recognizer import LayoutRecognizer
4
  from .table_structure_recognizer import TableStructureRecognizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
  from .ocr import OCR
3
  from .recognizer import Recognizer
4
  from .layout_recognizer import LayoutRecognizer
5
  from .table_structure_recognizer import TableStructureRecognizer
6
+
7
+ def init_in_out(args):
8
+ from PIL import Image
9
+ import fitz
10
+ import os
11
+ import traceback
12
+ from api.utils.file_utils import traversal_files
13
+ images = []
14
+ outputs = []
15
+
16
+ if not os.path.exists(args.output_dir):
17
+ os.mkdir(args.output_dir)
18
+
19
+ def pdf_pages(fnm, zoomin=3):
20
+ nonlocal outputs, images
21
+ pdf = fitz.open(fnm)
22
+ mat = fitz.Matrix(zoomin, zoomin)
23
+ for i, page in enumerate(pdf):
24
+ pix = page.get_pixmap(matrix=mat)
25
+ img = Image.frombytes("RGB", [pix.width, pix.height],
26
+ pix.samples)
27
+ images.append(img)
28
+ outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
29
+
30
+ def images_and_outputs(fnm):
31
+ nonlocal outputs, images
32
+ if fnm.split(".")[-1].lower() == "pdf":
33
+ pdf_pages(fnm)
34
+ return
35
+ try:
36
+ images.append(Image.open(fnm))
37
+ outputs.append(os.path.split(fnm)[-1])
38
+ except Exception as e:
39
+ traceback.print_exc()
40
+
41
+ if os.path.isdir(args.inputs):
42
+ for fnm in traversal_files(args.inputs):
43
+ images_and_outputs(fnm)
44
+ else:
45
+ images_and_outputs(args.inputs)
46
+
47
+ for i in range(len(outputs)): outputs[i] = os.path.join(args.output_dir, outputs[i])
48
+
49
+ return images, outputs
deepdoc/vision/layout_recognizer.py CHANGED
@@ -1,17 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import re
3
  from collections import Counter
4
  from copy import deepcopy
5
-
6
  import numpy as np
7
-
8
  from api.utils.file_utils import get_project_base_directory
9
- from .recognizer import Recognizer
10
 
11
 
12
  class LayoutRecognizer(Recognizer):
13
- def __init__(self, domain):
14
- self.layout_labels = [
15
  "_background_",
16
  "Text",
17
  "Title",
@@ -24,7 +33,8 @@ class LayoutRecognizer(Recognizer):
24
  "Reference",
25
  "Equation",
26
  ]
27
- super().__init__(self.layout_labels, domain,
 
28
  os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
29
 
30
  def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.7, batch_size=16):
@@ -37,7 +47,7 @@ class LayoutRecognizer(Recognizer):
37
  return any([re.search(p, b["text"]) for p in patt])
38
 
39
  layouts = super().__call__(image_list, thr, batch_size)
40
- # save_results(image_list, layouts, self.layout_labels, output_dir='output/', threshold=0.7)
41
  assert len(image_list) == len(ocr_res)
42
  # Tag layout type
43
  boxes = []
@@ -117,3 +127,5 @@ class LayoutRecognizer(Recognizer):
117
  ocr_res = [b for b in ocr_res if b["text"].strip() not in garbag_set]
118
  return ocr_res, page_layout
119
 
 
 
 
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
  import os
14
  import re
15
  from collections import Counter
16
  from copy import deepcopy
 
17
  import numpy as np
 
18
  from api.utils.file_utils import get_project_base_directory
19
+ from deepdoc.vision import Recognizer
20
 
21
 
22
  class LayoutRecognizer(Recognizer):
23
+ labels = [
 
24
  "_background_",
25
  "Text",
26
  "Title",
 
33
  "Reference",
34
  "Equation",
35
  ]
36
+ def __init__(self, domain):
37
+ super().__init__(self.labels, domain,
38
  os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
39
 
40
  def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.7, batch_size=16):
 
47
  return any([re.search(p, b["text"]) for p in patt])
48
 
49
  layouts = super().__call__(image_list, thr, batch_size)
50
+ # save_results(image_list, layouts, self.labels, output_dir='output/', threshold=0.7)
51
  assert len(image_list) == len(ocr_res)
52
  # Tag layout type
53
  boxes = []
 
127
  ocr_res = [b for b in ocr_res if b["text"].strip() not in garbag_set]
128
  return ocr_res, page_layout
129
 
130
+
131
+
deepdoc/vision/recognizer.py CHANGED
@@ -17,7 +17,6 @@ from copy import deepcopy
17
  import onnxruntime as ort
18
  from huggingface_hub import snapshot_download
19
 
20
- from . import seeit
21
  from .operators import *
22
  from rag.settings import cron_logger
23
 
@@ -36,7 +35,7 @@ class Recognizer(object):
36
 
37
  """
38
  if not model_dir:
39
- model_dir = snapshot_download(repo_id="InfiniFlow/ocr")
40
 
41
  model_file_path = os.path.join(model_dir, task_name + ".onnx")
42
  if not os.path.exists(model_file_path):
@@ -46,6 +45,9 @@ class Recognizer(object):
46
  self.ort_sess = ort.InferenceSession(model_file_path, providers=['CUDAExecutionProvider'])
47
  else:
48
  self.ort_sess = ort.InferenceSession(model_file_path, providers=['CPUExecutionProvider'])
 
 
 
49
  self.label_list = label_list
50
 
51
  @staticmethod
@@ -275,23 +277,131 @@ class Recognizer(object):
275
  return max_overlaped_i
276
 
277
  def preprocess(self, image_list):
278
- preprocess_ops = []
279
- for op_info in [
280
- {'interp': 2, 'keep_ratio': False, 'target_size': [800, 608], 'type': 'LinearResize'},
281
- {'is_scale': True, 'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225], 'type': 'StandardizeImage'},
282
- {'type': 'Permute'},
283
- {'stride': 32, 'type': 'PadStride'}
284
- ]:
285
- new_op_info = op_info.copy()
286
- op_type = new_op_info.pop('type')
287
- preprocess_ops.append(eval(op_type)(**new_op_info))
288
-
289
  inputs = []
290
- for im_path in image_list:
291
- im, im_info = preprocess(im_path, preprocess_ops)
292
- inputs.append({"image": np.array((im,)).astype('float32'), "scale_factor": np.array((im_info["scale_factor"],)).astype('float32')})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  return inputs
294
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  def __call__(self, image_list, thr=0.7, batch_size=16):
296
  res = []
297
  imgs = []
@@ -306,22 +416,14 @@ class Recognizer(object):
306
  end_index = min((i + 1) * batch_size, len(imgs))
307
  batch_image_list = imgs[start_index:end_index]
308
  inputs = self.preprocess(batch_image_list)
 
309
  for ins in inputs:
310
- bb = []
311
- for b in self.ort_sess.run(None, ins)[0]:
312
- clsid, bbox, score = int(b[0]), b[2:], b[1]
313
- if score < thr:
314
- continue
315
- if clsid >= len(self.label_list):
316
- cron_logger.warning(f"bad category id")
317
- continue
318
- bb.append({
319
- "type": self.label_list[clsid].lower(),
320
- "bbox": [float(t) for t in bbox.tolist()],
321
- "score": float(score)
322
- })
323
  res.append(bb)
324
 
325
  #seeit.save_results(image_list, res, self.label_list, threshold=thr)
326
 
327
  return res
 
 
 
 
17
  import onnxruntime as ort
18
  from huggingface_hub import snapshot_download
19
 
 
20
  from .operators import *
21
  from rag.settings import cron_logger
22
 
 
35
 
36
  """
37
  if not model_dir:
38
+ model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
39
 
40
  model_file_path = os.path.join(model_dir, task_name + ".onnx")
41
  if not os.path.exists(model_file_path):
 
45
  self.ort_sess = ort.InferenceSession(model_file_path, providers=['CUDAExecutionProvider'])
46
  else:
47
  self.ort_sess = ort.InferenceSession(model_file_path, providers=['CPUExecutionProvider'])
48
+ self.input_names = [node.name for node in self.ort_sess.get_inputs()]
49
+ self.output_names = [node.name for node in self.ort_sess.get_outputs()]
50
+ self.input_shape = self.ort_sess.get_inputs()[0].shape[2:4]
51
  self.label_list = label_list
52
 
53
  @staticmethod
 
277
  return max_overlaped_i
278
 
279
  def preprocess(self, image_list):
 
 
 
 
 
 
 
 
 
 
 
280
  inputs = []
281
+ if "scale_factor" in self.input_names:
282
+ preprocess_ops = []
283
+ for op_info in [
284
+ {'interp': 2, 'keep_ratio': False, 'target_size': [800, 608], 'type': 'LinearResize'},
285
+ {'is_scale': True, 'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225], 'type': 'StandardizeImage'},
286
+ {'type': 'Permute'},
287
+ {'stride': 32, 'type': 'PadStride'}
288
+ ]:
289
+ new_op_info = op_info.copy()
290
+ op_type = new_op_info.pop('type')
291
+ preprocess_ops.append(eval(op_type)(**new_op_info))
292
+
293
+ for im_path in image_list:
294
+ im, im_info = preprocess(im_path, preprocess_ops)
295
+ inputs.append({"image": np.array((im,)).astype('float32'),
296
+ "scale_factor": np.array((im_info["scale_factor"],)).astype('float32')})
297
+ else:
298
+ hh, ww = self.input_shape
299
+ for img in image_list:
300
+ h, w = img.shape[:2]
301
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
302
+ img = cv2.resize(np.array(img).astype('float32'), (ww, hh))
303
+ # Scale input pixel values to 0 to 1
304
+ img /= 255.0
305
+ img = img.transpose(2, 0, 1)
306
+ img = img[np.newaxis, :, :, :].astype(np.float32)
307
+ inputs.append({self.input_names[0]: img, "scale_factor": [w/ww, h/hh]})
308
  return inputs
309
 
310
+ def postprocess(self, boxes, inputs, thr):
311
+ if "scale_factor" in self.input_names:
312
+ bb = []
313
+ for b in boxes:
314
+ clsid, bbox, score = int(b[0]), b[2:], b[1]
315
+ if score < thr:
316
+ continue
317
+ if clsid >= len(self.label_list):
318
+ cron_logger.warning(f"bad category id")
319
+ continue
320
+ bb.append({
321
+ "type": self.label_list[clsid].lower(),
322
+ "bbox": [float(t) for t in bbox.tolist()],
323
+ "score": float(score)
324
+ })
325
+ return bb
326
+
327
+ def xywh2xyxy(x):
328
+ # [x, y, w, h] to [x1, y1, x2, y2]
329
+ y = np.copy(x)
330
+ y[:, 0] = x[:, 0] - x[:, 2] / 2
331
+ y[:, 1] = x[:, 1] - x[:, 3] / 2
332
+ y[:, 2] = x[:, 0] + x[:, 2] / 2
333
+ y[:, 3] = x[:, 1] + x[:, 3] / 2
334
+ return y
335
+
336
+ def compute_iou(box, boxes):
337
+ # Compute xmin, ymin, xmax, ymax for both boxes
338
+ xmin = np.maximum(box[0], boxes[:, 0])
339
+ ymin = np.maximum(box[1], boxes[:, 1])
340
+ xmax = np.minimum(box[2], boxes[:, 2])
341
+ ymax = np.minimum(box[3], boxes[:, 3])
342
+
343
+ # Compute intersection area
344
+ intersection_area = np.maximum(0, xmax - xmin) * np.maximum(0, ymax - ymin)
345
+
346
+ # Compute union area
347
+ box_area = (box[2] - box[0]) * (box[3] - box[1])
348
+ boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
349
+ union_area = box_area + boxes_area - intersection_area
350
+
351
+ # Compute IoU
352
+ iou = intersection_area / union_area
353
+
354
+ return iou
355
+
356
+ def iou_filter(boxes, scores, iou_threshold):
357
+ sorted_indices = np.argsort(scores)[::-1]
358
+
359
+ keep_boxes = []
360
+ while sorted_indices.size > 0:
361
+ # Pick the last box
362
+ box_id = sorted_indices[0]
363
+ keep_boxes.append(box_id)
364
+
365
+ # Compute IoU of the picked box with the rest
366
+ ious = compute_iou(boxes[box_id, :], boxes[sorted_indices[1:], :])
367
+
368
+ # Remove boxes with IoU over the threshold
369
+ keep_indices = np.where(ious < iou_threshold)[0]
370
+
371
+ # print(keep_indices.shape, sorted_indices.shape)
372
+ sorted_indices = sorted_indices[keep_indices + 1]
373
+
374
+ return keep_boxes
375
+
376
+ boxes = np.squeeze(boxes).T
377
+ # Filter out object confidence scores below threshold
378
+ scores = np.max(boxes[:, 4:], axis=1)
379
+ boxes = boxes[scores > thr, :]
380
+ scores = scores[scores > thr]
381
+ if len(boxes) == 0: return []
382
+
383
+ # Get the class with the highest confidence
384
+ class_ids = np.argmax(boxes[:, 4:], axis=1)
385
+ boxes = boxes[:, :4]
386
+ input_shape = np.array([inputs["scale_factor"][0], inputs["scale_factor"][1], inputs["scale_factor"][0], inputs["scale_factor"][1]])
387
+ boxes = np.multiply(boxes, input_shape, dtype=np.float32)
388
+ boxes = xywh2xyxy(boxes)
389
+
390
+ unique_class_ids = np.unique(class_ids)
391
+ indices = []
392
+ for class_id in unique_class_ids:
393
+ class_indices = np.where(class_ids == class_id)[0]
394
+ class_boxes = boxes[class_indices, :]
395
+ class_scores = scores[class_indices]
396
+ class_keep_boxes = iou_filter(class_boxes, class_scores, 0.2)
397
+ indices.extend(class_indices[class_keep_boxes])
398
+
399
+ return [{
400
+ "type": self.label_list[class_ids[i]].lower(),
401
+ "bbox": [float(t) for t in boxes[i].tolist()],
402
+ "score": float(scores[i])
403
+ } for i in indices]
404
+
405
  def __call__(self, image_list, thr=0.7, batch_size=16):
406
  res = []
407
  imgs = []
 
416
  end_index = min((i + 1) * batch_size, len(imgs))
417
  batch_image_list = imgs[start_index:end_index]
418
  inputs = self.preprocess(batch_image_list)
419
+ print("preprocess")
420
  for ins in inputs:
421
+ bb = self.postprocess(self.ort_sess.run(None, {k:v for k,v in ins.items() if k in self.input_names})[0], ins, thr)
 
 
 
 
 
 
 
 
 
 
 
 
422
  res.append(bb)
423
 
424
  #seeit.save_results(image_list, res, self.label_list, threshold=thr)
425
 
426
  return res
427
+
428
+
429
+
deepdoc/vision/t_ocr.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+
14
+ import os, sys
15
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../')))
16
+ import numpy as np
17
+ import argparse
18
+ from deepdoc.vision import OCR, init_in_out
19
+ from deepdoc.vision.seeit import draw_box
20
+
21
+ def main(args):
22
+ ocr = OCR()
23
+ images, outputs = init_in_out(args)
24
+
25
+ for i, img in enumerate(images):
26
+ bxs = ocr(np.array(img))
27
+ bxs = [(line[0], line[1][0]) for line in bxs]
28
+ bxs = [{
29
+ "text": t,
30
+ "bbox": [b[0][0], b[0][1], b[1][0], b[-1][1]],
31
+ "type": "ocr",
32
+ "score": 1} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]]
33
+ img = draw_box(images[i], bxs, ["ocr"], 1.)
34
+ img.save(outputs[i], quality=95)
35
+ with open(outputs[i] + ".txt", "w+") as f: f.write("\n".join([o["text"] for o in bxs]))
36
+
37
+
38
+
39
+ if __name__ == "__main__":
40
+ parser = argparse.ArgumentParser()
41
+ parser.add_argument('--inputs',
42
+ help="Directory where to store images or PDFs, or a file path to a single image or PDF",
43
+ required=True)
44
+ parser.add_argument('--output_dir', help="Directory where to store the output images. Default: './ocr_outputs'",
45
+ default="./ocr_outputs")
46
+ args = parser.parse_args()
47
+ main(args)
deepdoc/vision/t_recognizer.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+
14
+ import os, sys
15
+ import re
16
+
17
+ import numpy as np
18
+
19
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../')))
20
+
21
+ import argparse
22
+ from api.utils.file_utils import get_project_base_directory
23
+ from deepdoc.vision import Recognizer, LayoutRecognizer, TableStructureRecognizer, OCR, init_in_out
24
+ from deepdoc.vision.seeit import draw_box
25
+
26
+
27
+ def main(args):
28
+ images, outputs = init_in_out(args)
29
+ if args.mode.lower() == "layout":
30
+ labels = LayoutRecognizer.labels
31
+ detr = Recognizer(labels, "layout.paper", os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
32
+ if args.mode.lower() == "tsr":
33
+ labels = TableStructureRecognizer.labels
34
+ detr = TableStructureRecognizer()
35
+ ocr = OCR()
36
+
37
+ layouts = detr(images, float(args.threshold))
38
+ for i, lyt in enumerate(layouts):
39
+ if args.mode.lower() == "tsr":
40
+ #lyt = [t for t in lyt if t["type"] == "table column"]
41
+ html = get_table_html(images[i], lyt, ocr)
42
+ with open(outputs[i]+".html", "w+") as f: f.write(html)
43
+ lyt = [{
44
+ "type": t["label"],
45
+ "bbox": [t["x0"], t["top"], t["x1"], t["bottom"]],
46
+ "score": t["score"]
47
+ } for t in lyt]
48
+ img = draw_box(images[i], lyt, labels, float(args.threshold))
49
+ img.save(outputs[i], quality=95)
50
+ print("save result to: " + outputs[i])
51
+
52
+
53
+ def get_table_html(img, tb_cpns, ocr):
54
+ boxes = ocr(np.array(img))
55
+ boxes = Recognizer.sort_Y_firstly(
56
+ [{"x0": b[0][0], "x1": b[1][0],
57
+ "top": b[0][1], "text": t[0],
58
+ "bottom": b[-1][1],
59
+ "layout_type": "table",
60
+ "page_number": 0} for b, t in boxes if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
61
+ np.mean([b[-1][1]-b[0][1] for b,_ in boxes]) / 3
62
+ )
63
+
64
+ def gather(kwd, fzy=10, ption=0.6):
65
+ nonlocal boxes
66
+ eles = Recognizer.sort_Y_firstly(
67
+ [r for r in tb_cpns if re.match(kwd, r["label"])], fzy)
68
+ eles = Recognizer.layouts_cleanup(boxes, eles, 5, ption)
69
+ return Recognizer.sort_Y_firstly(eles, 0)
70
+
71
+ headers = gather(r".*header$")
72
+ rows = gather(r".* (row|header)")
73
+ spans = gather(r".*spanning")
74
+ clmns = sorted([r for r in tb_cpns if re.match(
75
+ r"table column$", r["label"])], key=lambda x: x["x0"])
76
+ clmns = Recognizer.layouts_cleanup(boxes, clmns, 5, 0.5)
77
+ for b in boxes:
78
+ ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
79
+ if ii is not None:
80
+ b["R"] = ii
81
+ b["R_top"] = rows[ii]["top"]
82
+ b["R_bott"] = rows[ii]["bottom"]
83
+
84
+ ii = Recognizer.find_overlapped_with_threashold(b, headers, thr=0.3)
85
+ if ii is not None:
86
+ b["H_top"] = headers[ii]["top"]
87
+ b["H_bott"] = headers[ii]["bottom"]
88
+ b["H_left"] = headers[ii]["x0"]
89
+ b["H_right"] = headers[ii]["x1"]
90
+ b["H"] = ii
91
+
92
+ ii = Recognizer.find_overlapped_with_threashold(b, clmns, thr=0.3)
93
+ if ii is not None:
94
+ b["C"] = ii
95
+ b["C_left"] = clmns[ii]["x0"]
96
+ b["C_right"] = clmns[ii]["x1"]
97
+
98
+ ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
99
+ if ii is not None:
100
+ b["H_top"] = spans[ii]["top"]
101
+ b["H_bott"] = spans[ii]["bottom"]
102
+ b["H_left"] = spans[ii]["x0"]
103
+ b["H_right"] = spans[ii]["x1"]
104
+ b["SP"] = ii
105
+ html = """
106
+ <html>
107
+ <head>
108
+ <style>
109
+ ._table_1nkzy_11 {
110
+ margin: auto;
111
+ width: 70%%;
112
+ padding: 10px;
113
+ }
114
+ ._table_1nkzy_11 p {
115
+ margin-bottom: 50px;
116
+ border: 1px solid #e1e1e1;
117
+ }
118
+
119
+ caption {
120
+ color: #6ac1ca;
121
+ font-size: 20px;
122
+ height: 50px;
123
+ line-height: 50px;
124
+ font-weight: 600;
125
+ margin-bottom: 10px;
126
+ }
127
+
128
+ ._table_1nkzy_11 table {
129
+ width: 100%%;
130
+ border-collapse: collapse;
131
+ }
132
+
133
+ th {
134
+ color: #fff;
135
+ background-color: #6ac1ca;
136
+ }
137
+
138
+ td:hover {
139
+ background: #c1e8e8;
140
+ }
141
+
142
+ tr:nth-child(even) {
143
+ background-color: #f2f2f2;
144
+ }
145
+
146
+ ._table_1nkzy_11 th,
147
+ ._table_1nkzy_11 td {
148
+ text-align: center;
149
+ border: 1px solid #ddd;
150
+ padding: 8px;
151
+ }
152
+ </style>
153
+ </head>
154
+ <body>
155
+ %s
156
+ </body>
157
+ </html>
158
+ """% TableStructureRecognizer.construct_table(boxes, html=True)
159
+ return html
160
+
161
+
162
+ if __name__ == "__main__":
163
+ parser = argparse.ArgumentParser()
164
+ parser.add_argument('--inputs',
165
+ help="Directory where to store images or PDFs, or a file path to a single image or PDF",
166
+ required=True)
167
+ parser.add_argument('--output_dir', help="Directory where to store the output images. Default: './layouts_outputs'",
168
+ default="./layouts_outputs")
169
+ parser.add_argument('--threshold', help="A threshold to filter out detections. Default: 0.5", default=0.5)
170
+ parser.add_argument('--mode', help="Task mode: layout recognition or table structure recognition", choices=["layout", "tsr"],
171
+ default="layout")
172
+ args = parser.parse_args()
173
+ main(args)
deepdoc/vision/table_structure_recognizer.py CHANGED
@@ -1,3 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
  import logging
2
  import os
3
  import re
@@ -12,15 +24,16 @@ from .recognizer import Recognizer
12
 
13
 
14
  class TableStructureRecognizer(Recognizer):
 
 
 
 
 
 
 
 
 
15
  def __init__(self):
16
- self.labels = [
17
- "table",
18
- "table column",
19
- "table row",
20
- "table column header",
21
- "table projected row header",
22
- "table spanning cell",
23
- ]
24
  super().__init__(self.labels, "tsr",
25
  os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
26
 
@@ -79,7 +92,8 @@ class TableStructureRecognizer(Recognizer):
79
  return True
80
  return False
81
 
82
- def __blockType(self, b):
 
83
  patt = [
84
  ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
85
  (r"^(20|19)[0-9]{2}年$", "Dt"),
@@ -109,11 +123,12 @@ class TableStructureRecognizer(Recognizer):
109
 
110
  return "Ot"
111
 
112
- def construct_table(self, boxes, is_english=False, html=False):
 
113
  cap = ""
114
  i = 0
115
  while i < len(boxes):
116
- if self.is_caption(boxes[i]):
117
  cap += boxes[i]["text"]
118
  boxes.pop(i)
119
  i -= 1
@@ -122,14 +137,15 @@ class TableStructureRecognizer(Recognizer):
122
  if not boxes:
123
  return []
124
  for b in boxes:
125
- b["btype"] = self.__blockType(b)
126
  max_type = Counter([b["btype"] for b in boxes]).items()
127
  max_type = max(max_type, key=lambda x: x[1])[0] if max_type else ""
128
  logging.debug("MAXTYPE: " + max_type)
129
 
130
  rowh = [b["R_bott"] - b["R_top"] for b in boxes if "R" in b]
131
  rowh = np.min(rowh) if rowh else 0
132
- boxes = self.sort_R_firstly(boxes, rowh / 2)
 
133
  boxes[0]["rn"] = 0
134
  rows = [[boxes[0]]]
135
  btm = boxes[0]["bottom"]
@@ -150,9 +166,9 @@ class TableStructureRecognizer(Recognizer):
150
  colwm = np.min(colwm) if colwm else 0
151
  crosspage = len(set([b["page_number"] for b in boxes])) > 1
152
  if crosspage:
153
- boxes = self.sort_X_firstly(boxes, colwm / 2, False)
154
  else:
155
- boxes = self.sort_C_firstly(boxes, colwm / 2)
156
  boxes[0]["cn"] = 0
157
  cols = [[boxes[0]]]
158
  right = boxes[0]["x1"]
@@ -313,16 +329,18 @@ class TableStructureRecognizer(Recognizer):
313
  hdset.add(i)
314
 
315
  if html:
316
- return [self.__html_table(cap, hdset,
317
- self.__cal_spans(boxes, rows,
318
- cols, tbl, True)
319
- )]
320
 
321
- return self.__desc_table(cap, hdset,
322
- self.__cal_spans(boxes, rows, cols, tbl, False),
323
- is_english)
 
324
 
325
- def __html_table(self, cap, hdset, tbl):
 
326
  # constrcut HTML
327
  html = "<table>"
328
  if cap:
@@ -339,8 +357,8 @@ class TableStructureRecognizer(Recognizer):
339
  txt = ""
340
  if arr:
341
  h = min(np.min([c["bottom"] - c["top"] for c in arr]) / 2, 10)
342
- txt = "".join([c["text"]
343
- for c in self.sort_Y_firstly(arr, h)])
344
  txts.append(txt)
345
  sp = ""
346
  if arr[0].get("colspan"):
@@ -366,7 +384,8 @@ class TableStructureRecognizer(Recognizer):
366
  html += "\n</table>"
367
  return html
368
 
369
- def __desc_table(self, cap, hdr_rowno, tbl, is_english):
 
370
  # get text of every colomn in header row to become header text
371
  clmno = len(tbl[0])
372
  rowno = len(tbl)
@@ -469,7 +488,8 @@ class TableStructureRecognizer(Recognizer):
469
  row_txt = [t + f"\t——{from_}“{cap}”" for t in row_txt]
470
  return row_txt
471
 
472
- def __cal_spans(self, boxes, rows, cols, tbl, html=True):
 
473
  # caculate span
474
  clft = [np.mean([c.get("C_left", c["x0"]) for c in cln])
475
  for cln in cols]
@@ -553,4 +573,3 @@ class TableStructureRecognizer(Recognizer):
553
  tbl[rowspan[0]][colspan[0]] = arr
554
 
555
  return tbl
556
-
 
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
  import logging
14
  import os
15
  import re
 
24
 
25
 
26
  class TableStructureRecognizer(Recognizer):
27
+ labels = [
28
+ "table",
29
+ "table column",
30
+ "table row",
31
+ "table column header",
32
+ "table projected row header",
33
+ "table spanning cell",
34
+ ]
35
+
36
  def __init__(self):
 
 
 
 
 
 
 
 
37
  super().__init__(self.labels, "tsr",
38
  os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
39
 
 
92
  return True
93
  return False
94
 
95
+ @staticmethod
96
+ def blockType(b):
97
  patt = [
98
  ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
99
  (r"^(20|19)[0-9]{2}年$", "Dt"),
 
123
 
124
  return "Ot"
125
 
126
+ @staticmethod
127
+ def construct_table(boxes, is_english=False, html=False):
128
  cap = ""
129
  i = 0
130
  while i < len(boxes):
131
+ if TableStructureRecognizer.is_caption(boxes[i]):
132
  cap += boxes[i]["text"]
133
  boxes.pop(i)
134
  i -= 1
 
137
  if not boxes:
138
  return []
139
  for b in boxes:
140
+ b["btype"] = TableStructureRecognizer.blockType(b)
141
  max_type = Counter([b["btype"] for b in boxes]).items()
142
  max_type = max(max_type, key=lambda x: x[1])[0] if max_type else ""
143
  logging.debug("MAXTYPE: " + max_type)
144
 
145
  rowh = [b["R_bott"] - b["R_top"] for b in boxes if "R" in b]
146
  rowh = np.min(rowh) if rowh else 0
147
+ boxes = Recognizer.sort_R_firstly(boxes, rowh / 2)
148
+ #for b in boxes:print(b)
149
  boxes[0]["rn"] = 0
150
  rows = [[boxes[0]]]
151
  btm = boxes[0]["bottom"]
 
166
  colwm = np.min(colwm) if colwm else 0
167
  crosspage = len(set([b["page_number"] for b in boxes])) > 1
168
  if crosspage:
169
+ boxes = Recognizer.sort_X_firstly(boxes, colwm / 2, False)
170
  else:
171
+ boxes = Recognizer.sort_C_firstly(boxes, colwm / 2)
172
  boxes[0]["cn"] = 0
173
  cols = [[boxes[0]]]
174
  right = boxes[0]["x1"]
 
329
  hdset.add(i)
330
 
331
  if html:
332
+ return TableStructureRecognizer.__html_table(cap, hdset,
333
+ TableStructureRecognizer.__cal_spans(boxes, rows,
334
+ cols, tbl, True)
335
+ )
336
 
337
+ return TableStructureRecognizer.__desc_table(cap, hdset,
338
+ TableStructureRecognizer.__cal_spans(boxes, rows, cols, tbl,
339
+ False),
340
+ is_english)
341
 
342
+ @staticmethod
343
+ def __html_table(cap, hdset, tbl):
344
  # constrcut HTML
345
  html = "<table>"
346
  if cap:
 
357
  txt = ""
358
  if arr:
359
  h = min(np.min([c["bottom"] - c["top"] for c in arr]) / 2, 10)
360
+ txt = " ".join([c["text"]
361
+ for c in Recognizer.sort_Y_firstly(arr, h)])
362
  txts.append(txt)
363
  sp = ""
364
  if arr[0].get("colspan"):
 
384
  html += "\n</table>"
385
  return html
386
 
387
+ @staticmethod
388
+ def __desc_table(cap, hdr_rowno, tbl, is_english):
389
  # get text of every colomn in header row to become header text
390
  clmno = len(tbl[0])
391
  rowno = len(tbl)
 
488
  row_txt = [t + f"\t——{from_}“{cap}”" for t in row_txt]
489
  return row_txt
490
 
491
+ @staticmethod
492
+ def __cal_spans(boxes, rows, cols, tbl, html=True):
493
  # caculate span
494
  clft = [np.mean([c.get("C_left", c["x0"]) for c in cln])
495
  for cln in cols]
 
573
  tbl[rowspan[0]][colspan[0]] = arr
574
 
575
  return tbl