pierreguillou commited on
Commit
bfde413
·
1 Parent(s): c7019e3

Update files/functions.py

Browse files
Files changed (1) hide show
  1. files/functions.py +227 -0
files/functions.py CHANGED
@@ -178,6 +178,233 @@ id2label_layoutxlm = model_layoutxlm.config.id2label
178
  label2id_layoutxlm = model_layoutxlm.config.label2id
179
  num_labels_layoutxlm = len(id2label_layoutxlm)
180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  ## PDf processing
182
 
183
  # get filename and images of PDF pages
 
178
  label2id_layoutxlm = model_layoutxlm.config.label2id
179
  num_labels_layoutxlm = len(id2label_layoutxlm)
180
 
181
+ ## General
182
+
183
+ # get text and bounding boxes from an image
184
+ # https://stackoverflow.com/questions/61347755/how-can-i-get-line-coordinates-that-readed-by-tesseract
185
+ # https://medium.com/geekculture/tesseract-ocr-understanding-the-contents-of-documents-beyond-their-text-a98704b7c655
186
+ def get_data_paragraph(results, factor, conf_min=0):
187
+
188
+ data = {}
189
+ for i in range(len(results['line_num'])):
190
+ level = results['level'][i]
191
+ block_num = results['block_num'][i]
192
+ par_num = results['par_num'][i]
193
+ line_num = results['line_num'][i]
194
+ top, left = results['top'][i], results['left'][i]
195
+ width, height = results['width'][i], results['height'][i]
196
+ conf = results['conf'][i]
197
+ text = results['text'][i]
198
+ if not (text == '' or text.isspace()):
199
+ if conf >= conf_min:
200
+ tup = (text, left, top, width, height)
201
+ if block_num in list(data.keys()):
202
+ if par_num in list(data[block_num].keys()):
203
+ if line_num in list(data[block_num][par_num].keys()):
204
+ data[block_num][par_num][line_num].append(tup)
205
+ else:
206
+ data[block_num][par_num][line_num] = [tup]
207
+ else:
208
+ data[block_num][par_num] = {}
209
+ data[block_num][par_num][line_num] = [tup]
210
+ else:
211
+ data[block_num] = {}
212
+ data[block_num][par_num] = {}
213
+ data[block_num][par_num][line_num] = [tup]
214
+
215
+ # get paragraphs dicionnary with list of lines
216
+ par_data = {}
217
+ par_idx = 1
218
+ for _, b in data.items():
219
+ for _, p in b.items():
220
+ line_data = {}
221
+ line_idx = 1
222
+ for _, l in p.items():
223
+ line_data[line_idx] = l
224
+ line_idx += 1
225
+ par_data[par_idx] = line_data
226
+ par_idx += 1
227
+
228
+ # get lines of texts, grouped by paragraph
229
+ texts_pars = list()
230
+ row_indexes = list()
231
+ texts_lines = list()
232
+ texts_lines_par = list()
233
+ row_index = 0
234
+ for _,par in par_data.items():
235
+ count_lines = 0
236
+ lines_par = list()
237
+ for _,line in par.items():
238
+ if count_lines == 0: row_indexes.append(row_index)
239
+ line_text = ' '.join([item[0] for item in line])
240
+ texts_lines.append(line_text)
241
+ lines_par.append(line_text)
242
+ count_lines += 1
243
+ row_index += 1
244
+ # lines.append("\n")
245
+ row_index += 1
246
+ texts_lines_par.append(lines_par)
247
+ texts_pars.append(' '.join(lines_par))
248
+ # lines = lines[:-1]
249
+
250
+ # get paragraphes boxes (par_boxes)
251
+ # get lines boxes (line_boxes)
252
+ par_boxes = list()
253
+ par_idx = 1
254
+ line_boxes, lines_par_boxes = list(), list()
255
+ line_idx = 1
256
+ for _, par in par_data.items():
257
+ xmins, ymins, xmaxs, ymaxs = list(), list(), list(), list()
258
+ line_boxes_par = list()
259
+ count_line_par = 0
260
+ for _, line in par.items():
261
+ xmin, ymin = line[0][1], line[0][2]
262
+ xmax, ymax = (line[-1][1] + line[-1][3]), (line[-1][2] + line[-1][4])
263
+ line_boxes.append([int(xmin/factor), int(ymin/factor), int(xmax/factor), int(ymax/factor)])
264
+ line_boxes_par.append([int(xmin/factor), int(ymin/factor), int(xmax/factor), int(ymax/factor)])
265
+ xmins.append(xmin)
266
+ ymins.append(ymin)
267
+ xmaxs.append(xmax)
268
+ ymaxs.append(ymax)
269
+ line_idx += 1
270
+ count_line_par += 1
271
+ xmin, ymin, xmax, ymax = min(xmins), min(ymins), max(xmaxs), max(ymaxs)
272
+ par_bbox = [int(xmin/factor), int(ymin/factor), int(xmax/factor), int(ymax/factor)]
273
+ par_boxes.append(par_bbox)
274
+ lines_par_boxes.append(line_boxes_par)
275
+ par_idx += 1
276
+
277
+ return texts_lines, texts_pars, texts_lines_par, row_indexes, par_boxes, line_boxes, lines_par_boxes
278
+
279
+ # rescale image to get 300dpi
280
+ def set_image_dpi_resize(image):
281
+ """
282
+ Rescaling image to 300dpi while resizing
283
+ :param image: An image
284
+ :return: A rescaled image
285
+ """
286
+ length_x, width_y = image.size
287
+ factor = min(1, float(1024.0 / length_x))
288
+ size = int(factor * length_x), int(factor * width_y)
289
+ # image_resize = image.resize(size, Image.Resampling.LANCZOS)
290
+ image_resize = image.resize(size, Image.LANCZOS)
291
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='1.png')
292
+ temp_filename = temp_file.name
293
+ image_resize.save(temp_filename, dpi=(300, 300))
294
+ return factor, temp_filename
295
+
296
+ # it is important that each bounding box should be in (upper left, lower right) format.
297
+ # source: https://github.com/NielsRogge/Transformers-Tutorials/issues/129
298
+ def upperleft_to_lowerright(bbox):
299
+ x0, y0, x1, y1 = tuple(bbox)
300
+ if bbox[2] < bbox[0]:
301
+ x0 = bbox[2]
302
+ x1 = bbox[0]
303
+ if bbox[3] < bbox[1]:
304
+ y0 = bbox[3]
305
+ y1 = bbox[1]
306
+ return [x0, y0, x1, y1]
307
+
308
+ # convert boundings boxes (left, top, width, height) format to (left, top, left+widght, top+height) format.
309
+ def convert_box(bbox):
310
+ x, y, w, h = tuple(bbox) # the row comes in (left, top, width, height) format
311
+ return [x, y, x+w, y+h] # we turn it into (left, top, left+widght, top+height) to get the actual box
312
+
313
+ # LiLT model gets 1000x10000 pixels images
314
+ def normalize_box(bbox, width, height):
315
+ return [
316
+ int(1000 * (bbox[0] / width)),
317
+ int(1000 * (bbox[1] / height)),
318
+ int(1000 * (bbox[2] / width)),
319
+ int(1000 * (bbox[3] / height)),
320
+ ]
321
+
322
+ # LiLT model gets 1000x10000 pixels images
323
+ def denormalize_box(bbox, width, height):
324
+ return [
325
+ int(width * (bbox[0] / 1000)),
326
+ int(height * (bbox[1] / 1000)),
327
+ int(width* (bbox[2] / 1000)),
328
+ int(height * (bbox[3] / 1000)),
329
+ ]
330
+
331
+ # get back original size
332
+ def original_box(box, original_width, original_height, coco_width, coco_height):
333
+ return [
334
+ int(original_width * (box[0] / coco_width)),
335
+ int(original_height * (box[1] / coco_height)),
336
+ int(original_width * (box[2] / coco_width)),
337
+ int(original_height* (box[3] / coco_height)),
338
+ ]
339
+
340
+ def get_blocks(bboxes_block, categories, texts):
341
+
342
+ # get list of unique block boxes
343
+ bbox_block_dict, bboxes_block_list, bbox_block_prec = dict(), list(), list()
344
+ for count_block, bbox_block in enumerate(bboxes_block):
345
+ if bbox_block != bbox_block_prec:
346
+ bbox_block_indexes = [i for i, bbox in enumerate(bboxes_block) if bbox == bbox_block]
347
+ bbox_block_dict[count_block] = bbox_block_indexes
348
+ bboxes_block_list.append(bbox_block)
349
+ bbox_block_prec = bbox_block
350
+
351
+ # get list of categories and texts by unique block boxes
352
+ category_block_list, text_block_list = list(), list()
353
+ for bbox_block in bboxes_block_list:
354
+ count_block = bboxes_block.index(bbox_block)
355
+ bbox_block_indexes = bbox_block_dict[count_block]
356
+ category_block = np.array(categories, dtype=object)[bbox_block_indexes].tolist()[0]
357
+ category_block_list.append(category_block)
358
+ text_block = np.array(texts, dtype=object)[bbox_block_indexes].tolist()
359
+ text_block = [text.replace("\n","").strip() for text in text_block]
360
+ if id2label[category_block] == "Text" or id2label[category_block] == "Caption" or id2label[category_block] == "Footnote":
361
+ text_block = ' '.join(text_block)
362
+ else:
363
+ text_block = '\n'.join(text_block)
364
+ text_block_list.append(text_block)
365
+
366
+ return bboxes_block_list, category_block_list, text_block_list
367
+
368
+ # function to sort bounding boxes
369
+ def get_sorted_boxes(bboxes):
370
+
371
+ # sort by y from page top to bottom
372
+ sorted_bboxes = sorted(bboxes, key=itemgetter(1), reverse=False)
373
+ y_list = [bbox[1] for bbox in sorted_bboxes]
374
+
375
+ # sort by x from page left to right when boxes with same y
376
+ if len(list(set(y_list))) != len(y_list):
377
+ y_list_duplicates_indexes = dict()
378
+ y_list_duplicates = [item for item, count in collections.Counter(y_list).items() if count > 1]
379
+ for item in y_list_duplicates:
380
+ y_list_duplicates_indexes[item] = [i for i, e in enumerate(y_list) if e == item]
381
+ bbox_list_y_duplicates = sorted(np.array(sorted_bboxes, dtype=object)[y_list_duplicates_indexes[item]].tolist(), key=itemgetter(0), reverse=False)
382
+ np_array_bboxes = np.array(sorted_bboxes)
383
+ np_array_bboxes[y_list_duplicates_indexes[item]] = np.array(bbox_list_y_duplicates)
384
+ sorted_bboxes = np_array_bboxes.tolist()
385
+
386
+ return sorted_bboxes
387
+
388
+ # sort data from y = 0 to end of page (and after, x=0 to end of page when necessary)
389
+ def sort_data(bboxes, categories, texts):
390
+
391
+ sorted_bboxes = get_sorted_boxes(bboxes)
392
+ sorted_bboxes_indexes = [bboxes.index(bbox) for bbox in sorted_bboxes]
393
+ sorted_categories = np.array(categories, dtype=object)[sorted_bboxes_indexes].tolist()
394
+ sorted_texts = np.array(texts, dtype=object)[sorted_bboxes_indexes].tolist()
395
+
396
+ return sorted_bboxes, sorted_categories, sorted_texts
397
+
398
+ # sort data from y = 0 to end of page (and after, x=0 to end of page when necessary)
399
+ def sort_data_wo_labels(bboxes, texts):
400
+
401
+ sorted_bboxes = get_sorted_boxes(bboxes)
402
+ sorted_bboxes_indexes = [bboxes.index(bbox) for bbox in sorted_bboxes]
403
+ sorted_texts = np.array(texts, dtype=object)[sorted_bboxes_indexes].tolist()
404
+
405
+ return sorted_bboxes, sorted_texts
406
+
407
+
408
  ## PDf processing
409
 
410
  # get filename and images of PDF pages