atatavana commited on
Commit
a0df5e7
·
1 Parent(s): 456bb2f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -292
app.py CHANGED
@@ -14,41 +14,11 @@ import PIL
14
  import torch
15
  import pandas as pd
16
  import numpy as np
17
- import pandas as pd
18
  import gradio as gr
19
  from tqdm import tqdm
20
- from PIL import Image as im
21
  from scipy import ndimage
22
- from difflib import SequenceMatcher
23
- from itertools import groupby
24
- from datasets import load_metric
25
- from datasets import load_dataset
26
- from datasets.features import ClassLabel
27
- from transformers import AutoProcessor
28
  from PIL import Image, ImageDraw, ImageFont
29
- from transformers import AutoModelForTokenClassification
30
- from transformers.data.data_collator import default_data_collator
31
- from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D
32
- from transformers import LayoutLMv3ForTokenClassification,LayoutLMv3FeatureExtractor
33
-
34
- # define id2label
35
- id2label={0: 'container id', 1: 'seal number', 2: 'container quantity', 3: 'container type', 4: 'tare', 5: 'package quantity', 6: 'weight', 7: 'others'}
36
- custom_config = r'--oem 3 --psm 6'
37
- # lang='eng+deu+ita+chi_sim'
38
- lang='spa'
39
-
40
- label_ints = np.random.randint(0,len(PIL.ImageColor.colormap.items()),42)
41
- label_color_pil = [k for k,_ in PIL.ImageColor.colormap.items()]
42
- label_color = [label_color_pil[i] for i in label_ints]
43
- label2color = {}
44
- for k,v in id2label.items():
45
- if v[:2] == '':
46
- label2color['o']=label_color[k]
47
- else:
48
- label2color[v[2:]]=label_color[k]
49
 
50
- processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=True,lang=lang)
51
- model = AutoModelForTokenClassification.from_pretrained("atatavana/layoutlm_manifesto_bigdataset")
52
 
53
 
54
  def unnormalize_box(bbox, width, height):
@@ -60,267 +30,29 @@ def unnormalize_box(bbox, width, height):
60
  height * (bbox[3] / 1000),
61
  ]
62
 
63
- def iob_to_label(label):
64
- if label == 0:
65
- return 'container id'
66
- if label == 1:
67
- return 'seal number'
68
- if label == 2:
69
- return 'container quantity'
70
- if label == 3:
71
- return 'container type'
72
- if label == 4:
73
- return 'tare'
74
- if label == 5:
75
- return 'package quantity'
76
- if label == 6:
77
- return 'weight'
78
- if label == 7:
79
- return 'others'
80
-
81
- # this method will detect if there is any intersect between two boxes or not
82
- def intersect(w, z):
83
- x1 = max(w[0], z[0]) #190 | 881 | 10
84
- y1 = max(w[1], z[1]) #90 | 49 | 273
85
- x2 = min(w[2], z[2]) #406 | 406 | 1310
86
- y2 = min(w[3], z[3]) #149 | 703 | 149
87
- if (x1 > x2 or y1 > y2):
88
- return 0
89
- else:
90
- # because sometimes in annotating, it is possible to overlap rows or columns by mistake
91
- # for very small pixels, we check a threshold to delete them
92
- area = (x2-x1) * (y2-y1)
93
- if (area > 0): #500 is minumum accepted area
94
- return [int(x1), int(y1), int(x2), int(y2)]
95
- else:
96
- return 0
97
-
98
-
99
- def process_image(image):
100
- custom_config = r'--oem 3 --psm 6'
101
- # lang='eng+deu+ita+chi_sim'
102
- lang='spa'
103
- width, height = image.size
104
- feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=True,lang=lang)
105
- encoding_feature_extractor = feature_extractor(image, return_tensors="pt",truncation=True)
106
- words, boxes = encoding_feature_extractor.words, encoding_feature_extractor.boxes
107
-
108
- custom_config = r'--oem 3 --psm 6'
109
- # encode
110
- inference_image = [image.convert("RGB")]
111
- encoding = processor(inference_image , truncation=True, return_offsets_mapping=True, return_tensors="pt",
112
- padding="max_length", stride =128, max_length=512, return_overflowing_tokens=True)
113
- offset_mapping = encoding.pop('offset_mapping')
114
- overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping')
115
-
116
- # change the shape of pixel values
117
- x = []
118
- for i in range(0, len(encoding['pixel_values'])):
119
- x.append(encoding['pixel_values'][i])
120
- x = torch.stack(x)
121
- encoding['pixel_values'] = x
122
-
123
- # forward pass
124
- outputs = model(**encoding)
125
-
126
- # get predictions
127
- predictions = outputs.logits.argmax(-1).squeeze().tolist()
128
- token_boxes = encoding.bbox.squeeze().tolist()
129
-
130
- # only keep non-subword predictions
131
- preds = []
132
- l_words = []
133
- bboxes = []
134
- token_section_num = []
135
-
136
- if (len(token_boxes) == 512):
137
- predictions = [predictions]
138
- token_boxes = [token_boxes]
139
-
140
-
141
- for i in range(0, len(token_boxes)):
142
- for j in range(0, len(token_boxes[i])):
143
- #print(np.asarray(token_boxes[i][j]).shape)
144
- unnormal_box = unnormalize_box(token_boxes[i][j], width, height)
145
- #print('prediction: {} - box: {} - word:{}'.format(predictions[i][j], unnormal_box, processor.tokenizer.decode(encoding["input_ids"][i][j])))
146
- if (np.asarray(token_boxes[i][j]).shape != (4,)):
147
- continue
148
- elif (token_boxes[i][j] == [0, 0, 0, 0] or token_boxes[i][j] == 0):
149
- #print('zero found!')
150
- continue
151
- # if bbox is available in the list, just we need to update text
152
- elif (unnormal_box not in bboxes):
153
- preds.append(predictions[i][j])
154
- l_words.append(processor.tokenizer.decode(encoding["input_ids"][i][j]))
155
- bboxes.append(unnormal_box)
156
- token_section_num.append(i)
157
- else:
158
- # we have to update the word
159
- _index = bboxes.index(unnormal_box)
160
- if (token_section_num[_index] == i):
161
- # check if they're in a same section or not (documents with more than 512 tokens will divide to seperate
162
- # parts, so it's possible to have a word in both of the pages and we have to control that repetetive words
163
- # HERE: because they're in a same section, so we can merge them safely
164
- l_words[_index] = l_words[_index] + processor.tokenizer.decode(encoding["input_ids"][i][j])
165
-
166
- else:
167
- continue
168
-
169
-
170
- return bboxes, preds, l_words, image
171
-
172
-
173
-
174
- def visualize_image(final_bbox, final_preds, l_words, image):
175
-
176
- draw = ImageDraw.Draw(image)
177
- font = ImageFont.load_default()
178
-
179
- label2color = {'container id':'red', 'seal number':'blue', 'container quantity':'black', 'container type':'green', 'tare':'brown', 'package quantity':'purple', 'weight':'orange', 'others': 'white'}
180
- l2l = {'container id':'red', 'seal number':'blue', 'container quantity':'black', 'container type':'green', 'tare':'brown', 'package quantity':'purple', 'weight':'orange', 'others': 'white'}
181
- f_labels = {'container id':'red', 'seal number':'blue', 'container quantity':'black', 'container type':'green', 'tare':'brown', 'package quantity':'purple', 'weight':'orange', 'others': 'white'}
182
-
183
- json_df = []
184
-
185
- for ix, (prediction, box) in enumerate(zip(final_preds, final_bbox)):
186
- predicted_label = iob_to_label(prediction).lower()
187
- draw.rectangle(box, outline=label2color[predicted_label])
188
- draw.text((box[0]+10, box[1]-10), text=predicted_label, fill=label2color[predicted_label], font=font)
189
-
190
- json_dict = {}
191
- json_dict['TEXT'] = l_words[ix]
192
- json_dict['LABEL'] = f_labels[predicted_label]
193
-
194
- json_df.append(json_dict)
195
- return image, json_df
196
-
197
-
198
- def mergeCloseBoxes(pr, bb, wr, threshold):
199
- idx = 0
200
- final_bbox =[]
201
- final_preds =[]
202
- final_words=[]
203
-
204
- for box, pred, word in zip(bb, pr, wr):
205
- if (pred=='others'):
206
- continue
207
- else:
208
- final_bbox.append(box)
209
- final_preds.append(pred)
210
- final_words.append(word)
211
- for b, p, w in zip(bb, pr, wr):
212
- if (p == 'others'):
213
- continue
214
- elif (box==b): # we shouldn't check each item with itself
215
- continue
216
- else:
217
- XMIN, YMIN, XMAX, YMAX = box
218
- xmin, ymin, xmax, ymax = b
219
- intsc = intersect([XMIN, YMIN, XMAX+threshold, YMAX], [xmin-threshold, ymin, xmax, ymax])
220
- if (intsc != 0 and pred==p):
221
- #if(abs(XMAX - xmin) < treshold and abs(YMIN - ymin) < 10):
222
- if(box in final_bbox):
223
- final_bbox[idx]= [XMIN, min(YMIN, ymin), xmax, max(YMAX, ymax)]
224
- final_words[idx] = word + ' ' + w
225
- continue
226
-
227
- print('box: {}, label: {} is close to b:{} with this p:{}--> {}'.format(box, pred, b, p, word + ' ' + w))
228
-
229
- idx = idx +1
230
- return final_bbox, final_preds, final_words
231
-
232
- def createDataframe(preds, words):
233
- df = pd.DataFrame(columns = ['container id' ,'seal number', 'container quantity', 'container type', 'package quantity', 'tare', 'weight'])
234
- flag_label = preds[0]
235
- #print(preds)
236
- #print(words)
237
- #print('@@@@@')
238
- #print(flag_label)
239
- row_number = -1
240
- for i in range(len(preds)):
241
- #print('i is: {}'.format(i))
242
- if (preds[i] == flag_label):
243
- row_number = row_number + 1
244
- df.at[row_number, preds[i]] = words[i]
245
- #print('row number is: {}'.format(row_number))
246
- continue
247
-
248
- else:
249
- #print('row_number {} is <= of df.shape {}'.format(row_number, df.shape[0]))
250
- #print(pd.isna(df[preds[i]].iloc[row_number]))
251
- #print(pd.isna(df[preds[i]].iloc[row_number]))
252
- if(pd.isna(df[preds[i]].iloc[row_number])):
253
- df.at[row_number, preds[i]] = words[i]
254
- else:
255
- row_number = row_number + 1
256
- df.at[row_number, preds[i]] = words[i]
257
-
258
- return df
259
-
260
- def isInside(w, z):
261
- # return True if w is inside z, if z is inside w return false
262
- if(w[0] >= z[0] and w[1] >= z[1] and w[2] <= z[2] and w[3] <= z[3]):
263
- return True
264
- return False
265
-
266
- def removeSimilarItems(final_bbox, final_preds, final_words):
267
- _bb =[]
268
- _pp=[]
269
- _ww=[]
270
- for i in range(len(final_bbox)):
271
- _bb.append(final_bbox[i])
272
- _pp.append(final_preds[i])
273
- _ww.append(final_words[i])
274
- for j in range(len(final_bbox)):
275
- if (final_bbox[i] == final_bbox[j]):
276
- continue
277
- elif (isInside(final_bbox[i], final_bbox[j]) and final_preds[i]==final_preds[j] ):
278
- # box i is inside box j, so we have to remove it
279
- #print('box[i]: {} is inside box[j]:{}'.format(final_bbox[i], final_bbox[j]))
280
- _bb = _bb[:-1]
281
- _pp = _pp[:-1]
282
- _ww = _ww[:-1]
283
- continue
284
- return _bb, _pp, _ww
285
-
286
- #[45.604, 2309.811, 66.652, 2391.6839999999997]
287
-
288
- def process_form(preds, words, bboxes):
289
-
290
- final_bbox, final_preds, final_words = mergeCloseBoxes(preds, bboxes, words, 70)
291
- _bbox, _preds, _words = removeSimilarItems(final_bbox, final_preds, final_words)
292
- # convert float list to int
293
- _bbox = [[int(x) for x in item ] for item in _bbox]
294
- # creat data object for sorting
295
- data = []
296
- for index in range(len(_bbox)):
297
- data.append((_bbox[index], _preds[index], _words[index]))
298
- # sorting by the height of the page
299
- sorted_list = sorted(
300
- data,
301
- key=lambda x: x[0][1]
302
- )
303
- _bbox = [item[0] for item in sorted_list]
304
- _preds = [item[1] for item in sorted_list]
305
- _words = [item[2] for item in sorted_list]
306
- return _bbox, _preds, _words
307
-
308
- def mergeImageVertical(a):
309
- list_im = a
310
- imgs = [ Image.open(i) for i in list_im ]
311
- # pick the image which is the smallest, and resize the others to match it (can be arbitrary image shape here)
312
- min_shape = sorted( [(np.sum(i.size), i.size ) for i in imgs])[0][1]
313
- imgs_comb = np.hstack([i.resize(min_shape) for i in imgs])
314
-
315
- # for a vertical stacking it is simple: use vstack
316
- imgs_comb = np.vstack([i.resize(min_shape) for i in imgs])
317
- imgs_comb = Image.fromarray( imgs_comb)
318
- imgs_comb.save( 'Trifecta_vertical.jpg' )
319
- return imgs_comb
320
 
321
 
322
 
323
  def completepreprocess(pdffile):
 
 
 
 
 
 
 
324
  myDataFrame = pd.DataFrame()
325
  a=[]
326
  doc = fitz.open(pdffile)
@@ -348,23 +80,25 @@ def completepreprocess(pdffile):
348
  return im2,myDataFrame
349
 
350
 
351
- title = "Interactive demo: Manifesto Information Extraction model"
352
- description = "Manifesto Information Extraction - We use Microsoft’s LayoutLMv3 trained on Manifesto Dataset through csv's to predict the labels. To use it, simply upload a PDF or use the example PDF below and click ‘Submit’. Results will show up in a few seconds. If you want to make the output bigger, right-click on it and select ‘Open image in new tab’.Train =63 ,Test =15"
353
 
354
  css = """.output_image, .input_image {height: 600px !important}"""
355
  #examples = [["461BHH69.PDF"],["AP-481-RF.PDF"],["DP-095-ML.PDF"],["DQ-231-LL.PDF"],["FK-941-ET.PDF"], ["FL-078-NH.PDF"]
356
  # ,["14ZZ69.PDF"],["74BCA69.PDF"],["254BEG69.PDF"],["761BJQ69.PDF"],["AB-486-EH.PDF"],["AZ-211-ZA.PDF"], ["CY-073-YV.PDF"]]
357
  # ["744BJQ69.PDF"], ['tarros_2.jpg'],
358
- examples = [['3pages_messina.pdf'], ['messina2.jpg'], ['arkas1.jpg'], ['brointermed1.jpg'], ['brointermed2.pdf'], ['tarros_1.jpg'], ['tarros_3.jpg'], ['tarros_4.jpg']]
359
 
360
  iface = gr.Interface(fn=completepreprocess,
361
  #inputs=gr.inputs.Image(type="pil",optional=True,label="upload file"),
362
- inputs=gr.File(label="PDF"),
 
 
 
363
  #inputs=gr.inputs.Image(type="pil")
364
  outputs=[gr.outputs.Image(type="pil", label="annotated image"),"dataframe"] ,
365
  title=title,
366
  description=description,
367
- examples=examples,
368
  css=css,
369
  analytics_enabled = True, enable_queue=True)
370
 
 
14
  import torch
15
  import pandas as pd
16
  import numpy as np
 
17
  import gradio as gr
18
  from tqdm import tqdm
 
19
  from scipy import ndimage
 
 
 
 
 
 
20
  from PIL import Image, ImageDraw, ImageFont
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
 
 
22
 
23
 
24
  def unnormalize_box(bbox, width, height):
 
30
  height * (bbox[3] / 1000),
31
  ]
32
 
33
+ def imageconversion(pdffile):
34
+ doc = fitz.open(pdffile)
35
+ page = doc.load_page(0)
36
+ zoom = 2 # zoom factor
37
+ mat = fitz.Matrix(zoom, zoom)
38
+ pix = page.get_pixmap(matrix = mat,dpi = 300)
39
+ image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
40
+ t=pix.save("page.jpg")
41
+ # img = removeBorders(image)
42
+ # noise_img = add_noise(np.array(image))
43
+ # image = Image.fromarray(noise_img)
44
+ return image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
 
47
 
48
  def completepreprocess(pdffile):
49
+ t=imageconversion(pdffile)
50
+ image = t.convert("RGB")
51
+ width,height=image.size
52
+ if ocr_type == "PaddleOCR":
53
+ words, boxes = process_image_PaddleOCR(image, width, height)
54
+ elif ocr_type == "Pytesseract":
55
+ words, boxes = process_image_pytesseract(image, width, height)
56
  myDataFrame = pd.DataFrame()
57
  a=[]
58
  doc = fitz.open(pdffile)
 
80
  return im2,myDataFrame
81
 
82
 
83
+ title = "OCR outputs"
84
+ description = ""
85
 
86
  css = """.output_image, .input_image {height: 600px !important}"""
87
  #examples = [["461BHH69.PDF"],["AP-481-RF.PDF"],["DP-095-ML.PDF"],["DQ-231-LL.PDF"],["FK-941-ET.PDF"], ["FL-078-NH.PDF"]
88
  # ,["14ZZ69.PDF"],["74BCA69.PDF"],["254BEG69.PDF"],["761BJQ69.PDF"],["AB-486-EH.PDF"],["AZ-211-ZA.PDF"], ["CY-073-YV.PDF"]]
89
  # ["744BJQ69.PDF"], ['tarros_2.jpg'],
 
90
 
91
  iface = gr.Interface(fn=completepreprocess,
92
  #inputs=gr.inputs.Image(type="pil",optional=True,label="upload file"),
93
+ inputs=[
94
+ gr.inputs.File(label="PDF"),
95
+ gr.inputs.Dropdown(label="Select the Open Source OCR", choices=["PaddleOCR", "Pytesseract"]),
96
+ ],
97
  #inputs=gr.inputs.Image(type="pil")
98
  outputs=[gr.outputs.Image(type="pil", label="annotated image"),"dataframe"] ,
99
  title=title,
100
  description=description,
101
+ #examples=examples,
102
  css=css,
103
  analytics_enabled = True, enable_queue=True)
104