Spaces:

atatavana
/

OCR_check

Build error

App Files Files Community

atatavana commited on Mar 20, 2023

Commit

a0df5e7

1 Parent(s): 456bb2f

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -292

app.py CHANGED Viewed

@@ -14,41 +14,11 @@ import PIL
 import torch
 import pandas as pd
 import numpy as np
-import pandas as pd
 import gradio as gr
 from tqdm import tqdm
-from PIL import Image as im
 from scipy import ndimage
-from difflib import SequenceMatcher
-from itertools import groupby
-from datasets import load_metric
-from datasets import load_dataset
-from datasets.features import ClassLabel
-from transformers import AutoProcessor
 from PIL import Image, ImageDraw, ImageFont
-from transformers import AutoModelForTokenClassification
-from transformers.data.data_collator import default_data_collator
-from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D
-from transformers import LayoutLMv3ForTokenClassification,LayoutLMv3FeatureExtractor
-# define id2label
-id2label={0: 'container id', 1: 'seal number', 2: 'container quantity', 3: 'container type', 4: 'tare', 5: 'package quantity', 6: 'weight', 7: 'others'}
-custom_config = r'--oem 3 --psm 6'
-# lang='eng+deu+ita+chi_sim'
-lang='spa'
-label_ints = np.random.randint(0,len(PIL.ImageColor.colormap.items()),42)
-label_color_pil = [k for k,_ in PIL.ImageColor.colormap.items()]
-label_color = [label_color_pil[i] for i in label_ints]
-label2color = {}
-for k,v in id2label.items():
-  if v[:2] == '':
-    label2color['o']=label_color[k]
-  else:
-    label2color[v[2:]]=label_color[k]
-processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=True,lang=lang)
-model = AutoModelForTokenClassification.from_pretrained("atatavana/layoutlm_manifesto_bigdataset")
 def unnormalize_box(bbox, width, height):
@@ -60,267 +30,29 @@ def unnormalize_box(bbox, width, height):
          height * (bbox[3] / 1000),
      ]
-def iob_to_label(label):
-  if label == 0:
-    return 'container id'
-  if label == 1:
-    return 'seal number'
-  if label == 2:
-    return 'container quantity'
-  if label == 3:
-    return 'container type'
-  if label == 4:
-    return 'tare'
-  if label == 5:
-    return 'package quantity'
-  if label == 6:
-    return 'weight'
-  if label == 7:
-    return 'others'
-# this method will detect if there is any intersect between two boxes or not
-def intersect(w, z):
-    x1 = max(w[0], z[0]) #190  | 881  |  10
-    y1 = max(w[1], z[1]) #90   | 49   | 273
-    x2 = min(w[2], z[2]) #406  | 406  | 1310
-    y2 = min(w[3], z[3]) #149  | 703  | 149
-    if (x1 > x2 or y1 > y2):
-      return 0
-    else:
-      # because sometimes in annotating, it is possible to overlap rows or columns by mistake
-      # for very small pixels, we check a threshold to delete them
-      area = (x2-x1) * (y2-y1)
-      if (area > 0):  #500 is minumum accepted area
-        return [int(x1), int(y1), int(x2), int(y2)]
-      else:
-        return 0
-def process_image(image):
-    custom_config = r'--oem 3 --psm 6'
-    # lang='eng+deu+ita+chi_sim'
-    lang='spa'
-    width, height = image.size
-    feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=True,lang=lang)
-    encoding_feature_extractor = feature_extractor(image, return_tensors="pt",truncation=True)
-    words, boxes = encoding_feature_extractor.words, encoding_feature_extractor.boxes
-    custom_config = r'--oem 3 --psm 6'
-    # encode
-    inference_image = [image.convert("RGB")]
-    encoding = processor(inference_image , truncation=True, return_offsets_mapping=True, return_tensors="pt",
-                     padding="max_length", stride =128, max_length=512, return_overflowing_tokens=True)
-    offset_mapping = encoding.pop('offset_mapping')
-    overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping')
-    # change the shape of pixel values
-    x = []
-    for i in range(0, len(encoding['pixel_values'])):
-      x.append(encoding['pixel_values'][i])
-    x = torch.stack(x)
-    encoding['pixel_values'] = x
-    # forward pass
-    outputs = model(**encoding)
-    # get predictions
-    predictions = outputs.logits.argmax(-1).squeeze().tolist()
-    token_boxes = encoding.bbox.squeeze().tolist()
-    # only keep non-subword predictions
-    preds = []
-    l_words = []
-    bboxes = []
-    token_section_num = []
-    if (len(token_boxes) == 512):
-      predictions = [predictions]
-      token_boxes = [token_boxes]
-    for i in range(0, len(token_boxes)):
-      for j in range(0, len(token_boxes[i])):
-        #print(np.asarray(token_boxes[i][j]).shape)
-        unnormal_box = unnormalize_box(token_boxes[i][j], width, height)
-        #print('prediction: {} - box: {} - word:{}'.format(predictions[i][j], unnormal_box, processor.tokenizer.decode(encoding["input_ids"][i][j])))
-        if (np.asarray(token_boxes[i][j]).shape != (4,)):
-          continue
-        elif (token_boxes[i][j] == [0, 0, 0, 0] or token_boxes[i][j] == 0):
-          #print('zero found!')
-          continue
-        # if bbox is available in the list, just we need to update text
-        elif (unnormal_box not in bboxes):
-          preds.append(predictions[i][j])
-          l_words.append(processor.tokenizer.decode(encoding["input_ids"][i][j]))
-          bboxes.append(unnormal_box)
-          token_section_num.append(i)
-        else:
-          # we have to update the word
-          _index = bboxes.index(unnormal_box)
-          if (token_section_num[_index] == i):
-            # check if they're in a same section or not (documents with more than 512 tokens will divide to seperate
-            # parts, so it's possible to have a word in both of the pages and we have to control that repetetive words
-            # HERE: because they're in a same section, so we can merge them safely
-            l_words[_index] = l_words[_index] + processor.tokenizer.decode(encoding["input_ids"][i][j])
-          else:
-            continue
-    return bboxes, preds, l_words, image
-def visualize_image(final_bbox, final_preds, l_words, image):
-      draw = ImageDraw.Draw(image)
-      font = ImageFont.load_default()
-      label2color = {'container id':'red', 'seal number':'blue', 'container quantity':'black', 'container type':'green', 'tare':'brown', 'package quantity':'purple', 'weight':'orange', 'others': 'white'}
-      l2l = {'container id':'red', 'seal number':'blue', 'container quantity':'black', 'container type':'green', 'tare':'brown', 'package quantity':'purple', 'weight':'orange', 'others': 'white'}
-      f_labels = {'container id':'red', 'seal number':'blue', 'container quantity':'black', 'container type':'green', 'tare':'brown', 'package quantity':'purple', 'weight':'orange', 'others': 'white'}
-      json_df = []
-      for ix, (prediction, box) in enumerate(zip(final_preds, final_bbox)):
-        predicted_label = iob_to_label(prediction).lower()
-        draw.rectangle(box, outline=label2color[predicted_label])
-        draw.text((box[0]+10, box[1]-10), text=predicted_label, fill=label2color[predicted_label], font=font)
-        json_dict = {}
-        json_dict['TEXT'] = l_words[ix]
-        json_dict['LABEL'] = f_labels[predicted_label]
-        json_df.append(json_dict)
-      return image, json_df
-def mergeCloseBoxes(pr, bb, wr, threshold):
-  idx = 0
-  final_bbox =[]
-  final_preds =[]
-  final_words=[]
-  for box, pred, word in zip(bb, pr, wr):
-    if (pred=='others'):
-      continue
-    else:
-      final_bbox.append(box)
-      final_preds.append(pred)
-      final_words.append(word)
-      for b, p, w in zip(bb, pr, wr):
-        if (p == 'others'):
-          continue
-        elif (box==b): # we shouldn't check each item with itself
-          continue
-        else:
-          XMIN, YMIN, XMAX, YMAX = box
-          xmin, ymin, xmax, ymax = b
-          intsc = intersect([XMIN, YMIN, XMAX+threshold, YMAX], [xmin-threshold, ymin, xmax, ymax])
-          if (intsc != 0 and pred==p):
-          #if(abs(XMAX - xmin) < treshold and abs(YMIN - ymin) < 10):
-            if(box in final_bbox):
-              final_bbox[idx]= [XMIN, min(YMIN, ymin), xmax, max(YMAX, ymax)]
-              final_words[idx] = word + ' ' + w
-              continue
-            print('box: {}, label: {} is close to b:{} with this p:{}--> {}'.format(box, pred, b, p, word + ' ' + w))
-    idx = idx +1
-  return final_bbox, final_preds, final_words
-def createDataframe(preds, words):
-  df = pd.DataFrame(columns = ['container id' ,'seal number', 'container quantity', 'container type', 'package quantity', 'tare', 'weight'])
-  flag_label = preds[0]
-  #print(preds)
-  #print(words)
-  #print('@@@@@')
-  #print(flag_label)
-  row_number = -1
-  for i in range(len(preds)):
-      #print('i is: {}'.format(i))
-      if (preds[i] == flag_label):
-          row_number = row_number + 1
-          df.at[row_number, preds[i]] = words[i]
-          #print('row number is: {}'.format(row_number))
-          continue
-      else:
-        #print('row_number {} is <= of df.shape {}'.format(row_number, df.shape[0]))
-        #print(pd.isna(df[preds[i]].iloc[row_number]))
-        #print(pd.isna(df[preds[i]].iloc[row_number]))
-        if(pd.isna(df[preds[i]].iloc[row_number])):
-          df.at[row_number, preds[i]] = words[i]
-        else:
-          row_number = row_number + 1
-          df.at[row_number, preds[i]] = words[i]
-  return df
-def isInside(w, z):
-    # return True if w is inside z, if z is inside w return false
-    if(w[0] >= z[0] and w[1] >= z[1] and w[2] <= z[2] and w[3] <= z[3]):
-      return True
-    return False
-def removeSimilarItems(final_bbox, final_preds, final_words):
-  _bb =[]
-  _pp=[]
-  _ww=[]
-  for i in range(len(final_bbox)):
-    _bb.append(final_bbox[i])
-    _pp.append(final_preds[i])
-    _ww.append(final_words[i])
-    for j in range(len(final_bbox)):
-        if (final_bbox[i] == final_bbox[j]):
-          continue
-        elif (isInside(final_bbox[i], final_bbox[j]) and final_preds[i]==final_preds[j] ):
-           # box i is inside box j, so we have to remove it
-           #print('box[i]: {} is inside box[j]:{}'.format(final_bbox[i], final_bbox[j]))
-           _bb = _bb[:-1]
-           _pp = _pp[:-1]
-           _ww = _ww[:-1]
-           continue
-  return _bb, _pp, _ww
- #[45.604, 2309.811, 66.652, 2391.6839999999997]
-def process_form(preds, words, bboxes):
-  final_bbox, final_preds, final_words = mergeCloseBoxes(preds, bboxes, words, 70)
-  _bbox, _preds, _words = removeSimilarItems(final_bbox, final_preds, final_words)
-  # convert float list to int
-  _bbox = [[int(x) for x in item ] for item in _bbox]
-  # creat data object for sorting
-  data = []
-  for index in range(len(_bbox)):
-    data.append((_bbox[index], _preds[index], _words[index]))
-  # sorting by the height of the page
-  sorted_list = sorted(
-      data,
-      key=lambda x: x[0][1]
-  )
-  _bbox = [item[0] for item in sorted_list]
-  _preds = [item[1] for item in sorted_list]
-  _words = [item[2] for item in sorted_list]
-  return _bbox, _preds, _words
-def mergeImageVertical(a):
-  list_im = a
-  imgs    = [ Image.open(i) for i in list_im ]
-  # pick the image which is the smallest, and resize the others to match it (can be arbitrary image shape here)
-  min_shape = sorted( [(np.sum(i.size), i.size ) for i in imgs])[0][1]
-  imgs_comb = np.hstack([i.resize(min_shape) for i in imgs])
-  # for a vertical stacking it is simple: use vstack
-  imgs_comb = np.vstack([i.resize(min_shape) for i in imgs])
-  imgs_comb = Image.fromarray( imgs_comb)
-  imgs_comb.save( 'Trifecta_vertical.jpg' )
-  return imgs_comb
 def completepreprocess(pdffile):
   myDataFrame = pd.DataFrame()
   a=[]
   doc = fitz.open(pdffile)
@@ -348,23 +80,25 @@ def completepreprocess(pdffile):
   return im2,myDataFrame
-title = "Interactive demo: Manifesto Information Extraction model"
-description = "Manifesto Information Extraction - We use Microsoft’s LayoutLMv3 trained on Manifesto Dataset through csv's to predict the labels.  To use it, simply upload a PDF or use the example PDF below and click ‘Submit’. Results will show up in a few seconds. If you want to make the output bigger, right-click on it and select ‘Open image in new tab’.Train =63 ,Test =15"
 css = """.output_image, .input_image {height: 600px !important}"""
 #examples = [["461BHH69.PDF"],["AP-481-RF.PDF"],["DP-095-ML.PDF"],["DQ-231-LL.PDF"],["FK-941-ET.PDF"], ["FL-078-NH.PDF"]
 #              ,["14ZZ69.PDF"],["74BCA69.PDF"],["254BEG69.PDF"],["761BJQ69.PDF"],["AB-486-EH.PDF"],["AZ-211-ZA.PDF"], ["CY-073-YV.PDF"]]
 # ["744BJQ69.PDF"], ['tarros_2.jpg'],
-examples = [['3pages_messina.pdf'], ['messina2.jpg'], ['arkas1.jpg'], ['brointermed1.jpg'], ['brointermed2.pdf'], ['tarros_1.jpg'],  ['tarros_3.jpg'], ['tarros_4.jpg']]
 iface = gr.Interface(fn=completepreprocess,
                      #inputs=gr.inputs.Image(type="pil",optional=True,label="upload file"),
-                     inputs=gr.File(label="PDF"),
                      #inputs=gr.inputs.Image(type="pil")
                      outputs=[gr.outputs.Image(type="pil", label="annotated image"),"dataframe"] ,
                      title=title,
                      description=description,
-                     examples=examples,
                      css=css,
                      analytics_enabled = True, enable_queue=True)

 import torch
 import pandas as pd
 import numpy as np
 import gradio as gr
 from tqdm import tqdm
 from scipy import ndimage
 from PIL import Image, ImageDraw, ImageFont
 def unnormalize_box(bbox, width, height):
          height * (bbox[3] / 1000),
      ]
+def imageconversion(pdffile):
+  doc = fitz.open(pdffile)
+  page = doc.load_page(0)
+  zoom = 2    # zoom factor
+  mat = fitz.Matrix(zoom, zoom)
+  pix = page.get_pixmap(matrix = mat,dpi = 300)
+  image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+  t=pix.save("page.jpg")
+  # img = removeBorders(image)
+  # noise_img = add_noise(np.array(image))
+  # image = Image.fromarray(noise_img)
+  return image
 def completepreprocess(pdffile):
+    t=imageconversion(pdffile)
+    image = t.convert("RGB")
+    width,height=image.size
+    if ocr_type == "PaddleOCR":
+        words, boxes = process_image_PaddleOCR(image, width, height)
+    elif ocr_type == "Pytesseract":
+        words, boxes = process_image_pytesseract(image, width, height)
   myDataFrame = pd.DataFrame()
   a=[]
   doc = fitz.open(pdffile)
   return im2,myDataFrame
+title = "OCR outputs"
+description = ""
 css = """.output_image, .input_image {height: 600px !important}"""
 #examples = [["461BHH69.PDF"],["AP-481-RF.PDF"],["DP-095-ML.PDF"],["DQ-231-LL.PDF"],["FK-941-ET.PDF"], ["FL-078-NH.PDF"]
 #              ,["14ZZ69.PDF"],["74BCA69.PDF"],["254BEG69.PDF"],["761BJQ69.PDF"],["AB-486-EH.PDF"],["AZ-211-ZA.PDF"], ["CY-073-YV.PDF"]]
 # ["744BJQ69.PDF"], ['tarros_2.jpg'],
 iface = gr.Interface(fn=completepreprocess,
                      #inputs=gr.inputs.Image(type="pil",optional=True,label="upload file"),
+                     inputs=[
+                        gr.inputs.File(label="PDF"),
+                        gr.inputs.Dropdown(label="Select the Open Source OCR", choices=["PaddleOCR", "Pytesseract"]),
+                    ],
                      #inputs=gr.inputs.Image(type="pil")
                      outputs=[gr.outputs.Image(type="pil", label="annotated image"),"dataframe"] ,
                      title=title,
                      description=description,
+                     #examples=examples,
                      css=css,
                      analytics_enabled = True, enable_queue=True)