Spaces:

atatavana
/

OCR_check

Build error

App Files Files Community

atatavana commited on Mar 20, 2023

Commit

358bc3a

1 Parent(s): fcc45ff

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -30

app.py CHANGED Viewed

@@ -18,6 +18,8 @@ import gradio as gr
 from tqdm import tqdm
 from scipy import ndimage
 from PIL import Image, ImageDraw, ImageFont
@@ -38,46 +40,54 @@ def imageconversion(pdffile):
   pix = page.get_pixmap(matrix = mat,dpi = 300)
   image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
   t=pix.save("page.jpg")
-  # img = removeBorders(image)
-  # noise_img = add_noise(np.array(image))
-  # image = Image.fromarray(noise_img)
   return image
 def completepreprocess(pdffile):
     t=imageconversion(pdffile)
     image = t.convert("RGB")
     width,height=image.size
     if ocr_type == "PaddleOCR":
-        words, boxes = process_image_PaddleOCR(image, width, height)
     elif ocr_type == "Pytesseract":
-        words, boxes = process_image_pytesseract(image, width, height)
-  myDataFrame = pd.DataFrame()
-  a=[]
-  doc = fitz.open(pdffile)
-  for i in range(0,len(doc)):
-    page = doc.load_page(i)
-    zoom = 2    # zoom factor
-    mat = fitz.Matrix(zoom, zoom)
-    pix = page.get_pixmap(matrix = mat,dpi = 200)
-    t=pix.save("page"+str(i)+".jpg")
-    images = Image.open("page"+str(i)+".jpg")
-    image = images.convert("RGB")
-    bbox, preds, words, image = process_image(image)
-    im, df = visualize_image(bbox, preds, words, image)
-    im1 = im.save("page"+str(i)+".jpg")
-    a.append("page"+str(i)+".jpg")
-    pred_list = []
-    for number in preds:
-      pred_list.append(iob_to_label(number))
-    _bbox, _preds, _words = process_form(pred_list, words, bbox)
-    print('page: ' + str(i) + '  ' + str(len(_preds))+ '  ' + str(len(_words)))
-    df = createDataframe(_preds, _words)
-    myDataFrame=myDataFrame.append(df)
-  im2=mergeImageVertical(a)
-  return im2,myDataFrame
 title = "OCR outputs"

 from tqdm import tqdm
 from scipy import ndimage
 from PIL import Image, ImageDraw, ImageFont
+import paddleocr
+from paddleocr import draw_ocr
   pix = page.get_pixmap(matrix = mat,dpi = 300)
   image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
   t=pix.save("page.jpg")
   return image
+def process_image_pytesseract(image,width,height):
+    width, height = image.size
+    #feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=True,lang=lang)
+    #encoding_feature_extractor = feature_extractor(image, return_tensors="pt",truncation=True)
+    #words, boxes = encoding_feature_extractor.words, encoding_feature_extractor.boxes
+    words, boxes, scores = [], [], []
+    return words,boxes,scores,image
+def process_image_PaddleOCR(image,width,height):
+    ocr = paddleocr.PaddleOCR(lang='en',use_gpu=False, use_angle_cls=True)
+    width, height = image.size
+    width_scale = 1000 / width
+    height_scale = 1000 / height
+    # Perform OCR on the image
+    results = ocr.ocr(np.array(image))
+    # Extract the words and bounding boxes from the OCR results
+    words = []
+    boxes = []
+    scores = []
+    for line in results:
+        for bbox in line:
+            words.append(bbox[1][0])
+            scores.append(bbox[1][1])
+            boxes.append(create_bounding_box1(bbox[0], width_scale, height_scale))
+    output_image = draw_ocr(image, boxes, words, scores, font_path='coolvetica rg.otf')
+    return words, boxes, scores, output_image
+def createDataframe(boxes, words, scores):
+    df = pd.DataFrame([boxes, words, scores], columns=['bbox','text', 'score'])
+    return df
 def completepreprocess(pdffile):
     t=imageconversion(pdffile)
     image = t.convert("RGB")
     width,height=image.size
     if ocr_type == "PaddleOCR":
+        words, boxes, scores, output_img = process_image_PaddleOCR(image, width, height)
     elif ocr_type == "Pytesseract":
+        words, boxes, scores, output_img = process_image_pytesseract(image, width, height)
+    dataframe = createDataframe(boxes, words, scores)
+    return output_img,myDataFrame
 title = "OCR outputs"