Update app.py
Browse files
app.py
CHANGED
@@ -18,6 +18,8 @@ import gradio as gr
|
|
18 |
from tqdm import tqdm
|
19 |
from scipy import ndimage
|
20 |
from PIL import Image, ImageDraw, ImageFont
|
|
|
|
|
21 |
|
22 |
|
23 |
|
@@ -38,46 +40,54 @@ def imageconversion(pdffile):
|
|
38 |
pix = page.get_pixmap(matrix = mat,dpi = 300)
|
39 |
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
40 |
t=pix.save("page.jpg")
|
41 |
-
# img = removeBorders(image)
|
42 |
-
# noise_img = add_noise(np.array(image))
|
43 |
-
# image = Image.fromarray(noise_img)
|
44 |
return image
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
def completepreprocess(pdffile):
|
49 |
t=imageconversion(pdffile)
|
50 |
image = t.convert("RGB")
|
51 |
width,height=image.size
|
52 |
if ocr_type == "PaddleOCR":
|
53 |
-
words, boxes = process_image_PaddleOCR(image, width, height)
|
54 |
elif ocr_type == "Pytesseract":
|
55 |
-
words, boxes = process_image_pytesseract(image, width, height)
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
for i in range(0,len(doc)):
|
60 |
-
page = doc.load_page(i)
|
61 |
-
zoom = 2 # zoom factor
|
62 |
-
mat = fitz.Matrix(zoom, zoom)
|
63 |
-
pix = page.get_pixmap(matrix = mat,dpi = 200)
|
64 |
-
t=pix.save("page"+str(i)+".jpg")
|
65 |
-
images = Image.open("page"+str(i)+".jpg")
|
66 |
-
image = images.convert("RGB")
|
67 |
-
bbox, preds, words, image = process_image(image)
|
68 |
-
im, df = visualize_image(bbox, preds, words, image)
|
69 |
-
im1 = im.save("page"+str(i)+".jpg")
|
70 |
-
a.append("page"+str(i)+".jpg")
|
71 |
-
pred_list = []
|
72 |
-
for number in preds:
|
73 |
-
pred_list.append(iob_to_label(number))
|
74 |
-
_bbox, _preds, _words = process_form(pred_list, words, bbox)
|
75 |
-
print('page: ' + str(i) + ' ' + str(len(_preds))+ ' ' + str(len(_words)))
|
76 |
-
df = createDataframe(_preds, _words)
|
77 |
-
myDataFrame=myDataFrame.append(df)
|
78 |
-
|
79 |
-
im2=mergeImageVertical(a)
|
80 |
-
return im2,myDataFrame
|
81 |
|
82 |
|
83 |
title = "OCR outputs"
|
|
|
18 |
from tqdm import tqdm
|
19 |
from scipy import ndimage
|
20 |
from PIL import Image, ImageDraw, ImageFont
|
21 |
+
import paddleocr
|
22 |
+
from paddleocr import draw_ocr
|
23 |
|
24 |
|
25 |
|
|
|
40 |
pix = page.get_pixmap(matrix = mat,dpi = 300)
|
41 |
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
42 |
t=pix.save("page.jpg")
|
|
|
|
|
|
|
43 |
return image
|
44 |
|
45 |
+
def process_image_pytesseract(image,width,height):
|
46 |
+
width, height = image.size
|
47 |
+
#feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=True,lang=lang)
|
48 |
+
#encoding_feature_extractor = feature_extractor(image, return_tensors="pt",truncation=True)
|
49 |
+
#words, boxes = encoding_feature_extractor.words, encoding_feature_extractor.boxes
|
50 |
+
words, boxes, scores = [], [], []
|
51 |
+
return words,boxes,scores,image
|
52 |
|
53 |
+
def process_image_PaddleOCR(image,width,height):
|
54 |
+
ocr = paddleocr.PaddleOCR(lang='en',use_gpu=False, use_angle_cls=True)
|
55 |
+
width, height = image.size
|
56 |
+
width_scale = 1000 / width
|
57 |
+
height_scale = 1000 / height
|
58 |
+
|
59 |
+
# Perform OCR on the image
|
60 |
+
results = ocr.ocr(np.array(image))
|
61 |
+
|
62 |
+
# Extract the words and bounding boxes from the OCR results
|
63 |
+
words = []
|
64 |
+
boxes = []
|
65 |
+
scores = []
|
66 |
+
for line in results:
|
67 |
+
for bbox in line:
|
68 |
+
words.append(bbox[1][0])
|
69 |
+
scores.append(bbox[1][1])
|
70 |
+
boxes.append(create_bounding_box1(bbox[0], width_scale, height_scale))
|
71 |
+
|
72 |
+
output_image = draw_ocr(image, boxes, words, scores, font_path='coolvetica rg.otf')
|
73 |
+
return words, boxes, scores, output_image
|
74 |
+
|
75 |
+
def createDataframe(boxes, words, scores):
|
76 |
+
df = pd.DataFrame([boxes, words, scores], columns=['bbox','text', 'score'])
|
77 |
+
return df
|
78 |
+
|
79 |
|
80 |
def completepreprocess(pdffile):
|
81 |
t=imageconversion(pdffile)
|
82 |
image = t.convert("RGB")
|
83 |
width,height=image.size
|
84 |
if ocr_type == "PaddleOCR":
|
85 |
+
words, boxes, scores, output_img = process_image_PaddleOCR(image, width, height)
|
86 |
elif ocr_type == "Pytesseract":
|
87 |
+
words, boxes, scores, output_img = process_image_pytesseract(image, width, height)
|
88 |
+
|
89 |
+
dataframe = createDataframe(boxes, words, scores)
|
90 |
+
return output_img,myDataFrame
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
|
93 |
title = "OCR outputs"
|