|
|
|
"""OCR check |
|
""" |
|
|
|
import os |
|
|
|
import os, glob, fitz |
|
import cv2 |
|
import os |
|
import PIL |
|
import pandas as pd |
|
import numpy as np |
|
import gradio as gr |
|
from tqdm import tqdm |
|
from scipy import ndimage |
|
from PIL import Image, ImageDraw, ImageFont |
|
import paddleocr |
|
from paddleocr import draw_ocr |
|
|
|
|
|
|
|
def unnormalize_box(bbox, width, height): |
|
|
|
return [ |
|
width * (bbox[0] / 1000), |
|
height * (bbox[1] / 1000), |
|
width * (bbox[2] / 1000), |
|
height * (bbox[3] / 1000), |
|
] |
|
|
|
def imageconversion(pdffile): |
|
doc = fitz.open(pdffile) |
|
page = doc.load_page(0) |
|
zoom = 2 |
|
mat = fitz.Matrix(zoom, zoom) |
|
pix = page.get_pixmap(matrix = mat,dpi = 300) |
|
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) |
|
t=pix.save("page.jpg") |
|
return image |
|
|
|
def process_image_pytesseract(image,width,height): |
|
width, height = image.size |
|
|
|
|
|
|
|
words, boxes, scores = [], [], [] |
|
return words,boxes,scores,image |
|
|
|
def process_image_PaddleOCR(image,width,height): |
|
ocr = paddleocr.PaddleOCR(lang='en',use_gpu=False, use_angle_cls=True) |
|
width, height = image.size |
|
width_scale = 1000 / width |
|
height_scale = 1000 / height |
|
|
|
|
|
results = ocr.ocr(np.array(image)) |
|
|
|
|
|
words = [] |
|
boxes = [] |
|
scores = [] |
|
for line in results: |
|
for bbox in line: |
|
words.append(bbox[1][0]) |
|
scores.append(bbox[1][1]) |
|
boxes.append(bbox[0]) |
|
|
|
output_image = draw_ocr(image, boxes, words, scores, font_path='coolvetica rg.otf') |
|
return words, boxes, scores, output_image |
|
|
|
def createDataframe(boxes, words, scores): |
|
df = pd.DataFrame(list(zip(boxes, words, scores)), columns=['bbox','text', 'score']) |
|
return df |
|
|
|
|
|
def completepreprocess(pdffile, ocr_type): |
|
t=imageconversion(pdffile) |
|
image = t.convert("RGB") |
|
width,height=image.size |
|
if ocr_type == "PaddleOCR": |
|
words, boxes, scores, output_img = process_image_PaddleOCR(image, width, height) |
|
elif ocr_type == "Pytesseract": |
|
words, boxes, scores, output_img = process_image_pytesseract(image, width, height) |
|
else: |
|
words, boxes, scores, output_img = process_image_PaddleOCR(image, width, height) |
|
|
|
dataframe = createDataframe(boxes, words, scores) |
|
return output_img, dataframe |
|
|
|
|
|
title = "OCR outputs" |
|
description = "" |
|
|
|
css = """.output_image, .input_image {height: 600px !important}""" |
|
|
|
|
|
|
|
examples = [['3.jpg']] |
|
iface = gr.Interface(fn=completepreprocess, |
|
|
|
inputs=[ |
|
gr.inputs.File(label="PDF"), |
|
gr.inputs.Dropdown(label="Select the OCR", choices=["PaddleOCR", "Pytesseract"]), |
|
], |
|
|
|
outputs=[gr.outputs.Image(type="pil", label="annotated image"),"dataframe"] , |
|
title=title, |
|
description=description, |
|
examples=examples, |
|
css=css, |
|
analytics_enabled = True, enable_queue=True) |
|
|
|
iface.launch(inline=False , debug=True) |