OCR_check / app.py
atatavana's picture
Update app.py
0b0a013
# -*- coding: utf-8 -*-
"""OCR check
"""
import os
#os.system('pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu')
import os, glob, fitz
import cv2
import os
import PIL
import pandas as pd
import numpy as np
import gradio as gr
from tqdm import tqdm
from scipy import ndimage
from PIL import Image, ImageDraw, ImageFont
import paddleocr
from paddleocr import draw_ocr
def unnormalize_box(bbox, width, height):
#print('shape is: ', np.asarray(bbox).shape, ' and box has values: ', bbox)
return [
width * (bbox[0] / 1000),
height * (bbox[1] / 1000),
width * (bbox[2] / 1000),
height * (bbox[3] / 1000),
]
def imageconversion(pdffile):
doc = fitz.open(pdffile)
page = doc.load_page(0)
zoom = 2 # zoom factor
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix = mat,dpi = 300)
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
t=pix.save("page.jpg")
return image
def process_image_pytesseract(image,width,height):
width, height = image.size
#feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=True,lang=lang)
#encoding_feature_extractor = feature_extractor(image, return_tensors="pt",truncation=True)
#words, boxes = encoding_feature_extractor.words, encoding_feature_extractor.boxes
words, boxes, scores = [], [], []
return words,boxes,scores,image
def process_image_PaddleOCR(image,width,height):
ocr = paddleocr.PaddleOCR(lang='en',use_gpu=False, use_angle_cls=True)
width, height = image.size
width_scale = 1000 / width
height_scale = 1000 / height
# Perform OCR on the image
results = ocr.ocr(np.array(image))
# Extract the words and bounding boxes from the OCR results
words = []
boxes = []
scores = []
for line in results:
for bbox in line:
words.append(bbox[1][0])
scores.append(bbox[1][1])
boxes.append(bbox[0])
output_image = draw_ocr(image, boxes, words, scores, font_path='coolvetica rg.otf')
return words, boxes, scores, output_image
def createDataframe(boxes, words, scores):
df = pd.DataFrame(list(zip(boxes, words, scores)), columns=['bbox','text', 'score'])
return df
def completepreprocess(pdffile, ocr_type):
t=imageconversion(pdffile)
image = t.convert("RGB")
width,height=image.size
if ocr_type == "PaddleOCR":
words, boxes, scores, output_img = process_image_PaddleOCR(image, width, height)
elif ocr_type == "Pytesseract":
words, boxes, scores, output_img = process_image_pytesseract(image, width, height)
else:
words, boxes, scores, output_img = process_image_PaddleOCR(image, width, height)
dataframe = createDataframe(boxes, words, scores)
return output_img, dataframe
title = "OCR outputs"
description = ""
css = """.output_image, .input_image {height: 600px !important}"""
#examples = [["461BHH69.PDF"],["AP-481-RF.PDF"],["DP-095-ML.PDF"],["DQ-231-LL.PDF"],["FK-941-ET.PDF"], ["FL-078-NH.PDF"]
# ,["14ZZ69.PDF"],["74BCA69.PDF"],["254BEG69.PDF"],["761BJQ69.PDF"],["AB-486-EH.PDF"],["AZ-211-ZA.PDF"], ["CY-073-YV.PDF"]]
# ["744BJQ69.PDF"], ['tarros_2.jpg'],
examples = [['3.jpg']]
iface = gr.Interface(fn=completepreprocess,
#inputs=gr.inputs.Image(type="pil",optional=True,label="upload file"),
inputs=[
gr.inputs.File(label="PDF"),
gr.inputs.Dropdown(label="Select the OCR", choices=["PaddleOCR", "Pytesseract"]),
],
#inputs=gr.inputs.Image(type="pil")
outputs=[gr.outputs.Image(type="pil", label="annotated image"),"dataframe"] ,
title=title,
description=description,
examples=examples,
css=css,
analytics_enabled = True, enable_queue=True)
iface.launch(inline=False , debug=True)