File size: 4,040 Bytes
122792f
ec586c8
122792f
 
 
575a70c
122792f
 
 
 
 
 
 
 
 
 
358bc3a
 
122792f
 
 
 
 
 
 
 
 
 
 
 
a0df5e7
 
 
 
 
 
 
 
 
122792f
358bc3a
 
 
 
 
 
 
122792f
358bc3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1eefa79
358bc3a
 
 
 
 
0b0a013
358bc3a
 
122792f
4e720a7
a0df5e7
 
 
 
358bc3a
a0df5e7
358bc3a
3cc9394
 
122792f
358bc3a
ad6bbe5
122792f
 
a0df5e7
 
122792f
 
 
 
 
d8aa7b1
122792f
 
a0df5e7
 
4e720a7
 
122792f
 
 
 
d8aa7b1
122792f
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# -*- coding: utf-8 -*-
"""OCR check
"""

import os
#os.system('pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu')
import os, glob, fitz
import cv2
import os
import PIL
import pandas as pd
import numpy as np
import gradio as gr
from tqdm import tqdm
from scipy import ndimage
from PIL import Image, ImageDraw, ImageFont
import paddleocr
from paddleocr import draw_ocr



def unnormalize_box(bbox, width, height):
     #print('shape is: ', np.asarray(bbox).shape, ' and box has values: ', bbox)
     return [
         width * (bbox[0] / 1000),
         height * (bbox[1] / 1000),
         width * (bbox[2] / 1000),
         height * (bbox[3] / 1000),
     ]

def imageconversion(pdffile):
  doc = fitz.open(pdffile)
  page = doc.load_page(0)
  zoom = 2    # zoom factor
  mat = fitz.Matrix(zoom, zoom)
  pix = page.get_pixmap(matrix = mat,dpi = 300)
  image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) 
  t=pix.save("page.jpg")
  return image

def process_image_pytesseract(image,width,height):
    width, height = image.size
    #feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=True,lang=lang)
    #encoding_feature_extractor = feature_extractor(image, return_tensors="pt",truncation=True)
    #words, boxes = encoding_feature_extractor.words, encoding_feature_extractor.boxes
    words, boxes, scores = [], [], []
    return words,boxes,scores,image

def process_image_PaddleOCR(image,width,height):
    ocr = paddleocr.PaddleOCR(lang='en',use_gpu=False, use_angle_cls=True)
    width, height = image.size
    width_scale = 1000 / width
    height_scale = 1000 / height
    
    # Perform OCR on the image
    results = ocr.ocr(np.array(image))
    
    # Extract the words and bounding boxes from the OCR results
    words = []
    boxes = []
    scores = []
    for line in results:
        for bbox in line:
            words.append(bbox[1][0])
            scores.append(bbox[1][1])
            boxes.append(bbox[0])

    output_image = draw_ocr(image, boxes, words, scores, font_path='coolvetica rg.otf')
    return words, boxes, scores, output_image

def createDataframe(boxes, words, scores):
    df = pd.DataFrame(list(zip(boxes, words, scores)), columns=['bbox','text', 'score'])
    return df
    

def completepreprocess(pdffile, ocr_type):
    t=imageconversion(pdffile)
    image = t.convert("RGB")
    width,height=image.size
    if ocr_type == "PaddleOCR":
        words, boxes, scores, output_img = process_image_PaddleOCR(image, width, height)
    elif ocr_type == "Pytesseract":
        words, boxes, scores, output_img = process_image_pytesseract(image, width, height)
    else:
        words, boxes, scores, output_img = process_image_PaddleOCR(image, width, height)

    dataframe = createDataframe(boxes, words, scores)
    return output_img, dataframe


title = "OCR outputs"
description = ""

css = """.output_image, .input_image {height: 600px !important}"""
#examples = [["461BHH69.PDF"],["AP-481-RF.PDF"],["DP-095-ML.PDF"],["DQ-231-LL.PDF"],["FK-941-ET.PDF"], ["FL-078-NH.PDF"]
#              ,["14ZZ69.PDF"],["74BCA69.PDF"],["254BEG69.PDF"],["761BJQ69.PDF"],["AB-486-EH.PDF"],["AZ-211-ZA.PDF"], ["CY-073-YV.PDF"]]
# ["744BJQ69.PDF"], ['tarros_2.jpg'],
examples = [['3.jpg']]
iface = gr.Interface(fn=completepreprocess,
                     #inputs=gr.inputs.Image(type="pil",optional=True,label="upload file"),
                     inputs=[
                        gr.inputs.File(label="PDF"),
                        gr.inputs.Dropdown(label="Select the OCR", choices=["PaddleOCR", "Pytesseract"]),
                     ],
                     #inputs=gr.inputs.Image(type="pil")
                     outputs=[gr.outputs.Image(type="pil", label="annotated image"),"dataframe"] ,
                     title=title,
                     description=description,
                     examples=examples,
                     css=css,
                     analytics_enabled = True, enable_queue=True)

iface.launch(inline=False , debug=True)