atatavana commited on
Commit
358bc3a
·
1 Parent(s): fcc45ff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -30
app.py CHANGED
@@ -18,6 +18,8 @@ import gradio as gr
18
  from tqdm import tqdm
19
  from scipy import ndimage
20
  from PIL import Image, ImageDraw, ImageFont
 
 
21
 
22
 
23
 
@@ -38,46 +40,54 @@ def imageconversion(pdffile):
38
  pix = page.get_pixmap(matrix = mat,dpi = 300)
39
  image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
40
  t=pix.save("page.jpg")
41
- # img = removeBorders(image)
42
- # noise_img = add_noise(np.array(image))
43
- # image = Image.fromarray(noise_img)
44
  return image
45
 
 
 
 
 
 
 
 
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  def completepreprocess(pdffile):
49
  t=imageconversion(pdffile)
50
  image = t.convert("RGB")
51
  width,height=image.size
52
  if ocr_type == "PaddleOCR":
53
- words, boxes = process_image_PaddleOCR(image, width, height)
54
  elif ocr_type == "Pytesseract":
55
- words, boxes = process_image_pytesseract(image, width, height)
56
- myDataFrame = pd.DataFrame()
57
- a=[]
58
- doc = fitz.open(pdffile)
59
- for i in range(0,len(doc)):
60
- page = doc.load_page(i)
61
- zoom = 2 # zoom factor
62
- mat = fitz.Matrix(zoom, zoom)
63
- pix = page.get_pixmap(matrix = mat,dpi = 200)
64
- t=pix.save("page"+str(i)+".jpg")
65
- images = Image.open("page"+str(i)+".jpg")
66
- image = images.convert("RGB")
67
- bbox, preds, words, image = process_image(image)
68
- im, df = visualize_image(bbox, preds, words, image)
69
- im1 = im.save("page"+str(i)+".jpg")
70
- a.append("page"+str(i)+".jpg")
71
- pred_list = []
72
- for number in preds:
73
- pred_list.append(iob_to_label(number))
74
- _bbox, _preds, _words = process_form(pred_list, words, bbox)
75
- print('page: ' + str(i) + ' ' + str(len(_preds))+ ' ' + str(len(_words)))
76
- df = createDataframe(_preds, _words)
77
- myDataFrame=myDataFrame.append(df)
78
-
79
- im2=mergeImageVertical(a)
80
- return im2,myDataFrame
81
 
82
 
83
  title = "OCR outputs"
 
18
  from tqdm import tqdm
19
  from scipy import ndimage
20
  from PIL import Image, ImageDraw, ImageFont
21
+ import paddleocr
22
+ from paddleocr import draw_ocr
23
 
24
 
25
 
 
40
  pix = page.get_pixmap(matrix = mat,dpi = 300)
41
  image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
42
  t=pix.save("page.jpg")
 
 
 
43
  return image
44
 
45
+ def process_image_pytesseract(image,width,height):
46
+ width, height = image.size
47
+ #feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=True,lang=lang)
48
+ #encoding_feature_extractor = feature_extractor(image, return_tensors="pt",truncation=True)
49
+ #words, boxes = encoding_feature_extractor.words, encoding_feature_extractor.boxes
50
+ words, boxes, scores = [], [], []
51
+ return words,boxes,scores,image
52
 
53
+ def process_image_PaddleOCR(image,width,height):
54
+ ocr = paddleocr.PaddleOCR(lang='en',use_gpu=False, use_angle_cls=True)
55
+ width, height = image.size
56
+ width_scale = 1000 / width
57
+ height_scale = 1000 / height
58
+
59
+ # Perform OCR on the image
60
+ results = ocr.ocr(np.array(image))
61
+
62
+ # Extract the words and bounding boxes from the OCR results
63
+ words = []
64
+ boxes = []
65
+ scores = []
66
+ for line in results:
67
+ for bbox in line:
68
+ words.append(bbox[1][0])
69
+ scores.append(bbox[1][1])
70
+ boxes.append(create_bounding_box1(bbox[0], width_scale, height_scale))
71
+
72
+ output_image = draw_ocr(image, boxes, words, scores, font_path='coolvetica rg.otf')
73
+ return words, boxes, scores, output_image
74
+
75
+ def createDataframe(boxes, words, scores):
76
+ df = pd.DataFrame([boxes, words, scores], columns=['bbox','text', 'score'])
77
+ return df
78
+
79
 
80
  def completepreprocess(pdffile):
81
  t=imageconversion(pdffile)
82
  image = t.convert("RGB")
83
  width,height=image.size
84
  if ocr_type == "PaddleOCR":
85
+ words, boxes, scores, output_img = process_image_PaddleOCR(image, width, height)
86
  elif ocr_type == "Pytesseract":
87
+ words, boxes, scores, output_img = process_image_pytesseract(image, width, height)
88
+
89
+ dataframe = createDataframe(boxes, words, scores)
90
+ return output_img,myDataFrame
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
 
93
  title = "OCR outputs"