File size: 5,986 Bytes
a228fac 16d7e9b a228fac 16d7e9b a228fac 1be0846 a228fac 80d398c a228fac d0225fc a228fac 1be0846 75c54a9 1be0846 75c54a9 1be0846 75c54a9 1be0846 16d7e9b 1be0846 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
from import vision
from google.oauth2 import service_account
from google.protobuf.json_format import MessageToJson
import pandas as pd
import json
import numpy as np
from PIL import Image
import io
import requests
image_ext = ("*.jpg", "*.jpeg", "*.png")
class VisionClient:
def __init__(self, auth):
# with open('temp/client_secret.json') as f:
# auth = json.load(f)
credentials = service_account.Credentials.from_service_account_info(
self.client = vision.ImageAnnotatorClient(credentials=credentials)
def send_request(self, image):
image = vision.Image(content=image)
except ValueError as e:
print("Image could not be read")
response = self.client.document_text_detection(image, timeout=60)
return response
def get_response(self, content):
resp_js = self.send_request(content)
except Exception as e:
print("OCR request failed. Reason : {}".format(e))
return resp_js
def post_process(self, resp_js):
boxObjects = []
for i in range(1, len(resp_js.text_annotations)):
# We need to do that because vision sometimes reverse the left and right coords so then we have negative
# width which causes problems when drawing link buttons
obj = resp_js
if obj.text_annotations[i].bounding_poly.vertices[1].x > obj.text_annotations[i].bounding_poly.vertices[3].x:
leftX = obj.text_annotations[i].bounding_poly.vertices[3].x
leftX = obj.text_annotations[i].bounding_poly.vertices[1].x
if obj.text_annotations[i].bounding_poly.vertices[1].x > obj.text_annotations[i].bounding_poly.vertices[3].x:
rightX = obj.text_annotations[i].bounding_poly.vertices[1].x
rightX = obj.text_annotations[i].bounding_poly.vertices[3].x
"id": i-1,
"text": obj.text_annotations[i].description,
"left": leftX,
"width": rightX - leftX,
"top": obj.text_annotations[i].bounding_poly.vertices[1].y,
"height":obj.text_annotations[i].bounding_poly.vertices[3].y - obj.text_annotations[i].bounding_poly.vertices[1].y
return boxObjects
def convert_to_df(self, boxObjects, image):
ocr_df = pd.DataFrame(boxObjects)
# ocr_df = ocr_df.sort_values(by=['top', 'left'], ascending=True).reset_index(drop=True)
width, height = image.size
w_scale = 1000/width
h_scale = 1000/height
ocr_df = ocr_df.dropna() \
.assign(left_scaled = ocr_df.left*w_scale,
width_scaled = ocr_df.width*w_scale,
top_scaled =*h_scale,
height_scaled = ocr_df.height*h_scale,
right_scaled = lambda x: x.left_scaled + x.width_scaled,
bottom_scaled = lambda x: x.top_scaled + x.height_scaled)
float_cols = ocr_df.select_dtypes('float').columns
ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)
ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True)
ocr_df = ocr_df.dropna().reset_index(drop=True)
return ocr_df
def ocr(self, content, image):
resp_js = self.get_response(content)
boxObjects = self.post_process(resp_js)
ocr_df = self.convert_to_df(boxObjects, image)
return ocr_df
class TrOCRClient():
def __init__(self, api_url):
self.api_url = api_url
def convert_to_df(self, boxObjects, image):
ocr_df = pd.DataFrame(boxObjects)
# ocr_df = ocr_df.sort_values(by=['top', 'left'], ascending=True).reset_index(drop=True)
width, height = image.size
w_scale = 1000/width
h_scale = 1000/height
ocr_df = ocr_df.dropna() \
.assign(left_scaled = ocr_df.left*w_scale,
width_scaled = ocr_df.width*w_scale,
top_scaled =*h_scale,
height_scaled = ocr_df.height*h_scale,
right_scaled = lambda x: x.left_scaled + x.width_scaled,
bottom_scaled = lambda x: x.top_scaled + x.height_scaled)
float_cols = ocr_df.select_dtypes('float').columns
ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)
ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True)
ocr_df = ocr_df.dropna().reset_index(drop=True)
return ocr_df
def send_request(self, handwritten_img):
jpeg_bytes = io.BytesIO(), format='JPEG')
jpeg_content = jpeg_bytes.getvalue()
# Send a POST request with the image file
response =, files={"file": jpeg_content})
# Check the response status code
if response.status_code == 200:
# Get the extracted text from the response
extracted_text = response.json()["ocr_result"]
print(f"Error: {response.text}")
return extracted_text
def ocr(self, handwritten_imgs, image):
boxObjects = []
for i in range(len(handwritten_imgs)):
handwritten_img = handwritten_imgs[i]
ocr_result = self.send_request(handwritten_img[0])
"id": i-1,
"text": ocr_result,
"left": handwritten_img[1],
"width": handwritten_img[3],
"top": handwritten_img[2],
ocr_df = self.convert_to_df(boxObjects, image)
return ocr_df