Spaces:

kavg
/

sri-doc

Runtime error

App Files Files Community

kavg commited on Dec 13, 2023

Commit

a228fac

0 Parent(s):

Initial commit

Browse files

Files changed (10) hide show

.gitignore +5 -0
README.md +6 -0
config.py +9 -0
download_model.ipynb +144 -0
main.py +66 -0
models.py +236 -0
ocr.py +89 -0
preprocess.py +111 -0
requirements.txt +0 -0
token_classification.py +36 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+lilt-env/
+.env
+temp/
+__pycache__/
+models/

README.md ADDED Viewed

	@@ -0,0 +1,6 @@

+ 1. Create a virtualenv
+	`virtualenv lilt-env`
+2. Install packages
+	`pip install -r requirements.txt`
+3. Run the app
+	`uvicorn main:app --reload`

config.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from pydantic_settings import BaseSettings, SettingsConfigDict
+from pydantic import Field
+class Settings(BaseSettings):
+    model_config = SettingsConfigDict(env_file='.env', env_file_encoding='utf-8')
+    GCV_AUTH: dict
+    SER_MODEL: str
+    TOKENIZER: str
+    RE_MODEL: str

download_model.ipynb ADDED Viewed

	@@ -0,0 +1,144 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "d:\\FYP\\lilt-app-without-fd\\lilt-env\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import LiltModel, AutoTokenizer, LiltForTokenClassification"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download tokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('models/lilt-tokenizer\\\\tokenizer_config.json',\n",
+       " 'models/lilt-tokenizer\\\\special_tokens_map.json',\n",
+       " 'models/lilt-tokenizer\\\\tokenizer.json')"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "TOKENIZER = 'nielsr/lilt-xlm-roberta-base'\n",
+    "tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)\n",
+    "save_dir = 'models/lilt-tokenizer'\n",
+    "tokenizer.save_pretrained(save_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download and save token classification model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# download the model\n",
+    "MODEL = \"pierreguillou/lilt-xlm-roberta-base-finetuned-funsd-iob-original\"\n",
+    "model = LiltForTokenClassification.from_pretrained(MODEL)\n",
+    "\n",
+    "# save the model\n",
+    "save_dir = \"models/lilt-ser-iob\"\n",
+    "model.save_pretrained(save_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download and save RE model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading config.json: 100%|██████████| 794/794 [00:00<00:00, 61.2kB/s]\n",
+      "d:\\FYP\\lilt-app-without-fd\\lilt-env\\lib\\site-packages\\huggingface_hub\\file_download.py:133: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\Gihantha Kavishka\\.cache\\huggingface\\hub. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
+      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
+      "  warnings.warn(message)\n",
+      "Downloading pytorch_model.bin: 100%|██████████| 1.15G/1.15G [08:10<00:00, 2.34MB/s]\n",
+      "Some weights of the model checkpoint at kavg/layoutxlm-finetuned-xfund-fr-re were not used when initializing LiltModel: ['extractor.rel_classifier.linear.weight', 'extractor.entity_emb.weight', 'extractor.ffnn_tail.0.weight', 'extractor.ffnn_tail.3.bias', 'extractor.ffnn_head.3.weight', 'extractor.ffnn_head.0.weight', 'extractor.ffnn_tail.0.bias', 'extractor.ffnn_head.3.bias', 'extractor.rel_classifier.bilinear.weight', 'extractor.rel_classifier.linear.bias', 'extractor.ffnn_head.0.bias', 'extractor.ffnn_tail.3.weight']\n",
+      "- This IS expected if you are initializing LiltModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing LiltModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "Some weights of LiltModel were not initialized from the model checkpoint at kavg/layoutxlm-finetuned-xfund-fr-re and are newly initialized: ['lilt.pooler.dense.bias', 'lilt.pooler.dense.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# download the model\n",
+    "MODEL =  'kavg/layoutxlm-finetuned-xfund-fr-re'\n",
+    "model = LiltModel.from_pretrained(MODEL)\n",
+    "\n",
+    "# save the model\n",
+    "save_dir = \"models/lilt-re\"\n",
+    "model.save_pretrained(save_dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "lilt-env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.8"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

main.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from config import Settings
+from preprocess import Preprocessor
+import ocr
+from PIL import Image
+from transformers import LiltForTokenClassification
+import token_classification
+import torch
+from fastapi import FastAPI, UploadFile
+from contextlib import asynccontextmanager
+import json
+import io
+from models import LiLTRobertaLikeForRelationExtraction
+config = {}
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    settings = Settings()
+    config['settings'] = settings
+    config['device'] = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    config['vision_client'] = ocr.VisionClient(settings.GCV_AUTH)
+    config['processor'] = Preprocessor(settings.TOKENIZER)
+    config['ser_model'] = LiltForTokenClassification.from_pretrained(settings.SER_MODEL)
+    config['re_model'] = LiLTRobertaLikeForRelationExtraction.from_pretrained(settings.RE_MODEL)
+    yield
+    # Clean up and release the resources
+    config.clear()
+app = FastAPI(lifespan=lifespan)
+@app.post("/submit-doc")
+async def ProcessDocument(file: UploadFile):
+  tokenClassificationOutput = await LabelTokens(file)
+  reOutput = ExtractRelations(tokenClassificationOutput)
+  return reOutput
+async def LabelTokens(file):
+  content = await file.read()
+  image = Image.open(io.BytesIO(content))
+  ocr_df = config['vision_client'].ocr(content, image)
+  input_ids, attention_mask, token_type_ids, bbox, token_actual_boxes, offset_mapping = config['processor'].process(ocr_df, image = image)
+  token_labels = token_classification.classifyTokens(config['ser_model'], input_ids, attention_mask, bbox, offset_mapping)
+  return {"token_labels": token_labels, "input_ids": input_ids, "bbox":bbox, "offset_mapping":offset_mapping, "attention_mask":attention_mask}
+def ExtractRelations(tokenClassificationOutput):
+  token_labels = tokenClassificationOutput['token_labels']
+  input_ids = tokenClassificationOutput['input_ids']
+  offset_mapping =  tokenClassificationOutput["offset_mapping"]
+  attention_mask = tokenClassificationOutput["attention_mask"]
+  bbox = tokenClassificationOutput["bbox"]
+  entities = token_classification.createEntities(config['ser_model'], token_labels, input_ids, offset_mapping)
+  config['re_model'].to(config['device'])
+  entity_dict = {'start': [entity[0] for entity in entities], 'end': [entity[1] for entity in entities], 'label': [entity[3] for entity in entities]}
+  relations = [{'start_index': [], 'end_index': [], 'head': [], 'tail': []}]
+  with torch.no_grad():
+    outputs = config['re_model'](input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, entities=[entity_dict], relations=relations)
+  print(type(outputs.pred_relations[0]))
+  print(type(entities))
+  print(type(input_ids))
+  print(type(bbox))
+  print(type(token_labels))
+  # "pred_relations":json.dumps(outputs.pred_relations[0]), "entities":json.dumps(entities), "input_ids": json.dumps(input_ids.tolist()),
+  return {"pred_relations":json.dumps(outputs.pred_relations[0]), "entities":json.dumps(entities), "input_ids": json.dumps(input_ids.tolist()), "bboxes": json.dumps(bbox.tolist()),"token_labels":json.dumps(token_labels)}

models.py ADDED Viewed

	@@ -0,0 +1,236 @@

+from transformers import LiltPreTrainedModel, LiltModel
+import copy
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple
+from transformers.utils import ModelOutput
+class BiaffineAttention(torch.nn.Module):
+    """Implements a biaffine attention operator for binary relation classification.
+    PyTorch implementation of the biaffine attention operator from "End-to-end neural relation
+    extraction using deep biaffine attention" (https://arxiv.org/abs/1812.11275) which can be used
+    as a classifier for binary relation classification.
+    Args:
+        in_features (int): The size of the feature dimension of the inputs.
+        out_features (int): The size of the feature dimension of the output.
+    Shape:
+        - x_1: `(N, *, in_features)` where `N` is the batch dimension and `*` means any number of
+          additional dimensisons.
+        - x_2: `(N, *, in_features)`, where `N` is the batch dimension and `*` means any number of
+          additional dimensions.
+        - Output: `(N, *, out_features)`, where `N` is the batch dimension and `*` means any number
+            of additional dimensions.
+    Examples:
+        >>> batch_size, in_features, out_features = 32, 100, 4
+        >>> biaffine_attention = BiaffineAttention(in_features, out_features)
+        >>> x_1 = torch.randn(batch_size, in_features)
+        >>> x_2 = torch.randn(batch_size, in_features)
+        >>> output = biaffine_attention(x_1, x_2)
+        >>> print(output.size())
+        torch.Size([32, 4])
+    """
+    def __init__(self, in_features, out_features):
+        super(BiaffineAttention, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.bilinear = torch.nn.Bilinear(in_features, in_features, out_features, bias=False)
+        self.linear = torch.nn.Linear(2 * in_features, out_features, bias=True)
+        self.reset_parameters()
+    def forward(self, x_1, x_2):
+        return self.bilinear(x_1, x_2) + self.linear(torch.cat((x_1, x_2), dim=-1))
+    def reset_parameters(self):
+        self.bilinear.reset_parameters()
+        self.linear.reset_parameters()
+class REDecoder(nn.Module):
+    def __init__(self, config, input_size):
+        super().__init__()
+        self.entity_emb = nn.Embedding(3, input_size, scale_grad_by_freq=True)
+        projection = nn.Sequential(
+            nn.Linear(input_size * 2, config.hidden_size),
+            nn.ReLU(),
+            nn.Dropout(config.hidden_dropout_prob),
+            nn.Linear(config.hidden_size, config.hidden_size // 2),
+            nn.ReLU(),
+            nn.Dropout(config.hidden_dropout_prob),
+        )
+        self.ffnn_head = copy.deepcopy(projection)
+        self.ffnn_tail = copy.deepcopy(projection)
+        self.rel_classifier = BiaffineAttention(config.hidden_size // 2, 2)
+        self.loss_fct = CrossEntropyLoss()
+    def build_relation(self, relations, entities):
+        batch_size = len(relations)
+        new_relations = []
+        for b in range(batch_size):
+            if len(entities[b]["start"]) <= 2:
+                entities[b] = {"end": [1, 1], "label": [0, 0], "start": [0, 0]}
+            all_possible_relations = set(
+                [
+                    (i, j)
+                    for i in range(len(entities[b]["label"]))
+                    for j in range(len(entities[b]["label"]))
+                    if entities[b]["label"][i] == 1 and entities[b]["label"][j] == 2
+                ]
+            )
+            if len(all_possible_relations) == 0:
+                all_possible_relations = set([(0, 1)])
+            positive_relations = set(list(zip(relations[b]["head"], relations[b]["tail"])))
+            negative_relations = all_possible_relations - positive_relations
+            positive_relations = set([i for i in positive_relations if i in all_possible_relations])
+            reordered_relations = list(positive_relations) + list(negative_relations)
+            relation_per_doc = {"head": [], "tail": [], "label": []}
+            relation_per_doc["head"] = [i[0] for i in reordered_relations]
+            relation_per_doc["tail"] = [i[1] for i in reordered_relations]
+            relation_per_doc["label"] = [1] * len(positive_relations) + [0] * (
+                len(reordered_relations) - len(positive_relations)
+            )
+            assert len(relation_per_doc["head"]) != 0
+            new_relations.append(relation_per_doc)
+        return new_relations, entities
+    def get_predicted_relations(self, logits, relations, entities):
+        pred_relations = []
+        for i, pred_label in enumerate(logits.argmax(-1)):
+            if pred_label != 1:
+                continue
+            rel = {}
+            rel["head_id"] = relations["head"][i]
+            rel["head"] = (entities["start"][rel["head_id"]], entities["end"][rel["head_id"]])
+            rel["head_type"] = entities["label"][rel["head_id"]]
+            rel["tail_id"] = relations["tail"][i]
+            rel["tail"] = (entities["start"][rel["tail_id"]], entities["end"][rel["tail_id"]])
+            rel["tail_type"] = entities["label"][rel["tail_id"]]
+            rel["type"] = 1
+            pred_relations.append(rel)
+        return pred_relations
+    def forward(self, hidden_states, entities, relations):
+        batch_size, max_n_words, context_dim = hidden_states.size()
+        device = hidden_states.device
+        relations, entities = self.build_relation(relations, entities)
+        loss = 0
+        all_pred_relations = []
+        all_logits = []
+        all_labels = []
+        for b in range(batch_size):
+            head_entities = torch.tensor(relations[b]["head"], device=device)
+            tail_entities = torch.tensor(relations[b]["tail"], device=device)
+            relation_labels = torch.tensor(relations[b]["label"], device=device)
+            entities_start_index = torch.tensor(entities[b]["start"], device=device)
+            entities_labels = torch.tensor(entities[b]["label"], device=device)
+            head_index = entities_start_index[head_entities]
+            head_label = entities_labels[head_entities]
+            head_label_repr = self.entity_emb(head_label)
+            tail_index = entities_start_index[tail_entities]
+            tail_label = entities_labels[tail_entities]
+            tail_label_repr = self.entity_emb(tail_label)
+            head_repr = torch.cat(
+                (hidden_states[b][head_index], head_label_repr),
+                dim=-1,
+            )
+            tail_repr = torch.cat(
+                (hidden_states[b][tail_index], tail_label_repr),
+                dim=-1,
+            )
+            heads = self.ffnn_head(head_repr)
+            tails = self.ffnn_tail(tail_repr)
+            logits = self.rel_classifier(heads, tails)
+            pred_relations = self.get_predicted_relations(logits, relations[b], entities[b])
+            all_pred_relations.append(pred_relations)
+            all_logits.append(logits)
+            all_labels.append(relation_labels)
+        all_logits = torch.cat(all_logits, 0)
+        all_labels = torch.cat(all_labels, 0)
+        loss = self.loss_fct(all_logits, all_labels)
+        return loss, all_pred_relations
+@dataclass
+class ReOutput(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    entities: Optional[Dict] = None
+    relations: Optional[Dict] = None
+    pred_relations: Optional[Dict] = None
+class REHead(nn.Module):
+  def __init__(self, config):
+    super().__init__()
+    self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    self.extractor = REDecoder(config, config.hidden_size)
+  def forward(self,sequence_output, entities, relations):
+    sequence_output = self.dropout(sequence_output)
+    loss, pred_relations = self.extractor(sequence_output, entities, relations)
+    return ReOutput(
+            loss=loss,
+            entities=entities,
+            relations=relations,
+            pred_relations=pred_relations,
+        )
+class LiLTRobertaLikeForRelationExtraction(LiltPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.lilt = LiltModel(config, add_pooling_layer=False)
+        self.rehead = REHead(config)
+        self.init_weights()
+    def forward(
+        self,
+        input_ids=None,
+        bbox=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        entities=None,
+        relations=None,
+    ):
+        outputs = self.lilt(
+            input_ids,
+            bbox=bbox,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        re_output = self.rehead(sequence_output, entities, relations)
+        return re_output

ocr.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from google.cloud import vision
+from google.oauth2 import service_account
+from google.protobuf.json_format import MessageToJson
+import pandas as pd
+import json
+import numpy as np
+from PIL import Image
+import io
+image_ext = ("*.jpg", "*.jpeg", "*.png")
+class VisionClient:
+    def __init__(self, auth):
+        credentials = service_account.Credentials.from_service_account_info(
+            auth
+        )
+        self.client = vision.ImageAnnotatorClient(credentials=credentials)
+    def send_request(self, image):
+        try:
+            image = vision.Image(content=image)
+        except ValueError as e:
+            print("Image could not be read")
+            return
+        response = self.client.document_text_detection(image, timeout=10)
+        return response
+    def get_response(self, content):
+        try:
+            resp_js = self.send_request(content)
+        except Exception as e:
+            print("OCR request failed. Reason : {}".format(e))
+        return resp_js
+    def post_process(self, resp_js):
+        boxObjects = []
+        for i in range(1, len(resp_js.text_annotations)):
+            # We need to do that because vision sometimes reverse the left and right coords so then we have negative
+            # width which causes problems when drawing link buttons
+            obj = resp_js
+            if obj.text_annotations[i].bounding_poly.vertices[1].x > obj.text_annotations[i].bounding_poly.vertices[3].x:
+                leftX = obj.text_annotations[i].bounding_poly.vertices[3].x
+            else:
+                leftX = obj.text_annotations[i].bounding_poly.vertices[1].x
+            if obj.text_annotations[i].bounding_poly.vertices[1].x > obj.text_annotations[i].bounding_poly.vertices[3].x:
+                rightX = obj.text_annotations[i].bounding_poly.vertices[1].x
+            else:
+                rightX = obj.text_annotations[i].bounding_poly.vertices[3].x
+            boxObjects.append({
+                "id": i-1,
+                "text": obj.text_annotations[i].description,
+                "left": leftX,
+                "width": rightX - leftX,
+                "top": obj.text_annotations[i].bounding_poly.vertices[1].y,
+                "height":obj.text_annotations[i].bounding_poly.vertices[3].y - obj.text_annotations[i].bounding_poly.vertices[1].y
+            })
+        return boxObjects
+    def convert_to_df(self, boxObjects, image):
+        ocr_df = pd.DataFrame(boxObjects)
+        # ocr_df = ocr_df.sort_values(by=['top', 'left'], ascending=True).reset_index(drop=True)
+        width, height = image.size
+        w_scale = 1000/width
+        h_scale = 1000/height
+        ocr_df = ocr_df.dropna() \
+                    .assign(left_scaled = ocr_df.left*w_scale,
+                            width_scaled = ocr_df.width*w_scale,
+                            top_scaled = ocr_df.top*h_scale,
+                            height_scaled = ocr_df.height*h_scale,
+                            right_scaled = lambda x: x.left_scaled + x.width_scaled,
+                            bottom_scaled = lambda x: x.top_scaled + x.height_scaled)
+        float_cols = ocr_df.select_dtypes('float').columns
+        ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)
+        ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True)
+        ocr_df = ocr_df.dropna().reset_index(drop=True)
+        return ocr_df
+    def ocr(self, content, image):
+        resp_js = self.get_response(content)
+        boxObjects = self.post_process(resp_js)
+        ocr_df = self.convert_to_df(boxObjects, image)
+        return ocr_df

preprocess.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import torch
+from transformers import AutoTokenizer
+# class to turn the keys of a dict into attributes (thanks Stackoverflow)
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+class Preprocessor():
+    def __init__(self, tokenizer):
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
+        self.argsdict = {'max_seq_length': 512}
+        self.args = AttrDict(self.argsdict)
+    def get_boxes(self, ocr_df, image):
+        words = list(ocr_df.text)
+        coordinates = ocr_df[['left', 'top', 'width', 'height']]
+        actual_boxes = []
+        width, height = image.size
+        for idx, row in coordinates.iterrows():
+            x, y, w, h = tuple(row) # the row comes in (left, top, width, height) format
+            actual_box = [x, y, x+w, y+h] # we turn it into (left, top, left+widght, top+height) to get the actual box
+            actual_boxes.append(actual_box)
+        def normalize_box(box, width, height):
+            return [
+                int(1000 * (box[0] / width)),
+                int(1000 * (box[1] / height)),
+                int(1000 * (box[2] / width)),
+                int(1000 * (box[3] / height)),
+            ]
+        boxes = []
+        for box in actual_boxes:
+            boxes.append(normalize_box(box, width, height))
+        return words, boxes, actual_boxes
+    def convert_example_to_features(self, image, words, boxes, actual_boxes, cls_token_box=[0, 0, 0, 0],
+                                    sep_token_box=[1000, 1000, 1000, 1000],
+                                    pad_token_box=[0, 0, 0, 0]):
+        width, height = image.size
+        tokens = []
+        token_boxes = []
+        actual_bboxes = [] # we use an extra b because actual_boxes is already used
+        token_actual_boxes = []
+        offset_mapping = []
+        for word, box, actual_bbox in zip(words, boxes, actual_boxes):
+            word_tokens = self.tokenizer.tokenize(word)
+            mapping = self.tokenizer(word, return_offsets_mapping=True).offset_mapping
+            offset_mapping.extend(mapping)
+            tokens.extend(word_tokens)
+            token_boxes.extend([box] * len(word_tokens))
+            actual_bboxes.extend([actual_bbox] * len(word_tokens))
+            token_actual_boxes.extend([actual_bbox] * len(word_tokens))
+        # Truncation: account for [CLS] and [SEP] with "- 2".
+        special_tokens_count = 2
+        if len(tokens) > self.args.max_seq_length - special_tokens_count:
+            tokens = tokens[: (self.args.max_seq_length - special_tokens_count)]
+            token_boxes = token_boxes[: (self.args.max_seq_length - special_tokens_count)]
+            actual_bboxes = actual_bboxes[: (self.args.max_seq_length - special_tokens_count)]
+            token_actual_boxes = token_actual_boxes[: (self.args.max_seq_length - special_tokens_count)]
+        # add [SEP] token, with corresponding token boxes and actual boxes
+        tokens += [self.tokenizer.sep_token]
+        token_boxes += [sep_token_box]
+        actual_bboxes += [[0, 0, width, height]]
+        token_actual_boxes += [[0, 0, width, height]]
+        segment_ids = [0] * len(tokens)
+        # next: [CLS] token
+        tokens = [self.tokenizer.cls_token] + tokens
+        token_boxes = [cls_token_box] + token_boxes
+        actual_bboxes = [[0, 0, width, height]] + actual_bboxes
+        token_actual_boxes = [[0, 0, width, height]] + token_actual_boxes
+        segment_ids = [1] + segment_ids
+        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1] * len(input_ids)
+        # Zero-pad up to the sequence length.
+        padding_length = self.args.max_seq_length - len(input_ids)
+        input_ids += [self.tokenizer.pad_token_id] * padding_length
+        input_mask += [0] * padding_length
+        segment_ids += [self.tokenizer.pad_token_id] * padding_length
+        token_boxes += [pad_token_box] * padding_length
+        token_actual_boxes += [pad_token_box] * padding_length
+        assert len(input_ids) == self.args.max_seq_length
+        assert len(input_mask) == self.args.max_seq_length
+        assert len(segment_ids) == self.args.max_seq_length
+        assert len(token_boxes) == self.args.max_seq_length
+        assert len(token_actual_boxes) == self.args.max_seq_length
+        return input_ids, input_mask, segment_ids, token_boxes, token_actual_boxes, offset_mapping
+    def process(self, ocr_df, image):
+        words, boxes, actual_boxes = self.get_boxes(ocr_df, image)
+        input_ids, input_mask, segment_ids, token_boxes, token_actual_boxes, offset_mapping = self.convert_example_to_features(image=image, words=words, boxes=boxes, actual_boxes=actual_boxes)
+        input_ids = torch.tensor(input_ids).unsqueeze(0)
+        attention_mask = torch.tensor(input_mask).unsqueeze(0)
+        token_type_ids = torch.tensor(segment_ids).unsqueeze(0)
+        bbox = torch.tensor(token_boxes).unsqueeze(0)
+        return input_ids, attention_mask, token_type_ids, bbox, token_actual_boxes, offset_mapping

requirements.txt ADDED Viewed

Binary file (3.27 kB). View file

token_classification.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import numpy as np
+def classifyTokens(model, input_ids, attention_mask, bbox, offset_mapping):
+    outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask)
+    # take argmax on last dimension to get predicted class ID per token
+    predictions = outputs.logits.argmax(-1).squeeze().tolist()
+    return predictions
+def createEntities(model, predictions, input_ids, offset_mapping):
+    # we're only interested in tokens which aren't subwords
+    # we'll use the offset mapping for that
+    offset_mapping = np.array(offset_mapping)
+    is_subword = np.array(offset_mapping.squeeze().tolist())[:,0] != 0
+    id2label = {"HEADER":0, "QUESTION":1, "ANSWER":2}
+    # finally, store recognized "question" and "answer" entities in a list
+    entities = []
+    current_entity = None
+    start = None
+    end = None
+    for idx, (id, pred) in enumerate(zip(input_ids[0].tolist(), predictions)):
+        if not is_subword[idx]:
+            predicted_label = model.config.id2label[pred]
+            if predicted_label.startswith("B") and current_entity is None:
+                # means we're at the start of a new entity
+                current_entity = predicted_label.replace("B-", "")
+                start = idx
+            if current_entity is not None and current_entity not in predicted_label:
+                # means we're at the end of a new entity
+                end = idx
+                entities.append((start, end, current_entity, id2label[current_entity]))
+                current_entity = None
+    return entities