Inference-APP-Document-Understanding-at-linelevel-LiLT-base-LayoutXLM-base-v1

Runtime error

App Files Files Community

pierreguillou commited on Mar 9, 2023

Commit

fe811a3

1 Parent(s): 62697e4

Update files/functions.py

Browse files

Files changed (1) hide show

files/functions.py +74 -57

files/functions.py CHANGED Viewed

@@ -44,39 +44,14 @@ import pathlib
 from pathlib import Path
 import shutil
 # Tesseract
 print(os.popen(f'cat /etc/debian_version').read())
 print(os.popen(f'cat /etc/issue').read())
 print(os.popen(f'apt search tesseract').read())
 import pytesseract
-## model / feature extractor / tokenizer
-import torch
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# model 1
-from transformers import AutoTokenizer, AutoModelForTokenClassification
-model_id = "pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-linelevel-ml384"
-tokenizer1 = AutoTokenizer.from_pretrained(model_id)
-model1 = AutoModelForTokenClassification.from_pretrained(model_id);
-model1.to(device);
-from transformers import LayoutLMv2ForTokenClassification
-# model 2
-model_id = "pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-linelevel-ml384"
-model2 = LayoutLMv2ForTokenClassification.from_pretrained(model_id);
-model2.to(device);
-# feature extractor
-from transformers import LayoutLMv2FeatureExtractor
-feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
-# tokenizer
-from transformers import AutoTokenizer
-tokenizer_id = "xlm-roberta-base"
-tokenizer2 = AutoTokenizer.from_pretrained(tokenizer_id)
 ## Key parameters
 # categories colors
@@ -96,27 +71,36 @@ label2color = {
 # bounding boxes start and end of a sequence
 cls_box = [0, 0, 0, 0]
-sep_box = [1000, 1000, 1000, 1000]
-# model
-model_id = "pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-linelevel-ml384"
-# tokenizer
-tokenizer_id = "xlm-roberta-base"
 # (tokenization) The maximum length of a feature (sequence)
-if str(384) in model_id:
-  max_length = 384
-elif str(512) in model_id:
-  max_length = 512
 else:
-  print("Error with max_length of chunks!")
 # (tokenization) overlap
 doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
 # max PDF page images that will be displayed
-max_imgboxes = 2
 # get files
 examples_dir = 'files/'
@@ -125,7 +109,7 @@ from huggingface_hub import hf_hub_download
 files = ["example.pdf", "blank.pdf", "blank.png", "languages_iso.csv", "languages_tesseract.csv", "wo_content.png"]
 for file_name in files:
     path_to_file = hf_hub_download(
-        repo_id = "pierreguillou/Inference-APP-Document-Understanding-at-linelevel-v2",
         filename = "files/" + file_name,
         repo_type = "space"
         )
@@ -162,6 +146,32 @@ for lang_t, langcode_t in zip(langs_t,langscode_t):
 langdetect2Tesseract = {v:k for k,v in Tesseract2langdetect.items()}
 ## General
 # get text and bounding boxes from an image
@@ -477,7 +487,7 @@ def extraction_data_from_image(images):
 ## Inference
-def prepare_inference_features(example, cls_box = cls_box, sep_box = sep_box):
   images_ids_list, chunks_ids_list, input_ids_list, attention_mask_list, bb_list, images_pixels_list = list(), list(), list(), list(), list(), list()
@@ -600,7 +610,7 @@ class CustomDataset(Dataset):
 import torch.nn.functional as F
 # get predictions at token level
-def predictions_token_level(images, custom_encoded_dataset):
     num_imgs = len(images)
     if num_imgs > 0:
@@ -635,12 +645,20 @@ def predictions_token_level(images, custom_encoded_dataset):
             # get prediction with forward pass
             with torch.no_grad():
-                output = model(
-                    input_ids=input_id.to(device),
-                    attention_mask=attention_mask.to(device),
-                    bbox=bbox.to(device),
-                    image=pixel_values.to(device)
-                    )
             # save probabilities of predictions in dictionnary
             if image_id in outputs: outputs[image_id].append(F.softmax(output.logits.squeeze(), dim=-1))
@@ -654,7 +672,7 @@ def predictions_token_level(images, custom_encoded_dataset):
 from functools import reduce
 # Get predictions (line level)
-def predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes):
     ten_probs_dict, ten_input_ids_dict, ten_bboxes_dict = dict(), dict(), dict()
     bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = dict(), dict(), dict(), dict()
@@ -711,14 +729,13 @@ def predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_i
             bbox_prev = [-100, -100, -100, -100]
             for probs, input_id, bbox in zip(ten_probs_list, ten_input_ids_list, ten_bboxes_list):
                 bbox = denormalize_box(bbox, width, height)
-                if bbox != bbox_prev and bbox != cls_box:
                     bboxes_list.append(bbox)
                     input_ids_dict[str(bbox)] = [input_id]
                     probs_dict[str(bbox)] = [probs]
-                else:
-                    if bbox != cls_box:
-                        input_ids_dict[str(bbox)].append(input_id)
-                        probs_dict[str(bbox)].append(probs)
                 bbox_prev = bbox
             probs_bbox = dict()
@@ -749,7 +766,7 @@ def predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_i
         print("An error occurred while getting predictions!")
 # Get labeled images with lines bounding boxes
-def get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict):
     labeled_images = list()
@@ -781,7 +798,7 @@ def get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_di
     return labeled_images
 # get data of encoded chunk
-def get_encoded_chunk_inference(index_chunk=None):
   # get datasets
   example = dataset
@@ -833,10 +850,10 @@ def get_encoded_chunk_inference(index_chunk=None):
   return image, df, num_tokens, page_no, num_pages
 # display chunk of PDF image and its data
-def display_chunk_lines_inference(index_chunk=None):
   # get image and image data
-  image, df, num_tokens, page_no, num_pages = get_encoded_chunk_inference(index_chunk=index_chunk)
   # get data from dataframe
   input_ids = df["input_ids"]

 from pathlib import Path
 import shutil
+from functools import partial
 # Tesseract
 print(os.popen(f'cat /etc/debian_version').read())
 print(os.popen(f'cat /etc/issue').read())
 print(os.popen(f'apt search tesseract').read())
 import pytesseract
 ## Key parameters
 # categories colors
 # bounding boxes start and end of a sequence
 cls_box = [0, 0, 0, 0]
+sep_box_lilt = cls_box
+sep_box_layoutxlm = [1000, 1000, 1000, 1000]
+# models
+model_id_lilt = "pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-linelevel-ml384"
+model_id_layoutxlm = "pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-linelevel-ml384"
+# tokenizer for LayoutXLM
+tokenizer_id_layoutxlm = "xlm-roberta-base"
 # (tokenization) The maximum length of a feature (sequence)
+if str(384) in model_id_lilt:
+  max_length_lilt = 384
+elif str(512) in model_id_lilt:
+  max_length_lilt = 512
 else:
+  print("Error with max_length_lilt of chunks!")
+if str(384) in model_id_layoutxlm:
+  max_length_layoutxlm = 384
+elif str(512) in model_id_layoutxlm:
+  max_length_layoutxlm = 512
+else:
+  print("Error with max_length_layoutxlm of chunks!")
 # (tokenization) overlap
 doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
 # max PDF page images that will be displayed
+max_imgboxes = 1
 # get files
 examples_dir = 'files/'
 files = ["example.pdf", "blank.pdf", "blank.png", "languages_iso.csv", "languages_tesseract.csv", "wo_content.png"]
 for file_name in files:
     path_to_file = hf_hub_download(
+        repo_id = "pierreguillou/Inference-APP-Document-Understanding-at-linelevel-v3",
         filename = "files/" + file_name,
         repo_type = "space"
         )
 langdetect2Tesseract = {v:k for k,v in Tesseract2langdetect.items()}
+## model / feature extractor / tokenizer
+# get device
+import torch
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+## model LiLT
+import transformers
+from transformers import AutoTokenizer, AutoModelForTokenClassification
+tokenizer_lilt = AutoTokenizer.from_pretrained(model_id_lilt)
+model_lilt = AutoModelForTokenClassification.from_pretrained(model_id_lilt);
+model_lilt.to(device);
+## model LayoutXLM
+from transformers import LayoutLMv2ForTokenClassification # LayoutXLMTokenizerFast,
+model_layoutxlm = LayoutLMv2ForTokenClassification.from_pretrained(model_id_layoutxlm);
+model_layoutxlm.to(device);
+# feature extractor
+from transformers import LayoutLMv2FeatureExtractor
+feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
+# tokenizer
+from transformers import AutoTokenizer
+tokenizer_layoutxlm = AutoTokenizer.from_pretrained(tokenizer_id_layoutxlm)
 ## General
 # get text and bounding boxes from an image
 ## Inference
+def prepare_inference_features(example, tokenizer, max_length, cls_box, sep_box):
   images_ids_list, chunks_ids_list, input_ids_list, attention_mask_list, bb_list, images_pixels_list = list(), list(), list(), list(), list(), list()
 import torch.nn.functional as F
 # get predictions at token level
+def predictions_token_level(images, custom_encoded_dataset, model_id, model):
     num_imgs = len(images)
     if num_imgs > 0:
             # get prediction with forward pass
             with torch.no_grad():
+                if model_id == model_id_lilt:
+                    output = model(
+                        input_ids=input_id.to(device),
+                        attention_mask=attention_mask.to(device),
+                        bbox=bbox.to(device),
+                        )
+                elif model_id == model_id_layoutxlm:
+                    output = model(
+                        input_ids=input_id.to(device),
+                        attention_mask=attention_mask.to(device),
+                        bbox=bbox.to(device),
+                        image=pixel_values.to(device)
+                        )
             # save probabilities of predictions in dictionnary
             if image_id in outputs: outputs[image_id].append(F.softmax(output.logits.squeeze(), dim=-1))
 from functools import reduce
 # Get predictions (line level)
+def predictions_line_level(max_length, tokenizer, id2label, dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes, cls_box, sep_box):
     ten_probs_dict, ten_input_ids_dict, ten_bboxes_dict = dict(), dict(), dict()
     bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = dict(), dict(), dict(), dict()
             bbox_prev = [-100, -100, -100, -100]
             for probs, input_id, bbox in zip(ten_probs_list, ten_input_ids_list, ten_bboxes_list):
                 bbox = denormalize_box(bbox, width, height)
+                if bbox != bbox_prev and bbox != cls_box and bbox != sep_box and bbox[0] != bbox[2] and bbox[1] != bbox[3]:
                     bboxes_list.append(bbox)
                     input_ids_dict[str(bbox)] = [input_id]
                     probs_dict[str(bbox)] = [probs]
+                elif bbox != cls_box and bbox != sep_box and bbox[0] != bbox[2] and bbox[1] != bbox[3]:
+                    input_ids_dict[str(bbox)].append(input_id)
+                    probs_dict[str(bbox)].append(probs)
                 bbox_prev = bbox
             probs_bbox = dict()
         print("An error occurred while getting predictions!")
 # Get labeled images with lines bounding boxes
+def get_labeled_images(id2label, dataset, images_ids_list, bboxes_list_dict, probs_dict_dict):
     labeled_images = list()
     return labeled_images
 # get data of encoded chunk
+def get_encoded_chunk_inference(tokenizer, dataset, encoded_dataset, index_chunk=None):
   # get datasets
   example = dataset
   return image, df, num_tokens, page_no, num_pages
 # display chunk of PDF image and its data
+def display_chunk_lines_inference(dataset, encoded_dataset, index_chunk=None):
   # get image and image data
+  image, df, num_tokens, page_no, num_pages = get_encoded_chunk_inference(dataset, encoded_dataset, index_chunk=index_chunk)
   # get data from dataframe
   input_ids = df["input_ids"]