Commit
·
fe811a3
1
Parent(s):
62697e4
Update files/functions.py
Browse files- files/functions.py +74 -57
files/functions.py
CHANGED
|
@@ -44,39 +44,14 @@ import pathlib
|
|
| 44 |
from pathlib import Path
|
| 45 |
import shutil
|
| 46 |
|
|
|
|
|
|
|
| 47 |
# Tesseract
|
| 48 |
print(os.popen(f'cat /etc/debian_version').read())
|
| 49 |
print(os.popen(f'cat /etc/issue').read())
|
| 50 |
print(os.popen(f'apt search tesseract').read())
|
| 51 |
import pytesseract
|
| 52 |
|
| 53 |
-
## model / feature extractor / tokenizer
|
| 54 |
-
|
| 55 |
-
import torch
|
| 56 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 57 |
-
|
| 58 |
-
# model 1
|
| 59 |
-
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
| 60 |
-
model_id = "pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-linelevel-ml384"
|
| 61 |
-
tokenizer1 = AutoTokenizer.from_pretrained(model_id)
|
| 62 |
-
model1 = AutoModelForTokenClassification.from_pretrained(model_id);
|
| 63 |
-
model1.to(device);
|
| 64 |
-
|
| 65 |
-
from transformers import LayoutLMv2ForTokenClassification
|
| 66 |
-
# model 2
|
| 67 |
-
model_id = "pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-linelevel-ml384"
|
| 68 |
-
model2 = LayoutLMv2ForTokenClassification.from_pretrained(model_id);
|
| 69 |
-
model2.to(device);
|
| 70 |
-
|
| 71 |
-
# feature extractor
|
| 72 |
-
from transformers import LayoutLMv2FeatureExtractor
|
| 73 |
-
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
|
| 74 |
-
|
| 75 |
-
# tokenizer
|
| 76 |
-
from transformers import AutoTokenizer
|
| 77 |
-
tokenizer_id = "xlm-roberta-base"
|
| 78 |
-
tokenizer2 = AutoTokenizer.from_pretrained(tokenizer_id)
|
| 79 |
-
|
| 80 |
## Key parameters
|
| 81 |
|
| 82 |
# categories colors
|
|
@@ -96,27 +71,36 @@ label2color = {
|
|
| 96 |
|
| 97 |
# bounding boxes start and end of a sequence
|
| 98 |
cls_box = [0, 0, 0, 0]
|
| 99 |
-
|
|
|
|
| 100 |
|
| 101 |
-
#
|
| 102 |
-
|
|
|
|
| 103 |
|
| 104 |
-
# tokenizer
|
| 105 |
-
|
| 106 |
|
| 107 |
# (tokenization) The maximum length of a feature (sequence)
|
| 108 |
-
if str(384) in
|
| 109 |
-
|
| 110 |
-
elif str(512) in
|
| 111 |
-
|
| 112 |
else:
|
| 113 |
-
print("Error with
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
# (tokenization) overlap
|
| 116 |
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
|
| 117 |
|
| 118 |
# max PDF page images that will be displayed
|
| 119 |
-
max_imgboxes =
|
| 120 |
|
| 121 |
# get files
|
| 122 |
examples_dir = 'files/'
|
|
@@ -125,7 +109,7 @@ from huggingface_hub import hf_hub_download
|
|
| 125 |
files = ["example.pdf", "blank.pdf", "blank.png", "languages_iso.csv", "languages_tesseract.csv", "wo_content.png"]
|
| 126 |
for file_name in files:
|
| 127 |
path_to_file = hf_hub_download(
|
| 128 |
-
repo_id = "pierreguillou/Inference-APP-Document-Understanding-at-linelevel-
|
| 129 |
filename = "files/" + file_name,
|
| 130 |
repo_type = "space"
|
| 131 |
)
|
|
@@ -162,6 +146,32 @@ for lang_t, langcode_t in zip(langs_t,langscode_t):
|
|
| 162 |
|
| 163 |
langdetect2Tesseract = {v:k for k,v in Tesseract2langdetect.items()}
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
## General
|
| 166 |
|
| 167 |
# get text and bounding boxes from an image
|
|
@@ -477,7 +487,7 @@ def extraction_data_from_image(images):
|
|
| 477 |
|
| 478 |
## Inference
|
| 479 |
|
| 480 |
-
def prepare_inference_features(example,
|
| 481 |
|
| 482 |
images_ids_list, chunks_ids_list, input_ids_list, attention_mask_list, bb_list, images_pixels_list = list(), list(), list(), list(), list(), list()
|
| 483 |
|
|
@@ -600,7 +610,7 @@ class CustomDataset(Dataset):
|
|
| 600 |
import torch.nn.functional as F
|
| 601 |
|
| 602 |
# get predictions at token level
|
| 603 |
-
def predictions_token_level(images, custom_encoded_dataset):
|
| 604 |
|
| 605 |
num_imgs = len(images)
|
| 606 |
if num_imgs > 0:
|
|
@@ -635,12 +645,20 @@ def predictions_token_level(images, custom_encoded_dataset):
|
|
| 635 |
|
| 636 |
# get prediction with forward pass
|
| 637 |
with torch.no_grad():
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 644 |
|
| 645 |
# save probabilities of predictions in dictionnary
|
| 646 |
if image_id in outputs: outputs[image_id].append(F.softmax(output.logits.squeeze(), dim=-1))
|
|
@@ -654,7 +672,7 @@ def predictions_token_level(images, custom_encoded_dataset):
|
|
| 654 |
from functools import reduce
|
| 655 |
|
| 656 |
# Get predictions (line level)
|
| 657 |
-
def predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes):
|
| 658 |
|
| 659 |
ten_probs_dict, ten_input_ids_dict, ten_bboxes_dict = dict(), dict(), dict()
|
| 660 |
bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = dict(), dict(), dict(), dict()
|
|
@@ -711,14 +729,13 @@ def predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_i
|
|
| 711 |
bbox_prev = [-100, -100, -100, -100]
|
| 712 |
for probs, input_id, bbox in zip(ten_probs_list, ten_input_ids_list, ten_bboxes_list):
|
| 713 |
bbox = denormalize_box(bbox, width, height)
|
| 714 |
-
if bbox != bbox_prev and bbox != cls_box:
|
| 715 |
bboxes_list.append(bbox)
|
| 716 |
input_ids_dict[str(bbox)] = [input_id]
|
| 717 |
probs_dict[str(bbox)] = [probs]
|
| 718 |
-
|
| 719 |
-
|
| 720 |
-
|
| 721 |
-
probs_dict[str(bbox)].append(probs)
|
| 722 |
bbox_prev = bbox
|
| 723 |
|
| 724 |
probs_bbox = dict()
|
|
@@ -749,7 +766,7 @@ def predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_i
|
|
| 749 |
print("An error occurred while getting predictions!")
|
| 750 |
|
| 751 |
# Get labeled images with lines bounding boxes
|
| 752 |
-
def get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict):
|
| 753 |
|
| 754 |
labeled_images = list()
|
| 755 |
|
|
@@ -781,7 +798,7 @@ def get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_di
|
|
| 781 |
return labeled_images
|
| 782 |
|
| 783 |
# get data of encoded chunk
|
| 784 |
-
def get_encoded_chunk_inference(index_chunk=None):
|
| 785 |
|
| 786 |
# get datasets
|
| 787 |
example = dataset
|
|
@@ -833,10 +850,10 @@ def get_encoded_chunk_inference(index_chunk=None):
|
|
| 833 |
return image, df, num_tokens, page_no, num_pages
|
| 834 |
|
| 835 |
# display chunk of PDF image and its data
|
| 836 |
-
def display_chunk_lines_inference(index_chunk=None):
|
| 837 |
|
| 838 |
# get image and image data
|
| 839 |
-
image, df, num_tokens, page_no, num_pages = get_encoded_chunk_inference(index_chunk=index_chunk)
|
| 840 |
|
| 841 |
# get data from dataframe
|
| 842 |
input_ids = df["input_ids"]
|
|
|
|
| 44 |
from pathlib import Path
|
| 45 |
import shutil
|
| 46 |
|
| 47 |
+
from functools import partial
|
| 48 |
+
|
| 49 |
# Tesseract
|
| 50 |
print(os.popen(f'cat /etc/debian_version').read())
|
| 51 |
print(os.popen(f'cat /etc/issue').read())
|
| 52 |
print(os.popen(f'apt search tesseract').read())
|
| 53 |
import pytesseract
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
## Key parameters
|
| 56 |
|
| 57 |
# categories colors
|
|
|
|
| 71 |
|
| 72 |
# bounding boxes start and end of a sequence
|
| 73 |
cls_box = [0, 0, 0, 0]
|
| 74 |
+
sep_box_lilt = cls_box
|
| 75 |
+
sep_box_layoutxlm = [1000, 1000, 1000, 1000]
|
| 76 |
|
| 77 |
+
# models
|
| 78 |
+
model_id_lilt = "pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-linelevel-ml384"
|
| 79 |
+
model_id_layoutxlm = "pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-linelevel-ml384"
|
| 80 |
|
| 81 |
+
# tokenizer for LayoutXLM
|
| 82 |
+
tokenizer_id_layoutxlm = "xlm-roberta-base"
|
| 83 |
|
| 84 |
# (tokenization) The maximum length of a feature (sequence)
|
| 85 |
+
if str(384) in model_id_lilt:
|
| 86 |
+
max_length_lilt = 384
|
| 87 |
+
elif str(512) in model_id_lilt:
|
| 88 |
+
max_length_lilt = 512
|
| 89 |
else:
|
| 90 |
+
print("Error with max_length_lilt of chunks!")
|
| 91 |
+
|
| 92 |
+
if str(384) in model_id_layoutxlm:
|
| 93 |
+
max_length_layoutxlm = 384
|
| 94 |
+
elif str(512) in model_id_layoutxlm:
|
| 95 |
+
max_length_layoutxlm = 512
|
| 96 |
+
else:
|
| 97 |
+
print("Error with max_length_layoutxlm of chunks!")
|
| 98 |
|
| 99 |
# (tokenization) overlap
|
| 100 |
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
|
| 101 |
|
| 102 |
# max PDF page images that will be displayed
|
| 103 |
+
max_imgboxes = 1
|
| 104 |
|
| 105 |
# get files
|
| 106 |
examples_dir = 'files/'
|
|
|
|
| 109 |
files = ["example.pdf", "blank.pdf", "blank.png", "languages_iso.csv", "languages_tesseract.csv", "wo_content.png"]
|
| 110 |
for file_name in files:
|
| 111 |
path_to_file = hf_hub_download(
|
| 112 |
+
repo_id = "pierreguillou/Inference-APP-Document-Understanding-at-linelevel-v3",
|
| 113 |
filename = "files/" + file_name,
|
| 114 |
repo_type = "space"
|
| 115 |
)
|
|
|
|
| 146 |
|
| 147 |
langdetect2Tesseract = {v:k for k,v in Tesseract2langdetect.items()}
|
| 148 |
|
| 149 |
+
## model / feature extractor / tokenizer
|
| 150 |
+
|
| 151 |
+
# get device
|
| 152 |
+
import torch
|
| 153 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 154 |
+
|
| 155 |
+
## model LiLT
|
| 156 |
+
import transformers
|
| 157 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
| 158 |
+
tokenizer_lilt = AutoTokenizer.from_pretrained(model_id_lilt)
|
| 159 |
+
model_lilt = AutoModelForTokenClassification.from_pretrained(model_id_lilt);
|
| 160 |
+
model_lilt.to(device);
|
| 161 |
+
|
| 162 |
+
## model LayoutXLM
|
| 163 |
+
from transformers import LayoutLMv2ForTokenClassification # LayoutXLMTokenizerFast,
|
| 164 |
+
model_layoutxlm = LayoutLMv2ForTokenClassification.from_pretrained(model_id_layoutxlm);
|
| 165 |
+
model_layoutxlm.to(device);
|
| 166 |
+
|
| 167 |
+
# feature extractor
|
| 168 |
+
from transformers import LayoutLMv2FeatureExtractor
|
| 169 |
+
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
|
| 170 |
+
|
| 171 |
+
# tokenizer
|
| 172 |
+
from transformers import AutoTokenizer
|
| 173 |
+
tokenizer_layoutxlm = AutoTokenizer.from_pretrained(tokenizer_id_layoutxlm)
|
| 174 |
+
|
| 175 |
## General
|
| 176 |
|
| 177 |
# get text and bounding boxes from an image
|
|
|
|
| 487 |
|
| 488 |
## Inference
|
| 489 |
|
| 490 |
+
def prepare_inference_features(example, tokenizer, max_length, cls_box, sep_box):
|
| 491 |
|
| 492 |
images_ids_list, chunks_ids_list, input_ids_list, attention_mask_list, bb_list, images_pixels_list = list(), list(), list(), list(), list(), list()
|
| 493 |
|
|
|
|
| 610 |
import torch.nn.functional as F
|
| 611 |
|
| 612 |
# get predictions at token level
|
| 613 |
+
def predictions_token_level(images, custom_encoded_dataset, model_id, model):
|
| 614 |
|
| 615 |
num_imgs = len(images)
|
| 616 |
if num_imgs > 0:
|
|
|
|
| 645 |
|
| 646 |
# get prediction with forward pass
|
| 647 |
with torch.no_grad():
|
| 648 |
+
|
| 649 |
+
if model_id == model_id_lilt:
|
| 650 |
+
output = model(
|
| 651 |
+
input_ids=input_id.to(device),
|
| 652 |
+
attention_mask=attention_mask.to(device),
|
| 653 |
+
bbox=bbox.to(device),
|
| 654 |
+
)
|
| 655 |
+
elif model_id == model_id_layoutxlm:
|
| 656 |
+
output = model(
|
| 657 |
+
input_ids=input_id.to(device),
|
| 658 |
+
attention_mask=attention_mask.to(device),
|
| 659 |
+
bbox=bbox.to(device),
|
| 660 |
+
image=pixel_values.to(device)
|
| 661 |
+
)
|
| 662 |
|
| 663 |
# save probabilities of predictions in dictionnary
|
| 664 |
if image_id in outputs: outputs[image_id].append(F.softmax(output.logits.squeeze(), dim=-1))
|
|
|
|
| 672 |
from functools import reduce
|
| 673 |
|
| 674 |
# Get predictions (line level)
|
| 675 |
+
def predictions_line_level(max_length, tokenizer, id2label, dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes, cls_box, sep_box):
|
| 676 |
|
| 677 |
ten_probs_dict, ten_input_ids_dict, ten_bboxes_dict = dict(), dict(), dict()
|
| 678 |
bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = dict(), dict(), dict(), dict()
|
|
|
|
| 729 |
bbox_prev = [-100, -100, -100, -100]
|
| 730 |
for probs, input_id, bbox in zip(ten_probs_list, ten_input_ids_list, ten_bboxes_list):
|
| 731 |
bbox = denormalize_box(bbox, width, height)
|
| 732 |
+
if bbox != bbox_prev and bbox != cls_box and bbox != sep_box and bbox[0] != bbox[2] and bbox[1] != bbox[3]:
|
| 733 |
bboxes_list.append(bbox)
|
| 734 |
input_ids_dict[str(bbox)] = [input_id]
|
| 735 |
probs_dict[str(bbox)] = [probs]
|
| 736 |
+
elif bbox != cls_box and bbox != sep_box and bbox[0] != bbox[2] and bbox[1] != bbox[3]:
|
| 737 |
+
input_ids_dict[str(bbox)].append(input_id)
|
| 738 |
+
probs_dict[str(bbox)].append(probs)
|
|
|
|
| 739 |
bbox_prev = bbox
|
| 740 |
|
| 741 |
probs_bbox = dict()
|
|
|
|
| 766 |
print("An error occurred while getting predictions!")
|
| 767 |
|
| 768 |
# Get labeled images with lines bounding boxes
|
| 769 |
+
def get_labeled_images(id2label, dataset, images_ids_list, bboxes_list_dict, probs_dict_dict):
|
| 770 |
|
| 771 |
labeled_images = list()
|
| 772 |
|
|
|
|
| 798 |
return labeled_images
|
| 799 |
|
| 800 |
# get data of encoded chunk
|
| 801 |
+
def get_encoded_chunk_inference(tokenizer, dataset, encoded_dataset, index_chunk=None):
|
| 802 |
|
| 803 |
# get datasets
|
| 804 |
example = dataset
|
|
|
|
| 850 |
return image, df, num_tokens, page_no, num_pages
|
| 851 |
|
| 852 |
# display chunk of PDF image and its data
|
| 853 |
+
def display_chunk_lines_inference(dataset, encoded_dataset, index_chunk=None):
|
| 854 |
|
| 855 |
# get image and image data
|
| 856 |
+
image, df, num_tokens, page_no, num_pages = get_encoded_chunk_inference(dataset, encoded_dataset, index_chunk=index_chunk)
|
| 857 |
|
| 858 |
# get data from dataframe
|
| 859 |
input_ids = df["input_ids"]
|