pierreguillou commited on
Commit
d2924de
·
1 Parent(s): 2618251

Update files/functions.py

Browse files
Files changed (1) hide show
  1. files/functions.py +21 -12
files/functions.py CHANGED
@@ -68,27 +68,36 @@ label2color = {
68
 
69
  # bounding boxes start and end of a sequence
70
  cls_box = [0, 0, 0, 0]
71
- sep_box = [1000, 1000, 1000, 1000]
 
72
 
73
- # model
74
- model_id = "pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512"
 
75
 
76
- # tokenizer
77
- tokenizer_id = "xlm-roberta-base"
78
 
79
  # (tokenization) The maximum length of a feature (sequence)
80
- if str(384) in model_id:
81
- max_length = 384
82
- elif str(512) in model_id:
83
- max_length = 512
 
 
 
 
 
 
 
84
  else:
85
- print("Error with max_length of chunks!")
86
 
87
  # (tokenization) overlap
88
  doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
89
 
90
  # max PDF page images that will be displayed
91
- max_imgboxes = 2
92
 
93
  # get files
94
  examples_dir = 'files/'
@@ -97,7 +106,7 @@ from huggingface_hub import hf_hub_download
97
  files = ["example.pdf", "blank.pdf", "blank.png", "languages_iso.csv", "languages_tesseract.csv", "wo_content.png"]
98
  for file_name in files:
99
  path_to_file = hf_hub_download(
100
- repo_id = "pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v2",
101
  filename = "files/" + file_name,
102
  repo_type = "space"
103
  )
 
68
 
69
  # bounding boxes start and end of a sequence
70
  cls_box = [0, 0, 0, 0]
71
+ sep_box_lilt = cls_box
72
+ sep_box_layoutxlm = [1000, 1000, 1000, 1000]
73
 
74
+ # models
75
+ model_id_lilt = "pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512"
76
+ model_id_layoutxlm = "pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512"
77
 
78
+ # tokenizer for LayoutXLM
79
+ tokenizer_id_layoutxlm = "xlm-roberta-base"
80
 
81
  # (tokenization) The maximum length of a feature (sequence)
82
+ if str(384) in model_id_lilt:
83
+ max_length_lilt = 384
84
+ elif str(512) in model_id_lilt:
85
+ max_length_lilt = 512
86
+ else:
87
+ print("Error with max_length_lilt of chunks!")
88
+
89
+ if str(384) in model_id_layoutxlm:
90
+ max_length_layoutxlm = 384
91
+ elif str(512) in model_id_layoutxlm:
92
+ max_length_layoutxlm = 512
93
  else:
94
+ print("Error with max_length_layoutxlm of chunks!")
95
 
96
  # (tokenization) overlap
97
  doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
98
 
99
  # max PDF page images that will be displayed
100
+ max_imgboxes = 1
101
 
102
  # get files
103
  examples_dir = 'files/'
 
106
  files = ["example.pdf", "blank.pdf", "blank.png", "languages_iso.csv", "languages_tesseract.csv", "wo_content.png"]
107
  for file_name in files:
108
  path_to_file = hf_hub_download(
109
+ repo_id = "pierreguillou/Inference-comparison-APP-Document-Understanding-at-paragraphlevel-v1",
110
  filename = "files/" + file_name,
111
  repo_type = "space"
112
  )