Spaces:

DimaKoshman
/

MakingGraphsAccessible

Runtime error

App Files Files Community

DimaKoshman commited on Apr 27, 2023

Commit

028951c

1 Parent(s): 88f8b47

trained model for a bit

Browse files

Files changed (16) hide show

MakingGraphsAccessible.ipynb +0 -0
app.py +48 -36
checkpoint/added_tokens.json +11 -10
checkpoint/config.json +5 -5
checkpoint/generation_config.json +3 -4
checkpoint/pytorch_model.bin +2 -2
checkpoint/special_tokens_map.json +1 -1
checkpoint/tokenizer.json +0 -0
checkpoint/tokenizer_config.json +1 -1
config.py +21 -0
data.py +521 -0
metrics.py +54 -0
model.py +193 -0
requirements.txt +0 -3
train.py +114 -0
utils.py +23 -0

MakingGraphsAccessible.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

app.py CHANGED Viewed

@@ -1,44 +1,56 @@
 import gradio
-import transformers
-import types
-checkpoint_path = "checkpoint"
-examples_path = "examples"
-MODEL = types.SimpleNamespace()
-MODEL.donut_processor = transformers.DonutProcessor.from_pretrained(checkpoint_path)
-MODEL.encoder_decoder = transformers.VisionEncoderDecoderModel.from_pretrained(checkpoint_path)
-MODEL.tokenizer = MODEL.donut_processor.tokenizer
-def generate_token_strings(images, skip_special_tokens=True):
-    decoder_output = MODEL.encoder_decoder.generate(
-        images,
-        max_length=MODEL.encoder_decoder.config.decoder.max_length,
-        eos_token_id=MODEL.tokenizer.eos_token_id,
-        return_dict_in_generate=True,
-    )
-    return MODEL.tokenizer.batch_decode(
-        decoder_output.sequences, skip_special_tokens=skip_special_tokens
-    )
-def predict_string(image):
-    image = MODEL.donut_processor(
-        image, random_padding=False, return_tensors="pt"
-    ).pixel_values
-    string = generate_token_strings(image)[0]
-    return string
-interface = gradio.Interface(
-    title = "Making graphs accessible",
-    description = "Generate textual representation of a graph\n"
-    "https://www.kaggle.com/competitions/benetech-making-graphs-accessible",
-    fn=predict_string,
-    inputs="image",
-    outputs="text",
-    examples=examples_path,
-)
-interface.launch()

 import gradio
+import pandas as pd
+from matplotlib import pyplot as plt
+from config import CONFIG
+from machine_learning.transformers.MakingGraphsAccessible.data import (
+    get_extra_tokens,
+    BenetechOutput,
+    ChartType,
+)
+from model import predict_string, build_model
+def gradio_visualize_prediction(string):
+    string = string.removeprefix(get_extra_tokens().benetech_prompt)
+    if not BenetechOutput.does_string_match_expected_pattern(string):
+        return
+    benetech_output = BenetechOutput.from_string(string)
+    x = benetech_output.x_data[: len(benetech_output.y_data)]
+    y = benetech_output.y_data[: len(benetech_output.x_data)]
+    df = pd.DataFrame(dict(x=x, y=y))
+    plt_plot = {
+        ChartType.line: plt.plot,
+        ChartType.scatter: plt.scatter,
+        ChartType.horizontal_bar: plt.barh,
+        ChartType.vertical_bar: plt.bar,
+        ChartType.dot: plt.scatter,
+    }
+    plt_plot[benetech_output.chart_type](x, y)
+    plt.xticks(rotation=30)
+    plt.savefig("plot.png")
+    ...
+def main():
+    config = CONFIG
+    config.pretrained_model_name = "checkpoint"
+    model = build_model(config)
+    interface = gradio.Interface(
+        title="Making graphs accessible",
+        description="Generate textual representation of a graph\n"
+                    "https://www.kaggle.com/competitions/benetech-making-graphs-accessible",
+        fn=lambda image: predict_string(image, model),
+        inputs="image",
+        outputs="text",
+        examples="examples",
+    )
+    interface.launch()

checkpoint/added_tokens.json CHANGED Viewed

@@ -1,18 +1,19 @@
 {
   "1": 57537,
   "</benetech_prompt>": 57526,
-  "<;>": 57536,
   "<benetech_prompt>": 57525,
-  "<categorical>": 57532,
-  "<dot>": 57527,
-  "<horizontal_bar>": 57528,
-  "<line>": 57530,
-  "<numerical>": 57533,
   "<s_iitcdip>": 57523,
   "<s_synthdog>": 57524,
-  "<scatter>": 57531,
   "<sep/>": 57522,
-  "<vertical_bar>": 57529,
-  "<x_start>": 57534,
-  "<y_start>": 57535
 }

 {
   "1": 57537,
   "</benetech_prompt>": 57526,
+  "<;>": 57529,
   "<benetech_prompt>": 57525,
+  "<categorical>": 57535,
+  "<dot>": 57530,
+  "<horizontal_bar>": 57531,
+  "<line>": 57533,
+  "<numerical>": 57536,
   "<s_iitcdip>": 57523,
   "<s_synthdog>": 57524,
+  "<scatter>": 57534,
   "<sep/>": 57522,
+  "<vertical_bar>": 57532,
+  "<x_start>": 57527,
+  "<y_start>": 57528,
+  "ދ": 57538
 }

checkpoint/config.json CHANGED Viewed

@@ -4,7 +4,7 @@
   "architectures": [
     "VisionEncoderDecoderModel"
   ],
-  "bos_token_id": 57525,
   "decoder": {
     "_name_or_path": "",
     "activation_dropout": 0.0,
@@ -51,7 +51,7 @@
       "LABEL_1": 1
     },
     "length_penalty": 1.0,
-    "max_length": 512,
     "max_position_embeddings": 1536,
     "min_length": 0,
     "model_type": "mbart",
@@ -88,9 +88,9 @@
     "typical_p": 1.0,
     "use_bfloat16": false,
     "use_cache": true,
-    "vocab_size": 57538
   },
-  "decoder_start_token_id": 57525,
   "encoder": {
     "_name_or_path": "",
     "add_cross_attention": false,
@@ -187,7 +187,7 @@
     "use_bfloat16": false,
     "window_size": 10
   },
-  "eos_token_id": 57526,
   "is_encoder_decoder": true,
   "model_type": "vision-encoder-decoder",
   "pad_token_id": 1,

   "architectures": [
     "VisionEncoderDecoderModel"
   ],
+  "bos_token_id": 3,
   "decoder": {
     "_name_or_path": "",
     "activation_dropout": 0.0,
       "LABEL_1": 1
     },
     "length_penalty": 1.0,
+    "max_length": 20,
     "max_position_embeddings": 1536,
     "min_length": 0,
     "model_type": "mbart",
     "typical_p": 1.0,
     "use_bfloat16": false,
     "use_cache": true,
+    "vocab_size": 57539
   },
+  "decoder_start_token_id": 3,
   "encoder": {
     "_name_or_path": "",
     "add_cross_attention": false,
     "use_bfloat16": false,
     "window_size": 10
   },
+  "eos_token_id": 3,
   "is_encoder_decoder": true,
   "model_type": "vision-encoder-decoder",
   "pad_token_id": 1,

checkpoint/generation_config.json CHANGED Viewed

@@ -1,10 +1,9 @@
 {
   "_from_model_config": true,
-  "bos_token_id": 57525,
-  "decoder_start_token_id": 57525,
-  "eos_token_id": 57526,
   "forced_eos_token_id": 2,
-  "max_length": 512,
   "pad_token_id": 1,
   "transformers_version": "4.26.1"
 }

 {
   "_from_model_config": true,
+  "bos_token_id": 3,
+  "decoder_start_token_id": 3,
+  "eos_token_id": 3,
   "forced_eos_token_id": 2,
   "pad_token_id": 1,
   "transformers_version": "4.26.1"
 }

checkpoint/pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:61dbb0fa6d53b3b8ee0bd4f168eacddc6001485b2e6ceb00781f05188cb57645
-size 809225433

 version https://git-lfs.github.com/spec/v1
+oid sha256:7c9a42d9810580ea7d19acdfe533e97b4be48693c18c82c2b5f337eb879921ff
+size 809236249

checkpoint/special_tokens_map.json CHANGED Viewed

@@ -5,7 +5,7 @@
   ],
   "bos_token": "<s>",
   "cls_token": "<s>",
-  "eos_token": "</benetech_prompt>",
   "mask_token": {
     "content": "<mask>",
     "lstrip": true,

   ],
   "bos_token": "<s>",
   "cls_token": "<s>",
+  "eos_token": "<unk>",
   "mask_token": {
     "content": "<mask>",
     "lstrip": true,

checkpoint/tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

checkpoint/tokenizer_config.json CHANGED Viewed

@@ -11,7 +11,7 @@
     "single_word": false
   },
   "model_max_length": 1000000000000000019884624838656,
-  "name_or_path": "tmp.cp",
   "pad_token": "<pad>",
   "processor_class": "DonutProcessor",
   "sep_token": "</s>",

     "single_word": false
   },
   "model_max_length": 1000000000000000019884624838656,
+  "name_or_path": "naver-clova-ix/donut-base",
   "pad_token": "<pad>",
   "processor_class": "DonutProcessor",
   "sep_token": "</s>",

config.py ADDED Viewed

	@@ -0,0 +1,21 @@

+class CONFIG:
+    debug = False
+    accelerator = "cpu" if debug else "gpu"
+    devices = "auto" if accelerator == "cpu" else [1]
+    batch_size = 2 if debug else 1
+    limit_train_batches = 2 if debug else None
+    limit_val_batches = 2 if debug else 100
+    learning_rate = 3e-5
+    val_fraction = 0.1
+    seed = 42
+    train_val_indices_path = "data/train_val_indices.pickle"
+    float_scientific_notation_string_precision = 5
+    pretrained_model_name = "naver-clova-ix/donut-base"
+    image_width = 720
+    image_height = 512
+    unknown_tokens_for_tokenizer_path = "data/unknown_tokens_for_tokenizer.pickle"
+    decoder_sequence_max_length = 512
+    num_workers = 4
+    training_directory = "training"
+    save_top_k_checkpoints = 3
+    wandb_project_name = "MakingGraphsAccessible"

data.py ADDED Viewed

	@@ -0,0 +1,521 @@

+import dataclasses
+import enum
+import functools
+import json
+import os
+import re
+import types
+from typing import Callable
+import einops
+import imageio
+import numpy as np
+import torch.utils.data
+import torchvision
+import tqdm
+from config import CONFIG
+from utils import load_pickle_or_build_object_and_save
+class Source(enum.Enum):
+    generated = "generated"
+    extracted = "extracted"
+class ChartType(enum.Enum):
+    dot = "dot"
+    horizontal_bar = "horizontal_bar"
+    vertical_bar = "vertical_bar"
+    line = "line"
+    scatter = "scatter"
+@dataclasses.dataclass
+class PlotBoundingBox:
+    height: int
+    width: int
+    x0: int
+    y0: int
+    def get_bounds(self):
+        xs = [self.x0, self.x0 + self.width, self.x0 + self.width, self.x0, self.x0]
+        ys = [self.y0, self.y0, self.y0 + self.height, self.y0 + self.height, self.y0]
+        return xs, ys
+@dataclasses.dataclass
+class DataPoint:
+    x: float or str
+    y: float or str
+class TextRole(enum.Enum):
+    axis_title = "axis_title"
+    chart_title = "chart_title"
+    legend_label = "legend_label"
+    tick_grouping = "tick_grouping"
+    tick_label = "tick_label"
+    other = "other"
+@dataclasses.dataclass
+class Polygon:
+    x0: int
+    x1: int
+    x2: int
+    x3: int
+    y0: int
+    y1: int
+    y2: int
+    y3: int
+    def get_bounds(self):
+        xs = [
+            self.x0,
+            self.x1,
+            self.x2,
+            self.x3,
+            self.x0,
+        ]
+        ys = [
+            self.y0,
+            self.y1,
+            self.y2,
+            self.y3,
+            self.y0,
+        ]
+        return xs, ys
+@dataclasses.dataclass
+class Text:
+    id: int
+    polygon: Polygon
+    role: TextRole
+    text: str
+    def __post_init__(self):
+        self.polygon = Polygon(**self.polygon)
+        self.role = TextRole(self.role)
+class ValuesType(enum.Enum):
+    categorical = "categorical"
+    numerical = "numerical"
+@dataclasses.dataclass
+class Tick:
+    id: int
+    x: int
+    y: int
+class TickType(enum.Enum):
+    markers = "markers"
+    separators = "separators"
+@dataclasses.dataclass
+class Axis:
+    values_type: ValuesType
+    tick_type: TickType
+    ticks: list[Tick]
+    def __post_init__(self):
+        self.values_type = ValuesType(self.values_type)
+        self.tick_type = TickType(self.tick_type)
+        self.ticks = [
+            Tick(id=kw["id"], x=kw["tick_pt"]["x"], y=kw["tick_pt"]["y"])
+            for kw in self.ticks
+        ]
+    def get_bounds(self):
+        min_x = min(tick.x for tick in self.ticks)
+        max_x = max(tick.x for tick in self.ticks)
+        min_y = min(tick.y for tick in self.ticks)
+        max_y = max(tick.y for tick in self.ticks)
+        xs = [min_x, max_x, max_x, min_x, min_x]
+        ys = [min_y, min_y, max_y, max_y, min_y]
+        return xs, ys
+def convert_dashes_to_underscores_in_key_names(dictionary):
+    return {k.replace("-", "_"): v for k, v in dictionary.items()}
+@dataclasses.dataclass
+class Axes:
+    x_axis: Axis
+    y_axis: Axis
+    def __post_init__(self):
+        self.x_axis = Axis(**convert_dashes_to_underscores_in_key_names(self.x_axis))
+        self.y_axis = Axis(**convert_dashes_to_underscores_in_key_names(self.y_axis))
+def preprocess_numerical_value(value):
+    value = float(value)
+    value = 0 if np.isnan(value) else value
+    return value
+def preprocess_value(value, value_type: ValuesType):
+    if value_type == ValuesType.numerical:
+        return preprocess_numerical_value(value)
+    else:
+        return str(value)
+@dataclasses.dataclass
+class Annotation:
+    source: Source
+    chart_type: ChartType
+    plot_bb: PlotBoundingBox
+    text: list[Text]
+    axes: Axes
+    data_series: list[DataPoint]
+    def __post_init__(self):
+        self.source = Source(self.source)
+        self.chart_type = ChartType(self.chart_type)
+        self.plot_bb = PlotBoundingBox(**self.plot_bb)
+        self.text = [Text(**kw) for kw in self.text]
+        self.axes = Axes(**convert_dashes_to_underscores_in_key_names(self.axes))
+        self.data_series = [DataPoint(**kw) for kw in self.data_series]
+        for i in range(len(self.data_series)):
+            self.data_series[i].x = preprocess_value(
+                self.data_series[i].x, self.axes.x_axis.values_type
+            )
+            self.data_series[i].y = preprocess_value(
+                self.data_series[i].y, self.axes.y_axis.values_type
+            )
+    @staticmethod
+    def from_dict_with_dashes(kwargs):
+        return Annotation(**convert_dashes_to_underscores_in_key_names(kwargs))
+    @staticmethod
+    def from_image_index(image_index: int):
+        image_id = load_train_image_ids()[image_index]
+        return Annotation.from_dict_with_dashes(load_image_annotation(image_id))
+    def get_text_by_role(self, text_role: TextRole) -> list[Text]:
+        return [t for t in self.text if t.role == text_role]
+@dataclasses.dataclass
+class AnnotatedImage:
+    id: str
+    image: np.ndarray
+    annotation: Annotation
+    @staticmethod
+    def from_image_id(image_id: str):
+        return AnnotatedImage(
+            id=image_id,
+            image=load_image(image_id),
+            annotation=Annotation.from_dict_with_dashes(
+                load_image_annotation(image_id)
+            ),
+        )
+    @staticmethod
+    def from_image_index(image_index: int):
+        return AnnotatedImage.from_image_id(load_train_image_ids()[image_index])
+def generate_annotated_images():
+    for image_id in tqdm.autonotebook.tqdm(
+            load_train_image_ids(), "Iterating over annotated images"
+    ):
+        yield AnnotatedImage.from_image_id(image_id)
+@functools.cache
+def load_train_image_ids() -> list[str]:
+    train_image_ids = [i.replace(".jpg", "") for i in os.listdir("data/train/images")]
+    return train_image_ids[: 1000 if CONFIG.debug else None]
+@functools.cache
+def load_test_image_ids() -> list[str]:
+    return [i.replace(".jpg", "") for i in os.listdir("data/test/images")]
+@functools.cache
+def load_image_annotation(image_id: str) -> dict:
+    return json.load(open(f"data/train/annotations/{image_id}.json"))
+def load_image(image_id: str) -> np.ndarray:
+    return imageio.v3.imread(open(f"data/train/images/{image_id}.jpg", "rb"))
+@dataclasses.dataclass
+class DataItem:
+    image: torch.FloatTensor
+    target_string: str
+    data_index: int
+    def __post_init__(self):
+        shape = einops.parse_shape(self.image, "channel height width")
+        assert shape["channel"] == 3, "Image is expected to have 3 channels."
+def split_train_indices_by_source():
+    extracted_image_indices = []
+    generated_image_indices = []
+    for i, annotated_image in enumerate(generate_annotated_images()):
+        if annotated_image.annotation.source == Source.extracted:
+            extracted_image_indices.append(i)
+        else:
+            generated_image_indices.append(i)
+    return extracted_image_indices, generated_image_indices
+def get_train_val_split_indices(val_fraction=0.1, seed=42):
+    np.random.seed(seed)
+    val_size = int(len(load_train_image_ids()) * val_fraction)
+    extracted_image_indices, generated_image_indices = split_train_indices_by_source()
+    extracted_image_indices = np.random.permutation(extracted_image_indices)
+    generated_image_indices = np.random.permutation(generated_image_indices)
+    val_indices = extracted_image_indices[:val_size]
+    n_generated_images_in_val = val_size - len(val_indices)
+    val_indices = np.concatenate(
+        [val_indices, generated_image_indices[:n_generated_images_in_val]]
+    )
+    train_indices = generated_image_indices[n_generated_images_in_val:]
+    assert len(set(train_indices) | set(val_indices)) == len(load_train_image_ids())
+    assert len(val_indices) == val_size
+    assert len(set(train_indices) & set(val_indices)) == 0
+    return train_indices, val_indices
+def to_token_str(value: str or enum.Enum):
+    string = value.name if isinstance(value, enum.Enum) else value
+    if re.fullmatch("<.*>", string):
+        return string
+    else:
+        return f"<{string}>"
+@functools.cache
+def get_extra_tokens() -> types.SimpleNamespace:
+    token_ns = types.SimpleNamespace()
+    token_ns.benetech_prompt = to_token_str("benetech_prompt")
+    token_ns.benetech_prompt_end = to_token_str("/benetech_prompt")
+    token_ns.x_start = to_token_str("x_start")
+    token_ns.y_start = to_token_str("y_start")
+    token_ns.value_separator = to_token_str(";")
+    for chart_type in ChartType:
+        setattr(token_ns, chart_type.name, to_token_str(chart_type))
+    for values_type in ValuesType:
+        setattr(token_ns, values_type.name, to_token_str(values_type))
+    return token_ns
+def convert_number_to_scientific_string(value: int or float) -> str:
+    return f"{value:.{CONFIG.float_scientific_notation_string_precision}e}"
+def convert_axis_data_to_string(
+        axis_data: list[str or float], values_type: ValuesType
+) -> str:
+    formatted_axis_data = []
+    for value in axis_data:
+        if values_type == ValuesType.numerical:
+            value = convert_number_to_scientific_string(value)
+        formatted_axis_data.append(value)
+    return get_extra_tokens().value_separator.join(formatted_axis_data)
+def convert_string_to_axis_data(string, values_type: ValuesType):
+    data = string.split(get_extra_tokens().value_separator)
+    if values_type == ValuesType.numerical:
+        data = [float(i.replace(" ", "")) for i in data]
+    return data
+@dataclasses.dataclass
+class BenetechOutput:
+    chart_type: ChartType
+    x_values_type: ValuesType
+    y_values_type: ValuesType
+    x_data: list[str or float]
+    y_data: list[str or float]
+    def __post_init__(self):
+        self.chart_type = ChartType(self.chart_type)
+        self.x_values_type = ValuesType(self.x_values_type)
+        self.y_values_type = ValuesType(self.y_values_type)
+        assert isinstance(self.x_data, list)
+        assert isinstance(self.y_data, list)
+    def get_main_characteristics(self):
+        return (
+            self.chart_type,
+            self.x_values_type,
+            self.y_values_type,
+            len(self.x_data),
+            len(self.y_data),
+        )
+    @staticmethod
+    def from_annotation(annotation: Annotation):
+        return BenetechOutput(
+            chart_type=annotation.chart_type,
+            x_values_type=annotation.axes.x_axis.values_type,
+            y_values_type=annotation.axes.y_axis.values_type,
+            x_data=[dp.x for dp in annotation.data_series],
+            y_data=[dp.y for dp in annotation.data_series],
+        )
+    def to_string(self):
+        return self.format_strings(
+            chart_type=self.chart_type,
+            x_values_type=self.x_values_type,
+            y_values_type=self.y_values_type,
+            x_data=convert_axis_data_to_string(self.x_data, self.x_values_type),
+            y_data=convert_axis_data_to_string(self.y_data, self.y_values_type),
+        )
+    @staticmethod
+    def format_strings(*, chart_type, x_values_type, y_values_type, x_data, y_data):
+        chart_type = to_token_str(chart_type)
+        x_values_type = to_token_str(x_values_type)
+        y_values_type = to_token_str(y_values_type)
+        token_ns = get_extra_tokens()
+        return (
+            f"{token_ns.benetech_prompt}{chart_type}"
+            f"{token_ns.x_start}{x_values_type}{x_data}"
+            f"{token_ns.y_start}{y_values_type}{y_data}"
+            f"{token_ns.benetech_prompt_end}"
+        )
+    @staticmethod
+    def get_string_pattern():
+        field_names = [field.name for field in dataclasses.fields(BenetechOutput)]
+        pattern = BenetechOutput.format_strings(
+            **{field_name: f"(?P<{field_name}>.*?)" for field_name in field_names}
+        )
+        return pattern
+    @staticmethod
+    def does_string_match_expected_pattern(string):
+        try:
+            BenetechOutput.from_string(string)
+            return True
+        except:
+            return False
+    @staticmethod
+    def from_string(string):
+        fullmatch = re.fullmatch(BenetechOutput.get_string_pattern(), string)
+        benetech_kwargs = fullmatch.groupdict()
+        benetech_kwargs["chart_type"] = ChartType(benetech_kwargs["chart_type"])
+        benetech_kwargs["x_values_type"] = ValuesType(benetech_kwargs["x_values_type"])
+        benetech_kwargs["y_values_type"] = ValuesType(benetech_kwargs["y_values_type"])
+        benetech_kwargs["x_data"] = convert_string_to_axis_data(
+            benetech_kwargs["x_data"], benetech_kwargs["x_values_type"]
+        )
+        benetech_kwargs["y_data"] = convert_string_to_axis_data(
+            benetech_kwargs["y_data"], benetech_kwargs["y_values_type"]
+        )
+        return BenetechOutput(**benetech_kwargs)
+def get_annotation_ground_truth_str(annotation: Annotation):
+    benetech_output = BenetechOutput(
+        chart_type=annotation.chart_type,
+        x_values_type=annotation.axes.x_axis.values_type,
+        x_data=[dp.x for dp in annotation.data_series],
+        y_values_type=annotation.axes.y_axis.values_type,
+        y_data=[dp.y for dp in annotation.data_series],
+    )
+    return benetech_output.to_string()
+def get_annotation_ground_truth_str_from_image_index(image_index: int) -> str:
+    return get_annotation_ground_truth_str(Annotation.from_image_index(image_index))
+class Dataset(torch.utils.data.Dataset):
+    def __init__(self, indices: list[int]):
+        super().__init__()
+        self.indices = indices
+        self.to_tensor = torchvision.transforms.ToTensor()
+    def __len__(self):
+        return len(self.indices)
+    def __getitem__(self, idx: int) -> DataItem:
+        data_index = self.indices[idx]
+        annotated_image = AnnotatedImage.from_image_index(data_index)
+        image = annotated_image.image
+        image = self.to_tensor(image)
+        target_string = get_annotation_ground_truth_str(annotated_image.annotation)
+        return DataItem(image=image, target_string=target_string, data_index=data_index)
+def get_train_val_datasets():
+    train_indices, val_indices = load_pickle_or_build_object_and_save(
+        CONFIG.train_val_indices_path,
+        lambda: get_train_val_split_indices(CONFIG.val_fraction, CONFIG.seed),
+    )
+    return Dataset(train_indices), Dataset(val_indices)
+def get_train_dataset():
+    return get_train_val_datasets()[0]
+def get_val_dataset():
+    return get_train_val_datasets()[1]
+@dataclasses.dataclass
+class Batch:
+    images: torch.FloatTensor
+    labels: torch.IntTensor
+    data_indices: list[int]
+    def __post_init__(self):
+        if CONFIG.debug:
+            images_shape = einops.parse_shape(self.images, "batch channel height width")
+            labels_shape = einops.parse_shape(self.labels, "batch label")
+            assert images_shape["batch"] == labels_shape["batch"]
+            assert len(self.data_indices) == images_shape["batch"]
+class Split(enum.Enum):
+    train = "train"
+    val = "val"
+BatchCollateFunction = Callable[[list[DataItem], Split], Batch]
+def build_dataloader(split: Split, batch_collate_function: BatchCollateFunction):
+    return torch.utils.data.DataLoader(
+        get_train_dataset() if split == Split.train else get_val_dataset(),
+        batch_size=CONFIG.batch_size,
+        shuffle=split == Split.train,
+        num_workers=CONFIG.num_workers,
+        collate_fn=functools.partial(batch_collate_function, split=split),
+    )

metrics.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import numpy as np
+import rapidfuzz
+import sklearn
+from data import ValuesType, BenetechOutput, Annotation
+def normalized_rmse(expected: list[float], predicted: list[float]) -> float:
+    return (1 - sklearn.metrics.r2_score(expected, predicted)) ** 0.5
+def normalized_levenshtein_distance(expected: list[str], predicted: list[str]) -> float:
+    total_distance = 0
+    for e, p in zip(expected, predicted):
+        total_distance += rapidfuzz.distance.Levenshtein.distance(e, p)
+    total_length = np.sum([len(e) for e in expected])
+    return total_distance / total_length
+def sigmoid(x):
+    return 1 / (1 + np.exp(-x))
+def positive_loss_to_score(x):
+    return 2 * sigmoid(-x)
+def score_axis_values(values_type, expected, predicted):
+    if values_type == ValuesType.numerical:
+        loss = normalized_rmse(expected, predicted)
+    else:
+        loss = normalized_levenshtein_distance(expected, predicted)
+    return positive_loss_to_score(loss)
+def benetech_score(expected: BenetechOutput, predicted: BenetechOutput) -> float:
+    if expected.get_main_characteristics() != predicted.get_main_characteristics():
+        return 0
+    x_score = score_axis_values(
+        expected.x_values_type, expected.x_data, predicted.x_data
+    )
+    y_score = score_axis_values(
+        expected.y_values_type, expected.y_data, predicted.y_data
+    )
+    return (x_score + y_score) / 2
+def benetech_score_string_prediction(expected_data_index: int, predicted_string: str):
+    if not BenetechOutput.does_string_match_expected_pattern(predicted_string):
+        return 0
+    expected_annotation = Annotation.from_image_index(expected_data_index)
+    expected_output = BenetechOutput.from_annotation(expected_annotation)
+    predicted_output = BenetechOutput.from_string(predicted_string)
+    return benetech_score(expected_output, predicted_output)

model.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import collections
+import dataclasses
+import types
+import pytorch_lightning as pl
+import torch.utils.data
+import transformers
+from data import (
+    generate_annotated_images,
+    get_annotation_ground_truth_str,
+    DataItem,
+    get_extra_tokens,
+    Batch,
+    Split,
+    BatchCollateFunction,
+)
+from utils import load_pickle_or_build_object_and_save
+@dataclasses.dataclass
+class Model:
+    processor: transformers.models.donut.processing_donut.DonutProcessor
+    tokenizer: transformers.models.xlm_roberta.tokenization_xlm_roberta_fast.XLMRobertaTokenizerFast
+    encoder_decoder: transformers.models.vision_encoder_decoder.modeling_vision_encoder_decoder.VisionEncoderDecoderModel
+    batch_collate_function: BatchCollateFunction
+    config: types.SimpleNamespace
+def add_unknown_tokens_to_tokenizer(
+    tokenizer, encoder_decoder, unknown_tokens: list[str]
+):
+    tokenizer.add_tokens(unknown_tokens)
+    encoder_decoder.decoder.resize_token_embeddings(len(tokenizer))
+def find_unknown_tokens_for_tokenizer(tokenizer) -> collections.Counter:
+    unknown_tokens_counter = collections.Counter()
+    for annotated_image in generate_annotated_images():
+        ground_truth = get_annotation_ground_truth_str(annotated_image.annotation)
+        input_ids = tokenizer(ground_truth).input_ids
+        tokens = tokenizer.tokenize(ground_truth, add_special_tokens=True)
+        for token_id, token in zip(input_ids, tokens, strict=True):
+            if token_id == tokenizer.unk_token_id:
+                unknown_tokens_counter.update([token])
+    return unknown_tokens_counter
+def replace_pad_token_id_with_negative_hundred_for_hf_transformers_automatic_batch_transformation(
+    tokenizer, token_ids
+):
+    token_ids[token_ids == tokenizer.pad_token_id] = -100
+    return token_ids
+@dataclasses.dataclass
+class BatchCollateFunction:
+    processor: transformers.models.donut.processing_donut.DonutProcessor
+    tokenizer: transformers.models.xlm_roberta.tokenization_xlm_roberta_fast.XLMRobertaTokenizerFast
+    decoder_sequence_max_length: int
+    def __call__(self, batch: list[DataItem], split: Split) -> Batch:
+        images = [di.image for di in batch]
+        images = self.processor(
+            images, random_padding=split == Split.train, return_tensors="pt"
+        ).pixel_values
+        target_token_ids = self.tokenizer(
+            [di.target_string for di in batch],
+            add_special_tokens=False,
+            max_length=self.decoder_sequence_max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        ).input_ids
+        labels = replace_pad_token_id_with_negative_hundred_for_hf_transformers_automatic_batch_transformation(
+            self.tokenizer, target_token_ids
+        )
+        data_indices = [di.data_index for di in batch]
+        return Batch(images=images, labels=labels, data_indices=data_indices)
+def build_model(config: types.SimpleNamespace or object) -> Model:
+    donut_processor = transformers.DonutProcessor.from_pretrained(
+        config.pretrained_model_name
+    )
+    donut_processor.image_processor.size = dict(
+        width=config.image_width, height=config.image_height
+    )
+    donut_processor.image_processor.do_align_long_axis = False
+    tokenizer = donut_processor.tokenizer
+    encoder_decoder_config = transformers.VisionEncoderDecoderConfig.from_pretrained(
+        config.pretrained_model_name
+    )
+    encoder_decoder_config.encoder.image_size = (
+        config.image_width,
+        config.image_height,
+    )
+    encoder_decoder = transformers.VisionEncoderDecoderModel.from_pretrained(
+        config.pretrained_model_name, config=encoder_decoder_config
+    )
+    encoder_decoder_config.pad_token_id = tokenizer.pad_token_id
+    encoder_decoder_config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(
+        get_extra_tokens().benetech_prompt
+    )
+    encoder_decoder_config.bos_token_id = encoder_decoder_config.decoder_start_token_id
+    encoder_decoder_config.eos_token_id = tokenizer.convert_tokens_to_ids(
+        get_extra_tokens().benetech_prompt_end
+    )
+    extra_tokens = list(get_extra_tokens().__dict__.values())
+    add_unknown_tokens_to_tokenizer(tokenizer, encoder_decoder, extra_tokens)
+    unknown_dataset_tokens = load_pickle_or_build_object_and_save(
+        config.unknown_tokens_for_tokenizer_path,
+        lambda: list(find_unknown_tokens_for_tokenizer(tokenizer).keys()),
+    )
+    add_unknown_tokens_to_tokenizer(tokenizer, encoder_decoder, unknown_dataset_tokens)
+    tokenizer.eos_token_id = encoder_decoder_config.eos_token_id
+    batch_collate_function = BatchCollateFunction(
+        processor=donut_processor,
+        tokenizer=tokenizer,
+        decoder_sequence_max_length=config.decoder_sequence_max_length,
+    )
+    return Model(
+        processor=donut_processor,
+        tokenizer=tokenizer,
+        encoder_decoder=encoder_decoder,
+        batch_collate_function=batch_collate_function,
+        config=config,
+    )
+def generate_token_strings(
+    model: Model, images: torch.Tensor, skip_special_tokens=True
+) -> list[str]:
+    decoder_output = model.encoder_decoder.generate(
+        images,
+        max_length=10
+        if model.config.debug
+        else model.config.decoder_sequence_max_length,
+        eos_token_id=model.tokenizer.eos_token_id,
+        return_dict_in_generate=True,
+    )
+    return model.tokenizer.batch_decode(
+        decoder_output.sequences, skip_special_tokens=skip_special_tokens
+    )
+def predict_string(image, model: Model):
+    image = model.processor(
+        image, random_padding=False, return_tensors="pt"
+    ).pixel_values
+    string = generate_token_strings(model, image)[0]
+    return string
+class LightningModule(pl.LightningModule):
+    def __init__(self, config):
+        super().__init__()
+        self.save_hyperparameters()
+        self.model = build_model(config)
+        self.encoder_decoder = self.model.encoder_decoder
+    def training_step(self, batch: Batch, batch_idx: int) -> torch.Tensor:
+        loss = self.compute_loss(batch)
+        self.log("train_loss", loss)
+        return loss
+    def validation_step(self, batch: Batch, batch_idx: int, dataset_idx: int = 0):
+        loss = self.compute_loss(batch)
+        self.log("val_loss", loss)
+    def compute_loss(self, batch: Batch) -> torch.Tensor:
+        outputs = self.encoder_decoder(pixel_values=batch.images, labels=batch.labels)
+        loss = outputs.loss
+        return loss
+    def configure_optimizers(self) -> torch.optim.Optimizer:
+        optimizer = torch.optim.Adam(
+            self.parameters(), lr=self.hparams["config"].learning_rate
+        )
+        return optimizer

requirements.txt DELETED Viewed

@@ -1,3 +0,0 @@
-gradio==3.27.0
-torch==2.0.0
-transformers==4.26.1

train.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import os
+import pandas as pd
+import pytorch_lightning as pl
+import transformers
+import wandb
+from config import CONFIG
+from data import (
+    get_annotation_ground_truth_str_from_image_index,
+    load_train_image_ids,
+    build_dataloader,
+    Split,
+    Batch,
+)
+from metrics import benetech_score_string_prediction
+from model import generate_token_strings, LightningModule
+from utils import set_tokenizers_parallelism, set_torch_device_order_pci_bus
+class MetricsCallback(pl.callbacks.Callback):
+    def on_validation_batch_start(
+        self, trainer, pl_module, batch: Batch, batch_idx, dataloader_idx=0
+    ):
+        predicted_strings = generate_token_strings(pl_module.model, images=batch.images)
+        for expected_data_index, predicted_string in zip(
+            batch.data_indices, predicted_strings, strict=True
+        ):
+            benetech_score = benetech_score_string_prediction(
+                expected_data_index=expected_data_index,
+                predicted_string=predicted_string,
+            )
+            wandb.log(dict(benetech_score=benetech_score))
+        ground_truth_strings = [
+            get_annotation_ground_truth_str_from_image_index(i)
+            for i in batch.data_indices
+        ]
+        string_ids = [load_train_image_ids()[i] for i in batch.data_indices]
+        strings_dataframe = pd.DataFrame(
+            dict(
+                string_ids=string_ids,
+                ground_truth=ground_truth_strings,
+                predicted=predicted_strings,
+            )
+        )
+        wandb.log(dict(strings=wandb.Table(dataframe=strings_dataframe)))
+class TransformersPreTrainedModelsCheckpointIO(pl.plugins.CheckpointIO):
+    def __init__(
+        self, pretrained_models: list[transformers.modeling_utils.PreTrainedModel]
+    ):
+        super().__init__()
+        self.pretrained_models = pretrained_models
+    def save_checkpoint(self, checkpoint, path, storage_options=None):
+        for pretrained_model in self.pretrained_models:
+            pretrained_model.save_pretrained(path)
+    def load_checkpoint(self, path, storage_options=None):
+        self.pretrained_models = [
+            pm.from_pretrained(path) for pm in self.pretrained_models
+        ]
+    def remove_checkpoint(self, path):
+        os.remove(path)
+def train():
+    set_tokenizers_parallelism(False)
+    set_torch_device_order_pci_bus()
+    pl_module = LightningModule(CONFIG)
+    model_checkpoint = pl.callbacks.ModelCheckpoint(
+        dirpath=CONFIG.training_directory,
+        monitor="val_loss",
+        save_top_k=CONFIG.save_top_k_checkpoints,
+    )
+    metrics_callback = MetricsCallback()
+    logger = pl.loggers.WandbLogger(
+        project=CONFIG.wandb_project_name, save_dir=CONFIG.training_directory
+    )
+    plugin = TransformersPreTrainedModelsCheckpointIO(
+        [pl_module.model.processor, pl_module.model.encoder_decoder]
+    )
+    trainer = pl.Trainer(
+        accelerator=CONFIG.accelerator,
+        devices=CONFIG.devices,
+        plugins=[plugin],
+        callbacks=[model_checkpoint, metrics_callback],
+        logger=logger,
+        limit_train_batches=CONFIG.limit_train_batches,
+        limit_val_batches=CONFIG.limit_val_batches,
+    )
+    trainer.fit(
+        model=pl_module,
+        train_dataloaders=build_dataloader(
+            Split.train, pl_module.model.batch_collate_function
+        ),
+        val_dataloaders=build_dataloader(
+            Split.val, pl_module.model.batch_collate_function
+        ),
+    )
+if __name__ == "__main__":
+    train()

utils.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import os
+import pickle
+from typing import Callable, TypeVar
+T = TypeVar("T")
+def set_tokenizers_parallelism(enable: bool):
+    os.environ["TOKENIZERS_PARALLELISM"] = "true" if enable else "false"
+def set_torch_device_order_pci_bus():
+    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+def load_pickle_or_build_object_and_save(
+    pickle_path: str, build_object: Callable[[], T], overwrite=False
+) -> T:
+    if overwrite or not os.path.exists(pickle_path):
+        pickle.dump(build_object(), open(pickle_path, "wb"))
+    else:
+        print(f"Reusing object {pickle_path}.")
+    return pickle.load(open(pickle_path, "rb"))