Spaces:

vk888
/

Paligemma2_VLM_DocVQA

Sleeping

App Files Files Community

vk commited on Jun 19

Commit

5919b75

1 Parent(s): c0118f4

first commit

Browse files

Files changed (5) hide show

.gitattributes +1 -0
.idea/.gitignore +3 -0
app.py +64 -0
requirements.txt +5 -0
utils.py +41 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+example/invoice1.png filter=lfs diff=lfs merge=lfs -text

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# Default ignored files
+/shelf/
+/workspace.xml

app.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import gradio as gr
+from peft import PeftModel, PeftConfig
+from transformers import PaliGemmaForConditionalGeneration
+import torch
+from transformers import PaliGemmaProcessor
+import PIL
+from utils import parse_bbox_and_labels,display_boxes
+def get_response(
+        image: PIL.Image.Image,
+        prompt: str,
+        max_new_tokens: str
+) -> str:
+    raw_image = image.convert("RGB")
+    width, height = raw_image.size
+    inputs = processor(raw_image, prompt, return_tensors="pt").to(device)
+    with torch.inference_mode():
+        output = peft_model.generate(**inputs, max_new_tokens=int(max_new_tokens))
+    input_len = inputs["input_ids"].shape[-1]
+    output = processor.decode(output[0][input_len:], skip_special_tokens=True)
+    print(output)
+    if "loc" in output:
+        boxes, labels = parse_bbox_and_labels(output)
+        raw_image=display_boxes(raw_image, boxes, labels, target_size=(width, height))
+    return output,raw_image
+if __name__ == "__main__":
+    device = torch.device("cpu")
+    # bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16) #for gpu
+    peft_model_id = "vk888/paligemma_vqav2"
+    model_id = "google/paligemma2-3b-pt-448"
+    config = PeftConfig.from_pretrained(peft_model_id)
+    base_model = PaliGemmaForConditionalGeneration.from_pretrained(config.base_model_name_or_path,
+                                                                   device_map=device)  # , quantization_config=bnb_config)
+    peft_model = PeftModel.from_pretrained(base_model, peft_model_id)
+    processor = PaliGemmaProcessor.from_pretrained(model_id)
+    examples = [
+        ["example/invoice1.png","<image>answer en what is the balance due ?\n",  80],
+        ["example/invoice1.png","<image>detect signature\n", 80],
+        ["example/invoice1.png","<image>answer en what is the rate cada of design ?\n", 80],
+    ]
+    iface = gr.Interface(
+        cache_examples=False,
+        fn=get_response,
+        inputs=[gr.Image(type="pil"),gr.Textbox(placeholder="<image>answer en what is the balance due ?\n"),gr.Textbox(placeholder="80")],
+        examples=examples,
+        outputs=[gr.Textbox(), gr.Image(type="pil")],
+        title="DocVQA with Paligemma2 VLM",
+        description="DocVQA with Paligemma2 VLM"
+    )
+    iface.launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+--index-url https://download.pytorch.org/whl/cpu
+torch
+transformers==4.53.0.dev0
+peft

utils.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import re
+import numpy as np
+from PIL import ImageDraw
+def parse_bbox_and_labels(detokenized_output: str):
+  matches = re.finditer(
+      '<loc(?P<y0>\d\d\d\d)><loc(?P<x0>\d\d\d\d)><loc(?P<y1>\d\d\d\d)><loc(?P<x1>\d\d\d\d)>'
+      ' (?P<label>.+?)( ;|$)',
+      detokenized_output,
+  )
+  labels, boxes = [], []
+  fmt = lambda x: float(x) / 1024.0
+  for m in matches:
+    d = m.groupdict()
+    boxes.append([fmt(d['y0']), fmt(d['x0']), fmt(d['y1']), fmt(d['x1'])])
+    labels.append(d['label'])
+  return np.array(boxes), np.array(labels)
+def display_boxes(image, boxes, labels, target_size):
+  h, w = target_size
+  # fig, ax = plt.subplots()
+  # ax.imshow(image)
+  draw = ImageDraw.Draw(image)
+  for i in range(boxes.shape[0]):
+      y, x, y2, x2 = (boxes[i][0]*w,boxes[i][1]*h,boxes[i][2]*w,boxes[i][3]*h)
+      # width = x2 - x
+      # height = y2 - y
+      # Create a Rectangle patch
+      # rect = patches.Rectangle((x, y),
+      #                          width,
+      #                          height,
+      #                          linewidth=1,
+      #                          edgecolor='r',
+      #                          facecolor='none')
+      draw.rectangle((x,y,x2,y2) , outline="red", width=3)
+      # Add label
+  #     plt.text(x, y, labels[i], color='red', fontsize=12)
+  #     # Add the patch to the Axes
+  #     ax.add_patch(rect)
+  # plt.show()
+  return image