Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -98,7 +98,6 @@
|
|
| 98 |
# demo.launch()
|
| 99 |
|
| 100 |
import re
|
| 101 |
-
import json
|
| 102 |
import gradio as gr
|
| 103 |
from transformers import AutoProcessor, AutoModelForImageTextToText
|
| 104 |
from PIL import Image
|
|
@@ -107,28 +106,7 @@ from PIL import Image
|
|
| 107 |
processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
|
| 108 |
model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
|
| 109 |
|
| 110 |
-
def
|
| 111 |
-
# Remove unwanted tags like <otsl>, </otsl>, <loc_...>
|
| 112 |
-
cleaned = re.sub(r"</?otsl>|<loc_[^>]+>", "", docling_text)
|
| 113 |
-
|
| 114 |
-
# Split by line break <nl>
|
| 115 |
-
lines = cleaned.split("<nl>")
|
| 116 |
-
table = []
|
| 117 |
-
for line in lines:
|
| 118 |
-
if not line.strip():
|
| 119 |
-
continue
|
| 120 |
-
# Extract all <fcel> values
|
| 121 |
-
cells = re.findall(r"<fcel>([^<]+)", line)
|
| 122 |
-
# Convert to floats if possible
|
| 123 |
-
try:
|
| 124 |
-
row = [float(cell) for cell in cells]
|
| 125 |
-
except ValueError:
|
| 126 |
-
# If conversion fails, keep as string
|
| 127 |
-
row = cells
|
| 128 |
-
table.append(row)
|
| 129 |
-
return json.dumps(table, indent=2)
|
| 130 |
-
|
| 131 |
-
def smoldocling_readimage(image, prompt_text):
|
| 132 |
messages = [
|
| 133 |
{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
|
| 134 |
]
|
|
@@ -138,21 +116,49 @@ def smoldocling_readimage(image, prompt_text):
|
|
| 138 |
prompt_length = inputs.input_ids.shape[1]
|
| 139 |
generated = outputs[:, prompt_length:]
|
| 140 |
result = processor.batch_decode(generated, skip_special_tokens=False)[0]
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
demo = gr.Interface(
|
| 148 |
-
fn=
|
| 149 |
inputs=[
|
| 150 |
-
gr.Image(type="pil", label="Upload Image"),
|
| 151 |
-
gr.
|
| 152 |
],
|
| 153 |
-
outputs="
|
| 154 |
-
title="SmolDocling
|
| 155 |
-
description="Upload
|
| 156 |
)
|
| 157 |
|
| 158 |
demo.launch()
|
|
|
|
| 98 |
# demo.launch()
|
| 99 |
|
| 100 |
import re
|
|
|
|
| 101 |
import gradio as gr
|
| 102 |
from transformers import AutoProcessor, AutoModelForImageTextToText
|
| 103 |
from PIL import Image
|
|
|
|
| 106 |
processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
|
| 107 |
model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
|
| 108 |
|
| 109 |
+
def smoldocling_readimage(image, prompt_text="Convert to docling"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
messages = [
|
| 111 |
{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
|
| 112 |
]
|
|
|
|
| 116 |
prompt_length = inputs.input_ids.shape[1]
|
| 117 |
generated = outputs[:, prompt_length:]
|
| 118 |
result = processor.batch_decode(generated, skip_special_tokens=False)[0]
|
| 119 |
+
return result.replace("<end_of_utterance>", "").strip()
|
| 120 |
+
|
| 121 |
+
def extract_numbers(docling_text):
|
| 122 |
+
# Extract all floating numbers from the docling text using regex
|
| 123 |
+
numbers = re.findall(r"[-+]?\d*\.\d+|\d+", docling_text)
|
| 124 |
+
return list(map(float, numbers))
|
| 125 |
+
|
| 126 |
+
def compare_outputs(img1, img2):
|
| 127 |
+
# Extract docling text from both images
|
| 128 |
+
output1 = smoldocling_readimage(img1)
|
| 129 |
+
output2 = smoldocling_readimage(img2)
|
| 130 |
+
|
| 131 |
+
# Extract numbers from both outputs
|
| 132 |
+
nums1 = extract_numbers(output1)
|
| 133 |
+
nums2 = extract_numbers(output2)
|
| 134 |
+
|
| 135 |
+
# Compare numbers — find matching count based on position
|
| 136 |
+
length = min(len(nums1), len(nums2))
|
| 137 |
+
matches = sum(1 for i in range(length) if abs(nums1[i] - nums2[i]) < 1e-3)
|
| 138 |
+
|
| 139 |
+
# Calculate similarity accuracy percentage
|
| 140 |
+
total = max(len(nums1), len(nums2))
|
| 141 |
+
accuracy = (matches / total) * 100 if total > 0 else 0
|
| 142 |
+
|
| 143 |
+
# Prepare result text
|
| 144 |
+
result_text = (
|
| 145 |
+
f"Output for Image 1:\n{output1}\n\n"
|
| 146 |
+
f"Output for Image 2:\n{output2}\n\n"
|
| 147 |
+
f"Similarity Accuracy: {accuracy:.2f}%\n"
|
| 148 |
+
f"Matching Values: {matches} out of {total}"
|
| 149 |
+
)
|
| 150 |
+
return result_text
|
| 151 |
+
|
| 152 |
+
# Gradio UI: take 2 images, output similarity report
|
| 153 |
demo = gr.Interface(
|
| 154 |
+
fn=compare_outputs,
|
| 155 |
inputs=[
|
| 156 |
+
gr.Image(type="pil", label="Upload Image 1"),
|
| 157 |
+
gr.Image(type="pil", label="Upload Image 2"),
|
| 158 |
],
|
| 159 |
+
outputs="text",
|
| 160 |
+
title="SmolDocling Image Comparison",
|
| 161 |
+
description="Upload two document images. This app extracts data from both and compares similarity."
|
| 162 |
)
|
| 163 |
|
| 164 |
demo.launch()
|