prithivMLmods commited on
Commit
4718f93
·
verified ·
1 Parent(s): 521380a
Files changed (1) hide show
  1. app.py +512 -0
app.py ADDED
@@ -0,0 +1,512 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import json
3
+ import math
4
+ import os
5
+ import traceback
6
+ from io import BytesIO
7
+ from typing import Any, Dict, List, Optional, Tuple
8
+ import re
9
+ import time
10
+ from threading import Thread
11
+ from io import BytesIO
12
+ import uuid
13
+ import tempfile
14
+
15
+ import gradio as gr
16
+ import requests
17
+ import torch
18
+ from PIL import Image
19
+ import fitz
20
+ import numpy as np
21
+ import torchvision.transforms as T
22
+ from torchvision.transforms.functional import InterpolationMode
23
+
24
+
25
+ from transformers import (
26
+ Qwen2_5_VLForConditionalGeneration,
27
+ Qwen2VLForConditionalGeneration,
28
+ AutoModelForCausalLM,
29
+ AutoModelForVision2Seq,
30
+ AutoModelForImageTextToText,
31
+ AutoModel,
32
+ AutoProcessor,
33
+ TextIteratorStreamer,
34
+ AutoTokenizer,
35
+ LlavaOnevisionForConditionalGeneration,
36
+ LlavaOnevisionProcessor,
37
+ )
38
+
39
+ from transformers.image_utils import load_image as hf_load_image
40
+
41
+ from reportlab.lib.pagesizes import A4
42
+ from reportlab.lib.styles import getSampleStyleSheet
43
+ from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph, Spacer
44
+ from reportlab.lib.units import inch
45
+
46
+ # --- Constants and Model Setup ---
47
+ MAX_INPUT_TOKEN_LENGTH = 4096
48
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
49
+
50
+ print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
51
+ print("torch.__version__ =", torch.__version__)
52
+ print("torch.version.cuda =", torch.version.cuda)
53
+ print("cuda available:", torch.cuda.is_available())
54
+ print("cuda device count:", torch.cuda.device_count())
55
+ if torch.cuda.is_available():
56
+ print("current device:", torch.cuda.current_device())
57
+ print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
58
+
59
+ print("Using device:", device)
60
+
61
+ # --- InternVL3_5-2B-MPO Preprocessing Functions ---
62
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
63
+ IMAGENET_STD = (0.229, 0.224, 0.225)
64
+
65
+ def build_transform(input_size):
66
+ MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
67
+ transform = T.Compose([
68
+ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
69
+ T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
70
+ T.ToTensor(),
71
+ T.Normalize(mean=MEAN, std=STD)
72
+ ])
73
+ return transform
74
+
75
+ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
76
+ best_ratio_diff = float('inf')
77
+ best_ratio = (1, 1)
78
+ area = width * height
79
+ for ratio in target_ratios:
80
+ target_aspect_ratio = ratio[0] / ratio[1]
81
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
82
+ if ratio_diff < best_ratio_diff:
83
+ best_ratio_diff = ratio_diff
84
+ best_ratio = ratio
85
+ elif ratio_diff == best_ratio_diff:
86
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
87
+ best_ratio = ratio
88
+ return best_ratio
89
+
90
+ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
91
+ orig_width, orig_height = image.size
92
+ aspect_ratio = orig_width / orig_height
93
+
94
+ target_ratios = set(
95
+ (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
96
+ i * j <= max_num and i * j >= min_num)
97
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
98
+
99
+ target_aspect_ratio = find_closest_aspect_ratio(
100
+ aspect_ratio, target_ratios, orig_width, orig_height, image_size)
101
+
102
+ target_width = image_size * target_aspect_ratio[0]
103
+ target_height = image_size * target_aspect_ratio[1]
104
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
105
+
106
+ resized_img = image.resize((target_width, target_height))
107
+ processed_images = []
108
+ for i in range(blocks):
109
+ box = (
110
+ (i % (target_width // image_size)) * image_size,
111
+ (i // (target_width // image_size)) * image_size,
112
+ ((i % (target_width // image_size)) + 1) * image_size,
113
+ ((i // (target_width // image_size)) + 1) * image_size
114
+ )
115
+ split_img = resized_img.crop(box)
116
+ processed_images.append(split_img)
117
+ assert len(processed_images) == blocks
118
+ if use_thumbnail and len(processed_images) != 1:
119
+ thumbnail_img = image.resize((image_size, image_size))
120
+ processed_images.append(thumbnail_img)
121
+ return processed_images
122
+
123
+ def load_image_internvl(image, input_size=448, max_num=12):
124
+ transform = build_transform(input_size=input_size)
125
+ images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
126
+ pixel_values = [transform(img) for img in images]
127
+ pixel_values = torch.stack(pixel_values)
128
+ return pixel_values
129
+
130
+ # --- Model Loading ---
131
+ MODEL_ID_M = "LiquidAI/LFM2-VL-450M"
132
+ processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
133
+ model_m = AutoModelForImageTextToText.from_pretrained(
134
+ MODEL_ID_M, trust_remote_code=True, torch_dtype=torch.float16
135
+ ).to(device).eval()
136
+
137
+ MODEL_ID_T = "LiquidAI/LFM2-VL-1.6B"
138
+ processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
139
+ model_t = AutoModelForImageTextToText.from_pretrained(
140
+ MODEL_ID_T, trust_remote_code=True, torch_dtype=torch.float16
141
+ ).to(device).eval()
142
+
143
+ MODEL_ID_C = "HuggingFaceTB/SmolVLM-Instruct-250M"
144
+ processor_c = AutoProcessor.from_pretrained(MODEL_ID_C, trust_remote_code=True)
145
+ model_c = AutoModelForVision2Seq.from_pretrained(
146
+ MODEL_ID_C, trust_remote_code=True, torch_dtype=torch.float16, _attn_implementation="flash_attention_2"
147
+ ).to(device).eval()
148
+
149
+ MODEL_ID_G = "echo840/MonkeyOCR-pro-1.2B"
150
+ SUBFOLDER = "Recognition"
151
+ processor_g = AutoProcessor.from_pretrained(
152
+ MODEL_ID_G, trust_remote_code=True, subfolder=SUBFOLDER
153
+ )
154
+ model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
155
+ MODEL_ID_G, trust_remote_code=True, subfolder=SUBFOLDER, torch_dtype=torch.float16
156
+ ).to(device).eval()
157
+
158
+ MODEL_ID_I = "UCSC-VLAA/VLAA-Thinker-Qwen2VL-2B"
159
+ processor_i = AutoProcessor.from_pretrained(MODEL_ID_I, trust_remote_code=True)
160
+ model_i = Qwen2VLForConditionalGeneration.from_pretrained(
161
+ MODEL_ID_I, trust_remote_code=True, torch_dtype=torch.float16
162
+ ).to(device).eval()
163
+
164
+ MODEL_ID_A = "nanonets/Nanonets-OCR-s"
165
+ processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
166
+ model_a = Qwen2_5_VLForConditionalGeneration.from_pretrained(
167
+ MODEL_ID_A, trust_remote_code=True, torch_dtype=torch.float16
168
+ ).to(device).eval()
169
+
170
+ MODEL_ID_X = "prithivMLmods/Megalodon-OCR-Sync-0713"
171
+ processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
172
+ model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
173
+ MODEL_ID_X, trust_remote_code=True, torch_dtype=torch.float16
174
+ ).to(device).eval()
175
+
176
+ # --- Moondream2 Model Loading ---
177
+ MODEL_ID_MD = "vikhyatk/moondream2"
178
+ REVISION_MD = "2025-06-21"
179
+ moondream = AutoModelForCausalLM.from_pretrained(
180
+ MODEL_ID_MD,
181
+ revision=REVISION_MD,
182
+ trust_remote_code=True,
183
+ torch_dtype=torch.float16,
184
+ device_map={"": "cuda"},
185
+ )
186
+ tokenizer_md = AutoTokenizer.from_pretrained(MODEL_ID_MD, revision=REVISION_MD)
187
+
188
+ # --- Qwen2.5-VL-3B-Abliterated-Caption-it ---
189
+ MODEL_ID_N = "prithivMLmods/Qwen2.5-VL-3B-Abliterated-Caption-it"
190
+ processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
191
+ model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
192
+ MODEL_ID_N, trust_remote_code=True, torch_dtype=torch.float16
193
+ ).to(device).eval()
194
+
195
+ # --- LMM-R1-MGT-PerceReason ---
196
+ MODEL_ID_F = "VLM-Reasoner/LMM-R1-MGT-PerceReason"
197
+ processor_f = AutoProcessor.from_pretrained(MODEL_ID_F, trust_remote_code=True)
198
+ model_f = Qwen2_5_VLForConditionalGeneration.from_pretrained(
199
+ MODEL_ID_F, trust_remote_code=True, torch_dtype=torch.float16
200
+ ).to(device).eval()
201
+
202
+ # TencentBAC/TBAC-VLR1-3B
203
+ MODEL_ID_G = "TencentBAC/TBAC-VLR1-3B"
204
+ processor_g = AutoProcessor.from_pretrained(MODEL_ID_G, trust_remote_code=True)
205
+ model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
206
+ MODEL_ID_G, trust_remote_code=True, torch_dtype=torch.float16
207
+ ).to(device).eval()
208
+
209
+ # OCRFlux-3B
210
+ MODEL_ID_V = "ChatDOC/OCRFlux-3B"
211
+ processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
212
+ model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
213
+ MODEL_ID_V, trust_remote_code=True, torch_dtype=torch.float16
214
+ ).to(device).eval()
215
+
216
+ MODEL_ID_O = "HuggingFaceTB/SmolVLM-500M-Instruct"
217
+ processor_o = AutoProcessor.from_pretrained(MODEL_ID_O, trust_remote_code=True)
218
+ model_o = AutoModelForVision2Seq.from_pretrained(
219
+ MODEL_ID_O, trust_remote_code=True, torch_dtype=torch.float16, _attn_implementation="flash_attention_2"
220
+ ).to(device).eval()
221
+
222
+ # --- New Model: llava-onevision ---
223
+ MODEL_ID_LO = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
224
+ processor_lo = LlavaOnevisionProcessor.from_pretrained(MODEL_ID_LO)
225
+ model_lo = LlavaOnevisionForConditionalGeneration.from_pretrained(
226
+ MODEL_ID_LO,
227
+ torch_dtype=torch.float16
228
+ ).to(device).eval()
229
+
230
+ # OpenGVLab/InternVL3_5-2B-MPO ---
231
+ MODEL_ID_IV = 'OpenGVLab/InternVL3_5-2B-MPO'
232
+ model_iv = AutoModel.from_pretrained(
233
+ MODEL_ID_IV,
234
+ torch_dtype=torch.bfloat16,
235
+ trust_remote_code=True,
236
+ device_map="auto").eval()
237
+ tokenizer_iv = AutoTokenizer.from_pretrained(MODEL_ID_IV, trust_remote_code=True, use_fast=False)
238
+
239
+
240
+ # --- PDF Generation and Preview Utility Function ---
241
+ def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str):
242
+ """
243
+ Generates a PDF, saves it, and then creates image previews of its pages.
244
+ Returns the path to the PDF and a list of paths to the preview images.
245
+ """
246
+ if image is None or not text_content or not text_content.strip():
247
+ raise gr.Error("Cannot generate PDF. Image or text content is missing.")
248
+
249
+ # --- 1. Generate the PDF ---
250
+ temp_dir = tempfile.gettempdir()
251
+ pdf_filename = os.path.join(temp_dir, f"output_{uuid.uuid4()}.pdf")
252
+ doc = SimpleDocTemplate(
253
+ pdf_filename,
254
+ pagesize=A4,
255
+ rightMargin=inch, leftMargin=inch,
256
+ topMargin=inch, bottomMargin=inch
257
+ )
258
+ styles = getSampleStyleSheet()
259
+ style_normal = styles["Normal"]
260
+ style_normal.fontSize = int(font_size)
261
+ style_normal.leading = int(font_size) * line_spacing
262
+ style_normal.alignment = {"Left": 0, "Center": 1, "Right": 2, "Justified": 4}[alignment]
263
+
264
+ story = []
265
+
266
+ img_buffer = BytesIO()
267
+ image.save(img_buffer, format='PNG')
268
+ img_buffer.seek(0)
269
+
270
+ page_width, _ = A4
271
+ available_width = page_width - 2 * inch
272
+ image_widths = {
273
+ "Small": available_width * 0.3,
274
+ "Medium": available_width * 0.6,
275
+ "Large": available_width * 0.9,
276
+ }
277
+ img_width = image_widths[image_size]
278
+ img = RLImage(img_buffer, width=img_width, height=image.height * (img_width / image.width))
279
+ story.append(img)
280
+ story.append(Spacer(1, 12))
281
+
282
+ cleaned_text = re.sub(r'#+\s*', '', text_content).replace("*", "")
283
+ text_paragraphs = cleaned_text.split('\n')
284
+
285
+ for para in text_paragraphs:
286
+ if para.strip():
287
+ story.append(Paragraph(para, style_normal))
288
+
289
+ doc.build(story)
290
+
291
+ # --- 2. Render PDF pages as images for preview ---
292
+ preview_images = []
293
+ try:
294
+ pdf_doc = fitz.open(pdf_filename)
295
+ for page_num in range(len(pdf_doc)):
296
+ page = pdf_doc.load_page(page_num)
297
+ pix = page.get_pixmap(dpi=150)
298
+ preview_img_path = os.path.join(temp_dir, f"preview_{uuid.uuid4()}_p{page_num}.png")
299
+ pix.save(preview_img_path)
300
+ preview_images.append(preview_img_path)
301
+ pdf_doc.close()
302
+ except Exception as e:
303
+ print(f"Error generating PDF preview: {e}")
304
+
305
+ return pdf_filename, preview_images
306
+
307
+
308
+ # --- Core Application Logic ---
309
+ @spaces.GPU
310
+ def process_document_stream(
311
+ model_name: str,
312
+ image: Image.Image,
313
+ prompt_input: str,
314
+ max_new_tokens: int,
315
+ temperature: float,
316
+ top_p: float,
317
+ top_k: int,
318
+ repetition_penalty: float
319
+ ):
320
+ """
321
+ Main generator function that handles model inference tasks with advanced generation parameters.
322
+ """
323
+ if image is None:
324
+ yield "Please upload an image.", ""
325
+ return
326
+ if not prompt_input or not prompt_input.strip():
327
+ yield "Please enter a prompt.", ""
328
+ return
329
+
330
+ # --- Special Handling for Moondream2 ---
331
+ if model_name == "Moondream2(vision)":
332
+ image_embeds = moondream.encode_image(image)
333
+ answer = moondream.answer_question(
334
+ image_embeds=image_embeds,
335
+ question=prompt_input,
336
+ tokenizer=tokenizer_md
337
+ )
338
+ yield answer, answer
339
+ return
340
+
341
+ # --- Special Handling for InternVL ---
342
+ if model_name == "OpenGVLab/InternVL3_5-2B-MPO":
343
+ pixel_values = load_image_internvl(image, max_num=12).to(torch.bfloat16).to(device)
344
+ generation_config = dict(
345
+ max_new_tokens=max_new_tokens,
346
+ do_sample=True if temperature > 0 else False,
347
+ temperature=temperature,
348
+ top_p=top_p,
349
+ top_k=top_k,
350
+ repetition_penalty=repetition_penalty,
351
+ )
352
+ question = f"<image>\n{prompt_input}"
353
+ response = model_iv.chat(tokenizer_iv, pixel_values, question, generation_config)
354
+ yield response, response
355
+ return
356
+
357
+
358
+ processor = None
359
+ model = None
360
+
361
+ # --- Special Handling for Llava-OneVision ---
362
+ if model_name == "llava-onevision-qwen2-0.5b-ov-hf(mini)":
363
+ processor, model = processor_lo, model_lo
364
+ prompt = f"<|im_start|>user <image>\n{prompt_input}<|im_end|><|im_start|>assistant"
365
+ inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch.float16)
366
+ # --- Generic Handling for all other models ---
367
+ else:
368
+ if model_name == "LFM2-VL-450M(fast)": processor, model = processor_m, model_m
369
+ elif model_name == "LFM2-VL-1.6B(fast)": processor, model = processor_t, model_t
370
+ elif model_name == "SmolVLM-Instruct-250M(smol)": processor, model = processor_c, model_c
371
+ elif model_name == "MonkeyOCR-pro-1.2B(ocr)": processor, model = processor_g, model_g
372
+ elif model_name == "VLAA-Thinker-Qwen2VL-2B(reason)": processor, model = processor_i, model_i
373
+ elif model_name == "Nanonets-OCR-s(ocr)": processor, model = processor_a, model_a
374
+ elif model_name == "Megalodon-OCR-Sync-0713(ocr)": processor, model = processor_x, model_x
375
+ elif model_name == "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)": processor, model = processor_n, model_n
376
+ elif model_name == "LMM-R1-MGT-PerceReason(reason)": processor, model = processor_f, model_f
377
+ elif model_name == "TBAC-VLR1-3B(open-r1)": processor, model = processor_g, model_g
378
+ elif model_name == "OCRFlux-3B(ocr)": processor, model = processor_v, model_v
379
+ elif model_name == "SmolVLM-500M-Instruct(smol)": processor, model = processor_o, model_o
380
+ else:
381
+ yield "Invalid model selected.", ""
382
+ return
383
+
384
+ messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt_input}]}]
385
+ prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
386
+ inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
387
+
388
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
389
+
390
+ generation_kwargs = {
391
+ **inputs,
392
+ "streamer": streamer,
393
+ "max_new_tokens": max_new_tokens,
394
+ "temperature": temperature,
395
+ "top_p": top_p,
396
+ "top_k": top_k,
397
+ "repetition_penalty": repetition_penalty,
398
+ "do_sample": True
399
+ }
400
+
401
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
402
+ thread.start()
403
+
404
+ buffer = ""
405
+ for new_text in streamer:
406
+ buffer += new_text
407
+ buffer = buffer.replace("<|im_end|>", "")
408
+ time.sleep(0.01)
409
+ yield buffer , buffer
410
+
411
+ yield buffer, buffer
412
+
413
+
414
+ # --- Gradio UI Definition ---
415
+ def create_gradio_interface():
416
+ """Builds and returns the Gradio web interface."""
417
+ css = """
418
+ .main-container { max-width: 1400px; margin: 0 auto; }
419
+ .process-button { border: none !important; color: white !important; font-weight: bold !important; background-color: blue !important;}
420
+ .process-button:hover { background-color: darkblue !important; transform: translateY(-2px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; }
421
+ #gallery { min-height: 400px; }
422
+ """
423
+ with gr.Blocks(theme="bethecloud/storj_theme", css=css) as demo:
424
+ gr.HTML("""
425
+ <div class="title" style="text-align: center">
426
+ <h1>Tiny VLMs Lab🧪</h1>
427
+ <p style="font-size: 1.1em; color: #6b7280; margin-bottom: 0.6em;">
428
+ Tiny VLMs for Image Content Extraction and Understanding
429
+ </p>
430
+ </div>
431
+ """)
432
+
433
+ with gr.Row():
434
+ # Left Column (Inputs)
435
+ with gr.Column(scale=1):
436
+ model_choice = gr.Dropdown(
437
+ choices=["LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)",
438
+ "OpenGVLab/InternVL3_5-2B-MPO", "Megalodon-OCR-Sync-0713(ocr)",
439
+ "VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)",
440
+ "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)", "Nanonets-OCR-s(ocr)",
441
+ "LMM-R1-MGT-PerceReason(reason)", "OCRFlux-3B(ocr)", "TBAC-VLR1-3B(open-r1)",
442
+ "SmolVLM-500M-Instruct(smol)", "llava-onevision-qwen2-0.5b-ov-hf(mini)"],
443
+ label="Select Model", value= "LFM2-VL-450M(fast)"
444
+ )
445
+
446
+ prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter the prompt")
447
+ image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
448
+
449
+ with gr.Accordion("Advanced Settings (PDF)", open=False):
450
+ max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=2048, step=256, label="Max New Tokens")
451
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
452
+ top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
453
+ top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
454
+ repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
455
+
456
+ gr.Markdown("### PDF Export Settings")
457
+ font_size = gr.Dropdown(choices=["8", "10", "12", "14", "16", "18"], value="12", label="Font Size")
458
+ line_spacing = gr.Dropdown(choices=[1.0, 1.15, 1.5, 2.0], value=1.15, label="Line Spacing")
459
+ alignment = gr.Dropdown(choices=["Left", "Center", "Right", "Justified"], value="Justified", label="Text Alignment")
460
+ image_size = gr.Dropdown(choices=["Small", "Medium", "Large"], value="Medium", label="Image Size in PDF")
461
+
462
+ process_btn = gr.Button("🚀 Process Image", variant="primary", elem_classes=["process-button"], size="lg")
463
+ clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
464
+
465
+ # Right Column (Outputs)
466
+ with gr.Column(scale=2):
467
+ with gr.Tabs() as tabs:
468
+ with gr.Tab("📝 Extracted Content"):
469
+ raw_output_stream = gr.Textbox(label="Raw Model Output Stream", interactive=False, lines=15, show_copy_button=True)
470
+ with gr.Row():
471
+ examples = gr.Examples(
472
+ examples=["examples/1.png", "examples/2.png", "examples/3.png",
473
+ "examples/4.png", "examples/5.png", "examples/6.png"],
474
+ inputs=image_input, label="Examples"
475
+ )
476
+ gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/Tiny-VLMs-Lab/discussions) | [prithivMLmods🤗](https://huggingface.co/prithivMLmods)")
477
+
478
+ with gr.Tab("📰 README.md"):
479
+ with gr.Accordion("(Result.md)", open=True):
480
+ markdown_output = gr.Markdown()
481
+
482
+ with gr.Tab("📋 PDF Preview"):
483
+ generate_pdf_btn = gr.Button("📄 Generate PDF & Render", variant="primary")
484
+ pdf_output_file = gr.File(label="Download Generated PDF", interactive=False)
485
+ pdf_preview_gallery = gr.Gallery(label="PDF Page Preview", show_label=True, elem_id="gallery", columns=2, object_fit="contain", height="auto")
486
+
487
+ # Event Handlers
488
+ def clear_all_outputs():
489
+ return None, "", "Raw output will appear here.", "", None, None
490
+
491
+ process_btn.click(
492
+ fn=process_document_stream,
493
+ inputs=[model_choice, image_input, prompt_input, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
494
+ outputs=[raw_output_stream, markdown_output]
495
+ )
496
+
497
+ generate_pdf_btn.click(
498
+ fn=generate_and_preview_pdf,
499
+ inputs=[image_input, raw_output_stream, font_size, line_spacing, alignment, image_size],
500
+ outputs=[pdf_output_file, pdf_preview_gallery]
501
+ )
502
+
503
+ clear_btn.click(
504
+ clear_all_outputs,
505
+ outputs=[image_input, prompt_input, raw_output_stream, markdown_output, pdf_output_file, pdf_preview_gallery]
506
+ )
507
+ return demo
508
+
509
+ if __name__ == "__main__":
510
+ demo = create_gradio_interface()
511
+
512
+ demo.queue(max_size=50).launch(share=True, ssr_mode=False, show_error=True)