prithivMLmods commited on
Commit
9c6a0b0
·
verified ·
1 Parent(s): f4471c3

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -512
app.py DELETED
@@ -1,512 +0,0 @@
1
- import spaces
2
- import json
3
- import math
4
- import os
5
- import traceback
6
- from io import BytesIO
7
- from typing import Any, Dict, List, Optional, Tuple
8
- import re
9
- import time
10
- from threading import Thread
11
- from io import BytesIO
12
- import uuid
13
- import tempfile
14
-
15
- import gradio as gr
16
- import requests
17
- import torch
18
- from PIL import Image
19
- import fitz
20
- import numpy as np
21
- import torchvision.transforms as T
22
- from torchvision.transforms.functional import InterpolationMode
23
-
24
-
25
- from transformers import (
26
- Qwen2_5_VLForConditionalGeneration,
27
- Qwen2VLForConditionalGeneration,
28
- AutoModelForCausalLM,
29
- AutoModelForVision2Seq,
30
- AutoModelForImageTextToText,
31
- AutoModel,
32
- AutoProcessor,
33
- TextIteratorStreamer,
34
- AutoTokenizer,
35
- LlavaOnevisionForConditionalGeneration,
36
- LlavaOnevisionProcessor,
37
- )
38
-
39
- from transformers.image_utils import load_image as hf_load_image
40
-
41
- from reportlab.lib.pagesizes import A4
42
- from reportlab.lib.styles import getSampleStyleSheet
43
- from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph, Spacer
44
- from reportlab.lib.units import inch
45
-
46
- # --- Constants and Model Setup ---
47
- MAX_INPUT_TOKEN_LENGTH = 4096
48
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
49
-
50
- print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
51
- print("torch.__version__ =", torch.__version__)
52
- print("torch.version.cuda =", torch.version.cuda)
53
- print("cuda available:", torch.cuda.is_available())
54
- print("cuda device count:", torch.cuda.device_count())
55
- if torch.cuda.is_available():
56
- print("current device:", torch.cuda.current_device())
57
- print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
58
-
59
- print("Using device:", device)
60
-
61
- # --- InternVL3_5-2B-MPO Preprocessing Functions ---
62
- IMAGENET_MEAN = (0.485, 0.456, 0.406)
63
- IMAGENET_STD = (0.229, 0.224, 0.225)
64
-
65
- def build_transform(input_size):
66
- MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
67
- transform = T.Compose([
68
- T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
69
- T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
70
- T.ToTensor(),
71
- T.Normalize(mean=MEAN, std=STD)
72
- ])
73
- return transform
74
-
75
- def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
76
- best_ratio_diff = float('inf')
77
- best_ratio = (1, 1)
78
- area = width * height
79
- for ratio in target_ratios:
80
- target_aspect_ratio = ratio[0] / ratio[1]
81
- ratio_diff = abs(aspect_ratio - target_aspect_ratio)
82
- if ratio_diff < best_ratio_diff:
83
- best_ratio_diff = ratio_diff
84
- best_ratio = ratio
85
- elif ratio_diff == best_ratio_diff:
86
- if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
87
- best_ratio = ratio
88
- return best_ratio
89
-
90
- def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
91
- orig_width, orig_height = image.size
92
- aspect_ratio = orig_width / orig_height
93
-
94
- target_ratios = set(
95
- (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
96
- i * j <= max_num and i * j >= min_num)
97
- target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
98
-
99
- target_aspect_ratio = find_closest_aspect_ratio(
100
- aspect_ratio, target_ratios, orig_width, orig_height, image_size)
101
-
102
- target_width = image_size * target_aspect_ratio[0]
103
- target_height = image_size * target_aspect_ratio[1]
104
- blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
105
-
106
- resized_img = image.resize((target_width, target_height))
107
- processed_images = []
108
- for i in range(blocks):
109
- box = (
110
- (i % (target_width // image_size)) * image_size,
111
- (i // (target_width // image_size)) * image_size,
112
- ((i % (target_width // image_size)) + 1) * image_size,
113
- ((i // (target_width // image_size)) + 1) * image_size
114
- )
115
- split_img = resized_img.crop(box)
116
- processed_images.append(split_img)
117
- assert len(processed_images) == blocks
118
- if use_thumbnail and len(processed_images) != 1:
119
- thumbnail_img = image.resize((image_size, image_size))
120
- processed_images.append(thumbnail_img)
121
- return processed_images
122
-
123
- def load_image_internvl(image, input_size=448, max_num=12):
124
- transform = build_transform(input_size=input_size)
125
- images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
126
- pixel_values = [transform(img) for img in images]
127
- pixel_values = torch.stack(pixel_values)
128
- return pixel_values
129
-
130
- # --- Model Loading ---
131
- MODEL_ID_M = "LiquidAI/LFM2-VL-450M"
132
- processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
133
- model_m = AutoModelForImageTextToText.from_pretrained(
134
- MODEL_ID_M, trust_remote_code=True, torch_dtype=torch.float16
135
- ).to(device).eval()
136
-
137
- MODEL_ID_T = "LiquidAI/LFM2-VL-1.6B"
138
- processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
139
- model_t = AutoModelForImageTextToText.from_pretrained(
140
- MODEL_ID_T, trust_remote_code=True, torch_dtype=torch.float16
141
- ).to(device).eval()
142
-
143
- MODEL_ID_C = "HuggingFaceTB/SmolVLM-Instruct-250M"
144
- processor_c = AutoProcessor.from_pretrained(MODEL_ID_C, trust_remote_code=True)
145
- model_c = AutoModelForVision2Seq.from_pretrained(
146
- MODEL_ID_C, trust_remote_code=True, torch_dtype=torch.float16, _attn_implementation="flash_attention_2"
147
- ).to(device).eval()
148
-
149
- MODEL_ID_G = "echo840/MonkeyOCR-pro-1.2B"
150
- SUBFOLDER = "Recognition"
151
- processor_g = AutoProcessor.from_pretrained(
152
- MODEL_ID_G, trust_remote_code=True, subfolder=SUBFOLDER
153
- )
154
- model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
155
- MODEL_ID_G, trust_remote_code=True, subfolder=SUBFOLDER, torch_dtype=torch.float16
156
- ).to(device).eval()
157
-
158
- MODEL_ID_I = "UCSC-VLAA/VLAA-Thinker-Qwen2VL-2B"
159
- processor_i = AutoProcessor.from_pretrained(MODEL_ID_I, trust_remote_code=True)
160
- model_i = Qwen2VLForConditionalGeneration.from_pretrained(
161
- MODEL_ID_I, trust_remote_code=True, torch_dtype=torch.float16
162
- ).to(device).eval()
163
-
164
- MODEL_ID_A = "nanonets/Nanonets-OCR-s"
165
- processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
166
- model_a = Qwen2_5_VLForConditionalGeneration.from_pretrained(
167
- MODEL_ID_A, trust_remote_code=True, torch_dtype=torch.float16
168
- ).to(device).eval()
169
-
170
- MODEL_ID_X = "prithivMLmods/Megalodon-OCR-Sync-0713"
171
- processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
172
- model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
173
- MODEL_ID_X, trust_remote_code=True, torch_dtype=torch.float16
174
- ).to(device).eval()
175
-
176
- # --- Moondream2 Model Loading ---
177
- MODEL_ID_MD = "vikhyatk/moondream2"
178
- REVISION_MD = "2025-06-21"
179
- moondream = AutoModelForCausalLM.from_pretrained(
180
- MODEL_ID_MD,
181
- revision=REVISION_MD,
182
- trust_remote_code=True,
183
- torch_dtype=torch.float16,
184
- device_map={"": "cuda"},
185
- )
186
- tokenizer_md = AutoTokenizer.from_pretrained(MODEL_ID_MD, revision=REVISION_MD)
187
-
188
- # --- Qwen2.5-VL-3B-Abliterated-Caption-it ---
189
- MODEL_ID_N = "prithivMLmods/Qwen2.5-VL-3B-Abliterated-Caption-it"
190
- processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
191
- model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
192
- MODEL_ID_N, trust_remote_code=True, torch_dtype=torch.float16
193
- ).to(device).eval()
194
-
195
- # --- LMM-R1-MGT-PerceReason ---
196
- MODEL_ID_F = "VLM-Reasoner/LMM-R1-MGT-PerceReason"
197
- processor_f = AutoProcessor.from_pretrained(MODEL_ID_F, trust_remote_code=True)
198
- model_f = Qwen2_5_VLForConditionalGeneration.from_pretrained(
199
- MODEL_ID_F, trust_remote_code=True, torch_dtype=torch.float16
200
- ).to(device).eval()
201
-
202
- # TencentBAC/TBAC-VLR1-3B
203
- MODEL_ID_G = "TencentBAC/TBAC-VLR1-3B"
204
- processor_g = AutoProcessor.from_pretrained(MODEL_ID_G, trust_remote_code=True)
205
- model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
206
- MODEL_ID_G, trust_remote_code=True, torch_dtype=torch.float16
207
- ).to(device).eval()
208
-
209
- # OCRFlux-3B
210
- MODEL_ID_V = "ChatDOC/OCRFlux-3B"
211
- processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
212
- model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
213
- MODEL_ID_V, trust_remote_code=True, torch_dtype=torch.float16
214
- ).to(device).eval()
215
-
216
- MODEL_ID_O = "HuggingFaceTB/SmolVLM-500M-Instruct"
217
- processor_o = AutoProcessor.from_pretrained(MODEL_ID_O, trust_remote_code=True)
218
- model_o = AutoModelForVision2Seq.from_pretrained(
219
- MODEL_ID_O, trust_remote_code=True, torch_dtype=torch.float16, _attn_implementation="flash_attention_2"
220
- ).to(device).eval()
221
-
222
- # --- New Model: llava-onevision ---
223
- MODEL_ID_LO = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
224
- processor_lo = LlavaOnevisionProcessor.from_pretrained(MODEL_ID_LO)
225
- model_lo = LlavaOnevisionForConditionalGeneration.from_pretrained(
226
- MODEL_ID_LO,
227
- torch_dtype=torch.float16
228
- ).to(device).eval()
229
-
230
- # OpenGVLab/InternVL3_5-2B-MPO ---
231
- MODEL_ID_IV = 'OpenGVLab/InternVL3_5-2B-MPO'
232
- model_iv = AutoModel.from_pretrained(
233
- MODEL_ID_IV,
234
- torch_dtype=torch.bfloat16,
235
- trust_remote_code=True,
236
- device_map="auto").eval()
237
- tokenizer_iv = AutoTokenizer.from_pretrained(MODEL_ID_IV, trust_remote_code=True, use_fast=False)
238
-
239
-
240
- # --- PDF Generation and Preview Utility Function ---
241
- def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str):
242
- """
243
- Generates a PDF, saves it, and then creates image previews of its pages.
244
- Returns the path to the PDF and a list of paths to the preview images.
245
- """
246
- if image is None or not text_content or not text_content.strip():
247
- raise gr.Error("Cannot generate PDF. Image or text content is missing.")
248
-
249
- # --- 1. Generate the PDF ---
250
- temp_dir = tempfile.gettempdir()
251
- pdf_filename = os.path.join(temp_dir, f"output_{uuid.uuid4()}.pdf")
252
- doc = SimpleDocTemplate(
253
- pdf_filename,
254
- pagesize=A4,
255
- rightMargin=inch, leftMargin=inch,
256
- topMargin=inch, bottomMargin=inch
257
- )
258
- styles = getSampleStyleSheet()
259
- style_normal = styles["Normal"]
260
- style_normal.fontSize = int(font_size)
261
- style_normal.leading = int(font_size) * line_spacing
262
- style_normal.alignment = {"Left": 0, "Center": 1, "Right": 2, "Justified": 4}[alignment]
263
-
264
- story = []
265
-
266
- img_buffer = BytesIO()
267
- image.save(img_buffer, format='PNG')
268
- img_buffer.seek(0)
269
-
270
- page_width, _ = A4
271
- available_width = page_width - 2 * inch
272
- image_widths = {
273
- "Small": available_width * 0.3,
274
- "Medium": available_width * 0.6,
275
- "Large": available_width * 0.9,
276
- }
277
- img_width = image_widths[image_size]
278
- img = RLImage(img_buffer, width=img_width, height=image.height * (img_width / image.width))
279
- story.append(img)
280
- story.append(Spacer(1, 12))
281
-
282
- cleaned_text = re.sub(r'#+\s*', '', text_content).replace("*", "")
283
- text_paragraphs = cleaned_text.split('\n')
284
-
285
- for para in text_paragraphs:
286
- if para.strip():
287
- story.append(Paragraph(para, style_normal))
288
-
289
- doc.build(story)
290
-
291
- # --- 2. Render PDF pages as images for preview ---
292
- preview_images = []
293
- try:
294
- pdf_doc = fitz.open(pdf_filename)
295
- for page_num in range(len(pdf_doc)):
296
- page = pdf_doc.load_page(page_num)
297
- pix = page.get_pixmap(dpi=150)
298
- preview_img_path = os.path.join(temp_dir, f"preview_{uuid.uuid4()}_p{page_num}.png")
299
- pix.save(preview_img_path)
300
- preview_images.append(preview_img_path)
301
- pdf_doc.close()
302
- except Exception as e:
303
- print(f"Error generating PDF preview: {e}")
304
-
305
- return pdf_filename, preview_images
306
-
307
-
308
- # --- Core Application Logic ---
309
- @spaces.GPU
310
- def process_document_stream(
311
- model_name: str,
312
- image: Image.Image,
313
- prompt_input: str,
314
- max_new_tokens: int,
315
- temperature: float,
316
- top_p: float,
317
- top_k: int,
318
- repetition_penalty: float
319
- ):
320
- """
321
- Main generator function that handles model inference tasks with advanced generation parameters.
322
- """
323
- if image is None:
324
- yield "Please upload an image.", ""
325
- return
326
- if not prompt_input or not prompt_input.strip():
327
- yield "Please enter a prompt.", ""
328
- return
329
-
330
- # --- Special Handling for Moondream2 ---
331
- if model_name == "Moondream2(vision)":
332
- image_embeds = moondream.encode_image(image)
333
- answer = moondream.answer_question(
334
- image_embeds=image_embeds,
335
- question=prompt_input,
336
- tokenizer=tokenizer_md
337
- )
338
- yield answer, answer
339
- return
340
-
341
- # --- Special Handling for InternVL ---
342
- if model_name == "OpenGVLab/InternVL3_5-2B-MPO":
343
- pixel_values = load_image_internvl(image, max_num=12).to(torch.bfloat16).to(device)
344
- generation_config = dict(
345
- max_new_tokens=max_new_tokens,
346
- do_sample=True if temperature > 0 else False,
347
- temperature=temperature,
348
- top_p=top_p,
349
- top_k=top_k,
350
- repetition_penalty=repetition_penalty,
351
- )
352
- question = f"<image>\n{prompt_input}"
353
- response = model_iv.chat(tokenizer_iv, pixel_values, question, generation_config)
354
- yield response, response
355
- return
356
-
357
-
358
- processor = None
359
- model = None
360
-
361
- # --- Special Handling for Llava-OneVision ---
362
- if model_name == "llava-onevision-qwen2-0.5b-ov-hf(mini)":
363
- processor, model = processor_lo, model_lo
364
- prompt = f"<|im_start|>user <image>\n{prompt_input}<|im_end|><|im_start|>assistant"
365
- inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch.float16)
366
- # --- Generic Handling for all other models ---
367
- else:
368
- if model_name == "LFM2-VL-450M(fast)": processor, model = processor_m, model_m
369
- elif model_name == "LFM2-VL-1.6B(fast)": processor, model = processor_t, model_t
370
- elif model_name == "SmolVLM-Instruct-250M(smol)": processor, model = processor_c, model_c
371
- elif model_name == "MonkeyOCR-pro-1.2B(ocr)": processor, model = processor_g, model_g
372
- elif model_name == "VLAA-Thinker-Qwen2VL-2B(reason)": processor, model = processor_i, model_i
373
- elif model_name == "Nanonets-OCR-s(ocr)": processor, model = processor_a, model_a
374
- elif model_name == "Megalodon-OCR-Sync-0713(ocr)": processor, model = processor_x, model_x
375
- elif model_name == "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)": processor, model = processor_n, model_n
376
- elif model_name == "LMM-R1-MGT-PerceReason(reason)": processor, model = processor_f, model_f
377
- elif model_name == "TBAC-VLR1-3B(open-r1)": processor, model = processor_g, model_g
378
- elif model_name == "OCRFlux-3B(ocr)": processor, model = processor_v, model_v
379
- elif model_name == "SmolVLM-500M-Instruct(smol)": processor, model = processor_o, model_o
380
- else:
381
- yield "Invalid model selected.", ""
382
- return
383
-
384
- messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt_input}]}]
385
- prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
386
- inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
387
-
388
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
389
-
390
- generation_kwargs = {
391
- **inputs,
392
- "streamer": streamer,
393
- "max_new_tokens": max_new_tokens,
394
- "temperature": temperature,
395
- "top_p": top_p,
396
- "top_k": top_k,
397
- "repetition_penalty": repetition_penalty,
398
- "do_sample": True
399
- }
400
-
401
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
402
- thread.start()
403
-
404
- buffer = ""
405
- for new_text in streamer:
406
- buffer += new_text
407
- buffer = buffer.replace("<|im_end|>", "")
408
- time.sleep(0.01)
409
- yield buffer , buffer
410
-
411
- yield buffer, buffer
412
-
413
-
414
- # --- Gradio UI Definition ---
415
- def create_gradio_interface():
416
- """Builds and returns the Gradio web interface."""
417
- css = """
418
- .main-container { max-width: 1400px; margin: 0 auto; }
419
- .process-button { border: none !important; color: white !important; font-weight: bold !important; background-color: blue !important;}
420
- .process-button:hover { background-color: darkblue !important; transform: translateY(-2px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; }
421
- #gallery { min-height: 400px; }
422
- """
423
- with gr.Blocks(theme="bethecloud/storj_theme", css=css) as demo:
424
- gr.HTML("""
425
- <div class="title" style="text-align: center">
426
- <h1>Tiny VLMs Lab🧪</h1>
427
- <p style="font-size: 1.1em; color: #6b7280; margin-bottom: 0.6em;">
428
- Tiny VLMs for Image Content Extraction and Understanding
429
- </p>
430
- </div>
431
- """)
432
-
433
- with gr.Row():
434
- # Left Column (Inputs)
435
- with gr.Column(scale=1):
436
- model_choice = gr.Dropdown(
437
- choices=["LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)",
438
- "OpenGVLab/InternVL3_5-2B-MPO", "Megalodon-OCR-Sync-0713(ocr)",
439
- "VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)",
440
- "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)", "Nanonets-OCR-s(ocr)",
441
- "LMM-R1-MGT-PerceReason(reason)", "OCRFlux-3B(ocr)", "TBAC-VLR1-3B(open-r1)",
442
- "SmolVLM-500M-Instruct(smol)", "llava-onevision-qwen2-0.5b-ov-hf(mini)"],
443
- label="Select Model", value= "LFM2-VL-450M(fast)"
444
- )
445
-
446
- prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter the prompt")
447
- image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
448
-
449
- with gr.Accordion("Advanced Settings (PDF)", open=False):
450
- max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=2048, step=256, label="Max New Tokens")
451
- temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
452
- top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
453
- top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
454
- repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
455
-
456
- gr.Markdown("### PDF Export Settings")
457
- font_size = gr.Dropdown(choices=["8", "10", "12", "14", "16", "18"], value="12", label="Font Size")
458
- line_spacing = gr.Dropdown(choices=[1.0, 1.15, 1.5, 2.0], value=1.15, label="Line Spacing")
459
- alignment = gr.Dropdown(choices=["Left", "Center", "Right", "Justified"], value="Justified", label="Text Alignment")
460
- image_size = gr.Dropdown(choices=["Small", "Medium", "Large"], value="Medium", label="Image Size in PDF")
461
-
462
- process_btn = gr.Button("🚀 Process Image", variant="primary", elem_classes=["process-button"], size="lg")
463
- clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
464
-
465
- # Right Column (Outputs)
466
- with gr.Column(scale=2):
467
- with gr.Tabs() as tabs:
468
- with gr.Tab("📝 Extracted Content"):
469
- raw_output_stream = gr.Textbox(label="Raw Model Output Stream", interactive=False, lines=15, show_copy_button=True)
470
- with gr.Row():
471
- examples = gr.Examples(
472
- examples=["examples/1.png", "examples/2.png", "examples/3.png",
473
- "examples/4.png", "examples/5.png", "examples/6.png"],
474
- inputs=image_input, label="Examples"
475
- )
476
- gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/Tiny-VLMs-Lab/discussions) | [prithivMLmods🤗](https://huggingface.co/prithivMLmods)")
477
-
478
- with gr.Tab("📰 README.md"):
479
- with gr.Accordion("(Result.md)", open=True):
480
- markdown_output = gr.Markdown()
481
-
482
- with gr.Tab("📋 PDF Preview"):
483
- generate_pdf_btn = gr.Button("📄 Generate PDF & Render", variant="primary")
484
- pdf_output_file = gr.File(label="Download Generated PDF", interactive=False)
485
- pdf_preview_gallery = gr.Gallery(label="PDF Page Preview", show_label=True, elem_id="gallery", columns=2, object_fit="contain", height="auto")
486
-
487
- # Event Handlers
488
- def clear_all_outputs():
489
- return None, "", "Raw output will appear here.", "", None, None
490
-
491
- process_btn.click(
492
- fn=process_document_stream,
493
- inputs=[model_choice, image_input, prompt_input, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
494
- outputs=[raw_output_stream, markdown_output]
495
- )
496
-
497
- generate_pdf_btn.click(
498
- fn=generate_and_preview_pdf,
499
- inputs=[image_input, raw_output_stream, font_size, line_spacing, alignment, image_size],
500
- outputs=[pdf_output_file, pdf_preview_gallery]
501
- )
502
-
503
- clear_btn.click(
504
- clear_all_outputs,
505
- outputs=[image_input, prompt_input, raw_output_stream, markdown_output, pdf_output_file, pdf_preview_gallery]
506
- )
507
- return demo
508
-
509
- if __name__ == "__main__":
510
- demo = create_gradio_interface()
511
-
512
- demo.queue(max_size=50).launch(share=True, ssr_mode=False, show_error=True)