Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -58,7 +58,7 @@ if torch.cuda.is_available():
|
|
| 58 |
|
| 59 |
print("Using device:", device)
|
| 60 |
|
| 61 |
-
# ---
|
| 62 |
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
| 63 |
IMAGENET_STD = (0.229, 0.224, 0.225)
|
| 64 |
|
|
@@ -200,10 +200,10 @@ model_f = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
|
| 200 |
).to(device).eval()
|
| 201 |
|
| 202 |
# TencentBAC/TBAC-VLR1-3B
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
).to(device).eval()
|
| 208 |
|
| 209 |
# OCRFlux-3B
|
|
@@ -227,26 +227,15 @@ model_lo = LlavaOnevisionForConditionalGeneration.from_pretrained(
|
|
| 227 |
torch_dtype=torch.float16
|
| 228 |
).to(device).eval()
|
| 229 |
|
| 230 |
-
#
|
| 231 |
MODEL_ID_IV = 'OpenGVLab/InternVL3_5-2B-MPO'
|
| 232 |
model_iv = AutoModel.from_pretrained(
|
| 233 |
MODEL_ID_IV,
|
| 234 |
torch_dtype=torch.bfloat16,
|
| 235 |
-
low_cpu_mem_usage=True,
|
| 236 |
trust_remote_code=True,
|
| 237 |
device_map="auto").eval()
|
| 238 |
tokenizer_iv = AutoTokenizer.from_pretrained(MODEL_ID_IV, trust_remote_code=True, use_fast=False)
|
| 239 |
|
| 240 |
-
# --- New Model: OpenGVLab/InternVL3_5-1B-MPO ---
|
| 241 |
-
MODEL_ID_IV_1B = 'OpenGVLab/InternVL3_5-1B-MPO'
|
| 242 |
-
model_iv_1b = AutoModel.from_pretrained(
|
| 243 |
-
MODEL_ID_IV_1B,
|
| 244 |
-
torch_dtype=torch.bfloat16,
|
| 245 |
-
low_cpu_mem_usage=True,
|
| 246 |
-
trust_remote_code=True,
|
| 247 |
-
device_map="auto").eval()
|
| 248 |
-
tokenizer_iv_1b = AutoTokenizer.from_pretrained(MODEL_ID_IV_1B, trust_remote_code=True, use_fast=False)
|
| 249 |
-
|
| 250 |
|
| 251 |
# --- PDF Generation and Preview Utility Function ---
|
| 252 |
def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str):
|
|
@@ -349,10 +338,8 @@ def process_document_stream(
|
|
| 349 |
yield answer, answer
|
| 350 |
return
|
| 351 |
|
| 352 |
-
# --- Special Handling for InternVL
|
| 353 |
-
if model_name
|
| 354 |
-
model_to_use, tokenizer_to_use = (model_iv, tokenizer_iv) if model_name == "OpenGVLab/InternVL3_5-2B-MPO(reason)" else (model_iv_1b, tokenizer_iv_1b)
|
| 355 |
-
|
| 356 |
pixel_values = load_image_internvl(image, max_num=12).to(torch.bfloat16).to(device)
|
| 357 |
generation_config = dict(
|
| 358 |
max_new_tokens=max_new_tokens,
|
|
@@ -363,10 +350,11 @@ def process_document_stream(
|
|
| 363 |
repetition_penalty=repetition_penalty,
|
| 364 |
)
|
| 365 |
question = f"<image>\n{prompt_input}"
|
| 366 |
-
response =
|
| 367 |
yield response, response
|
| 368 |
return
|
| 369 |
|
|
|
|
| 370 |
processor = None
|
| 371 |
model = None
|
| 372 |
|
|
@@ -386,7 +374,7 @@ def process_document_stream(
|
|
| 386 |
elif model_name == "Megalodon-OCR-Sync-0713(ocr)": processor, model = processor_x, model_x
|
| 387 |
elif model_name == "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)": processor, model = processor_n, model_n
|
| 388 |
elif model_name == "LMM-R1-MGT-PerceReason(reason)": processor, model = processor_f, model_f
|
| 389 |
-
elif model_name == "TBAC-VLR1-3B(open-r1)": processor, model =
|
| 390 |
elif model_name == "OCRFlux-3B(ocr)": processor, model = processor_v, model_v
|
| 391 |
elif model_name == "SmolVLM-500M-Instruct(smol)": processor, model = processor_o, model_o
|
| 392 |
else:
|
|
@@ -447,7 +435,7 @@ def create_gradio_interface():
|
|
| 447 |
with gr.Column(scale=1):
|
| 448 |
model_choice = gr.Dropdown(
|
| 449 |
choices=["LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)",
|
| 450 |
-
"OpenGVLab/InternVL3_5-2B-MPO
|
| 451 |
"VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)",
|
| 452 |
"Qwen2.5-VL-3B-Abliterated-Caption-it(caption)", "Nanonets-OCR-s(ocr)",
|
| 453 |
"LMM-R1-MGT-PerceReason(reason)", "OCRFlux-3B(ocr)", "TBAC-VLR1-3B(open-r1)",
|
|
|
|
| 58 |
|
| 59 |
print("Using device:", device)
|
| 60 |
|
| 61 |
+
# --- InternVL3_5-2B-MPO Preprocessing Functions ---
|
| 62 |
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
| 63 |
IMAGENET_STD = (0.229, 0.224, 0.225)
|
| 64 |
|
|
|
|
| 200 |
).to(device).eval()
|
| 201 |
|
| 202 |
# TencentBAC/TBAC-VLR1-3B
|
| 203 |
+
MODEL_ID_G = "TencentBAC/TBAC-VLR1-3B"
|
| 204 |
+
processor_g = AutoProcessor.from_pretrained(MODEL_ID_G, trust_remote_code=True)
|
| 205 |
+
model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 206 |
+
MODEL_ID_G, trust_remote_code=True, torch_dtype=torch.float16
|
| 207 |
).to(device).eval()
|
| 208 |
|
| 209 |
# OCRFlux-3B
|
|
|
|
| 227 |
torch_dtype=torch.float16
|
| 228 |
).to(device).eval()
|
| 229 |
|
| 230 |
+
# OpenGVLab/InternVL3_5-2B-MPO ---
|
| 231 |
MODEL_ID_IV = 'OpenGVLab/InternVL3_5-2B-MPO'
|
| 232 |
model_iv = AutoModel.from_pretrained(
|
| 233 |
MODEL_ID_IV,
|
| 234 |
torch_dtype=torch.bfloat16,
|
|
|
|
| 235 |
trust_remote_code=True,
|
| 236 |
device_map="auto").eval()
|
| 237 |
tokenizer_iv = AutoTokenizer.from_pretrained(MODEL_ID_IV, trust_remote_code=True, use_fast=False)
|
| 238 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
|
| 240 |
# --- PDF Generation and Preview Utility Function ---
|
| 241 |
def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str):
|
|
|
|
| 338 |
yield answer, answer
|
| 339 |
return
|
| 340 |
|
| 341 |
+
# --- Special Handling for InternVL ---
|
| 342 |
+
if model_name == "OpenGVLab/InternVL3_5-2B-MPO":
|
|
|
|
|
|
|
| 343 |
pixel_values = load_image_internvl(image, max_num=12).to(torch.bfloat16).to(device)
|
| 344 |
generation_config = dict(
|
| 345 |
max_new_tokens=max_new_tokens,
|
|
|
|
| 350 |
repetition_penalty=repetition_penalty,
|
| 351 |
)
|
| 352 |
question = f"<image>\n{prompt_input}"
|
| 353 |
+
response = model_iv.chat(tokenizer_iv, pixel_values, question, generation_config)
|
| 354 |
yield response, response
|
| 355 |
return
|
| 356 |
|
| 357 |
+
|
| 358 |
processor = None
|
| 359 |
model = None
|
| 360 |
|
|
|
|
| 374 |
elif model_name == "Megalodon-OCR-Sync-0713(ocr)": processor, model = processor_x, model_x
|
| 375 |
elif model_name == "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)": processor, model = processor_n, model_n
|
| 376 |
elif model_name == "LMM-R1-MGT-PerceReason(reason)": processor, model = processor_f, model_f
|
| 377 |
+
elif model_name == "TBAC-VLR1-3B(open-r1)": processor, model = processor_g, model_g
|
| 378 |
elif model_name == "OCRFlux-3B(ocr)": processor, model = processor_v, model_v
|
| 379 |
elif model_name == "SmolVLM-500M-Instruct(smol)": processor, model = processor_o, model_o
|
| 380 |
else:
|
|
|
|
| 435 |
with gr.Column(scale=1):
|
| 436 |
model_choice = gr.Dropdown(
|
| 437 |
choices=["LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)",
|
| 438 |
+
"OpenGVLab/InternVL3_5-2B-MPO", "Megalodon-OCR-Sync-0713(ocr)",
|
| 439 |
"VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)",
|
| 440 |
"Qwen2.5-VL-3B-Abliterated-Caption-it(caption)", "Nanonets-OCR-s(ocr)",
|
| 441 |
"LMM-R1-MGT-PerceReason(reason)", "OCRFlux-3B(ocr)", "TBAC-VLR1-3B(open-r1)",
|