prithivMLmods commited on
Commit
6d68214
·
verified ·
1 Parent(s): c78eb67

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -24
app.py CHANGED
@@ -58,7 +58,7 @@ if torch.cuda.is_available():
58
 
59
  print("Using device:", device)
60
 
61
- # --- InternVL Preprocessing Functions ---
62
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
63
  IMAGENET_STD = (0.229, 0.224, 0.225)
64
 
@@ -200,10 +200,10 @@ model_f = Qwen2_5_VLForConditionalGeneration.from_pretrained(
200
  ).to(device).eval()
201
 
202
  # TencentBAC/TBAC-VLR1-3B
203
- MODEL_ID_G_BAC = "TencentBAC/TBAC-VLR1-3B" # Renamed to avoid conflict
204
- processor_g_bac = AutoProcessor.from_pretrained(MODEL_ID_G_BAC, trust_remote_code=True)
205
- model_g_bac = Qwen2_5_VLForConditionalGeneration.from_pretrained(
206
- MODEL_ID_G_BAC, trust_remote_code=True, torch_dtype=torch.float16
207
  ).to(device).eval()
208
 
209
  # OCRFlux-3B
@@ -227,26 +227,15 @@ model_lo = LlavaOnevisionForConditionalGeneration.from_pretrained(
227
  torch_dtype=torch.float16
228
  ).to(device).eval()
229
 
230
- # --- New Model: OpenGVLab/InternVL3_5-2B-MPO ---
231
  MODEL_ID_IV = 'OpenGVLab/InternVL3_5-2B-MPO'
232
  model_iv = AutoModel.from_pretrained(
233
  MODEL_ID_IV,
234
  torch_dtype=torch.bfloat16,
235
- low_cpu_mem_usage=True,
236
  trust_remote_code=True,
237
  device_map="auto").eval()
238
  tokenizer_iv = AutoTokenizer.from_pretrained(MODEL_ID_IV, trust_remote_code=True, use_fast=False)
239
 
240
- # --- New Model: OpenGVLab/InternVL3_5-1B-MPO ---
241
- MODEL_ID_IV_1B = 'OpenGVLab/InternVL3_5-1B-MPO'
242
- model_iv_1b = AutoModel.from_pretrained(
243
- MODEL_ID_IV_1B,
244
- torch_dtype=torch.bfloat16,
245
- low_cpu_mem_usage=True,
246
- trust_remote_code=True,
247
- device_map="auto").eval()
248
- tokenizer_iv_1b = AutoTokenizer.from_pretrained(MODEL_ID_IV_1B, trust_remote_code=True, use_fast=False)
249
-
250
 
251
  # --- PDF Generation and Preview Utility Function ---
252
  def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str):
@@ -349,10 +338,8 @@ def process_document_stream(
349
  yield answer, answer
350
  return
351
 
352
- # --- Special Handling for InternVL Models ---
353
- if model_name in ["OpenGVLab/InternVL3_5-2B-MPO(reason)", "OpenGVLab/InternVL3_5-1B-MPO(reason)"]:
354
- model_to_use, tokenizer_to_use = (model_iv, tokenizer_iv) if model_name == "OpenGVLab/InternVL3_5-2B-MPO(reason)" else (model_iv_1b, tokenizer_iv_1b)
355
-
356
  pixel_values = load_image_internvl(image, max_num=12).to(torch.bfloat16).to(device)
357
  generation_config = dict(
358
  max_new_tokens=max_new_tokens,
@@ -363,10 +350,11 @@ def process_document_stream(
363
  repetition_penalty=repetition_penalty,
364
  )
365
  question = f"<image>\n{prompt_input}"
366
- response = model_to_use.chat(tokenizer_to_use, pixel_values, question, generation_config)
367
  yield response, response
368
  return
369
 
 
370
  processor = None
371
  model = None
372
 
@@ -386,7 +374,7 @@ def process_document_stream(
386
  elif model_name == "Megalodon-OCR-Sync-0713(ocr)": processor, model = processor_x, model_x
387
  elif model_name == "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)": processor, model = processor_n, model_n
388
  elif model_name == "LMM-R1-MGT-PerceReason(reason)": processor, model = processor_f, model_f
389
- elif model_name == "TBAC-VLR1-3B(open-r1)": processor, model = processor_g_bac, model_g_bac
390
  elif model_name == "OCRFlux-3B(ocr)": processor, model = processor_v, model_v
391
  elif model_name == "SmolVLM-500M-Instruct(smol)": processor, model = processor_o, model_o
392
  else:
@@ -447,7 +435,7 @@ def create_gradio_interface():
447
  with gr.Column(scale=1):
448
  model_choice = gr.Dropdown(
449
  choices=["LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)",
450
- "OpenGVLab/InternVL3_5-2B-MPO(reason)", "OpenGVLab/InternVL3_5-1B-MPO(reason)", "Megalodon-OCR-Sync-0713(ocr)",
451
  "VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)",
452
  "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)", "Nanonets-OCR-s(ocr)",
453
  "LMM-R1-MGT-PerceReason(reason)", "OCRFlux-3B(ocr)", "TBAC-VLR1-3B(open-r1)",
 
58
 
59
  print("Using device:", device)
60
 
61
+ # --- InternVL3_5-2B-MPO Preprocessing Functions ---
62
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
63
  IMAGENET_STD = (0.229, 0.224, 0.225)
64
 
 
200
  ).to(device).eval()
201
 
202
  # TencentBAC/TBAC-VLR1-3B
203
+ MODEL_ID_G = "TencentBAC/TBAC-VLR1-3B"
204
+ processor_g = AutoProcessor.from_pretrained(MODEL_ID_G, trust_remote_code=True)
205
+ model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
206
+ MODEL_ID_G, trust_remote_code=True, torch_dtype=torch.float16
207
  ).to(device).eval()
208
 
209
  # OCRFlux-3B
 
227
  torch_dtype=torch.float16
228
  ).to(device).eval()
229
 
230
+ # OpenGVLab/InternVL3_5-2B-MPO ---
231
  MODEL_ID_IV = 'OpenGVLab/InternVL3_5-2B-MPO'
232
  model_iv = AutoModel.from_pretrained(
233
  MODEL_ID_IV,
234
  torch_dtype=torch.bfloat16,
 
235
  trust_remote_code=True,
236
  device_map="auto").eval()
237
  tokenizer_iv = AutoTokenizer.from_pretrained(MODEL_ID_IV, trust_remote_code=True, use_fast=False)
238
 
 
 
 
 
 
 
 
 
 
 
239
 
240
  # --- PDF Generation and Preview Utility Function ---
241
  def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str):
 
338
  yield answer, answer
339
  return
340
 
341
+ # --- Special Handling for InternVL ---
342
+ if model_name == "OpenGVLab/InternVL3_5-2B-MPO":
 
 
343
  pixel_values = load_image_internvl(image, max_num=12).to(torch.bfloat16).to(device)
344
  generation_config = dict(
345
  max_new_tokens=max_new_tokens,
 
350
  repetition_penalty=repetition_penalty,
351
  )
352
  question = f"<image>\n{prompt_input}"
353
+ response = model_iv.chat(tokenizer_iv, pixel_values, question, generation_config)
354
  yield response, response
355
  return
356
 
357
+
358
  processor = None
359
  model = None
360
 
 
374
  elif model_name == "Megalodon-OCR-Sync-0713(ocr)": processor, model = processor_x, model_x
375
  elif model_name == "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)": processor, model = processor_n, model_n
376
  elif model_name == "LMM-R1-MGT-PerceReason(reason)": processor, model = processor_f, model_f
377
+ elif model_name == "TBAC-VLR1-3B(open-r1)": processor, model = processor_g, model_g
378
  elif model_name == "OCRFlux-3B(ocr)": processor, model = processor_v, model_v
379
  elif model_name == "SmolVLM-500M-Instruct(smol)": processor, model = processor_o, model_o
380
  else:
 
435
  with gr.Column(scale=1):
436
  model_choice = gr.Dropdown(
437
  choices=["LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)",
438
+ "OpenGVLab/InternVL3_5-2B-MPO", "Megalodon-OCR-Sync-0713(ocr)",
439
  "VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)",
440
  "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)", "Nanonets-OCR-s(ocr)",
441
  "LMM-R1-MGT-PerceReason(reason)", "OCRFlux-3B(ocr)", "TBAC-VLR1-3B(open-r1)",