Spaces:

prithivMLmods
/

Tiny-VLMs-Lab

Running on Zero

App Files Files Community

prithivMLmods commited on Aug 26

Commit

6d68214

verified ·

1 Parent(s): c78eb67

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -24

app.py CHANGED Viewed

@@ -58,7 +58,7 @@ if torch.cuda.is_available():
 print("Using device:", device)
-# --- InternVL Preprocessing Functions ---
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
 IMAGENET_STD = (0.229, 0.224, 0.225)
@@ -200,10 +200,10 @@ model_f = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 ).to(device).eval()
 # TencentBAC/TBAC-VLR1-3B
-MODEL_ID_G_BAC = "TencentBAC/TBAC-VLR1-3B" # Renamed to avoid conflict
-processor_g_bac = AutoProcessor.from_pretrained(MODEL_ID_G_BAC, trust_remote_code=True)
-model_g_bac = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_G_BAC, trust_remote_code=True, torch_dtype=torch.float16
 ).to(device).eval()
 # OCRFlux-3B
@@ -227,26 +227,15 @@ model_lo = LlavaOnevisionForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# --- New Model: OpenGVLab/InternVL3_5-2B-MPO ---
 MODEL_ID_IV = 'OpenGVLab/InternVL3_5-2B-MPO'
 model_iv = AutoModel.from_pretrained(
     MODEL_ID_IV,
     torch_dtype=torch.bfloat16,
-    low_cpu_mem_usage=True,
     trust_remote_code=True,
     device_map="auto").eval()
 tokenizer_iv = AutoTokenizer.from_pretrained(MODEL_ID_IV, trust_remote_code=True, use_fast=False)
-# --- New Model: OpenGVLab/InternVL3_5-1B-MPO ---
-MODEL_ID_IV_1B = 'OpenGVLab/InternVL3_5-1B-MPO'
-model_iv_1b = AutoModel.from_pretrained(
-    MODEL_ID_IV_1B,
-    torch_dtype=torch.bfloat16,
-    low_cpu_mem_usage=True,
-    trust_remote_code=True,
-    device_map="auto").eval()
-tokenizer_iv_1b = AutoTokenizer.from_pretrained(MODEL_ID_IV_1B, trust_remote_code=True, use_fast=False)
 # --- PDF Generation and Preview Utility Function ---
 def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str):
@@ -349,10 +338,8 @@ def process_document_stream(
         yield answer, answer
         return
-    # --- Special Handling for InternVL Models ---
-    if model_name in ["OpenGVLab/InternVL3_5-2B-MPO(reason)", "OpenGVLab/InternVL3_5-1B-MPO(reason)"]:
-        model_to_use, tokenizer_to_use = (model_iv, tokenizer_iv) if model_name == "OpenGVLab/InternVL3_5-2B-MPO(reason)" else (model_iv_1b, tokenizer_iv_1b)
         pixel_values = load_image_internvl(image, max_num=12).to(torch.bfloat16).to(device)
         generation_config = dict(
             max_new_tokens=max_new_tokens,
@@ -363,10 +350,11 @@ def process_document_stream(
             repetition_penalty=repetition_penalty,
         )
         question = f"<image>\n{prompt_input}"
-        response = model_to_use.chat(tokenizer_to_use, pixel_values, question, generation_config)
         yield response, response
         return
     processor = None
     model = None
@@ -386,7 +374,7 @@ def process_document_stream(
         elif model_name == "Megalodon-OCR-Sync-0713(ocr)": processor, model = processor_x, model_x
         elif model_name == "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)": processor, model = processor_n, model_n
         elif model_name == "LMM-R1-MGT-PerceReason(reason)": processor, model = processor_f, model_f
-        elif model_name == "TBAC-VLR1-3B(open-r1)": processor, model = processor_g_bac, model_g_bac
         elif model_name == "OCRFlux-3B(ocr)": processor, model = processor_v, model_v
         elif model_name == "SmolVLM-500M-Instruct(smol)": processor, model = processor_o, model_o
         else:
@@ -447,7 +435,7 @@ def create_gradio_interface():
             with gr.Column(scale=1):
                 model_choice = gr.Dropdown(
                     choices=["LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)",
-                             "OpenGVLab/InternVL3_5-2B-MPO(reason)", "OpenGVLab/InternVL3_5-1B-MPO(reason)", "Megalodon-OCR-Sync-0713(ocr)",
                              "VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)",
                              "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)", "Nanonets-OCR-s(ocr)",
                              "LMM-R1-MGT-PerceReason(reason)", "OCRFlux-3B(ocr)", "TBAC-VLR1-3B(open-r1)",

 print("Using device:", device)
+# --- InternVL3_5-2B-MPO Preprocessing Functions ---
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
 IMAGENET_STD = (0.229, 0.224, 0.225)
 ).to(device).eval()
 # TencentBAC/TBAC-VLR1-3B
+MODEL_ID_G = "TencentBAC/TBAC-VLR1-3B"
+processor_g = AutoProcessor.from_pretrained(MODEL_ID_G, trust_remote_code=True)
+model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_G, trust_remote_code=True, torch_dtype=torch.float16
 ).to(device).eval()
 # OCRFlux-3B
     torch_dtype=torch.float16
 ).to(device).eval()
+# OpenGVLab/InternVL3_5-2B-MPO ---
 MODEL_ID_IV = 'OpenGVLab/InternVL3_5-2B-MPO'
 model_iv = AutoModel.from_pretrained(
     MODEL_ID_IV,
     torch_dtype=torch.bfloat16,
     trust_remote_code=True,
     device_map="auto").eval()
 tokenizer_iv = AutoTokenizer.from_pretrained(MODEL_ID_IV, trust_remote_code=True, use_fast=False)
 # --- PDF Generation and Preview Utility Function ---
 def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str):
         yield answer, answer
         return
+    # --- Special Handling for InternVL ---
+    if model_name == "OpenGVLab/InternVL3_5-2B-MPO":
         pixel_values = load_image_internvl(image, max_num=12).to(torch.bfloat16).to(device)
         generation_config = dict(
             max_new_tokens=max_new_tokens,
             repetition_penalty=repetition_penalty,
         )
         question = f"<image>\n{prompt_input}"
+        response = model_iv.chat(tokenizer_iv, pixel_values, question, generation_config)
         yield response, response
         return
     processor = None
     model = None
         elif model_name == "Megalodon-OCR-Sync-0713(ocr)": processor, model = processor_x, model_x
         elif model_name == "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)": processor, model = processor_n, model_n
         elif model_name == "LMM-R1-MGT-PerceReason(reason)": processor, model = processor_f, model_f
+        elif model_name == "TBAC-VLR1-3B(open-r1)": processor, model = processor_g, model_g
         elif model_name == "OCRFlux-3B(ocr)": processor, model = processor_v, model_v
         elif model_name == "SmolVLM-500M-Instruct(smol)": processor, model = processor_o, model_o
         else:
             with gr.Column(scale=1):
                 model_choice = gr.Dropdown(
                     choices=["LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)",
+                             "OpenGVLab/InternVL3_5-2B-MPO", "Megalodon-OCR-Sync-0713(ocr)",
                              "VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)",
                              "Qwen2.5-VL-3B-Abliterated-Caption-it(caption)", "Nanonets-OCR-s(ocr)",
                              "LMM-R1-MGT-PerceReason(reason)", "OCRFlux-3B(ocr)", "TBAC-VLR1-3B(open-r1)",