ColPali-multi

Runtime error

App Files Files Community

ginipick commited on Feb 3

Commit

45b6f79

verified ·

1 Parent(s): 1fc115f

Update app.py

Browse files

Files changed (1) hide show

app.py +131 -52

app.py CHANGED Viewed

@@ -1,34 +1,35 @@
 import subprocess  # 🥲
 subprocess.run(
     "pip install flash-attn --no-build-isolation",
     env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
     shell=True,
 )
 import spaces
 import gradio as gr
 import re
-from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
-from qwen_vl_utils import process_vision_info
 import torch
 import os
 import json
 from pydantic import BaseModel
 from typing import Tuple
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     "Qwen/Qwen2.5-VL-7B-Instruct",
     torch_dtype=torch.bfloat16,
     attn_implementation="flash_attention_2",
     device_map="auto",
 )
-processor = AutoProcessor.from_pretrained(
-    "Qwen/Qwen2.5-VL-7B-Instruct",
-)
 class GeneralRetrievalQuery(BaseModel):
     broad_topical_query: str
     broad_topical_explanation: str
@@ -38,21 +39,15 @@ class GeneralRetrievalQuery(BaseModel):
     visual_element_explanation: str
 def extract_json_with_regex(text):
-    # Pattern to match content between code backticks
     pattern = r'```(?:json)?\s*(.+?)\s*```'
-    # Find all matches (should typically be one)
     matches = re.findall(pattern, text, re.DOTALL)
     if matches:
-        # Return the first match
         return matches[0]
     return None
 def get_retrieval_prompt(prompt_name: str) -> Tuple[str, GeneralRetrievalQuery]:
     if prompt_name != "general":
         raise ValueError("Only 'general' prompt is available in this version")
     prompt = """You are an AI assistant specialized in document retrieval tasks. Given an image of a document page, your task is to generate retrieval queries that someone might use to find this document in a large corpus.
 Please generate 3 different types of retrieval queries:
@@ -85,32 +80,25 @@ Here is the document image to analyze:
 <image>
 Generate the queries based on this image and provide the response in the specified JSON format."""
     return prompt, GeneralRetrievalQuery
-# defined like this so we can later add more prompting options
 prompt, pydantic_model = get_retrieval_prompt("general")
 def _prep_data_for_input(image):
     messages = [
         {
             "role": "user",
             "content": [
-                {
-                    "type": "image",
-                    "image": image,
-                },
                 {"type": "text", "text": prompt},
             ],
         }
     ]
     text = processor.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
     )
     image_inputs, video_inputs = process_vision_info(messages)
     return processor(
         text=[text],
         images=image_inputs,
@@ -119,17 +107,40 @@ def _prep_data_for_input(image):
         return_tensors="pt",
     )
 @spaces.GPU
-def generate_response(image):
     inputs = _prep_data_for_input(image)
     inputs = inputs.to("cuda")
     generated_ids = model.generate(**inputs, max_new_tokens=200)
     generated_ids_trimmed = [
-        out_ids[len(in_ids) :]
-        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
     ]
     output_text = processor.batch_decode(
         generated_ids_trimmed,
         skip_special_tokens=True,
@@ -137,32 +148,27 @@ def generate_response(image):
     )[0]
     try:
-        # Try to extract JSON from code block first
         json_str = extract_json_with_regex(output_text)
         if json_str:
             parsed = json.loads(json_str)
-            return json.dumps(parsed, indent=2)
-        # If no code block found, try direct JSON parsing
         parsed = json.loads(output_text)
-        return json.dumps(parsed, indent=2)
     except Exception:
         gr.Warning("Failed to parse JSON from output")
         return output_text
-title = "ColPali Query Generator using Qwen2.5-VL"
-description = """[ColPali](https://huggingface.co/papers/2407.01449) is a very exciting new approach to multimodal document retrieval which aims to replace existing document retrievers which often rely on an OCR step with an end-to-end multimodal approach.
-To train or fine-tune a ColPali model, we need a dataset of image-text pairs which represent the document images and the relevant text queries which those documents should match.
-To make the ColPali models work even better we might want a dataset of query/image document pairs related to our domain or task.
-One way in which we might go about generating such a dataset is to use a VLM to generate synthetic queries for us.
-This space uses the [Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) VLM model to generate queries for a document, based on an input document image.
-**Note** there is a lot of scope for improving to prompts and the quality of the generated queries! If you have any suggestions for improvements please [open a Discussion](https://huggingface.co/spaces/davanstrien/ColPali-Query-Generator/discussions/new)!
-This [blog post](https://danielvanstrien.xyz/posts/post-with-code/colpali/2024-09-23-generate_colpali_dataset.html) gives an overview of how you can use this kind of approach to generate a full dataset for fine-tuning ColPali models.
-If you want to convert a PDF(s) to a dataset of page images you can try out the [ PDFs to Page Images Converter](https://huggingface.co/spaces/Dataset-Creation-Tools/pdf-to-page-images-dataset) Space.
 """
 examples = [
@@ -170,12 +176,85 @@ examples = [
     "examples/SRCCL_Technical-Summary.pdf_page_7.jpg",
 ]
-demo = gr.Interface(
-    fn=generate_response,
-    inputs=gr.Image(type="pil"),
-    outputs=gr.Text(),
-    title=title,
-    description=description,
-    examples=examples,
-)
-demo.launch()

 import subprocess  # 🥲
 subprocess.run(
     "pip install flash-attn --no-build-isolation",
     env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
     shell=True,
 )
 import spaces
 import gradio as gr
 import re
 import torch
 import os
 import json
+import time
 from pydantic import BaseModel
 from typing import Tuple
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+from PIL import Image
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+# ----------------------- 모델 및 프로세서 로드 ----------------------- #
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     "Qwen/Qwen2.5-VL-7B-Instruct",
     torch_dtype=torch.bfloat16,
     attn_implementation="flash_attention_2",
     device_map="auto",
 )
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
+# ----------------------- Pydantic 모델 정의 ----------------------- #
 class GeneralRetrievalQuery(BaseModel):
     broad_topical_query: str
     broad_topical_explanation: str
     visual_element_explanation: str
 def extract_json_with_regex(text):
     pattern = r'```(?:json)?\s*(.+?)\s*```'
     matches = re.findall(pattern, text, re.DOTALL)
     if matches:
         return matches[0]
     return None
 def get_retrieval_prompt(prompt_name: str) -> Tuple[str, GeneralRetrievalQuery]:
     if prompt_name != "general":
         raise ValueError("Only 'general' prompt is available in this version")
     prompt = """You are an AI assistant specialized in document retrieval tasks. Given an image of a document page, your task is to generate retrieval queries that someone might use to find this document in a large corpus.
 Please generate 3 different types of retrieval queries:
 <image>
 Generate the queries based on this image and provide the response in the specified JSON format."""
     return prompt, GeneralRetrievalQuery
 prompt, pydantic_model = get_retrieval_prompt("general")
+# ----------------------- 입력 데이터 전처리 ----------------------- #
 def _prep_data_for_input(image):
     messages = [
         {
             "role": "user",
             "content": [
+                {"type": "image", "image": image},
                 {"type": "text", "text": prompt},
             ],
         }
     ]
     text = processor.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
     )
     image_inputs, video_inputs = process_vision_info(messages)
     return processor(
         text=[text],
         images=image_inputs,
         return_tensors="pt",
     )
+# ----------------------- 출력 형식 변환 함수 ----------------------- #
+def format_output(data: dict, output_format: str) -> str:
+    """
+    data: 파싱된 JSON 딕셔너리
+    output_format: "JSON", "Markdown", "Table" 중 하나
+    """
+    if output_format == "JSON":
+        return json.dumps(data, indent=2, ensure_ascii=False)
+    elif output_format == "Markdown":
+        # 각 항목을 Markdown 문단 형식으로 출력
+        md_lines = []
+        for key, value in data.items():
+            md_lines.append(f"**{key.replace('_', ' ').title()}:** {value}")
+        return "\n\n".join(md_lines)
+    elif output_format == "Table":
+        # 간단한 Markdown 표 형식으로 변환
+        headers = ["Field", "Content"]
+        separator = "|".join(["---"] * len(headers))
+        rows = [f"| {' | '.join(headers)} |", f"|{separator}|"]
+        for key, value in data.items():
+            rows.append(f"| {key.replace('_', ' ').title()} | {value} |")
+        return "\n".join(rows)
+    else:
+        return json.dumps(data, indent=2, ensure_ascii=False)
+# ----------------------- 응답 생성 함수 ----------------------- #
 @spaces.GPU
+def generate_response(image, output_format: str = "JSON"):
     inputs = _prep_data_for_input(image)
     inputs = inputs.to("cuda")
     generated_ids = model.generate(**inputs, max_new_tokens=200)
     generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
     ]
     output_text = processor.batch_decode(
         generated_ids_trimmed,
         skip_special_tokens=True,
     )[0]
     try:
         json_str = extract_json_with_regex(output_text)
         if json_str:
             parsed = json.loads(json_str)
+            return format_output(parsed, output_format)
         parsed = json.loads(output_text)
+        return format_output(parsed, output_format)
     except Exception:
         gr.Warning("Failed to parse JSON from output")
         return output_text
+# ----------------------- 인터페이스 제목 및 설명 ----------------------- #
+title = "Elegant ColPali Query Generator using Qwen2.5-VL"
+description = """**ColPali**는 문서 검색에 최적화된 멀티모달 접근법입니다.
+이 인터페이스는 [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) 모델을 사용하여, 문서 이미지로부터 관련 검색 쿼리를 생성합니다.
+- **Broad Topical Query:** 문서의 주요 주제를 포괄하는 쿼리
+- **Specific Detail Query:** 문서 내 특정 사실이나 수치를 포함한 쿼리
+- **Visual Element Query:** 문서의 시각적 요소(예: 차트, 그래프 등)를 기반으로 한 쿼리
+아래 예제를 참고하여, 문서 이미지에 적합한 쿼리를 생성해 보세요.
+더 자세한 정보는 [블로그 포스트](https://danielvanstrien.xyz/posts/post-with-code/colpali/2024-09-23-generate_colpali_dataset.html)를 참조하세요.
 """
 examples = [
     "examples/SRCCL_Technical-Summary.pdf_page_7.jpg",
 ]
+# ----------------------- 커스텀 CSS ----------------------- #
+custom_css = """
+body {
+    background: #f7f9fb;
+    font-family: 'Segoe UI', sans-serif;
+    color: #333;
+}
+header {
+    text-align: center;
+    padding: 20px;
+    margin-bottom: 20px;
+}
+header h1 {
+    font-size: 3em;
+    color: #2c3e50;
+}
+.gradio-container {
+    padding: 20px;
+}
+.gr-button {
+    background-color: #3498db !important;
+    color: #fff !important;
+    border: none !important;
+    padding: 10px 20px !important;
+    border-radius: 5px !important;
+    font-size: 1em !important;
+}
+.gr-button:hover {
+    background-color: #2980b9 !important;
+}
+.gr-gallery-item {
+    border-radius: 10px;
+    overflow: hidden;
+    box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+}
+footer {
+    text-align: center;
+    padding: 20px 0;
+    font-size: 0.9em;
+    color: #555;
+}
+"""
+# ----------------------- Gradio 인터페이스 구성 ----------------------- #
+with gr.Blocks(css=custom_css, title=title) as demo:
+    with gr.Column(variant="panel"):
+        gr.Markdown(f"<header><h1>{title}</h1></header>")
+        gr.Markdown(description)
+        with gr.Tabs():
+            with gr.TabItem("Query Generation"):
+                gr.Markdown("### Generate Retrieval Queries from a Document Image")
+                with gr.Row():
+                    image_input = gr.Image(label="Upload Document Image", type="pil")
+                with gr.Row():
+                    # 출력 형식 선택 옵션 추가
+                    output_format = gr.Radio(
+                        choices=["JSON", "Markdown", "Table"],
+                        value="JSON",
+                        label="Output Format",
+                        info="Select the desired output format."
+                    )
+                generate_button = gr.Button("Generate Query")
+                output_text = gr.Textbox(label="Generated Query", lines=10)
+                with gr.Accordion("Examples", open=False):
+                    gr.Examples(
+                        label="Query Examples",
+                        examples=[
+                            "examples/Approche_no_13_1977.pdf_page_22.jpg",
+                            "examples/SRCCL_Technical-Summary.pdf_page_7.jpg",
+                        ],
+                        inputs=image_input,
+                    )
+                generate_button.click(
+                    fn=generate_response,
+                    inputs=[image_input, output_format],
+                    outputs=output_text
+                )
+        gr.Markdown("<footer>Join our community on <a href='https://discord.gg/openfreeai' target='_blank'>Discord</a></footer>")
+demo.launch()