Robo-Beam

Running

App Files Files Community

seawolf2357 commited on Mar 16

Commit

d538cf7

1 Parent(s): d48cdf4

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -11

app.py CHANGED Viewed

@@ -14,6 +14,9 @@ from loguru import logger
 from PIL import Image
 from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
 model_id = os.getenv("MODEL_ID", "google/gemma-3-27b-it")
 processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
 model = Gemma3ForConditionalGeneration.from_pretrained(
@@ -48,10 +51,20 @@ def count_files_in_history(history: list[dict]) -> tuple[int, int]:
 def validate_media_constraints(message: dict, history: list[dict]) -> bool:
-    new_image_count, new_video_count = count_files_in_new_message(message["files"])
     history_image_count, history_video_count = count_files_in_history(history)
     image_count = history_image_count + new_image_count
     video_count = history_video_count + new_video_count
     if video_count > 1:
         gr.Warning("Only one video is supported.")
         return False
@@ -63,12 +76,21 @@ def validate_media_constraints(message: dict, history: list[dict]) -> bool:
             gr.Warning("Using <image> tags with video files is not supported.")
             return False
         # TODO: Add frame count validation for videos similar to image count limits  # noqa: FIX002, TD002, TD003
     if video_count == 0 and image_count > MAX_NUM_IMAGES:
         gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
         return False
-    if "<image>" in message["text"] and message["text"].count("<image>") != new_image_count:
-        gr.Warning("The number of <image> tags in the text does not match the number of images.")
-        return False
     return True
@@ -127,20 +149,65 @@ def process_interleaved_images(message: dict) -> list[dict]:
     return content
 def process_new_user_message(message: dict) -> list[dict]:
     if not message["files"]:
         return [{"type": "text", "text": message["text"]}]
-    if message["files"][0].endswith(".mp4"):
-        return [{"type": "text", "text": message["text"]}, *process_video(message["files"][0])]
     if "<image>" in message["text"]:
         return process_interleaved_images(message)
-    return [
-        {"type": "text", "text": message["text"]},
-        *[{"type": "image", "url": path} for path in message["files"]],
-    ]
 def process_history(history: list[dict]) -> list[dict]:
@@ -323,13 +390,20 @@ DESCRIPTION = """\
 This is a demo of Gemma 3 27B it, a vision language model with outstanding performance on a wide range of tasks.
 You can upload images, interleaved images and videos. Note that video input only supports single-turn conversation and mp4 input.
 """
 demo = gr.ChatInterface(
     fn=run,
     type="messages",
     chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
-    textbox=gr.MultimodalTextbox(file_types=["image", ".mp4"], file_count="multiple", autofocus=True),
     multimodal=True,
     additional_inputs=[
         gr.Textbox(label="System Prompt", value="You are a helpful assistant."),

 from PIL import Image
 from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
+# [PDF] PyPDF2 추가
+import PyPDF2
 model_id = os.getenv("MODEL_ID", "google/gemma-3-27b-it")
 processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
 model = Gemma3ForConditionalGeneration.from_pretrained(
 def validate_media_constraints(message: dict, history: list[dict]) -> bool:
+    """
+    이미지/비디오 개수와 혼합 여부 등을 검사하는 함수.
+    PDF는 검사 로직에서 제외하여 업로드만 허용.
+    """
+    # [PDF] PDF 파일 제외 처리
+    pdf_files = [f for f in message["files"] if f.endswith(".pdf")]
+    non_pdf_files = [f for f in message["files"] if not f.endswith(".pdf")]
+    # 기존 로직은 non_pdf_files(= 이미지/비디오)에 대해서만 체크
+    new_image_count, new_video_count = count_files_in_new_message(non_pdf_files)
     history_image_count, history_video_count = count_files_in_history(history)
     image_count = history_image_count + new_image_count
     video_count = history_video_count + new_video_count
     if video_count > 1:
         gr.Warning("Only one video is supported.")
         return False
             gr.Warning("Using <image> tags with video files is not supported.")
             return False
         # TODO: Add frame count validation for videos similar to image count limits  # noqa: FIX002, TD002, TD003
     if video_count == 0 and image_count > MAX_NUM_IMAGES:
         gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
         return False
+    # [PDF] PDF 갯수 제한(필요하다면)도 추가 가능
+    # 일단 제한은 두지 않고 바로 True 반환
+    # <image> 태그가 있을 경우, 이미지 개수와 매칭 검사
+    if "<image>" in message["text"]:
+        # new_image_count는 pdf 제외된 이미지 수
+        if message["text"].count("<image>") != new_image_count:
+            gr.Warning("The number of <image> tags in the text does not match the number of images.")
+            return False
     return True
     return content
+# [PDF] PDF -> Markdown 변환 함수 추가
+def pdf_to_markdown(pdf_path: str) -> str:
+    """
+    PDF 파일을 텍스트로 추출 후, 간단한 Markdown 형태로 반환.
+    """
+    text_chunks = []
+    with open(pdf_path, "rb") as f:
+        reader = PyPDF2.PdfReader(f)
+        for page_num, page in enumerate(reader.pages, start=1):
+            page_text = page.extract_text()
+            page_text = page_text.strip() if page_text else ""
+            if page_text:
+                # 페이지별로 간단한 헤더와 본문을 Markdown으로 합침
+                text_chunks.append(f"## Page {page_num}\n\n{page_text}\n")
+    return "\n".join(text_chunks)
 def process_new_user_message(message: dict) -> list[dict]:
+    """
+    새 user message에서 text, 파일(이미지/비디오/PDF)을 처리.
+    """
     if not message["files"]:
         return [{"type": "text", "text": message["text"]}]
+    # [PDF] PDF 파일 목록
+    pdf_files = [f for f in message["files"] if f.endswith(".pdf")]
+    # 이미지·비디오 목록
+    other_files = [f for f in message["files"] if not f.endswith(".pdf")]
+    # 일단 사용자의 text를 가장 먼저 넣는다
+    content_list = [{"type": "text", "text": message["text"]}]
+    # PDF 변환 후 추가
+    for pdf_path in pdf_files:
+        pdf_markdown = pdf_to_markdown(pdf_path)
+        if pdf_markdown.strip():
+            content_list.append({"type": "text", "text": pdf_markdown})
+        else:
+            content_list.append({"type": "text", "text": "(PDF에서 텍스트 추출 실패)"})
+    # 영상이 있는지 확인
+    video_files = [f for f in other_files if f.endswith(".mp4")]
+    if video_files:
+        # 비디오는 한 개만 처리한다는 전제 (validate_media_constraints에서 이미 검사)
+        # 여러 개일 경우 첫 번째 것만 처리하거나, 경고 처리
+        content_list += process_video(video_files[0])
+        return content_list
+    # interleaved 이미지
     if "<image>" in message["text"]:
         return process_interleaved_images(message)
+    # 일반 이미지(여러 장)
+    image_files = [f for f in other_files if not f.endswith(".mp4")]
+    if image_files:
+        content_list += [{"type": "image", "url": path} for path in image_files]
+    return content_list
 def process_history(history: list[dict]) -> list[dict]:
 This is a demo of Gemma 3 27B it, a vision language model with outstanding performance on a wide range of tasks.
 You can upload images, interleaved images and videos. Note that video input only supports single-turn conversation and mp4 input.
+Also, PDF files are now supported: any uploaded PDF will be converted to Markdown text and passed into the conversation.
 """
+# [PDF] .pdf 허용
 demo = gr.ChatInterface(
     fn=run,
     type="messages",
     chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
+    textbox=gr.MultimodalTextbox(
+        file_types=["image", ".mp4", ".pdf"],  # [PDF] 허용
+        file_count="multiple",
+        autofocus=True
+    ),
     multimodal=True,
     additional_inputs=[
         gr.Textbox(label="System Prompt", value="You are a helpful assistant."),