Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	Commit 
							
							ยท
						
						d538cf7
	
1
								Parent(s):
							
							d48cdf4
								
Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -14,6 +14,9 @@ from loguru import logger | |
| 14 | 
             
            from PIL import Image
         | 
| 15 | 
             
            from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
         | 
| 16 |  | 
|  | |
|  | |
|  | |
| 17 | 
             
            model_id = os.getenv("MODEL_ID", "google/gemma-3-27b-it")
         | 
| 18 | 
             
            processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
         | 
| 19 | 
             
            model = Gemma3ForConditionalGeneration.from_pretrained(
         | 
| @@ -48,10 +51,20 @@ def count_files_in_history(history: list[dict]) -> tuple[int, int]: | |
| 48 |  | 
| 49 |  | 
| 50 | 
             
            def validate_media_constraints(message: dict, history: list[dict]) -> bool:
         | 
| 51 | 
            -
                 | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 52 | 
             
                history_image_count, history_video_count = count_files_in_history(history)
         | 
| 53 | 
             
                image_count = history_image_count + new_image_count
         | 
| 54 | 
             
                video_count = history_video_count + new_video_count
         | 
|  | |
| 55 | 
             
                if video_count > 1:
         | 
| 56 | 
             
                    gr.Warning("Only one video is supported.")
         | 
| 57 | 
             
                    return False
         | 
| @@ -63,12 +76,21 @@ def validate_media_constraints(message: dict, history: list[dict]) -> bool: | |
| 63 | 
             
                        gr.Warning("Using <image> tags with video files is not supported.")
         | 
| 64 | 
             
                        return False
         | 
| 65 | 
             
                    # TODO: Add frame count validation for videos similar to image count limits  # noqa: FIX002, TD002, TD003
         | 
|  | |
| 66 | 
             
                if video_count == 0 and image_count > MAX_NUM_IMAGES:
         | 
| 67 | 
             
                    gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
         | 
| 68 | 
             
                    return False
         | 
| 69 | 
            -
             | 
| 70 | 
            -
             | 
| 71 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 72 | 
             
                return True
         | 
| 73 |  | 
| 74 |  | 
| @@ -127,20 +149,65 @@ def process_interleaved_images(message: dict) -> list[dict]: | |
| 127 | 
             
                return content
         | 
| 128 |  | 
| 129 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 130 | 
             
            def process_new_user_message(message: dict) -> list[dict]:
         | 
|  | |
|  | |
|  | |
| 131 | 
             
                if not message["files"]:
         | 
| 132 | 
             
                    return [{"type": "text", "text": message["text"]}]
         | 
| 133 |  | 
| 134 | 
            -
                 | 
| 135 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 136 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 137 | 
             
                if "<image>" in message["text"]:
         | 
| 138 | 
             
                    return process_interleaved_images(message)
         | 
| 139 |  | 
| 140 | 
            -
                 | 
| 141 | 
            -
             | 
| 142 | 
            -
             | 
| 143 | 
            -
             | 
|  | |
|  | |
| 144 |  | 
| 145 |  | 
| 146 | 
             
            def process_history(history: list[dict]) -> list[dict]:
         | 
| @@ -323,13 +390,20 @@ DESCRIPTION = """\ | |
| 323 |  | 
| 324 | 
             
            This is a demo of Gemma 3 27B it, a vision language model with outstanding performance on a wide range of tasks.
         | 
| 325 | 
             
            You can upload images, interleaved images and videos. Note that video input only supports single-turn conversation and mp4 input.
         | 
|  | |
|  | |
| 326 | 
             
            """
         | 
| 327 |  | 
|  | |
| 328 | 
             
            demo = gr.ChatInterface(
         | 
| 329 | 
             
                fn=run,
         | 
| 330 | 
             
                type="messages",
         | 
| 331 | 
             
                chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
         | 
| 332 | 
            -
                textbox=gr.MultimodalTextbox( | 
|  | |
|  | |
|  | |
|  | |
| 333 | 
             
                multimodal=True,
         | 
| 334 | 
             
                additional_inputs=[
         | 
| 335 | 
             
                    gr.Textbox(label="System Prompt", value="You are a helpful assistant."),
         | 
|  | |
| 14 | 
             
            from PIL import Image
         | 
| 15 | 
             
            from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
         | 
| 16 |  | 
| 17 | 
            +
            # [PDF] PyPDF2 ์ถ๊ฐ
         | 
| 18 | 
            +
            import PyPDF2
         | 
| 19 | 
            +
             | 
| 20 | 
             
            model_id = os.getenv("MODEL_ID", "google/gemma-3-27b-it")
         | 
| 21 | 
             
            processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
         | 
| 22 | 
             
            model = Gemma3ForConditionalGeneration.from_pretrained(
         | 
|  | |
| 51 |  | 
| 52 |  | 
| 53 | 
             
            def validate_media_constraints(message: dict, history: list[dict]) -> bool:
         | 
| 54 | 
            +
                """
         | 
| 55 | 
            +
                ์ด๋ฏธ์ง/๋น๋์ค ๊ฐ์์ ํผํฉ ์ฌ๋ถ ๋ฑ์ ๊ฒ์ฌํ๋ ํจ์.
         | 
| 56 | 
            +
                PDF๋ ๊ฒ์ฌ ๋ก์ง์์ ์ ์ธํ์ฌ ์
๋ก๋๋ง ํ์ฉ.
         | 
| 57 | 
            +
                """
         | 
| 58 | 
            +
                # [PDF] PDF ํ์ผ ์ ์ธ ์ฒ๋ฆฌ
         | 
| 59 | 
            +
                pdf_files = [f for f in message["files"] if f.endswith(".pdf")]
         | 
| 60 | 
            +
                non_pdf_files = [f for f in message["files"] if not f.endswith(".pdf")]
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                # ๊ธฐ์กด ๋ก์ง์ non_pdf_files(= ์ด๋ฏธ์ง/๋น๋์ค)์ ๋ํด์๋ง ์ฒดํฌ
         | 
| 63 | 
            +
                new_image_count, new_video_count = count_files_in_new_message(non_pdf_files)
         | 
| 64 | 
             
                history_image_count, history_video_count = count_files_in_history(history)
         | 
| 65 | 
             
                image_count = history_image_count + new_image_count
         | 
| 66 | 
             
                video_count = history_video_count + new_video_count
         | 
| 67 | 
            +
             | 
| 68 | 
             
                if video_count > 1:
         | 
| 69 | 
             
                    gr.Warning("Only one video is supported.")
         | 
| 70 | 
             
                    return False
         | 
|  | |
| 76 | 
             
                        gr.Warning("Using <image> tags with video files is not supported.")
         | 
| 77 | 
             
                        return False
         | 
| 78 | 
             
                    # TODO: Add frame count validation for videos similar to image count limits  # noqa: FIX002, TD002, TD003
         | 
| 79 | 
            +
             | 
| 80 | 
             
                if video_count == 0 and image_count > MAX_NUM_IMAGES:
         | 
| 81 | 
             
                    gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
         | 
| 82 | 
             
                    return False
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                # [PDF] PDF ๊ฐฏ์ ์ ํ(ํ์ํ๋ค๋ฉด)๋ ์ถ๊ฐ ๊ฐ๋ฅ
         | 
| 85 | 
            +
                # ์ผ๋จ ์ ํ์ ๋์ง ์๊ณ  ๋ฐ๋ก True ๋ฐํ
         | 
| 86 | 
            +
             | 
| 87 | 
            +
                # <image> ํ๊ทธ๊ฐ ์์ ๊ฒฝ์ฐ, ์ด๋ฏธ์ง ๊ฐ์์ ๋งค์นญ ๊ฒ์ฌ
         | 
| 88 | 
            +
                if "<image>" in message["text"]:
         | 
| 89 | 
            +
                    # new_image_count๋ pdf ์ ์ธ๋ ์ด๋ฏธ์ง ์
         | 
| 90 | 
            +
                    if message["text"].count("<image>") != new_image_count:
         | 
| 91 | 
            +
                        gr.Warning("The number of <image> tags in the text does not match the number of images.")
         | 
| 92 | 
            +
                        return False
         | 
| 93 | 
            +
             | 
| 94 | 
             
                return True
         | 
| 95 |  | 
| 96 |  | 
|  | |
| 149 | 
             
                return content
         | 
| 150 |  | 
| 151 |  | 
| 152 | 
            +
            # [PDF] PDF -> Markdown ๋ณํ ํจ์ ์ถ๊ฐ
         | 
| 153 | 
            +
            def pdf_to_markdown(pdf_path: str) -> str:
         | 
| 154 | 
            +
                """
         | 
| 155 | 
            +
                PDF ํ์ผ์ ํ
์คํธ๋ก ์ถ์ถ ํ, ๊ฐ๋จํ Markdown ํํ๋ก ๋ฐํ.
         | 
| 156 | 
            +
                """
         | 
| 157 | 
            +
                text_chunks = []
         | 
| 158 | 
            +
                with open(pdf_path, "rb") as f:
         | 
| 159 | 
            +
                    reader = PyPDF2.PdfReader(f)
         | 
| 160 | 
            +
                    for page_num, page in enumerate(reader.pages, start=1):
         | 
| 161 | 
            +
                        page_text = page.extract_text()
         | 
| 162 | 
            +
                        page_text = page_text.strip() if page_text else ""
         | 
| 163 | 
            +
                        if page_text:
         | 
| 164 | 
            +
                            # ํ์ด์ง๋ณ๋ก ๊ฐ๋จํ ํค๋์ ๋ณธ๋ฌธ์ Markdown์ผ๋ก ํฉ์นจ
         | 
| 165 | 
            +
                            text_chunks.append(f"## Page {page_num}\n\n{page_text}\n")
         | 
| 166 | 
            +
                return "\n".join(text_chunks)
         | 
| 167 | 
            +
             | 
| 168 | 
            +
             | 
| 169 | 
             
            def process_new_user_message(message: dict) -> list[dict]:
         | 
| 170 | 
            +
                """
         | 
| 171 | 
            +
                ์ user message์์ text, ํ์ผ(์ด๋ฏธ์ง/๋น๋์ค/PDF)์ ์ฒ๋ฆฌ.
         | 
| 172 | 
            +
                """
         | 
| 173 | 
             
                if not message["files"]:
         | 
| 174 | 
             
                    return [{"type": "text", "text": message["text"]}]
         | 
| 175 |  | 
| 176 | 
            +
                # [PDF] PDF ํ์ผ ๋ชฉ๋ก
         | 
| 177 | 
            +
                pdf_files = [f for f in message["files"] if f.endswith(".pdf")]
         | 
| 178 | 
            +
                # ์ด๋ฏธ์งยท๋น๋์ค ๋ชฉ๋ก
         | 
| 179 | 
            +
                other_files = [f for f in message["files"] if not f.endswith(".pdf")]
         | 
| 180 | 
            +
             | 
| 181 | 
            +
                # ์ผ๋จ ์ฌ์ฉ์์ text๋ฅผ ๊ฐ์ฅ ๋จผ์  ๋ฃ๋๋ค
         | 
| 182 | 
            +
                content_list = [{"type": "text", "text": message["text"]}]
         | 
| 183 | 
            +
             | 
| 184 | 
            +
                # PDF ๋ณํ ํ ์ถ๊ฐ
         | 
| 185 | 
            +
                for pdf_path in pdf_files:
         | 
| 186 | 
            +
                    pdf_markdown = pdf_to_markdown(pdf_path)
         | 
| 187 | 
            +
                    if pdf_markdown.strip():
         | 
| 188 | 
            +
                        content_list.append({"type": "text", "text": pdf_markdown})
         | 
| 189 | 
            +
                    else:
         | 
| 190 | 
            +
                        content_list.append({"type": "text", "text": "(PDF์์ ํ
์คํธ ์ถ์ถ ์คํจ)"})
         | 
| 191 |  | 
| 192 | 
            +
             | 
| 193 | 
            +
                # ์์์ด ์๋์ง ํ์ธ
         | 
| 194 | 
            +
                video_files = [f for f in other_files if f.endswith(".mp4")]
         | 
| 195 | 
            +
                if video_files:
         | 
| 196 | 
            +
                    # ๋น๋์ค๋ ํ ๊ฐ๋ง ์ฒ๋ฆฌํ๋ค๋ ์ ์  (validate_media_constraints์์ ์ด๋ฏธ ๊ฒ์ฌ)
         | 
| 197 | 
            +
                    # ์ฌ๋ฌ ๊ฐ์ผ ๊ฒฝ์ฐ ์ฒซ ๋ฒ์งธ ๊ฒ๋ง ์ฒ๋ฆฌํ๊ฑฐ๋, ๊ฒฝ๊ณ  ์ฒ๋ฆฌ
         | 
| 198 | 
            +
                    content_list += process_video(video_files[0])
         | 
| 199 | 
            +
                    return content_list
         | 
| 200 | 
            +
             | 
| 201 | 
            +
                # interleaved ์ด๋ฏธ์ง
         | 
| 202 | 
             
                if "<image>" in message["text"]:
         | 
| 203 | 
             
                    return process_interleaved_images(message)
         | 
| 204 |  | 
| 205 | 
            +
                # ์ผ๋ฐ ์ด๋ฏธ์ง(์ฌ๋ฌ ์ฅ)
         | 
| 206 | 
            +
                image_files = [f for f in other_files if not f.endswith(".mp4")]
         | 
| 207 | 
            +
                if image_files:
         | 
| 208 | 
            +
                    content_list += [{"type": "image", "url": path} for path in image_files]
         | 
| 209 | 
            +
             | 
| 210 | 
            +
                return content_list
         | 
| 211 |  | 
| 212 |  | 
| 213 | 
             
            def process_history(history: list[dict]) -> list[dict]:
         | 
|  | |
| 390 |  | 
| 391 | 
             
            This is a demo of Gemma 3 27B it, a vision language model with outstanding performance on a wide range of tasks.
         | 
| 392 | 
             
            You can upload images, interleaved images and videos. Note that video input only supports single-turn conversation and mp4 input.
         | 
| 393 | 
            +
             | 
| 394 | 
            +
            Also, PDF files are now supported: any uploaded PDF will be converted to Markdown text and passed into the conversation.
         | 
| 395 | 
             
            """
         | 
| 396 |  | 
| 397 | 
            +
            # [PDF] .pdf ํ์ฉ
         | 
| 398 | 
             
            demo = gr.ChatInterface(
         | 
| 399 | 
             
                fn=run,
         | 
| 400 | 
             
                type="messages",
         | 
| 401 | 
             
                chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
         | 
| 402 | 
            +
                textbox=gr.MultimodalTextbox(
         | 
| 403 | 
            +
                    file_types=["image", ".mp4", ".pdf"],  # [PDF] ํ์ฉ
         | 
| 404 | 
            +
                    file_count="multiple",
         | 
| 405 | 
            +
                    autofocus=True
         | 
| 406 | 
            +
                ),
         | 
| 407 | 
             
                multimodal=True,
         | 
| 408 | 
             
                additional_inputs=[
         | 
| 409 | 
             
                    gr.Textbox(label="System Prompt", value="You are a helpful assistant."),
         | 
 
			
