openfree commited on
Commit
f932d41
Β·
verified Β·
1 Parent(s): 421d7b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -42
app.py CHANGED
@@ -145,7 +145,11 @@ def replace_image_with_base64(markdown_text, image_dir_path):
145
  def to_pdf(file_path):
146
  """
147
  이미지(JPG/PNG λ“±)λ₯Ό PDF둜 μ»¨λ²„νŒ….
 
148
  """
 
 
 
149
  with pymupdf.open(file_path) as f:
150
  if f.is_pdf:
151
  return file_path
@@ -159,50 +163,54 @@ def to_pdf(file_path):
159
 
160
  def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table_enable, language, progress=gr.Progress(track_tqdm=False)):
161
  """
162
- μ—…λ‘œλ“œλœ PDF/이미지 -> PDF λ³€ν™˜ -> λ§ˆν¬λ‹€μš΄ λ³€ν™˜
163
  (ν”„λ‘œκ·Έλ ˆμŠ€ λ°” ν‘œμ‹œμš©)
164
  """
165
- progress(0, "PDF둜 λ³€ν™˜ 쀑...")
166
- file_path = to_pdf(file_path)
167
- time.sleep(0.5)
168
-
169
- if end_pages > 20:
170
- end_pages = 20
171
-
172
- progress(20, "λ¬Έμ„œ νŒŒμ‹± 쀑...")
173
- local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr,
174
- layout_mode, formula_enable, table_enable, language)
175
- time.sleep(0.5)
176
-
177
- progress(50, "μ••μΆ•(zip) 생성 쀑...")
178
- archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
179
- zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
180
- if zip_archive_success == 0:
181
- logger.info("μ••μΆ• 성곡")
182
- status_message = "\n\n**λ³€ν™˜ μ™„λ£Œ (μ••μΆ• 성곡)**"
183
  else:
184
- logger.error("μ••μΆ• μ‹€νŒ¨")
185
- status_message = "\n\n**λ³€ν™˜ μ™„λ£Œ (μ••μΆ• μ‹€νŒ¨)**"
186
- time.sleep(0.5)
187
-
188
- progress(70, "λ§ˆν¬λ‹€μš΄ μ½λŠ” 쀑...")
189
- md_path = os.path.join(local_md_dir, file_name + ".md")
190
- with open(md_path, 'r', encoding='utf-8') as f:
191
- txt_content = f.read()
192
- time.sleep(0.5)
193
-
194
- progress(90, "이미지 base64 λ³€ν™˜ 쀑...")
195
- md_content = replace_image_with_base64(txt_content, local_md_dir)
196
- time.sleep(0.5)
197
-
198
- progress(100, "λ³€ν™˜ μ™„λ£Œ!")
199
- return md_content + status_message
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
  def to_markdown_comparison(file_a, file_b, end_pages, is_ocr, layout_mode, formula_enable, table_enable, language, progress=gr.Progress(track_tqdm=False)):
202
  """
203
- 두 개의 νŒŒμΌμ„ λ³€ν™˜ν•˜μ—¬ A/B 비ꡐλ₯Ό μœ„ν•œ λ§ˆν¬λ‹€μš΄μ„ μƒμ„±ν•œλ‹€.
204
- 각 파일의 λ³€ν™˜ κ²°κ³ΌλŠ” "λ¬Έμ„œ A", "λ¬Έμ„œ B" ν—€λ”λ‘œ κ΅¬λΆ„λ˜λ©°,
205
- 두 λ¬Έμ„œ λͺ¨λ‘ μ—…λ‘œλ“œλœ 경우 두 λ¬Έμ„œμ˜ 차이점, μž₯단점 및 μ£Όμš” λ‚΄μš©μ„ 비ꡐ λΆ„μ„ν•˜λ„λ‘ μΆ”κ°€ μ§€μ‹œμ‚¬ν•­μ„ ν¬ν•¨ν•œλ‹€.
206
  """
207
  combined_md = ""
208
  if file_a is not None:
@@ -392,7 +400,7 @@ if __name__ == "__main__":
392
  gr.HTML("""
393
  <div class="title-area">
394
  <h1>VisionOCR</h1>
395
- <p>두 개의 PDF/이미지 νŒŒμΌμ„ μ—…λ‘œλ“œν•˜μ—¬ A/B 비ꡐ ν›„, μΆ”λ‘  LLMκ³Ό λŒ€ν™”ν•©λ‹ˆλ‹€.<br>
396
  ν•œ 파일만 μ—…λ‘œλ“œν•˜λ©΄ ν•΄λ‹Ή 파일둜 λΆ„μ„ν•©λ‹ˆλ‹€.</p>
397
  </div>
398
  """)
@@ -404,8 +412,8 @@ if __name__ == "__main__":
404
  hidden_chatbot = gr.Chatbot(visible=False)
405
 
406
  with gr.Row():
407
- file_a = gr.File(label="λ¬Έμ„œ A μ—…λ‘œλ“œ", file_types=[".pdf", ".png", ".jpeg", ".jpg"], interactive=True)
408
- file_b = gr.File(label="λ¬Έμ„œ B μ—…λ‘œλ“œ", file_types=[".pdf", ".png", ".jpeg", ".jpg"], interactive=True)
409
  convert_btn = gr.Button("λΉ„κ΅μš© λ³€ν™˜ν•˜κΈ°")
410
 
411
  # 파일 μ—…λ‘œλ“œ μ‹œ μƒνƒœ μ΄ˆκΈ°ν™” (Chatbot은 μˆ¨κΉ€ 처리)
@@ -447,7 +455,7 @@ if __name__ == "__main__":
447
  chat_input = gr.Textbox(lines=1, placeholder="μ§ˆλ¬Έμ„ μž…λ ₯ν•˜μ„Έμš”...")
448
  clear_btn = gr.Button("λŒ€ν™” μ΄ˆκΈ°ν™”")
449
 
450
- # μ±„νŒ… μž…λ ₯ ν›„, LLM 응닡은 hidden_chatbot에 μ €μž₯(화면에 보이지 μ•ŠμŒ)
451
  chat_input.submit(
452
  fn=user_message,
453
  inputs=[chat_input, chat_history, md_state],
 
145
  def to_pdf(file_path):
146
  """
147
  이미지(JPG/PNG λ“±)λ₯Ό PDF둜 μ»¨λ²„νŒ….
148
+ TXT, CSV 파일인 경우 λ³€ν™˜ 없이 원본 경둜λ₯Ό λ°˜ν™˜ν•œλ‹€.
149
  """
150
+ ext = Path(file_path).suffix.lower()
151
+ if ext in ['.txt', '.csv']:
152
+ return file_path
153
  with pymupdf.open(file_path) as f:
154
  if f.is_pdf:
155
  return file_path
 
163
 
164
  def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table_enable, language, progress=gr.Progress(track_tqdm=False)):
165
  """
166
+ μ—…λ‘œλ“œλœ PDF/이미지 λ˜λŠ” TXT/CSV -> λ§ˆν¬λ‹€μš΄ λ³€ν™˜
167
  (ν”„λ‘œκ·Έλ ˆμŠ€ λ°” ν‘œμ‹œμš©)
168
  """
169
+ ext = Path(file_path).suffix.lower()
170
+ if ext in ['.txt', '.csv']:
171
+ progress(0, "파일 μ½λŠ” 쀑...")
172
+ with open(file_path, 'r', encoding='utf-8') as f:
173
+ txt_content = f.read()
174
+ time.sleep(0.5)
175
+ progress(50, "파일 λ‚΄μš© 처리 쀑...")
176
+ progress(100, "λ³€ν™˜ μ™„λ£Œ!")
177
+ return f"```{txt_content}```\n\n**λ³€ν™˜ μ™„λ£Œ (ν…μŠ€νŠΈ/CSV 파일)**"
 
 
 
 
 
 
 
 
 
178
  else:
179
+ progress(0, "PDF둜 λ³€ν™˜ 쀑...")
180
+ file_path = to_pdf(file_path)
181
+ time.sleep(0.5)
182
+ if end_pages > 20:
183
+ end_pages = 20
184
+ progress(20, "λ¬Έμ„œ νŒŒμ‹± 쀑...")
185
+ local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr,
186
+ layout_mode, formula_enable, table_enable, language)
187
+ time.sleep(0.5)
188
+ progress(50, "μ••μΆ•(zip) 생성 쀑...")
189
+ archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
190
+ zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
191
+ if zip_archive_success == 0:
192
+ logger.info("μ••μΆ• 성곡")
193
+ status_message = "\n\n**λ³€ν™˜ μ™„λ£Œ (μ••μΆ• 성곡)**"
194
+ else:
195
+ logger.error("μ••μΆ• μ‹€νŒ¨")
196
+ status_message = "\n\n**λ³€ν™˜ μ™„λ£Œ (μ••μΆ• μ‹€νŒ¨)**"
197
+ time.sleep(0.5)
198
+ progress(70, "λ§ˆν¬λ‹€μš΄ μ½λŠ” 쀑...")
199
+ md_path = os.path.join(local_md_dir, file_name + ".md")
200
+ with open(md_path, 'r', encoding='utf-8') as f:
201
+ txt_content = f.read()
202
+ time.sleep(0.5)
203
+ progress(90, "이미지 base64 λ³€ν™˜ 쀑...")
204
+ md_content = replace_image_with_base64(txt_content, local_md_dir)
205
+ time.sleep(0.5)
206
+ progress(100, "λ³€ν™˜ μ™„λ£Œ!")
207
+ return md_content + status_message
208
 
209
  def to_markdown_comparison(file_a, file_b, end_pages, is_ocr, layout_mode, formula_enable, table_enable, language, progress=gr.Progress(track_tqdm=False)):
210
  """
211
+ 두 개의 νŒŒμΌμ„ λ³€ν™˜ν•˜μ—¬ A/B λΉ„κ΅μš© λ§ˆν¬λ‹€μš΄ 생성.
212
+ 각 νŒŒμΌμ€ "λ¬Έμ„œ A", "λ¬Έμ„œ B" ν—€λ”λ‘œ κ΅¬λΆ„λ˜λ©°,
213
+ 두 파일 λͺ¨λ‘ μ—…λ‘œλ“œλœ 경우 μΆ”κ°€λ‘œ 비ꡐ 뢄석 μ§€μ‹œμ‚¬ν•­μ„ ν¬ν•¨ν•œλ‹€.
214
  """
215
  combined_md = ""
216
  if file_a is not None:
 
400
  gr.HTML("""
401
  <div class="title-area">
402
  <h1>VisionOCR</h1>
403
+ <p>두 개의 PDF/이미지/ν…μŠ€νŠΈ/CSV νŒŒμΌμ„ μ—…λ‘œλ“œν•˜μ—¬ A/B 비ꡐ ν›„, μΆ”λ‘  LLMκ³Ό λŒ€ν™”ν•©λ‹ˆλ‹€.<br>
404
  ν•œ 파일만 μ—…λ‘œλ“œν•˜λ©΄ ν•΄λ‹Ή 파일둜 λΆ„μ„ν•©λ‹ˆλ‹€.</p>
405
  </div>
406
  """)
 
412
  hidden_chatbot = gr.Chatbot(visible=False)
413
 
414
  with gr.Row():
415
+ file_a = gr.File(label="λ¬Έμ„œ A μ—…λ‘œλ“œ", file_types=[".pdf", ".png", ".jpeg", ".jpg", ".txt", ".csv"], interactive=True)
416
+ file_b = gr.File(label="λ¬Έμ„œ B μ—…λ‘œλ“œ", file_types=[".pdf", ".png", ".jpeg", ".jpg", ".txt", ".csv"], interactive=True)
417
  convert_btn = gr.Button("λΉ„κ΅μš© λ³€ν™˜ν•˜κΈ°")
418
 
419
  # 파일 μ—…λ‘œλ“œ μ‹œ μƒνƒœ μ΄ˆκΈ°ν™” (Chatbot은 μˆ¨κΉ€ 처리)
 
455
  chat_input = gr.Textbox(lines=1, placeholder="μ§ˆλ¬Έμ„ μž…λ ₯ν•˜μ„Έμš”...")
456
  clear_btn = gr.Button("λŒ€ν™” μ΄ˆκΈ°ν™”")
457
 
458
+ # μ±„νŒ… μž…λ ₯ ν›„ LLM 응닡은 hidden_chatbot에 μ €μž₯(화면에 보이지 μ•ŠμŒ)
459
  chat_input.submit(
460
  fn=user_message,
461
  inputs=[chat_input, chat_history, md_state],