Spaces:

crystalchen
/

demo-fin-pdf-extraction

Runtime error

App Files Files Community

crystalchen commited on Aug 13, 2024

Commit

735fe06

verified ·

1 Parent(s): 9ab56a2

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -15

app.py CHANGED Viewed

@@ -1,11 +1,6 @@
 ## Set Environment
 import os
-#os.system('python -m venv .env')
-#os.system('source .env/bin/activate')
-## Install poppler in os
-#os.system('apt-get update')
-#os.system('apt-get install poppler-utils')
-##
 from pdf2image import convert_from_path
 import cv2
@@ -17,13 +12,6 @@ import json
 from anthropic import Anthropic, Client
 import gradio as gr
-def get_base64_encorded_image(image_path):
-  with open(image_path, "rb") as image_file:
-    binary_data = image_file.read()
-    base64_encorded_data = base64.b64encode(binary_data)
-    base64_string = base64_encorded_data.decode('utf-8')
-  return base64_string
 ## Set Environment
 os.system('python -m venv env')
 os.system('source env/bin/activate')
@@ -33,6 +21,12 @@ os.system('apt-get update')
 os.system('sudo apt-get install poppler-utils')
 ## The rest of your app.py code goes here
 ## Process pdf
@@ -120,7 +114,6 @@ def extract_table_info(image_path):
             "type": "text",
             "text": """
                  Please extract the table information of the image, keep the context in Traditional Chinese without translation.
-                 if you can not recognize the value precisely, please infer it and try to make a best guess.
                  If you can not make the best guess, please return “UNK”.
                  Create a structured set of data in json format providing key information about a table.
                  Keep the section titles in the table as a parts of json.
@@ -129,7 +122,7 @@ def extract_table_info(image_path):
                  Do not do any sort operation with all the rows.
                  Extract the text information of each cell precisely. Do not make inference between  "代碼" and "項目" if you can not extract it precisely.
                  Make sure the length of each cell you predict is the same as you extract.
-                 Please do not mix "代碼" and "項目" with other case.
                  JSON fields must be labelled as:
                  Example json structure is:
                  <json>

 ## Set Environment
 import os
 from pdf2image import convert_from_path
 import cv2
 from anthropic import Anthropic, Client
 import gradio as gr
 ## Set Environment
 os.system('python -m venv env')
 os.system('source env/bin/activate')
 os.system('sudo apt-get install poppler-utils')
 ## The rest of your app.py code goes here
+def get_base64_encorded_image(image_path):
+  with open(image_path, "rb") as image_file:
+    binary_data = image_file.read()
+    base64_encorded_data = base64.b64encode(binary_data)
+    base64_string = base64_encorded_data.decode('utf-8')
+  return base64_string
 ## Process pdf
             "type": "text",
             "text": """
                  Please extract the table information of the image, keep the context in Traditional Chinese without translation.
                  If you can not make the best guess, please return “UNK”.
                  Create a structured set of data in json format providing key information about a table.
                  Keep the section titles in the table as a parts of json.
                  Do not do any sort operation with all the rows.
                  Extract the text information of each cell precisely. Do not make inference between  "代碼" and "項目" if you can not extract it precisely.
                  Make sure the length of each cell you predict is the same as you extract.
+                 Please do not make any guess with "項目" based on the value of "代碼".
                  JSON fields must be labelled as:
                  Example json structure is:
                  <json>