Spaces:

crystalchen
/

demo-fin-pdf-extraction

Runtime error

App Files Files Community

crystalchen commited on Aug 14, 2024

Commit

f49dfa1

verified ·

1 Parent(s): 735fe06

Update app.py

Browse files

- Simplify message_list
- Output as markdown
- limit output token

Files changed (1) hide show

app.py +48 -251

app.py CHANGED Viewed

@@ -109,250 +109,25 @@ def extract_table_info(image_path):
         "role": "user",
         "content": [
           {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": get_base64_encorded_image(image_path)}},
-          #{"type": "text", "text": "Please extract the table information of the image, keep the context in Traditional Chinese without translation, all the alphanumeric chararacter exressed in string, give me a json dictionary of the information extracted"},
           {
             "type": "text",
             "text": """
-                 Please extract the table information of the image, keep the context in Traditional Chinese without translation.
-                 If you can not make the best guess, please return “UNK”.
-                 Create a structured set of data in json format providing key information about a table.
-                 Keep the section titles in the table as a parts of json.
-                 Be sure to extract the information of "代碼", and save them as part of json.
-                 All the value extracted are string, including the "代碼".
-                 Do not do any sort operation with all the rows.
-                 Extract the text information of each cell precisely. Do not make inference between  "代碼" and "項目" if you can not extract it precisely.
-                 Make sure the length of each cell you predict is the same as you extract.
-                 Please do not make any guess with "項目" based on the value of "代碼".
-                 JSON fields must be labelled as:
-                 Example json structure is:
-                 <json>
-                 {
-                    "table meta": [
-                      {"企業名稱":  },
-                      {"表頭名稱":  },
-                      {"報表日期":  },
-                      {"幣別":      },
-                      ...
-                      ...
-                      ...
-                    ],
-                    "table detail": [
-                        {
-                          ...
-                          ...
-                          ...
-                        },
-                        {
-                          ...
-                          ...
-                          ...
-                        },
-                        ...
-                        ...
-                        ...
-                      ]
-                 }
-                 </json>
-                 Output the json structure as a string starting with <json> and ending with </json> XML tags.
-                 Do not return any narrative language. Look at the images in detail.
-                 Do not insert and control code, like line feed, tab indent: "\n"
-                 IF YOU COULD NOT FIND THE RIGHT INFORMATION JUST RETURN THIS VALUE “UNK”.
-                 Example:
-                 <json>
-                 {
-                    "table meta": [
-                      {"企業名稱": "台灣水泥股份有限公司"},
-                      {"表頭名稱": "個體資產負債表"},
-                      {"報表日期": "民國 112 年及 111 年 12 月 31 日"},
-                      {"幣別": "新台幣仟元"},
-                      ...
-                      ...
-                      ...
-                    ],
-                    "table detail": [
-                      {
-                        "資產": [
-                          { "流動資產":
-                            [
-                              {
-                                "代碼": "1100",
-                                "項目": "現金及約當現金（附註四及六）",
-                                "112年12月31日金額": "1,516,633",
-                                "112年12月31日%": "-",
-                                "111年12月31日金額": "4,243,295",
-                                "111年12月31日%": "1"
-                              },
-                              {
-                                "代碼": "1110",
-                                "項目": "透過損益按公允價值衡量之金融資產（附註四、七及二六）",
-                                "112年12月31日金額": "341,056",
-                                "112年12月31日%": "-",
-                                "111年12月31日金額": "259,919",
-                                "111年12月31日%": "-"
-                              },
-                              {
-                                "代碼": "1120",
-                                "項目": "透過其他綜合損益按公允價值衡量之金融資產（附註四、八及二六）",
-                                "112年12月31日金額": "4,333,594",
-                                "112年12月31日%": "1",
-                                "111年12月31日金額": "3,607,819",
-                                "111年12月31日%": "1"
-                              },
-                              {
-                                "代碼": "1150",
-                                "項目": "應收票據及帳款淨額（附註四及九）",
-                                "112年12月31日金額": "5,801,135",
-                                "112年12月31日%": "2",
-                                "111年12月31日金額": "5,319,368",
-                                "111年12月31日%": "1"
-                             },
-                              {
-                                "代碼": "1180",
-                                "項目": "應收票據及帳款－關係人（附註四及二七）",
-                                "112年12月31日金額": "572,118",
-                                "112年12月31日%": "-",
-                                "111年12月31日金額": "681,793",
-                                "111年12月31日%": "-"
-                              },
-                              {
-                                "代碼": "130X",
-                                "項目": "存貨（附註四及十）",
-                                "112年12月31日金額": "1,782,735",
-                                "112年12月31日%": "1",
-                                "111年12月31日金額": "2,321,850",
-                                "111年12月31日%": "1"
-                              },
-                              {
-                                "代碼": "1470",
-                                "項目": "其他流動資產（附註二一及二七）",
-                                "112年12月31日金額": "411,540",
-                                "112年12月31日%": "-",
-                                "111年12月31日金額": "248,683",
-                                "111年12月31日%": "-"
-                              },
-                              {
-                                "代碼": "11XX",
-                                "項目": "流動資產總計",
-                                "112年12月31日金額": "14,758,811",
-                                "112年12月31日%": "4",
-                                "111年12月31日金額": "16,682,727",
-                                "111年12月31日%": "4"
-                              }
-                            ]
-                        },
-                        {
-                          "非流動資產": [
-                          {
-                            "代碼": "1517",
-                            "項目": "透過其他綜合損益按公允價值衡量之金融資產（附註四、八及二六）",
-                            "112年12月31日金額": "9,638,255",
-                            "112年12月31日%": "3",
-                            "111年12月31日金額": "7,633,603",
-                            "111年12月31日%": "2"
-                          },
-                          {
-                            "代碼": "1550",
-                            "項目": "採用權益法之投資（附註四、五及十一）",
-                            "112年12月31日金額": "312,351,291",
-                            "112年12月31日%": "82",
-                            "111年12月31日金額": "307,101,709",
-                            "111年12月31日%": "82"
-                          },
-                          {
-                            "代碼": "1600",
-                            "項目": "不動產、廠房及設備（附註四、五、十二、十三及二八）",
-                            "112年12月31日金額": "28,052,603",
-                            "112年12月31日%": "7",
-                            "111年12月31日金額": "35,583,596",
-                            "111年12月31日%": "10"
-                          },
-                          {
-                            "代碼": "1755",
-                            "項目": "使用權資產（附註四、十五、二十、二七）",
-                            "112年12月31日金額": "1,797,820",
-                            "112年12月31日%": "1",
-                            "111年12月31日金額": "1,788,972",
-                            "111年12月31日%": "1"
-                          },
-                          {
-                            "代碼": "1760",
-                            "項目": "投資性不動產（附註四、十四及二十）",
-                            "112年12月31日金額": "13,042,677",
-                            "112年12月31日%": "3",
-                            "111年12月31日金額": "2,436,675",
-                            "111年12月31日%": "-"
-                          },
-                          {
-                            "代碼": "1821",
-                            "項目": "無形資產（附註四及二十）",
-                            "112年12月31日金額": "58,840",
-                            "112年12月31日%": "-",
-                            "111年12月31日金額": "64,956",
-                            "111年12月31日%": "-"
-                          },
-                          {
-                            "代碼": "1915",
-                            "項目": "預付設備款",
-                            "112年12月31日金額": "600,042",
-                            "112年12月31日%": "-",
-                            "111年12月31日金額": "682,765",
-                            "111年12月31日%": "-"
-                          },
-                          {
-                            "代碼": "1975",
-                            "項目": "淨確定福利資產（附註四及十八）",
-                            "112年12月31日金額": "1,507,153",
-                            "112年12月31日%": "-",
-                            "111年12月31日金額": "1,526,546",
-                            "111年12月31日%": "-"
-                          },
-                          {
-                            "代碼": "1990",
-                            "項目": "其他非流動資產（附註四、六、二一及二八）",
-                            "112年12月31日金額": "827,628",
-                            "112年12月31日%": "-",
-                            "111年12月31日金額": "840,688",
-                            "111年12月31日%": "1"
-                          },
-                          {
-                            "代碼": "15XX",
-                            "項目": "非流動資產總計",
-                            "112年12月31日金額": "367,876,309",
-                            "112年12月31日%": "96",
-                            "111年12月31日金額": "357,659,510",
-                            "111年12月31日%": "96"
-                          }]
-                        },
-                        {
-                          "代碼": "1XXX",
-                          "項目": "資產總計",
-                          "112年12月31日金額": "382,635,120",
-                          "112年12月31日%": "100",
-                          "111年12月31日金額": "374,342,237",
-                          "111年12月31日%": "100"
-                        }
-                      ]
-                      },
-                      {
-                        "負債": [
-                          ...
-                          ...
-                          ...
-                        ]
-                      },
-                      ...
-                      ...
-                      ...
-                    ]
-                  }
-                 </json>
               """
@@ -365,14 +140,14 @@ def extract_table_info(image_path):
   # Update how the API is called
   response = client.messages.create(
       model=MODEL_NAME,
-      max_tokens=8192, # limit the amount of response information
       messages=message_list,
-      temperature=0.7,
       extra_headers={"anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"}  # Changed to a dictionary
   )
   tokens = response.usage.output_tokens
   print(f"Generated Tokens: {tokens}")
-  #print(f"Response: {response}")
   return response
 ## Check Response
@@ -389,6 +164,26 @@ def check_response(response):
       print("Unexpected response format. Unable to extract text.")
   return None
 ## Extract Json data
 def extract_json(response):
     response_text = response.content[0].text  # Access the 'text' attribute of the TextBlock object
@@ -423,27 +218,29 @@ def pipeline(pdf_path):
   response = {}
   response = extract_table_info(destamp_img)
   check_response(response)
-  json_data = extract_json(response)
-  return len(pages), destamp_img, json_data
 ## Gradio Interface
 title = "Demo: Financial Statement(PDF) information Extraction - Traditional Chinese"
 description = """Demo pdf, either editable or scanned image, information extraction for Traditional Chinese without OCR"""
-examples = ['text_pdf.pdf', 'image_pdf.pdf']
 pdf_file = gr.File(label="Upload PDF", type="filepath")
 pages = gr.File(label="Pages", type="filepath")
 num_pages = gr.Number(label="Number of Pages")
 destamp_img = gr.Image(type="numpy", label="De-stamped Image")
-json_data = gr.JSON(label="JSON Data")
 app = gr.Interface(fn=pipeline,
                      inputs=pdf_file,
-                     outputs=[num_pages, destamp_img, json_data],
                      title=title,
                      description=description,
                      examples=examples)
 app.queue()
-#app.launch(debug=True, share=True)
-app.launch(share=True)

         "role": "user",
         "content": [
           {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": get_base64_encorded_image(image_path)}},
           {
             "type": "text",
             "text": """
+                 You are analyzing an Financial Statement in traditional Chinese.
+                 Please extract all the information of the statement image, keep the context in Traditional Chinese without translation.
+                 Extract information row by row, and cell by cell.
+                 Keep document title, header, date, currency, section header, summary, footer, ... as part of the information.
+                 OCR all the cells precisely with the best accuracy. Any Chinese character, if you can not make the best guess, please return "?". Do not ignore it.
+                 Do not do any correction with the content of the cell related with "代碼", even it is not 100% correct from your experience. Keep as what it is.
+                 Makd sure the length of the string of each cell is same as the image.
+                 Save all the information as a markdown table.
+                 Keep alignment of each column with the image.
+                 Repsonse as below structure:
+                 <mark>
+                 ...
+                 ...
+                 ...
+                 </mark>
               """
   # Update how the API is called
   response = client.messages.create(
       model=MODEL_NAME,
+      max_tokens=3072, # limit the amount of response information
       messages=message_list,
+      temperature=0.6,
       extra_headers={"anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"}  # Changed to a dictionary
   )
   tokens = response.usage.output_tokens
   print(f"Generated Tokens: {tokens}")
+  print(f"Response: {response}")
   return response
 ## Check Response
       print("Unexpected response format. Unable to extract text.")
   return None
+## Extract markdown data
+def extract_markdown(response):
+    response_text = response.content[0].text  # Access the 'text' attribute of the TextBlock object
+    # Try to find the start and end of the JSON object more robustly
+    # skip <json>
+    mark_start = response_text.find("<mark>")+6  # Skip the <json> tag
+    mark_end = response_text.find("</mark>")  # Include the closing brace
+    print(f"mark_start: {mark_start}")
+    print(f"mark_end: {mark_end}")
+    # Check if valid start and end indices were found
+    if mark_start >= 0 and mark_end > mark_start:
+          mark_data = response_text[mark_start:mark_end]
+          print(f"mark_data: {mark_data}")
+          return   mark_data
+    else:
+        print("Could not find valid Markdown object in response.")
+    return
 ## Extract Json data
 def extract_json(response):
     response_text = response.content[0].text  # Access the 'text' attribute of the TextBlock object
   response = {}
   response = extract_table_info(destamp_img)
   check_response(response)
+  mark_data = extract_markdown(response)
+  #json_data = extract_json(response)
+  return len(pages), destamp_img, mark_data
 ## Gradio Interface
 title = "Demo: Financial Statement(PDF) information Extraction - Traditional Chinese"
 description = """Demo pdf, either editable or scanned image, information extraction for Traditional Chinese without OCR"""
+examples = [['text_pdf.pdf'], ['image_pdf.pdf']]
 pdf_file = gr.File(label="Upload PDF", type="filepath")
 pages = gr.File(label="Pages", type="filepath")
 num_pages = gr.Number(label="Number of Pages")
 destamp_img = gr.Image(type="numpy", label="De-stamped Image")
+#json_data = gr.JSON(label="JSON Data")
+mark_data = gr.Markdown(label="Markdown Data")
 app = gr.Interface(fn=pipeline,
                      inputs=pdf_file,
+                     outputs=[num_pages, destamp_img, mark_data],
                      title=title,
                      description=description,
                      examples=examples)
 app.queue()
+app.launch(debug=True, share=True)
+#app.launch()