crystalchen commited on
Commit
f49dfa1
·
verified ·
1 Parent(s): 735fe06

Update app.py

Browse files

- Simplify message_list
- Output as markdown
- limit output token

Files changed (1) hide show
  1. app.py +48 -251
app.py CHANGED
@@ -109,250 +109,25 @@ def extract_table_info(image_path):
109
  "role": "user",
110
  "content": [
111
  {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": get_base64_encorded_image(image_path)}},
112
- #{"type": "text", "text": "Please extract the table information of the image, keep the context in Traditional Chinese without translation, all the alphanumeric chararacter exressed in string, give me a json dictionary of the information extracted"},
113
  {
114
  "type": "text",
115
  "text": """
116
- Please extract the table information of the image, keep the context in Traditional Chinese without translation.
117
- If you can not make the best guess, please return “UNK”.
118
- Create a structured set of data in json format providing key information about a table.
119
- Keep the section titles in the table as a parts of json.
120
- Be sure to extract the information of "代碼", and save them as part of json.
121
- All the value extracted are string, including the "代碼".
122
- Do not do any sort operation with all the rows.
123
- Extract the text information of each cell precisely. Do not make inference between "代碼" and "項目" if you can not extract it precisely.
124
- Make sure the length of each cell you predict is the same as you extract.
125
- Please do not make any guess with "項目" based on the value of "代碼".
126
- JSON fields must be labelled as:
127
- Example json structure is:
128
- <json>
129
- {
130
- "table meta": [
131
- {"企業名稱": },
132
- {"表頭名稱": },
133
- {"報表日期": },
134
- {"幣別": },
135
- ...
136
- ...
137
- ...
138
- ],
139
- "table detail": [
140
- {
141
- ...
142
- ...
143
- ...
144
- },
145
- {
146
- ...
147
- ...
148
- ...
149
- },
150
- ...
151
- ...
152
- ...
153
- ]
154
- }
155
-
156
- </json>
157
-
158
-
159
- Output the json structure as a string starting with <json> and ending with </json> XML tags.
160
- Do not return any narrative language. Look at the images in detail.
161
- Do not insert and control code, like line feed, tab indent: "\n"
162
-
163
- IF YOU COULD NOT FIND THE RIGHT INFORMATION JUST RETURN THIS VALUE “UNK”.
164
-
165
- Example:
166
-
167
- <json>
168
- {
169
- "table meta": [
170
- {"企業名稱": "台灣水泥股份有限公司"},
171
- {"表頭名稱": "個體資產負債表"},
172
- {"報表日期": "民國 112 年及 111 年 12 月 31 日"},
173
- {"幣別": "新台幣仟元"},
174
- ...
175
- ...
176
- ...
177
-
178
- ],
179
- "table detail": [
180
- {
181
- "資產": [
182
- { "流動資產":
183
- [
184
- {
185
- "代碼": "1100",
186
- "項目": "現金及約當現金(附註四及六)",
187
- "112年12月31日金額": "1,516,633",
188
- "112年12月31日%": "-",
189
- "111年12月31日金額": "4,243,295",
190
- "111年12月31日%": "1"
191
- },
192
- {
193
- "代碼": "1110",
194
- "項目": "透過損益按公允價值衡量之金融資產(附註四、七及二六)",
195
- "112年12月31日金額": "341,056",
196
- "112年12月31日%": "-",
197
- "111年12月31日金額": "259,919",
198
- "111年12月31日%": "-"
199
- },
200
- {
201
- "代碼": "1120",
202
- "項目": "透過其他綜合損益按公允價值衡量之金融資產(附註四、八及二六)",
203
- "112年12月31日金額": "4,333,594",
204
- "112年12月31日%": "1",
205
- "111年12月31日金額": "3,607,819",
206
- "111年12月31日%": "1"
207
- },
208
- {
209
- "代碼": "1150",
210
- "項目": "應收票據及帳款淨額(附註四及九)",
211
- "112年12月31日金額": "5,801,135",
212
- "112年12月31日%": "2",
213
- "111年12月31日金額": "5,319,368",
214
- "111年12月31日%": "1"
215
- },
216
- {
217
- "代碼": "1180",
218
- "項目": "應收票據及帳款-關係人(附註四及二七)",
219
- "112年12月31日金額": "572,118",
220
- "112年12月31日%": "-",
221
- "111年12月31日金額": "681,793",
222
- "111年12月31日%": "-"
223
- },
224
- {
225
- "代碼": "130X",
226
- "項目": "存貨(附註四及十)",
227
- "112年12月31日金額": "1,782,735",
228
- "112年12月31日%": "1",
229
- "111年12月31日金額": "2,321,850",
230
- "111年12月31日%": "1"
231
- },
232
- {
233
- "代碼": "1470",
234
- "項目": "其他流動資產(附註二一及二七)",
235
- "112年12月31日金額": "411,540",
236
- "112年12月31日%": "-",
237
- "111年12月31日金額": "248,683",
238
- "111年12月31日%": "-"
239
- },
240
- {
241
- "代碼": "11XX",
242
- "項目": "流動資產總計",
243
- "112年12月31日金額": "14,758,811",
244
- "112年12月31日%": "4",
245
- "111年12月31日金額": "16,682,727",
246
- "111年12月31日%": "4"
247
- }
248
- ]
249
- },
250
- {
251
- "非流動資產": [
252
- {
253
- "代碼": "1517",
254
- "項目": "透過其他綜合損益按公允價值衡量之金融資產(附註四、八及二六)",
255
- "112年12月31日金額": "9,638,255",
256
- "112年12月31日%": "3",
257
- "111年12月31日金額": "7,633,603",
258
- "111年12月31日%": "2"
259
- },
260
- {
261
- "代碼": "1550",
262
- "項目": "採用權益法之投資(附註四、五及十一)",
263
- "112年12月31日金額": "312,351,291",
264
- "112年12月31日%": "82",
265
- "111年12月31日金額": "307,101,709",
266
- "111年12月31日%": "82"
267
- },
268
- {
269
- "代碼": "1600",
270
- "項目": "不動產、廠房及設備(附註四、五、十二、十三及二八)",
271
- "112年12月31日金額": "28,052,603",
272
- "112年12月31日%": "7",
273
- "111年12月31日金額": "35,583,596",
274
- "111年12月31日%": "10"
275
- },
276
- {
277
- "代碼": "1755",
278
- "項目": "使用權資產(附註四、十五、二十、二七)",
279
- "112年12月31日金額": "1,797,820",
280
- "112年12月31日%": "1",
281
- "111年12月31日金額": "1,788,972",
282
- "111年12月31日%": "1"
283
- },
284
- {
285
- "代碼": "1760",
286
- "項目": "投資性不動產(附註四、十四及二十)",
287
- "112年12月31日金額": "13,042,677",
288
- "112年12月31日%": "3",
289
- "111年12月31日金額": "2,436,675",
290
- "111年12月31日%": "-"
291
- },
292
- {
293
- "代碼": "1821",
294
- "項目": "無形資產(附註四及二十)",
295
- "112年12月31日金額": "58,840",
296
- "112年12月31日%": "-",
297
- "111年12月31日金額": "64,956",
298
- "111年12月31日%": "-"
299
- },
300
- {
301
- "代碼": "1915",
302
- "項目": "預付設備款",
303
- "112年12月31日金額": "600,042",
304
- "112年12月31日%": "-",
305
- "111年12月31日金額": "682,765",
306
- "111年12月31日%": "-"
307
- },
308
- {
309
- "代碼": "1975",
310
- "項目": "淨確定福利資產(附註四及十八)",
311
- "112年12月31日金額": "1,507,153",
312
- "112年12月31日%": "-",
313
- "111年12月31日金額": "1,526,546",
314
- "111年12月31日%": "-"
315
- },
316
- {
317
- "代碼": "1990",
318
- "項目": "其他非流動資產(附註四、六、二一及二八)",
319
- "112年12月31日金額": "827,628",
320
- "112年12月31日%": "-",
321
- "111年12月31日金額": "840,688",
322
- "111年12月31日%": "1"
323
- },
324
- {
325
- "代碼": "15XX",
326
- "項目": "非流動資產總計",
327
- "112年12月31日金額": "367,876,309",
328
- "112年12月31日%": "96",
329
- "111年12月31日金額": "357,659,510",
330
- "111年12月31日%": "96"
331
- }]
332
- },
333
- {
334
- "代碼": "1XXX",
335
- "項目": "資產總計",
336
- "112年12月31日金額": "382,635,120",
337
- "112年12月31日%": "100",
338
- "111年12月31日金額": "374,342,237",
339
- "111年12月31日%": "100"
340
- }
341
- ]
342
- },
343
- {
344
- "負債": [
345
- ...
346
- ...
347
- ...
348
- ]
349
- },
350
- ...
351
- ...
352
- ...
353
- ]
354
- }
355
- </json>
356
 
357
  """
358
 
@@ -365,14 +140,14 @@ def extract_table_info(image_path):
365
  # Update how the API is called
366
  response = client.messages.create(
367
  model=MODEL_NAME,
368
- max_tokens=8192, # limit the amount of response information
369
  messages=message_list,
370
- temperature=0.7,
371
  extra_headers={"anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"} # Changed to a dictionary
372
  )
373
  tokens = response.usage.output_tokens
374
  print(f"Generated Tokens: {tokens}")
375
- #print(f"Response: {response}")
376
  return response
377
 
378
  ## Check Response
@@ -389,6 +164,26 @@ def check_response(response):
389
  print("Unexpected response format. Unable to extract text.")
390
  return None
391
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
  ## Extract Json data
393
  def extract_json(response):
394
  response_text = response.content[0].text # Access the 'text' attribute of the TextBlock object
@@ -423,27 +218,29 @@ def pipeline(pdf_path):
423
  response = {}
424
  response = extract_table_info(destamp_img)
425
  check_response(response)
426
- json_data = extract_json(response)
427
- return len(pages), destamp_img, json_data
 
428
 
429
  ## Gradio Interface
430
  title = "Demo: Financial Statement(PDF) information Extraction - Traditional Chinese"
431
  description = """Demo pdf, either editable or scanned image, information extraction for Traditional Chinese without OCR"""
432
- examples = ['text_pdf.pdf', 'image_pdf.pdf']
433
 
434
  pdf_file = gr.File(label="Upload PDF", type="filepath")
435
  pages = gr.File(label="Pages", type="filepath")
436
  num_pages = gr.Number(label="Number of Pages")
437
  destamp_img = gr.Image(type="numpy", label="De-stamped Image")
438
- json_data = gr.JSON(label="JSON Data")
 
439
 
440
 
441
  app = gr.Interface(fn=pipeline,
442
  inputs=pdf_file,
443
- outputs=[num_pages, destamp_img, json_data],
444
  title=title,
445
  description=description,
446
  examples=examples)
447
  app.queue()
448
- #app.launch(debug=True, share=True)
449
- app.launch(share=True)
 
109
  "role": "user",
110
  "content": [
111
  {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": get_base64_encorded_image(image_path)}},
 
112
  {
113
  "type": "text",
114
  "text": """
115
+ You are analyzing an Financial Statement in traditional Chinese.
116
+ Please extract all the information of the statement image, keep the context in Traditional Chinese without translation.
117
+ Extract information row by row, and cell by cell.
118
+ Keep document title, header, date, currency, section header, summary, footer, ... as part of the information.
119
+ OCR all the cells precisely with the best accuracy. Any Chinese character, if you can not make the best guess, please return "?". Do not ignore it.
120
+ Do not do any correction with the content of the cell related with "代碼", even it is not 100% correct from your experience. Keep as what it is.
121
+ Makd sure the length of the string of each cell is same as the image.
122
+ Save all the information as a markdown table.
123
+ Keep alignment of each column with the image.
124
+
125
+ Repsonse as below structure:
126
+ <mark>
127
+ ...
128
+ ...
129
+ ...
130
+ </mark>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  """
133
 
 
140
  # Update how the API is called
141
  response = client.messages.create(
142
  model=MODEL_NAME,
143
+ max_tokens=3072, # limit the amount of response information
144
  messages=message_list,
145
+ temperature=0.6,
146
  extra_headers={"anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"} # Changed to a dictionary
147
  )
148
  tokens = response.usage.output_tokens
149
  print(f"Generated Tokens: {tokens}")
150
+ print(f"Response: {response}")
151
  return response
152
 
153
  ## Check Response
 
164
  print("Unexpected response format. Unable to extract text.")
165
  return None
166
 
167
+ ## Extract markdown data
168
+ def extract_markdown(response):
169
+ response_text = response.content[0].text # Access the 'text' attribute of the TextBlock object
170
+ # Try to find the start and end of the JSON object more robustly
171
+ # skip <json>
172
+
173
+ mark_start = response_text.find("<mark>")+6 # Skip the <json> tag
174
+ mark_end = response_text.find("</mark>") # Include the closing brace
175
+ print(f"mark_start: {mark_start}")
176
+ print(f"mark_end: {mark_end}")
177
+ # Check if valid start and end indices were found
178
+ if mark_start >= 0 and mark_end > mark_start:
179
+ mark_data = response_text[mark_start:mark_end]
180
+ print(f"mark_data: {mark_data}")
181
+ return mark_data
182
+ else:
183
+ print("Could not find valid Markdown object in response.")
184
+ return
185
+
186
+
187
  ## Extract Json data
188
  def extract_json(response):
189
  response_text = response.content[0].text # Access the 'text' attribute of the TextBlock object
 
218
  response = {}
219
  response = extract_table_info(destamp_img)
220
  check_response(response)
221
+ mark_data = extract_markdown(response)
222
+ #json_data = extract_json(response)
223
+ return len(pages), destamp_img, mark_data
224
 
225
  ## Gradio Interface
226
  title = "Demo: Financial Statement(PDF) information Extraction - Traditional Chinese"
227
  description = """Demo pdf, either editable or scanned image, information extraction for Traditional Chinese without OCR"""
228
+ examples = [['text_pdf.pdf'], ['image_pdf.pdf']]
229
 
230
  pdf_file = gr.File(label="Upload PDF", type="filepath")
231
  pages = gr.File(label="Pages", type="filepath")
232
  num_pages = gr.Number(label="Number of Pages")
233
  destamp_img = gr.Image(type="numpy", label="De-stamped Image")
234
+ #json_data = gr.JSON(label="JSON Data")
235
+ mark_data = gr.Markdown(label="Markdown Data")
236
 
237
 
238
  app = gr.Interface(fn=pipeline,
239
  inputs=pdf_file,
240
+ outputs=[num_pages, destamp_img, mark_data],
241
  title=title,
242
  description=description,
243
  examples=examples)
244
  app.queue()
245
+ app.launch(debug=True, share=True)
246
+ #app.launch()