Spaces:
Runtime error
Runtime error
Update app.py
Browse files- Simplify message_list
- Output as markdown
- limit output token
app.py
CHANGED
@@ -109,250 +109,25 @@ def extract_table_info(image_path):
|
|
109 |
"role": "user",
|
110 |
"content": [
|
111 |
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": get_base64_encorded_image(image_path)}},
|
112 |
-
#{"type": "text", "text": "Please extract the table information of the image, keep the context in Traditional Chinese without translation, all the alphanumeric chararacter exressed in string, give me a json dictionary of the information extracted"},
|
113 |
{
|
114 |
"type": "text",
|
115 |
"text": """
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
Keep
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
{"表頭名稱": },
|
133 |
-
{"報表日期": },
|
134 |
-
{"幣別": },
|
135 |
-
...
|
136 |
-
...
|
137 |
-
...
|
138 |
-
],
|
139 |
-
"table detail": [
|
140 |
-
{
|
141 |
-
...
|
142 |
-
...
|
143 |
-
...
|
144 |
-
},
|
145 |
-
{
|
146 |
-
...
|
147 |
-
...
|
148 |
-
...
|
149 |
-
},
|
150 |
-
...
|
151 |
-
...
|
152 |
-
...
|
153 |
-
]
|
154 |
-
}
|
155 |
-
|
156 |
-
</json>
|
157 |
-
|
158 |
-
|
159 |
-
Output the json structure as a string starting with <json> and ending with </json> XML tags.
|
160 |
-
Do not return any narrative language. Look at the images in detail.
|
161 |
-
Do not insert and control code, like line feed, tab indent: "\n"
|
162 |
-
|
163 |
-
IF YOU COULD NOT FIND THE RIGHT INFORMATION JUST RETURN THIS VALUE “UNK”.
|
164 |
-
|
165 |
-
Example:
|
166 |
-
|
167 |
-
<json>
|
168 |
-
{
|
169 |
-
"table meta": [
|
170 |
-
{"企業名稱": "台灣水泥股份有限公司"},
|
171 |
-
{"表頭名稱": "個體資產負債表"},
|
172 |
-
{"報表日期": "民國 112 年及 111 年 12 月 31 日"},
|
173 |
-
{"幣別": "新台幣仟元"},
|
174 |
-
...
|
175 |
-
...
|
176 |
-
...
|
177 |
-
|
178 |
-
],
|
179 |
-
"table detail": [
|
180 |
-
{
|
181 |
-
"資產": [
|
182 |
-
{ "流動資產":
|
183 |
-
[
|
184 |
-
{
|
185 |
-
"代碼": "1100",
|
186 |
-
"項目": "現金及約當現金(附註四及六)",
|
187 |
-
"112年12月31日金額": "1,516,633",
|
188 |
-
"112年12月31日%": "-",
|
189 |
-
"111年12月31日金額": "4,243,295",
|
190 |
-
"111年12月31日%": "1"
|
191 |
-
},
|
192 |
-
{
|
193 |
-
"代碼": "1110",
|
194 |
-
"項目": "透過損益按公允價值衡量之金融資產(附註四、七及二六)",
|
195 |
-
"112年12月31日金額": "341,056",
|
196 |
-
"112年12月31日%": "-",
|
197 |
-
"111年12月31日金額": "259,919",
|
198 |
-
"111年12月31日%": "-"
|
199 |
-
},
|
200 |
-
{
|
201 |
-
"代碼": "1120",
|
202 |
-
"項目": "透過其他綜合損益按公允價值衡量之金融資產(附註四、八及二六)",
|
203 |
-
"112年12月31日金額": "4,333,594",
|
204 |
-
"112年12月31日%": "1",
|
205 |
-
"111年12月31日金額": "3,607,819",
|
206 |
-
"111年12月31日%": "1"
|
207 |
-
},
|
208 |
-
{
|
209 |
-
"代碼": "1150",
|
210 |
-
"項目": "應收票據及帳款淨額(附註四及九)",
|
211 |
-
"112年12月31日金額": "5,801,135",
|
212 |
-
"112年12月31日%": "2",
|
213 |
-
"111年12月31日金額": "5,319,368",
|
214 |
-
"111年12月31日%": "1"
|
215 |
-
},
|
216 |
-
{
|
217 |
-
"代碼": "1180",
|
218 |
-
"項目": "應收票據及帳款-關係人(附註四及二七)",
|
219 |
-
"112年12月31日金額": "572,118",
|
220 |
-
"112年12月31日%": "-",
|
221 |
-
"111年12月31日金額": "681,793",
|
222 |
-
"111年12月31日%": "-"
|
223 |
-
},
|
224 |
-
{
|
225 |
-
"代碼": "130X",
|
226 |
-
"項目": "存貨(附註四及十)",
|
227 |
-
"112年12月31日金額": "1,782,735",
|
228 |
-
"112年12月31日%": "1",
|
229 |
-
"111年12月31日金額": "2,321,850",
|
230 |
-
"111年12月31日%": "1"
|
231 |
-
},
|
232 |
-
{
|
233 |
-
"代碼": "1470",
|
234 |
-
"項目": "其他流動資產(附註二一及二七)",
|
235 |
-
"112年12月31日金額": "411,540",
|
236 |
-
"112年12月31日%": "-",
|
237 |
-
"111年12月31日金額": "248,683",
|
238 |
-
"111年12月31日%": "-"
|
239 |
-
},
|
240 |
-
{
|
241 |
-
"代碼": "11XX",
|
242 |
-
"項目": "流動資產總計",
|
243 |
-
"112年12月31日金額": "14,758,811",
|
244 |
-
"112年12月31日%": "4",
|
245 |
-
"111年12月31日金額": "16,682,727",
|
246 |
-
"111年12月31日%": "4"
|
247 |
-
}
|
248 |
-
]
|
249 |
-
},
|
250 |
-
{
|
251 |
-
"非流動資產": [
|
252 |
-
{
|
253 |
-
"代碼": "1517",
|
254 |
-
"項目": "透過其他綜合損益按公允價值衡量之金融資產(附註四、八及二六)",
|
255 |
-
"112年12月31日金額": "9,638,255",
|
256 |
-
"112年12月31日%": "3",
|
257 |
-
"111年12月31日金額": "7,633,603",
|
258 |
-
"111年12月31日%": "2"
|
259 |
-
},
|
260 |
-
{
|
261 |
-
"代碼": "1550",
|
262 |
-
"項目": "採用權益法之投資(附註四、五及十一)",
|
263 |
-
"112年12月31日金額": "312,351,291",
|
264 |
-
"112年12月31日%": "82",
|
265 |
-
"111年12月31日金額": "307,101,709",
|
266 |
-
"111年12月31日%": "82"
|
267 |
-
},
|
268 |
-
{
|
269 |
-
"代碼": "1600",
|
270 |
-
"項目": "不動產、廠房及設備(附註四、五、十二、十三及二八)",
|
271 |
-
"112年12月31日金額": "28,052,603",
|
272 |
-
"112年12月31日%": "7",
|
273 |
-
"111年12月31日金額": "35,583,596",
|
274 |
-
"111年12月31日%": "10"
|
275 |
-
},
|
276 |
-
{
|
277 |
-
"代碼": "1755",
|
278 |
-
"項目": "使用權資產(附註四、十五、二十、二七)",
|
279 |
-
"112年12月31日金額": "1,797,820",
|
280 |
-
"112年12月31日%": "1",
|
281 |
-
"111年12月31日金額": "1,788,972",
|
282 |
-
"111年12月31日%": "1"
|
283 |
-
},
|
284 |
-
{
|
285 |
-
"代碼": "1760",
|
286 |
-
"項目": "投資性不動產(附註四、十四及二十)",
|
287 |
-
"112年12月31日金額": "13,042,677",
|
288 |
-
"112年12月31日%": "3",
|
289 |
-
"111年12月31日金額": "2,436,675",
|
290 |
-
"111年12月31日%": "-"
|
291 |
-
},
|
292 |
-
{
|
293 |
-
"代碼": "1821",
|
294 |
-
"項目": "無形資產(附註四及二十)",
|
295 |
-
"112年12月31日金額": "58,840",
|
296 |
-
"112年12月31日%": "-",
|
297 |
-
"111年12月31日金額": "64,956",
|
298 |
-
"111年12月31日%": "-"
|
299 |
-
},
|
300 |
-
{
|
301 |
-
"代碼": "1915",
|
302 |
-
"項目": "預付設備款",
|
303 |
-
"112年12月31日金額": "600,042",
|
304 |
-
"112年12月31日%": "-",
|
305 |
-
"111年12月31日金額": "682,765",
|
306 |
-
"111年12月31日%": "-"
|
307 |
-
},
|
308 |
-
{
|
309 |
-
"代碼": "1975",
|
310 |
-
"項目": "淨確定福利資產(附註四及十八)",
|
311 |
-
"112年12月31日金額": "1,507,153",
|
312 |
-
"112年12月31日%": "-",
|
313 |
-
"111年12月31日金額": "1,526,546",
|
314 |
-
"111年12月31日%": "-"
|
315 |
-
},
|
316 |
-
{
|
317 |
-
"代碼": "1990",
|
318 |
-
"項目": "其他非流動資產(附註四、六、二一及二八)",
|
319 |
-
"112年12月31日金額": "827,628",
|
320 |
-
"112年12月31日%": "-",
|
321 |
-
"111年12月31日金額": "840,688",
|
322 |
-
"111年12月31日%": "1"
|
323 |
-
},
|
324 |
-
{
|
325 |
-
"代碼": "15XX",
|
326 |
-
"項目": "非流動資產總計",
|
327 |
-
"112年12月31日金額": "367,876,309",
|
328 |
-
"112年12月31日%": "96",
|
329 |
-
"111年12月31日金額": "357,659,510",
|
330 |
-
"111年12月31日%": "96"
|
331 |
-
}]
|
332 |
-
},
|
333 |
-
{
|
334 |
-
"代碼": "1XXX",
|
335 |
-
"項目": "資產總計",
|
336 |
-
"112年12月31日金額": "382,635,120",
|
337 |
-
"112年12月31日%": "100",
|
338 |
-
"111年12月31日金額": "374,342,237",
|
339 |
-
"111年12月31日%": "100"
|
340 |
-
}
|
341 |
-
]
|
342 |
-
},
|
343 |
-
{
|
344 |
-
"負債": [
|
345 |
-
...
|
346 |
-
...
|
347 |
-
...
|
348 |
-
]
|
349 |
-
},
|
350 |
-
...
|
351 |
-
...
|
352 |
-
...
|
353 |
-
]
|
354 |
-
}
|
355 |
-
</json>
|
356 |
|
357 |
"""
|
358 |
|
@@ -365,14 +140,14 @@ def extract_table_info(image_path):
|
|
365 |
# Update how the API is called
|
366 |
response = client.messages.create(
|
367 |
model=MODEL_NAME,
|
368 |
-
max_tokens=
|
369 |
messages=message_list,
|
370 |
-
temperature=0.
|
371 |
extra_headers={"anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"} # Changed to a dictionary
|
372 |
)
|
373 |
tokens = response.usage.output_tokens
|
374 |
print(f"Generated Tokens: {tokens}")
|
375 |
-
|
376 |
return response
|
377 |
|
378 |
## Check Response
|
@@ -389,6 +164,26 @@ def check_response(response):
|
|
389 |
print("Unexpected response format. Unable to extract text.")
|
390 |
return None
|
391 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
392 |
## Extract Json data
|
393 |
def extract_json(response):
|
394 |
response_text = response.content[0].text # Access the 'text' attribute of the TextBlock object
|
@@ -423,27 +218,29 @@ def pipeline(pdf_path):
|
|
423 |
response = {}
|
424 |
response = extract_table_info(destamp_img)
|
425 |
check_response(response)
|
426 |
-
|
427 |
-
|
|
|
428 |
|
429 |
## Gradio Interface
|
430 |
title = "Demo: Financial Statement(PDF) information Extraction - Traditional Chinese"
|
431 |
description = """Demo pdf, either editable or scanned image, information extraction for Traditional Chinese without OCR"""
|
432 |
-
examples = ['text_pdf.pdf', 'image_pdf.pdf']
|
433 |
|
434 |
pdf_file = gr.File(label="Upload PDF", type="filepath")
|
435 |
pages = gr.File(label="Pages", type="filepath")
|
436 |
num_pages = gr.Number(label="Number of Pages")
|
437 |
destamp_img = gr.Image(type="numpy", label="De-stamped Image")
|
438 |
-
json_data = gr.JSON(label="JSON Data")
|
|
|
439 |
|
440 |
|
441 |
app = gr.Interface(fn=pipeline,
|
442 |
inputs=pdf_file,
|
443 |
-
outputs=[num_pages, destamp_img,
|
444 |
title=title,
|
445 |
description=description,
|
446 |
examples=examples)
|
447 |
app.queue()
|
448 |
-
|
449 |
-
app.launch(
|
|
|
109 |
"role": "user",
|
110 |
"content": [
|
111 |
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": get_base64_encorded_image(image_path)}},
|
|
|
112 |
{
|
113 |
"type": "text",
|
114 |
"text": """
|
115 |
+
You are analyzing an Financial Statement in traditional Chinese.
|
116 |
+
Please extract all the information of the statement image, keep the context in Traditional Chinese without translation.
|
117 |
+
Extract information row by row, and cell by cell.
|
118 |
+
Keep document title, header, date, currency, section header, summary, footer, ... as part of the information.
|
119 |
+
OCR all the cells precisely with the best accuracy. Any Chinese character, if you can not make the best guess, please return "?". Do not ignore it.
|
120 |
+
Do not do any correction with the content of the cell related with "代碼", even it is not 100% correct from your experience. Keep as what it is.
|
121 |
+
Makd sure the length of the string of each cell is same as the image.
|
122 |
+
Save all the information as a markdown table.
|
123 |
+
Keep alignment of each column with the image.
|
124 |
+
|
125 |
+
Repsonse as below structure:
|
126 |
+
<mark>
|
127 |
+
...
|
128 |
+
...
|
129 |
+
...
|
130 |
+
</mark>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
"""
|
133 |
|
|
|
140 |
# Update how the API is called
|
141 |
response = client.messages.create(
|
142 |
model=MODEL_NAME,
|
143 |
+
max_tokens=3072, # limit the amount of response information
|
144 |
messages=message_list,
|
145 |
+
temperature=0.6,
|
146 |
extra_headers={"anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"} # Changed to a dictionary
|
147 |
)
|
148 |
tokens = response.usage.output_tokens
|
149 |
print(f"Generated Tokens: {tokens}")
|
150 |
+
print(f"Response: {response}")
|
151 |
return response
|
152 |
|
153 |
## Check Response
|
|
|
164 |
print("Unexpected response format. Unable to extract text.")
|
165 |
return None
|
166 |
|
167 |
+
## Extract markdown data
|
168 |
+
def extract_markdown(response):
|
169 |
+
response_text = response.content[0].text # Access the 'text' attribute of the TextBlock object
|
170 |
+
# Try to find the start and end of the JSON object more robustly
|
171 |
+
# skip <json>
|
172 |
+
|
173 |
+
mark_start = response_text.find("<mark>")+6 # Skip the <json> tag
|
174 |
+
mark_end = response_text.find("</mark>") # Include the closing brace
|
175 |
+
print(f"mark_start: {mark_start}")
|
176 |
+
print(f"mark_end: {mark_end}")
|
177 |
+
# Check if valid start and end indices were found
|
178 |
+
if mark_start >= 0 and mark_end > mark_start:
|
179 |
+
mark_data = response_text[mark_start:mark_end]
|
180 |
+
print(f"mark_data: {mark_data}")
|
181 |
+
return mark_data
|
182 |
+
else:
|
183 |
+
print("Could not find valid Markdown object in response.")
|
184 |
+
return
|
185 |
+
|
186 |
+
|
187 |
## Extract Json data
|
188 |
def extract_json(response):
|
189 |
response_text = response.content[0].text # Access the 'text' attribute of the TextBlock object
|
|
|
218 |
response = {}
|
219 |
response = extract_table_info(destamp_img)
|
220 |
check_response(response)
|
221 |
+
mark_data = extract_markdown(response)
|
222 |
+
#json_data = extract_json(response)
|
223 |
+
return len(pages), destamp_img, mark_data
|
224 |
|
225 |
## Gradio Interface
|
226 |
title = "Demo: Financial Statement(PDF) information Extraction - Traditional Chinese"
|
227 |
description = """Demo pdf, either editable or scanned image, information extraction for Traditional Chinese without OCR"""
|
228 |
+
examples = [['text_pdf.pdf'], ['image_pdf.pdf']]
|
229 |
|
230 |
pdf_file = gr.File(label="Upload PDF", type="filepath")
|
231 |
pages = gr.File(label="Pages", type="filepath")
|
232 |
num_pages = gr.Number(label="Number of Pages")
|
233 |
destamp_img = gr.Image(type="numpy", label="De-stamped Image")
|
234 |
+
#json_data = gr.JSON(label="JSON Data")
|
235 |
+
mark_data = gr.Markdown(label="Markdown Data")
|
236 |
|
237 |
|
238 |
app = gr.Interface(fn=pipeline,
|
239 |
inputs=pdf_file,
|
240 |
+
outputs=[num_pages, destamp_img, mark_data],
|
241 |
title=title,
|
242 |
description=description,
|
243 |
examples=examples)
|
244 |
app.queue()
|
245 |
+
app.launch(debug=True, share=True)
|
246 |
+
#app.launch()
|