Spaces:
Runtime error
Runtime error
Update app.py
Browse files- Simplify message_list
- Output as markdown
- limit output token
app.py
CHANGED
|
@@ -109,250 +109,25 @@ def extract_table_info(image_path):
|
|
| 109 |
"role": "user",
|
| 110 |
"content": [
|
| 111 |
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": get_base64_encorded_image(image_path)}},
|
| 112 |
-
#{"type": "text", "text": "Please extract the table information of the image, keep the context in Traditional Chinese without translation, all the alphanumeric chararacter exressed in string, give me a json dictionary of the information extracted"},
|
| 113 |
{
|
| 114 |
"type": "text",
|
| 115 |
"text": """
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
Keep
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
{"表頭名稱": },
|
| 133 |
-
{"報表日期": },
|
| 134 |
-
{"幣別": },
|
| 135 |
-
...
|
| 136 |
-
...
|
| 137 |
-
...
|
| 138 |
-
],
|
| 139 |
-
"table detail": [
|
| 140 |
-
{
|
| 141 |
-
...
|
| 142 |
-
...
|
| 143 |
-
...
|
| 144 |
-
},
|
| 145 |
-
{
|
| 146 |
-
...
|
| 147 |
-
...
|
| 148 |
-
...
|
| 149 |
-
},
|
| 150 |
-
...
|
| 151 |
-
...
|
| 152 |
-
...
|
| 153 |
-
]
|
| 154 |
-
}
|
| 155 |
-
|
| 156 |
-
</json>
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
Output the json structure as a string starting with <json> and ending with </json> XML tags.
|
| 160 |
-
Do not return any narrative language. Look at the images in detail.
|
| 161 |
-
Do not insert and control code, like line feed, tab indent: "\n"
|
| 162 |
-
|
| 163 |
-
IF YOU COULD NOT FIND THE RIGHT INFORMATION JUST RETURN THIS VALUE “UNK”.
|
| 164 |
-
|
| 165 |
-
Example:
|
| 166 |
-
|
| 167 |
-
<json>
|
| 168 |
-
{
|
| 169 |
-
"table meta": [
|
| 170 |
-
{"企業名稱": "台灣水泥股份有限公司"},
|
| 171 |
-
{"表頭名稱": "個體資產負債表"},
|
| 172 |
-
{"報表日期": "民國 112 年及 111 年 12 月 31 日"},
|
| 173 |
-
{"幣別": "新台幣仟元"},
|
| 174 |
-
...
|
| 175 |
-
...
|
| 176 |
-
...
|
| 177 |
-
|
| 178 |
-
],
|
| 179 |
-
"table detail": [
|
| 180 |
-
{
|
| 181 |
-
"資產": [
|
| 182 |
-
{ "流動資產":
|
| 183 |
-
[
|
| 184 |
-
{
|
| 185 |
-
"代碼": "1100",
|
| 186 |
-
"項目": "現金及約當現金(附註四及六)",
|
| 187 |
-
"112年12月31日金額": "1,516,633",
|
| 188 |
-
"112年12月31日%": "-",
|
| 189 |
-
"111年12月31日金額": "4,243,295",
|
| 190 |
-
"111年12月31日%": "1"
|
| 191 |
-
},
|
| 192 |
-
{
|
| 193 |
-
"代碼": "1110",
|
| 194 |
-
"項目": "透過損益按公允價值衡量之金融資產(附註四、七及二六)",
|
| 195 |
-
"112年12月31日金額": "341,056",
|
| 196 |
-
"112年12月31日%": "-",
|
| 197 |
-
"111年12月31日金額": "259,919",
|
| 198 |
-
"111年12月31日%": "-"
|
| 199 |
-
},
|
| 200 |
-
{
|
| 201 |
-
"代碼": "1120",
|
| 202 |
-
"項目": "透過其他綜合損益按公允價值衡量之金融資產(附註四、八及二六)",
|
| 203 |
-
"112年12月31日金額": "4,333,594",
|
| 204 |
-
"112年12月31日%": "1",
|
| 205 |
-
"111年12月31日金額": "3,607,819",
|
| 206 |
-
"111年12月31日%": "1"
|
| 207 |
-
},
|
| 208 |
-
{
|
| 209 |
-
"代碼": "1150",
|
| 210 |
-
"項目": "應收票據及帳款淨額(附註四及九)",
|
| 211 |
-
"112年12月31日金額": "5,801,135",
|
| 212 |
-
"112年12月31日%": "2",
|
| 213 |
-
"111年12月31日金額": "5,319,368",
|
| 214 |
-
"111年12月31日%": "1"
|
| 215 |
-
},
|
| 216 |
-
{
|
| 217 |
-
"代碼": "1180",
|
| 218 |
-
"項目": "應收票據及帳款-關係人(附註四及二七)",
|
| 219 |
-
"112年12月31日金額": "572,118",
|
| 220 |
-
"112年12月31日%": "-",
|
| 221 |
-
"111年12月31日金額": "681,793",
|
| 222 |
-
"111年12月31日%": "-"
|
| 223 |
-
},
|
| 224 |
-
{
|
| 225 |
-
"代碼": "130X",
|
| 226 |
-
"項目": "存貨(附註四及十)",
|
| 227 |
-
"112年12月31日金額": "1,782,735",
|
| 228 |
-
"112年12月31日%": "1",
|
| 229 |
-
"111年12月31日金額": "2,321,850",
|
| 230 |
-
"111年12月31日%": "1"
|
| 231 |
-
},
|
| 232 |
-
{
|
| 233 |
-
"代碼": "1470",
|
| 234 |
-
"項目": "其他流動資產(附註二一及二七)",
|
| 235 |
-
"112年12月31日金額": "411,540",
|
| 236 |
-
"112年12月31日%": "-",
|
| 237 |
-
"111年12月31日金額": "248,683",
|
| 238 |
-
"111年12月31日%": "-"
|
| 239 |
-
},
|
| 240 |
-
{
|
| 241 |
-
"代碼": "11XX",
|
| 242 |
-
"項目": "流動資產總計",
|
| 243 |
-
"112年12月31日金額": "14,758,811",
|
| 244 |
-
"112年12月31日%": "4",
|
| 245 |
-
"111年12月31日金額": "16,682,727",
|
| 246 |
-
"111年12月31日%": "4"
|
| 247 |
-
}
|
| 248 |
-
]
|
| 249 |
-
},
|
| 250 |
-
{
|
| 251 |
-
"非流動資產": [
|
| 252 |
-
{
|
| 253 |
-
"代碼": "1517",
|
| 254 |
-
"項目": "透過其他綜合損益按公允價值衡量之金融資產(附註四、八及二六)",
|
| 255 |
-
"112年12月31日金額": "9,638,255",
|
| 256 |
-
"112年12月31日%": "3",
|
| 257 |
-
"111年12月31日金額": "7,633,603",
|
| 258 |
-
"111年12月31日%": "2"
|
| 259 |
-
},
|
| 260 |
-
{
|
| 261 |
-
"代碼": "1550",
|
| 262 |
-
"項目": "採用權益法之投資(附註四、五及十一)",
|
| 263 |
-
"112年12月31日金額": "312,351,291",
|
| 264 |
-
"112年12月31日%": "82",
|
| 265 |
-
"111年12月31日金額": "307,101,709",
|
| 266 |
-
"111年12月31日%": "82"
|
| 267 |
-
},
|
| 268 |
-
{
|
| 269 |
-
"代碼": "1600",
|
| 270 |
-
"項目": "不動產、廠房及設備(附註四、五、十二、十三及二八)",
|
| 271 |
-
"112年12月31日金額": "28,052,603",
|
| 272 |
-
"112年12月31日%": "7",
|
| 273 |
-
"111年12月31日金額": "35,583,596",
|
| 274 |
-
"111年12月31日%": "10"
|
| 275 |
-
},
|
| 276 |
-
{
|
| 277 |
-
"代碼": "1755",
|
| 278 |
-
"項目": "使用權資產(附註四、十五、二十、二七)",
|
| 279 |
-
"112年12月31日金額": "1,797,820",
|
| 280 |
-
"112年12月31日%": "1",
|
| 281 |
-
"111年12月31日金額": "1,788,972",
|
| 282 |
-
"111年12月31日%": "1"
|
| 283 |
-
},
|
| 284 |
-
{
|
| 285 |
-
"代碼": "1760",
|
| 286 |
-
"項目": "投資性不動產(附註四、十四及二十)",
|
| 287 |
-
"112年12月31日金額": "13,042,677",
|
| 288 |
-
"112年12月31日%": "3",
|
| 289 |
-
"111年12月31日金額": "2,436,675",
|
| 290 |
-
"111年12月31日%": "-"
|
| 291 |
-
},
|
| 292 |
-
{
|
| 293 |
-
"代碼": "1821",
|
| 294 |
-
"項目": "無形資產(附註四及二十)",
|
| 295 |
-
"112年12月31日金額": "58,840",
|
| 296 |
-
"112年12月31日%": "-",
|
| 297 |
-
"111年12月31日金額": "64,956",
|
| 298 |
-
"111年12月31日%": "-"
|
| 299 |
-
},
|
| 300 |
-
{
|
| 301 |
-
"代碼": "1915",
|
| 302 |
-
"項目": "預付設備款",
|
| 303 |
-
"112年12月31日金額": "600,042",
|
| 304 |
-
"112年12月31日%": "-",
|
| 305 |
-
"111年12月31日金額": "682,765",
|
| 306 |
-
"111年12月31日%": "-"
|
| 307 |
-
},
|
| 308 |
-
{
|
| 309 |
-
"代碼": "1975",
|
| 310 |
-
"項目": "淨確定福利資產(附註四及十八)",
|
| 311 |
-
"112年12月31日金額": "1,507,153",
|
| 312 |
-
"112年12月31日%": "-",
|
| 313 |
-
"111年12月31日金額": "1,526,546",
|
| 314 |
-
"111年12月31日%": "-"
|
| 315 |
-
},
|
| 316 |
-
{
|
| 317 |
-
"代碼": "1990",
|
| 318 |
-
"項目": "其他非流動資產(附註四、六、二一及二八)",
|
| 319 |
-
"112年12月31日金額": "827,628",
|
| 320 |
-
"112年12月31日%": "-",
|
| 321 |
-
"111年12月31日金額": "840,688",
|
| 322 |
-
"111年12月31日%": "1"
|
| 323 |
-
},
|
| 324 |
-
{
|
| 325 |
-
"代碼": "15XX",
|
| 326 |
-
"項目": "非流動資產總計",
|
| 327 |
-
"112年12月31日金額": "367,876,309",
|
| 328 |
-
"112年12月31日%": "96",
|
| 329 |
-
"111年12月31日金額": "357,659,510",
|
| 330 |
-
"111年12月31日%": "96"
|
| 331 |
-
}]
|
| 332 |
-
},
|
| 333 |
-
{
|
| 334 |
-
"代碼": "1XXX",
|
| 335 |
-
"項目": "資產總計",
|
| 336 |
-
"112年12月31日金額": "382,635,120",
|
| 337 |
-
"112年12月31日%": "100",
|
| 338 |
-
"111年12月31日金額": "374,342,237",
|
| 339 |
-
"111年12月31日%": "100"
|
| 340 |
-
}
|
| 341 |
-
]
|
| 342 |
-
},
|
| 343 |
-
{
|
| 344 |
-
"負債": [
|
| 345 |
-
...
|
| 346 |
-
...
|
| 347 |
-
...
|
| 348 |
-
]
|
| 349 |
-
},
|
| 350 |
-
...
|
| 351 |
-
...
|
| 352 |
-
...
|
| 353 |
-
]
|
| 354 |
-
}
|
| 355 |
-
</json>
|
| 356 |
|
| 357 |
"""
|
| 358 |
|
|
@@ -365,14 +140,14 @@ def extract_table_info(image_path):
|
|
| 365 |
# Update how the API is called
|
| 366 |
response = client.messages.create(
|
| 367 |
model=MODEL_NAME,
|
| 368 |
-
max_tokens=
|
| 369 |
messages=message_list,
|
| 370 |
-
temperature=0.
|
| 371 |
extra_headers={"anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"} # Changed to a dictionary
|
| 372 |
)
|
| 373 |
tokens = response.usage.output_tokens
|
| 374 |
print(f"Generated Tokens: {tokens}")
|
| 375 |
-
|
| 376 |
return response
|
| 377 |
|
| 378 |
## Check Response
|
|
@@ -389,6 +164,26 @@ def check_response(response):
|
|
| 389 |
print("Unexpected response format. Unable to extract text.")
|
| 390 |
return None
|
| 391 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
## Extract Json data
|
| 393 |
def extract_json(response):
|
| 394 |
response_text = response.content[0].text # Access the 'text' attribute of the TextBlock object
|
|
@@ -423,27 +218,29 @@ def pipeline(pdf_path):
|
|
| 423 |
response = {}
|
| 424 |
response = extract_table_info(destamp_img)
|
| 425 |
check_response(response)
|
| 426 |
-
|
| 427 |
-
|
|
|
|
| 428 |
|
| 429 |
## Gradio Interface
|
| 430 |
title = "Demo: Financial Statement(PDF) information Extraction - Traditional Chinese"
|
| 431 |
description = """Demo pdf, either editable or scanned image, information extraction for Traditional Chinese without OCR"""
|
| 432 |
-
examples = ['text_pdf.pdf', 'image_pdf.pdf']
|
| 433 |
|
| 434 |
pdf_file = gr.File(label="Upload PDF", type="filepath")
|
| 435 |
pages = gr.File(label="Pages", type="filepath")
|
| 436 |
num_pages = gr.Number(label="Number of Pages")
|
| 437 |
destamp_img = gr.Image(type="numpy", label="De-stamped Image")
|
| 438 |
-
json_data = gr.JSON(label="JSON Data")
|
|
|
|
| 439 |
|
| 440 |
|
| 441 |
app = gr.Interface(fn=pipeline,
|
| 442 |
inputs=pdf_file,
|
| 443 |
-
outputs=[num_pages, destamp_img,
|
| 444 |
title=title,
|
| 445 |
description=description,
|
| 446 |
examples=examples)
|
| 447 |
app.queue()
|
| 448 |
-
|
| 449 |
-
app.launch(
|
|
|
|
| 109 |
"role": "user",
|
| 110 |
"content": [
|
| 111 |
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": get_base64_encorded_image(image_path)}},
|
|
|
|
| 112 |
{
|
| 113 |
"type": "text",
|
| 114 |
"text": """
|
| 115 |
+
You are analyzing an Financial Statement in traditional Chinese.
|
| 116 |
+
Please extract all the information of the statement image, keep the context in Traditional Chinese without translation.
|
| 117 |
+
Extract information row by row, and cell by cell.
|
| 118 |
+
Keep document title, header, date, currency, section header, summary, footer, ... as part of the information.
|
| 119 |
+
OCR all the cells precisely with the best accuracy. Any Chinese character, if you can not make the best guess, please return "?". Do not ignore it.
|
| 120 |
+
Do not do any correction with the content of the cell related with "代碼", even it is not 100% correct from your experience. Keep as what it is.
|
| 121 |
+
Makd sure the length of the string of each cell is same as the image.
|
| 122 |
+
Save all the information as a markdown table.
|
| 123 |
+
Keep alignment of each column with the image.
|
| 124 |
+
|
| 125 |
+
Repsonse as below structure:
|
| 126 |
+
<mark>
|
| 127 |
+
...
|
| 128 |
+
...
|
| 129 |
+
...
|
| 130 |
+
</mark>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
"""
|
| 133 |
|
|
|
|
| 140 |
# Update how the API is called
|
| 141 |
response = client.messages.create(
|
| 142 |
model=MODEL_NAME,
|
| 143 |
+
max_tokens=3072, # limit the amount of response information
|
| 144 |
messages=message_list,
|
| 145 |
+
temperature=0.6,
|
| 146 |
extra_headers={"anthropic-beta": "max-tokens-3-5-sonnet-2024-07-15"} # Changed to a dictionary
|
| 147 |
)
|
| 148 |
tokens = response.usage.output_tokens
|
| 149 |
print(f"Generated Tokens: {tokens}")
|
| 150 |
+
print(f"Response: {response}")
|
| 151 |
return response
|
| 152 |
|
| 153 |
## Check Response
|
|
|
|
| 164 |
print("Unexpected response format. Unable to extract text.")
|
| 165 |
return None
|
| 166 |
|
| 167 |
+
## Extract markdown data
|
| 168 |
+
def extract_markdown(response):
|
| 169 |
+
response_text = response.content[0].text # Access the 'text' attribute of the TextBlock object
|
| 170 |
+
# Try to find the start and end of the JSON object more robustly
|
| 171 |
+
# skip <json>
|
| 172 |
+
|
| 173 |
+
mark_start = response_text.find("<mark>")+6 # Skip the <json> tag
|
| 174 |
+
mark_end = response_text.find("</mark>") # Include the closing brace
|
| 175 |
+
print(f"mark_start: {mark_start}")
|
| 176 |
+
print(f"mark_end: {mark_end}")
|
| 177 |
+
# Check if valid start and end indices were found
|
| 178 |
+
if mark_start >= 0 and mark_end > mark_start:
|
| 179 |
+
mark_data = response_text[mark_start:mark_end]
|
| 180 |
+
print(f"mark_data: {mark_data}")
|
| 181 |
+
return mark_data
|
| 182 |
+
else:
|
| 183 |
+
print("Could not find valid Markdown object in response.")
|
| 184 |
+
return
|
| 185 |
+
|
| 186 |
+
|
| 187 |
## Extract Json data
|
| 188 |
def extract_json(response):
|
| 189 |
response_text = response.content[0].text # Access the 'text' attribute of the TextBlock object
|
|
|
|
| 218 |
response = {}
|
| 219 |
response = extract_table_info(destamp_img)
|
| 220 |
check_response(response)
|
| 221 |
+
mark_data = extract_markdown(response)
|
| 222 |
+
#json_data = extract_json(response)
|
| 223 |
+
return len(pages), destamp_img, mark_data
|
| 224 |
|
| 225 |
## Gradio Interface
|
| 226 |
title = "Demo: Financial Statement(PDF) information Extraction - Traditional Chinese"
|
| 227 |
description = """Demo pdf, either editable or scanned image, information extraction for Traditional Chinese without OCR"""
|
| 228 |
+
examples = [['text_pdf.pdf'], ['image_pdf.pdf']]
|
| 229 |
|
| 230 |
pdf_file = gr.File(label="Upload PDF", type="filepath")
|
| 231 |
pages = gr.File(label="Pages", type="filepath")
|
| 232 |
num_pages = gr.Number(label="Number of Pages")
|
| 233 |
destamp_img = gr.Image(type="numpy", label="De-stamped Image")
|
| 234 |
+
#json_data = gr.JSON(label="JSON Data")
|
| 235 |
+
mark_data = gr.Markdown(label="Markdown Data")
|
| 236 |
|
| 237 |
|
| 238 |
app = gr.Interface(fn=pipeline,
|
| 239 |
inputs=pdf_file,
|
| 240 |
+
outputs=[num_pages, destamp_img, mark_data],
|
| 241 |
title=title,
|
| 242 |
description=description,
|
| 243 |
examples=examples)
|
| 244 |
app.queue()
|
| 245 |
+
app.launch(debug=True, share=True)
|
| 246 |
+
#app.launch()
|