Spaces:
Sleeping
Sleeping
updated model to extract bank_name and cheque_date
Browse files- predict_cheque_parser.py +25 -23
predict_cheque_parser.py
CHANGED
|
@@ -1,15 +1,16 @@
|
|
| 1 |
from transformers import DonutProcessor, VisionEncoderDecoderModel
|
|
|
|
|
|
|
| 2 |
from word2number import w2n
|
| 3 |
from dateutil import relativedelta
|
| 4 |
from datetime import datetime
|
| 5 |
from word2number import w2n
|
| 6 |
-
from textblob import Word
|
| 7 |
from PIL import Image
|
| 8 |
import torch
|
| 9 |
import re
|
| 10 |
|
| 11 |
-
CHEQUE_PARSER_MODEL = "shivi/donut-
|
| 12 |
-
TASK_PROMPT = "<
|
| 13 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 14 |
|
| 15 |
def load_donut_model_and_processor():
|
|
@@ -21,7 +22,6 @@ def load_donut_model_and_processor():
|
|
| 21 |
def prepare_data_using_processor(donut_processor,image_path):
|
| 22 |
## Pass image through donut processor's feature extractor and retrieve image tensor
|
| 23 |
image = load_image(image_path)
|
| 24 |
-
print("type image:", type(image))
|
| 25 |
pixel_values = donut_processor(image, return_tensors="pt").pixel_values
|
| 26 |
pixel_values = pixel_values.to(device)
|
| 27 |
|
|
@@ -70,28 +70,31 @@ def parse_cheque_with_donut(input_image_path):
|
|
| 70 |
|
| 71 |
payee_name = cheque_details_json['cheque_details'][2]['payee_name']
|
| 72 |
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
stale_cheque = check_if_cheque_is_stale(cheque_date)
|
| 77 |
|
| 78 |
-
return payee_name,amt_in_words,amt_in_figures,cheque_date,macthing_amts,stale_cheque
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
words = [word.lower() for word in words]
|
| 84 |
-
for word in words:
|
| 85 |
-
word = Word(word)
|
| 86 |
-
corrected_word = word.correct()+' '
|
| 87 |
-
corrected_amt_in_words += corrected_word
|
| 88 |
-
return corrected_amt_in_words
|
| 89 |
|
| 90 |
def match_legal_and_courstesy_amount(legal_amount,courtesy_amount):
|
| 91 |
macthing_amts = False
|
| 92 |
if len(legal_amount) == 0:
|
| 93 |
return macthing_amts
|
| 94 |
-
|
|
|
|
| 95 |
print("corrected_amt_in_words:",corrected_amt_in_words)
|
| 96 |
|
| 97 |
numeric_legal_amt = w2n.word_to_num(corrected_amt_in_words)
|
|
@@ -102,13 +105,12 @@ def match_legal_and_courstesy_amount(legal_amount,courtesy_amount):
|
|
| 102 |
|
| 103 |
def check_if_cheque_is_stale(cheque_issue_date):
|
| 104 |
stale_check = False
|
| 105 |
-
current_date = datetime.now().strftime('%d/%m/%
|
| 106 |
-
current_date_ = datetime.strptime(current_date, "%d/%m/%
|
| 107 |
-
cheque_issue_date_ = datetime.strptime(cheque_issue_date, "%d/%m/%
|
| 108 |
relative_diff = relativedelta.relativedelta(current_date_, cheque_issue_date_)
|
| 109 |
months_difference = (relative_diff.years * 12) + relative_diff.months
|
| 110 |
print("months_difference:",months_difference)
|
| 111 |
if months_difference > 3:
|
| 112 |
stale_check = True
|
| 113 |
-
return stale_check
|
| 114 |
-
|
|
|
|
| 1 |
from transformers import DonutProcessor, VisionEncoderDecoderModel
|
| 2 |
+
import pkg_resources
|
| 3 |
+
from symspellpy import SymSpell
|
| 4 |
from word2number import w2n
|
| 5 |
from dateutil import relativedelta
|
| 6 |
from datetime import datetime
|
| 7 |
from word2number import w2n
|
|
|
|
| 8 |
from PIL import Image
|
| 9 |
import torch
|
| 10 |
import re
|
| 11 |
|
| 12 |
+
CHEQUE_PARSER_MODEL = "shivi/donut-cheque-parser"
|
| 13 |
+
TASK_PROMPT = "<parse-cheque>"
|
| 14 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 15 |
|
| 16 |
def load_donut_model_and_processor():
|
|
|
|
| 22 |
def prepare_data_using_processor(donut_processor,image_path):
|
| 23 |
## Pass image through donut processor's feature extractor and retrieve image tensor
|
| 24 |
image = load_image(image_path)
|
|
|
|
| 25 |
pixel_values = donut_processor(image, return_tensors="pt").pixel_values
|
| 26 |
pixel_values = pixel_values.to(device)
|
| 27 |
|
|
|
|
| 70 |
|
| 71 |
payee_name = cheque_details_json['cheque_details'][2]['payee_name']
|
| 72 |
|
| 73 |
+
bank_name = cheque_details_json['cheque_details'][3]['bank_name']
|
| 74 |
+
cheque_date = cheque_details_json['cheque_details'][4]['cheque_date']
|
| 75 |
+
|
| 76 |
stale_cheque = check_if_cheque_is_stale(cheque_date)
|
| 77 |
|
| 78 |
+
return payee_name,amt_in_words,amt_in_figures,bank_name,cheque_date,macthing_amts,stale_cheque
|
| 79 |
+
|
| 80 |
+
def spell_check(amt_in_words):
|
| 81 |
+
sym_spell = SymSpell(max_dictionary_edit_distance=2,prefix_length=7)
|
| 82 |
+
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_82_765.txt")
|
| 83 |
+
bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")
|
| 84 |
+
|
| 85 |
+
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
|
| 86 |
+
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
|
| 87 |
|
| 88 |
+
suggestions = sym_spell.lookup_compound(amt_in_words, max_edit_distance=2)
|
| 89 |
+
|
| 90 |
+
return suggestions[0].term
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
def match_legal_and_courstesy_amount(legal_amount,courtesy_amount):
|
| 93 |
macthing_amts = False
|
| 94 |
if len(legal_amount) == 0:
|
| 95 |
return macthing_amts
|
| 96 |
+
|
| 97 |
+
corrected_amt_in_words = spell_check(legal_amount)
|
| 98 |
print("corrected_amt_in_words:",corrected_amt_in_words)
|
| 99 |
|
| 100 |
numeric_legal_amt = w2n.word_to_num(corrected_amt_in_words)
|
|
|
|
| 105 |
|
| 106 |
def check_if_cheque_is_stale(cheque_issue_date):
|
| 107 |
stale_check = False
|
| 108 |
+
current_date = datetime.now().strftime('%d/%m/%y')
|
| 109 |
+
current_date_ = datetime.strptime(current_date, "%d/%m/%y")
|
| 110 |
+
cheque_issue_date_ = datetime.strptime(cheque_issue_date, "%d/%m/%y")
|
| 111 |
relative_diff = relativedelta.relativedelta(current_date_, cheque_issue_date_)
|
| 112 |
months_difference = (relative_diff.years * 12) + relative_diff.months
|
| 113 |
print("months_difference:",months_difference)
|
| 114 |
if months_difference > 3:
|
| 115 |
stale_check = True
|
| 116 |
+
return stale_check
|
|
|