import streamlit as st |
import easyocr |
import pickle |
import spacy |
import re |
import os |
import subprocess |
@st.cache_data |
def load_nlp(): |
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"]) |
return spacy.load('en_core_web_sm') |
@st.cache_resource |
def load_ocr_engine(): |
return easyocr.Reader(['en']) |
@st.cache_resource |
def load_model(): |
with open('models/autoclassifier.pkl', 'rb') as model_file: |
stopwords = pickle.load(model_file) |
punctuations = pickle.load(model_file) |
model_pipe = pickle.load(model_file) |
return (stopwords, punctuations, model_pipe) |
def tokenizer(sentence): |
doc = nlp(sentence) |
tokens = [ token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_ for token in doc ] |
tokens = [ token for token in tokens if token not in stopwords and token not in punctuations ] |
return tokens |
@st.cache_data |
def autoclassifier(images): |
with st.spinner(f"Processing Images"): |
for image in images: |
with open(image.name, 'wb') as f: |
f.write(image.read()) |
raw_ocr = ocr_engine.readtext(image.name) |
words = '' |
for (bbox, text, prob) in raw_ocr: |
if prob > 0.5: |
text = re.sub('[0-9]+', '', text) |
if text != '': |
words += ' ' + text |
doc_type = model_pipe.predict([words]) |
st.info(f"filename: '{image.name}', doc_type: '{doc_type[0]}'") |
os.remove(image.name) |
if __name__ == "__main__": |
st.header('Document Classifier', divider='green') |
st.warning("Work in Progress") |
st.markdown("#### What is OCR?") |
st.markdown("OCR stands for Optical Character Recognition, and the technology for it has been around for over 30 years.") |
st.markdown("In this project, we leverage the extraction of the text from an image to classify the document. I am using EasyOCR as the OCR Engine, and I do some pre-processing of the raw OCR text to improve the quality of the words used to classify the documents.") |
st.markdown("After an investigation I settled on a Random Forest classifier for this project, since it had the best classification accuracy of the different models I investigated.") |
st.markdown("This project makes use of the [Real World Documents Collections](https://www.kaggle.com/datasets/shaz13/real-world-documents-collections) found at `Kaggle`") |
st.markdown("*This project is based off the tutorial by Animesh Giri [Intelligent Document Classification](https://www.kaggle.com/code/animeshgiri/intelligent-document-classification)*") |
st.markdown("*N.B. I created a similar document classifier in my first ML project, but that relied on IBM's Datacap for the OCR Engine. I also used a Support Vector Machine (SVM) classifier library (libsvm) at the time, but it was slow to train. I tried to re-create that document classifier again, using open source tools and modern techniques outlined in the referenced tutorial.*") |
st.divider() |
nlp = load_nlp() |
ocr_engine = load_ocr_engine() |
stopwords, punctuations, model_pipe = load_model() |
images = st.file_uploader( |
"Choose an image to classify", |
type=['png','jpg','jpeg'], |
accept_multiple_files=True |
) |
autoclassifier(images) |