Spaces:

ccapo
/

portfolio

Sleeping

portfolio / projects /01_Document_Classifier.py

Christopher Capobianco

Try embedding document classifier inside main block

b1eea1f 5 months ago

4.43 kB

	import streamlit as st
	import easyocr
	import pickle
	import spacy
	import re
	import os
	import subprocess

	# Function to Load the Spacy tokenizer
	@st.cache_data
	def load_nlp():
	subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
	return spacy.load('en_core_web_sm')

	# Function to Initialze the OCR Engine
	@st.cache_resource
	def load_ocr_engine():
	return easyocr.Reader(['en'])

	# Function to Load the model
	@st.cache_resource
	def load_model():
	with open('models/autoclassifier.pkl', 'rb') as model_file:
	stopwords = pickle.load(model_file)
	punctuations = pickle.load(model_file)
	model_pipe = pickle.load(model_file)
	return (stopwords, punctuations, model_pipe)

	# Function to tokenize the text
	def tokenizer(sentence):
	# Process the text
	doc = nlp(sentence)

	# Convert tokens to lemma form for all except '-PRON-'
	# Recall: Tokens like 'I', 'my', 'me' are represented as '-PRON-' by lemma attribute (See SpaCy Introduction)
	tokens = [ token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_ for token in doc ]

	# Remove stop words and punctuations
	tokens = [ token for token in tokens if token not in stopwords and token not in punctuations ]

	return tokens

	# Function to process uploaded images
	@st.cache_data
	def autoclassifier(images):
	# Iterate through all uploaded images
	with st.spinner(f"Processing Images"):
	for image in images:
	# Write bytes to disk
	with open(image.name, 'wb') as f:
	f.write(image.read())

	# Load image into OCR Engine and extract text
	raw_ocr = ocr_engine.readtext(image.name)

	# Extract relevant words from raw OCR
	words = ''
	for (bbox, text, prob) in raw_ocr:
	# Only keep OCR text with 50% probability or higher
	if prob > 0.5:
	# Filter out any digits
	text = re.sub('[0-9]+', '', text)
	# If we have any characters left, append to string
	if text != '':
	words += ' ' + text
	# Pass filtered OCR string to the model
	doc_type = model_pipe.predict([words])

	# Report filename and document class
	st.info(f"filename: '{image.name}', doc_type: '{doc_type[0]}'")

	# Delete image file
	os.remove(image.name)

	if __name__ == "__main__":
	st.header('Document Classifier', divider='green')

	st.warning("Work in Progress")

	st.markdown("#### What is OCR?")
	st.markdown("OCR stands for Optical Character Recognition, and the technology for it has been around for over 30 years.")
	st.markdown("In this project, we leverage the extraction of the text from an image to classify the document. I am using EasyOCR as the OCR Engine, and I do some pre-processing of the raw OCR text to improve the quality of the words used to classify the documents.")
	st.markdown("After an investigation I settled on a Random Forest classifier for this project, since it had the best classification accuracy of the different models I investigated.")
	st.markdown("This project makes use of the [Real World Documents Collections](https://www.kaggle.com/datasets/shaz13/real-world-documents-collections) found at `Kaggle`")
	st.markdown("This project is based off the tutorial by Animesh Giri [Intelligent Document Classification](https://www.kaggle.com/code/animeshgiri/intelligent-document-classification)")
	st.markdown("N.B. I created a similar document classifier in my first ML project, but that relied on IBM's Datacap for the OCR Engine. I also used a Support Vector Machine (SVM) classifier library (libsvm) at the time, but it was slow to train. I tried to re-create that document classifier again, using open source tools and modern techniques outlined in the referenced tutorial.")
	st.divider()

	# Load the Spacy tokenizer
	nlp = load_nlp()

	# Initialze the OCR Engine
	ocr_engine = load_ocr_engine()

	# Load the Model
	stopwords, punctuations, model_pipe = load_model()

	# Fetch uploaded images
	images = st.file_uploader(
	"Choose an image to classify",
	type=['png','jpg','jpeg'],
	accept_multiple_files=True
	)

	# Process and predict document classification
	autoclassifier(images)