Spaces:

yuvaranianandhan24
/

invoice_extractor

Sleeping

App Files Files Community

invoice_extractor / app.py

yuvaranianandhan24

Update app.py

a813767 verified 10 months ago

raw

history blame

2.66 kB



	import streamlit as st
	import pytesseract
	from tempfile import NamedTemporaryFile
	from langchain.document_loaders import PyPDFLoader
	from langchain.llms import CTransformers
	from langchain.chains import LLMChain
	from langchain.prompts import PromptTemplate
	from PIL import Image

	def main():
	st.title("Invoice Entity Extractor 📚")

	uploaded_file = st.sidebar.file_uploader("Upload a PDF file", type="pdf")
	uploaded_image = st.sidebar.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])

	if uploaded_file is not None:
	process_pdf(uploaded_file)
	elif uploaded_image is not None:
	process_image(uploaded_image)

	def process_pdf(uploaded_file):
	# Process the uploaded PDF file
	with NamedTemporaryFile(delete=False) as temp_file:
	temp_file.write(uploaded_file.read())
	temp_file_path = temp_file.name

	loader = PyPDFLoader(temp_file_path)
	pages = loader.load()

	st.write(f"Number of pages: {len(pages)}")

	for page in pages:
	st.write(page.page_content)

	llm = CTransformers(model="llama-2-7b-chat.ggmlv3.q4_0.bin", model_type="llama",
	config={'max_new_tokens': 128, 'temperature': 0.01})

	template = """Extract invoice number, name of organization, address, date,
	Qty, Rate, Tax, Amount {pages}
	Output: entity: type
	"""
	prompt_template = PromptTemplate(input_variables=["pages"], template=template)
	chain = LLMChain(llm=llm, prompt=prompt_template)

	result = chain.run(pages=pages[0].page_content)

	st.write("Extracted entities:")
	entities = result.strip().split("\n")
	table_data = [line.split(":") for line in entities]
	st.table(table_data)

	def process_image(uploaded_image):
	# Process the uploaded image using OCR
	image = Image.open(uploaded_image)
	text = pytesseract.image_to_string(image)

	st.write("Extracted text from the image:")
	st.write(text)

	# Apply entity extraction logic to the extracted text
	llm = CTransformers(model="llama-2-7b-chat.ggmlv3.q4_0.bin", model_type="llama",
	config={'max_new_tokens': 128, 'temperature': 0.01})

	template = """Extract invoice number, name of organization, address, date,
	Qty, Rate, Tax, Amount {text}
	Output: entity: type
	"""
	prompt_template = PromptTemplate(input_variables=["text"], template=template)
	chain = LLMChain(llm=llm, prompt=prompt_template)

	result = chain.run(text)

	st.write("Extracted entities:")
	entities = result.strip().split("\n")
	table_data = [line.split(":") for line in entities]
	st.table(table_data)

	if __name__ == "__main__":
	main()