|
|
|
|
|
import streamlit as st |
|
import pytesseract |
|
from tempfile import NamedTemporaryFile |
|
from langchain.document_loaders import PyPDFLoader |
|
from langchain.llms import CTransformers |
|
from langchain.chains import LLMChain |
|
from langchain.prompts import PromptTemplate |
|
from PIL import Image |
|
|
|
def main(): |
|
st.title("Invoice Entity Extractor π") |
|
|
|
uploaded_file = st.sidebar.file_uploader("Upload a PDF file", type="pdf") |
|
uploaded_image = st.sidebar.file_uploader("Upload an image", type=["png", "jpg", "jpeg"]) |
|
|
|
if uploaded_file is not None: |
|
process_pdf(uploaded_file) |
|
elif uploaded_image is not None: |
|
process_image(uploaded_image) |
|
|
|
def process_pdf(uploaded_file): |
|
|
|
with NamedTemporaryFile(delete=False) as temp_file: |
|
temp_file.write(uploaded_file.read()) |
|
temp_file_path = temp_file.name |
|
|
|
loader = PyPDFLoader(temp_file_path) |
|
pages = loader.load() |
|
|
|
st.write(f"Number of pages: {len(pages)}") |
|
|
|
for page in pages: |
|
st.write(page.page_content) |
|
|
|
llm = CTransformers(model="llama-2-7b-chat.ggmlv3.q4_0.bin", model_type="llama", |
|
config={'max_new_tokens': 128, 'temperature': 0.01}) |
|
|
|
template = """Extract invoice number, name of organization, address, date, |
|
Qty, Rate, Tax, Amount {pages} |
|
Output: entity: type |
|
""" |
|
prompt_template = PromptTemplate(input_variables=["pages"], template=template) |
|
chain = LLMChain(llm=llm, prompt=prompt_template) |
|
|
|
result = chain.run(pages=pages[0].page_content) |
|
|
|
st.write("Extracted entities:") |
|
entities = result.strip().split("\n") |
|
table_data = [line.split(":") for line in entities] |
|
st.table(table_data) |
|
|
|
def process_image(uploaded_image): |
|
|
|
image = Image.open(uploaded_image) |
|
text = pytesseract.image_to_string(image) |
|
|
|
st.write("Extracted text from the image:") |
|
st.write(text) |
|
|
|
|
|
llm = CTransformers(model="llama-2-7b-chat.ggmlv3.q4_0.bin", model_type="llama", |
|
config={'max_new_tokens': 128, 'temperature': 0.01}) |
|
|
|
template = """Extract invoice number, name of organization, address, date, |
|
Qty, Rate, Tax, Amount {text} |
|
Output: entity: type |
|
""" |
|
prompt_template = PromptTemplate(input_variables=["text"], template=template) |
|
chain = LLMChain(llm=llm, prompt=prompt_template) |
|
|
|
result = chain.run(text) |
|
|
|
st.write("Extracted entities:") |
|
entities = result.strip().split("\n") |
|
table_data = [line.split(":") for line in entities] |
|
st.table(table_data) |
|
|
|
if __name__ == "__main__": |
|
main() |