|
|
|
|
|
import streamlit as st |
|
import pytesseract |
|
from tempfile import NamedTemporaryFile |
|
from langchain_community.document_loaders import PyPDFLoader |
|
|
|
from langchain.chains import LLMChain |
|
from langchain.prompts import PromptTemplate |
|
from langchain import HuggingFaceHub |
|
from PIL import Image |
|
import os |
|
|
|
def main(): |
|
st.title("Invoice Entity Extractor π") |
|
|
|
uploaded_file = st.sidebar.file_uploader("Upload a PDF file", type="pdf") |
|
uploaded_image = st.sidebar.file_uploader("Upload an image", type=["png", "jpg", "jpeg"]) |
|
|
|
if uploaded_file is not None: |
|
process_pdf(uploaded_file) |
|
elif uploaded_image is not None: |
|
process_image(uploaded_image) |
|
|
|
api_token = os.getenv('HF_TOKEN') |
|
|
|
def process_pdf(uploaded_file): |
|
|
|
with NamedTemporaryFile(delete=False) as temp_file: |
|
temp_file.write(uploaded_file.read()) |
|
temp_file_path = temp_file.name |
|
|
|
loader = PyPDFLoader(temp_file_path) |
|
pages = loader.load() |
|
|
|
st.write(f"Number of pages: {len(pages)}") |
|
|
|
for page in pages: |
|
st.write(page.page_content) |
|
|
|
model = "meta-llama/Meta-Llama-3-8B-Instruct" |
|
llm = HuggingFaceHub( |
|
huggingfacehub_api_token = api_token, |
|
repo_id = model, |
|
|
|
verbose = False, |
|
model_kwargs = {"temperature":0.01, "max_new_tokens": 128}) |
|
|
|
|
|
template = """Extract invoice number, name of organization, address, date, |
|
Qty, Rate, Tax, Amount {pages} |
|
Output: entity: type |
|
""" |
|
prompt_template = PromptTemplate(input_variables=["pages"], template=template) |
|
chain = LLMChain(llm=llm, prompt=prompt_template) |
|
|
|
result = chain.run(pages=pages[0].page_content) |
|
|
|
st.write("Extracted entities:") |
|
entities = result.strip().split("\n") |
|
table_data = [line.split(":") for line in entities] |
|
st.table(table_data) |
|
|
|
def process_image(uploaded_image): |
|
|
|
image = Image.open(uploaded_image) |
|
text = pytesseract.image_to_string(image) |
|
|
|
st.write("Extracted text from the image:") |
|
st.write(text) |
|
|
|
|
|
|
|
|
|
|
|
model = "llama-2-7b-chat.ggmlv3.q4_0.bin" |
|
llm = HuggingFaceHub( |
|
huggingfacehub_api_token = api_token, |
|
repo_id = model, |
|
|
|
verbose = False, |
|
model_kwargs = {"temperature":0.01, "max_new_tokens": 128}) |
|
|
|
template = """Extract invoice number, name of organization, address, date, |
|
Qty, Rate, Tax, Amount {text} |
|
Output: entity: type |
|
""" |
|
prompt_template = PromptTemplate(input_variables=["text"], template=template) |
|
chain = LLMChain(llm=llm, prompt=prompt_template) |
|
|
|
result = chain.run(text) |
|
|
|
st.write("Extracted entities:") |
|
entities = result.strip().split("\n") |
|
table_data = [line.split(":") for line in entities] |
|
st.table(table_data) |
|
|
|
if __name__ == "__main__": |
|
main() |