satyam007's picture
Update app.py
3cf7dad verified
import gradio as gr
from transformers import AutoModelForConditionalGeneration, AutoProcessor
import torch
import pandas as pd
import pytesseract
import cv2
# Set Tesseract command (only works if Tesseract is already installed on the hosting server)
pytesseract.pytesseract_cmd = r'/usr/bin/tesseract'
# Initialize the model and processor from Hugging Face Hub
model_name = "Qwen/Qwen2-VL-2B-Instruct-AWQ"
model = AutoModelForConditionalGeneration.from_pretrained(
model_name,
torch_dtype="auto"
)
model.to("cpu")
processor = AutoProcessor.from_pretrained(model_name)
# Preprocessing image for OCR
def preprocess_image(image_path):
image = cv2.imread(image_path)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
_, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
return binary
# OCR-based text extraction
def ocr_extract_text(image_path):
preprocessed_image = preprocess_image(image_path)
return pytesseract.image_to_string(preprocessed_image)
# Model-based image processing
def process_image(image_path):
try:
messages = [{
"role": "user",
"content": [
{"type": "image", "image": image_path},
{"type": "text", "text": (
"Extract the following details from the invoice:\n"
"- 'invoice_number'\n"
"- 'date'\n"
"- 'place'\n"
"- 'amount' (monetary value in the relevant currency)\n"
"- 'category' (based on the invoice type)"
)}
]
}]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# Removed process_vision_info and used the processor directly
inputs = processor(text=[text], padding=True, return_tensors="pt")
inputs = inputs.to(model.device)
generated_ids = model.generate(**inputs, max_new_tokens=128)
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
return parse_details(output_text[0])
except Exception as e:
print(f"Model failed, falling back to OCR: {e}")
ocr_text = ocr_extract_text(image_path)
return parse_details(ocr_text)
# Parsing details from text
def parse_details(details):
parsed_data = {
"Invoice Number": None,
"Date": None,
"Place": None,
"Amount": None,
"Category": None
}
lines = details.split("\n")
for line in lines:
lower_line = line.lower()
if "invoice" in lower_line:
parsed_data["Invoice Number"] = line.split(":")[-1].strip()
elif "date" in lower_line:
parsed_data["Date"] = line.split(":")[-1].strip()
elif "place" in lower_line:
parsed_data["Place"] = line.split(":")[-1].strip()
elif any(keyword in lower_line for keyword in ["total", "amount", "cost"]):
parsed_data["Amount"] = line.split(":")[-1].strip()
else:
parsed_data["Category"] = "General"
return parsed_data
# Gradio Interface
def gradio_interface(image_files):
results = []
for image_file in image_files:
details = process_image(image_file.name)
results.append(details)
df = pd.DataFrame(results)
return df
# Launch Gradio App
grpc_interface = gr.Interface(
fn=gradio_interface,
inputs=gr.File(label="Upload Invoice Images", file_types=["image"]),
outputs=gr.Dataframe(interactive=True),
title="Invoice Extraction System"
)
if __name__ == "__main__":
grpc_interface.launch(share=True)