import gradio as gr from transformers import AutoModelForConditionalGeneration, AutoProcessor import torch import pandas as pd import pytesseract import cv2 # Set Tesseract command (only works if Tesseract is already installed on the hosting server) pytesseract.pytesseract_cmd = r'/usr/bin/tesseract' # Initialize the model and processor from Hugging Face Hub model_name = "Qwen/Qwen2-VL-2B-Instruct-AWQ" model = AutoModelForConditionalGeneration.from_pretrained( model_name, torch_dtype="auto" ) model.to("cpu") processor = AutoProcessor.from_pretrained(model_name) # Preprocessing image for OCR def preprocess_image(image_path): image = cv2.imread(image_path) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY) return binary # OCR-based text extraction def ocr_extract_text(image_path): preprocessed_image = preprocess_image(image_path) return pytesseract.image_to_string(preprocessed_image) # Model-based image processing def process_image(image_path): try: messages = [{ "role": "user", "content": [ {"type": "image", "image": image_path}, {"type": "text", "text": ( "Extract the following details from the invoice:\n" "- 'invoice_number'\n" "- 'date'\n" "- 'place'\n" "- 'amount' (monetary value in the relevant currency)\n" "- 'category' (based on the invoice type)" )} ] }] text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # Removed process_vision_info and used the processor directly inputs = processor(text=[text], padding=True, return_tensors="pt") inputs = inputs.to(model.device) generated_ids = model.generate(**inputs, max_new_tokens=128) output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) return parse_details(output_text[0]) except Exception as e: print(f"Model failed, falling back to OCR: {e}") ocr_text = ocr_extract_text(image_path) return parse_details(ocr_text) # Parsing details from text def parse_details(details): parsed_data = { "Invoice Number": None, "Date": None, "Place": None, "Amount": None, "Category": None } lines = details.split("\n") for line in lines: lower_line = line.lower() if "invoice" in lower_line: parsed_data["Invoice Number"] = line.split(":")[-1].strip() elif "date" in lower_line: parsed_data["Date"] = line.split(":")[-1].strip() elif "place" in lower_line: parsed_data["Place"] = line.split(":")[-1].strip() elif any(keyword in lower_line for keyword in ["total", "amount", "cost"]): parsed_data["Amount"] = line.split(":")[-1].strip() else: parsed_data["Category"] = "General" return parsed_data # Gradio Interface def gradio_interface(image_files): results = [] for image_file in image_files: details = process_image(image_file.name) results.append(details) df = pd.DataFrame(results) return df # Launch Gradio App grpc_interface = gr.Interface( fn=gradio_interface, inputs=gr.File(label="Upload Invoice Images", file_types=["image"]), outputs=gr.Dataframe(interactive=True), title="Invoice Extraction System" ) if __name__ == "__main__": grpc_interface.launch(share=True)