""" OCR Models Module Contains all OCR-related functions for different AI models. """ import google.generativeai as genai from mistralai import Mistral from PIL import Image import io import base64 import logging import openai import os # Configure logging logger = logging.getLogger(__name__) def gemini_ocr(image: Image.Image): """Process OCR using Google's Gemini 2.0 Flash model.""" try: # Initialize Gemini model gemini_model = initialize_gemini() if not gemini_model: return "Gemini OCR error: Failed to initialize Gemini model" # Convert image to base64 buffered = io.BytesIO() image.save(buffered, format="JPEG") img_bytes = buffered.getvalue() base64_image = base64.b64encode(img_bytes).decode('utf-8') # Create the image part for Gemini image_part = { "mime_type": "image/jpeg", "data": base64_image } # Generate content with Gemini response = gemini_model.generate_content([ "Extract and transcribe all text from this image. Return only the transcribed text in markdown format, preserving any formatting like headers, lists, etc.", image_part ]) markdown_text = response.text logger.info("Gemini OCR completed successfully") return markdown_text except Exception as e: logger.error(f"Gemini OCR error: {e}") return f"Gemini OCR error: {e}" def mistral_ocr(image: Image.Image): """Process OCR using Mistral AI's OCR model.""" try: # Convert image to base64 buffered = io.BytesIO() image.save(buffered, format="JPEG") img_bytes = buffered.getvalue() base64_image = base64.b64encode(img_bytes).decode('utf-8') client = Mistral(api_key=os.getenv("MISTRAL_API_KEY")) ocr_response = client.ocr.process( model="mistral-ocr-latest", document={ "type": "image_url", "image_url": f"data:image/jpeg;base64,{base64_image}" } ) # Extract markdown from the first page if available markdown_text = "" if hasattr(ocr_response, 'pages') and ocr_response.pages: page = ocr_response.pages[0] markdown_text = getattr(page, 'markdown', "") if not markdown_text: markdown_text = str(ocr_response) logger.info("Mistral OCR completed successfully") return markdown_text except Exception as e: logger.error(f"Mistral OCR error: {e}") return f"Mistral OCR error: {e}" def openai_ocr(image: Image.Image): """Process OCR using OpenAI's GPT-4o model.""" try: # Convert image to base64 buffered = io.BytesIO() image.save(buffered, format="PNG") img_bytes = buffered.getvalue() base64_image = base64.b64encode(img_bytes).decode('utf-8') image_data_url = f"data:image/png;base64,{base64_image}" # Send request to GPT-4o for OCR response = openai.chat.completions.create( model="gpt-4o", messages=[ { "role": "user", "content": [ {"type": "text", "text": "Extract and transcribe all text from this image. Return only the transcribed text in markdown format, preserving any formatting like headers, lists, etc."}, {"type": "image_url", "image_url": {"url": image_data_url}} ] } ] ) markdown_text = response.choices[0].message.content logger.info("OpenAI OCR completed successfully") return markdown_text except Exception as e: logger.error(f"OpenAI OCR error: {e}") return f"OpenAI OCR error: {e}" def gpt5_ocr(image: Image.Image): """Process OCR using OpenAI's GPT-5 model with the same prompt.""" try: # Convert image to base64 (PNG) and use as data URL buffered = io.BytesIO() image.save(buffered, format="PNG") img_bytes = buffered.getvalue() base64_image = base64.b64encode(img_bytes).decode('utf-8') image_data_url = f"data:image/png;base64,{base64_image}" # Use Chat Completions style content for multimodal reliability response = openai.chat.completions.create( model="gpt-5", messages=[ { "role": "user", "content": [ {"type": "text", "text": "Extract and transcribe all text from this image. Return only the transcribed text in markdown format, preserving any formatting like headers, lists, etc."}, {"type": "image_url", "image_url": {"url": image_data_url}} ] } ] ) markdown_text = response.choices[0].message.content logger.info("GPT-5 OCR completed successfully") return markdown_text except Exception as e: logger.error(f"GPT-5 OCR error: {e}") return f"GPT-5 OCR error: {e}" def process_model_ocr(image, model_name): """Process OCR for a specific model.""" if model_name == "gemini": return gemini_ocr(image) elif model_name == "mistral": return mistral_ocr(image) elif model_name == "openai": return openai_ocr(image) elif model_name == "gpt5": return gpt5_ocr(image) else: return f"Unknown model: {model_name}" # Initialize Gemini model def initialize_gemini(): """Initialize the Gemini model with API key.""" gemini_api_key = os.getenv("GEMINI_API_KEY") if gemini_api_key: genai.configure(api_key=gemini_api_key) logger.info("✅ GEMINI_API_KEY loaded successfully") return genai.GenerativeModel('gemini-2.0-flash-exp') else: logger.error("❌ GEMINI_API_KEY not found in environment variables") return None # Initialize OpenAI def initialize_openai(): """Initialize OpenAI with API key.""" openai_api_key = os.getenv("OPENAI_API_KEY") if openai_api_key: openai.api_key = openai_api_key logger.info("✅ OPENAI_API_KEY loaded successfully") else: logger.error("❌ OPENAI_API_KEY not found in environment variables") # Initialize Mistral def initialize_mistral(): """Initialize Mistral with API key.""" mistral_api_key = os.getenv("MISTRAL_API_KEY") if mistral_api_key: logger.info("✅ MISTRAL_API_KEY loaded successfully") else: logger.error("❌ MISTRAL_API_KEY not found in environment variables")