File size: 6,791 Bytes
28673b1 5200597 28673b1 3e159b8 28673b1 3e159b8 28673b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
"""
OCR Models Module
Contains all OCR-related functions for different AI models.
"""
import google.generativeai as genai
from mistralai import Mistral
from PIL import Image
import io
import base64
import logging
import openai
import os
# Configure logging
logger = logging.getLogger(__name__)
def gemini_ocr(image: Image.Image):
"""Process OCR using Google's Gemini 2.0 Flash model."""
try:
# Initialize Gemini model
gemini_model = initialize_gemini()
if not gemini_model:
return "Gemini OCR error: Failed to initialize Gemini model"
# Convert image to base64
buffered = io.BytesIO()
image.save(buffered, format="JPEG")
img_bytes = buffered.getvalue()
base64_image = base64.b64encode(img_bytes).decode('utf-8')
# Create the image part for Gemini
image_part = {
"mime_type": "image/jpeg",
"data": base64_image
}
# Generate content with Gemini
response = gemini_model.generate_content([
"Extract and transcribe all text from this image. Return only the transcribed text in markdown format, preserving any formatting like headers, lists, etc.",
image_part
])
markdown_text = response.text
logger.info("Gemini OCR completed successfully")
return markdown_text
except Exception as e:
logger.error(f"Gemini OCR error: {e}")
return f"Gemini OCR error: {e}"
def mistral_ocr(image: Image.Image):
"""Process OCR using Mistral AI's OCR model."""
try:
# Convert image to base64
buffered = io.BytesIO()
image.save(buffered, format="JPEG")
img_bytes = buffered.getvalue()
base64_image = base64.b64encode(img_bytes).decode('utf-8')
client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
ocr_response = client.ocr.process(
model="mistral-ocr-latest",
document={
"type": "image_url",
"image_url": f"data:image/jpeg;base64,{base64_image}"
}
)
# Extract markdown from the first page if available
markdown_text = ""
if hasattr(ocr_response, 'pages') and ocr_response.pages:
page = ocr_response.pages[0]
markdown_text = getattr(page, 'markdown', "")
if not markdown_text:
markdown_text = str(ocr_response)
logger.info("Mistral OCR completed successfully")
return markdown_text
except Exception as e:
logger.error(f"Mistral OCR error: {e}")
return f"Mistral OCR error: {e}"
def openai_ocr(image: Image.Image):
"""Process OCR using OpenAI's GPT-4o model."""
try:
# Convert image to base64
buffered = io.BytesIO()
image.save(buffered, format="PNG")
img_bytes = buffered.getvalue()
base64_image = base64.b64encode(img_bytes).decode('utf-8')
image_data_url = f"data:image/png;base64,{base64_image}"
# Send request to GPT-4o for OCR
response = openai.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Extract and transcribe all text from this image. Return only the transcribed text in markdown format, preserving any formatting like headers, lists, etc."},
{"type": "image_url", "image_url": {"url": image_data_url}}
]
}
]
)
markdown_text = response.choices[0].message.content
logger.info("OpenAI OCR completed successfully")
return markdown_text
except Exception as e:
logger.error(f"OpenAI OCR error: {e}")
return f"OpenAI OCR error: {e}"
def gpt5_ocr(image: Image.Image):
"""Process OCR using OpenAI's GPT-5 model with the same prompt."""
try:
# Convert image to base64 (PNG) and use as data URL
buffered = io.BytesIO()
image.save(buffered, format="PNG")
img_bytes = buffered.getvalue()
base64_image = base64.b64encode(img_bytes).decode('utf-8')
image_data_url = f"data:image/png;base64,{base64_image}"
# Use Chat Completions style content for multimodal reliability
response = openai.chat.completions.create(
model="gpt-5",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Extract and transcribe all text from this image. Return only the transcribed text in markdown format, preserving any formatting like headers, lists, etc."},
{"type": "image_url", "image_url": {"url": image_data_url}}
]
}
]
)
markdown_text = response.choices[0].message.content
logger.info("GPT-5 OCR completed successfully")
return markdown_text
except Exception as e:
logger.error(f"GPT-5 OCR error: {e}")
return f"GPT-5 OCR error: {e}"
def process_model_ocr(image, model_name):
"""Process OCR for a specific model."""
if model_name == "gemini":
return gemini_ocr(image)
elif model_name == "mistral":
return mistral_ocr(image)
elif model_name == "openai":
return openai_ocr(image)
elif model_name == "gpt5":
return gpt5_ocr(image)
else:
return f"Unknown model: {model_name}"
# Initialize Gemini model
def initialize_gemini():
"""Initialize the Gemini model with API key."""
gemini_api_key = os.getenv("GEMINI_API_KEY")
if gemini_api_key:
genai.configure(api_key=gemini_api_key)
logger.info("✅ GEMINI_API_KEY loaded successfully")
return genai.GenerativeModel('gemini-2.0-flash-exp')
else:
logger.error("❌ GEMINI_API_KEY not found in environment variables")
return None
# Initialize OpenAI
def initialize_openai():
"""Initialize OpenAI with API key."""
openai_api_key = os.getenv("OPENAI_API_KEY")
if openai_api_key:
openai.api_key = openai_api_key
logger.info("✅ OPENAI_API_KEY loaded successfully")
else:
logger.error("❌ OPENAI_API_KEY not found in environment variables")
# Initialize Mistral
def initialize_mistral():
"""Initialize Mistral with API key."""
mistral_api_key = os.getenv("MISTRAL_API_KEY")
if mistral_api_key:
logger.info("✅ MISTRAL_API_KEY loaded successfully")
else:
logger.error("❌ MISTRAL_API_KEY not found in environment variables") |