listing-gen / gpt_vision.py
Ahmad Shahzad
app added
e3d7308
from langchain_core.output_parsers import JsonOutputParser
import base64
from langchain.chains import TransformChain
from langchain_core.messages import HumanMessage
from langchain_openai import ChatOpenAI
from langchain import globals
from langchain_core.runnables import chain
from langchain_core.pydantic_v1 import BaseModel, Field
import os
from dotenv import load_dotenv
load_dotenv()
# Set up your OpenAI API key
os.environ["OPENAI_API_KEY"] = os.getenv('gpt_api_key')
def load_image(inputs: dict) -> dict:
"""Load image from file and encode it as base64."""
image_path = inputs["image_path"]
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
image_base64 = encode_image(image_path)
return {"image": image_base64}
load_image_chain = TransformChain(
input_variables=["image_path"],
output_variables=["image"],
transform=load_image
)
class ImageInformation(BaseModel):
"""Information about an image."""
Title: str = Field(description="Suitable title for the given product in image")
image_description: str = Field(description="a short description of the image")
# main_objects: list[str] = Field(description="list of the main objects on the picture")
# Set verbose
# globals.set_debug(True)
@chain
def image_model(inputs: dict) -> str | list[str] | dict:
"""Invoke model with image and prompt."""
model = ChatOpenAI(temperature=0.5, model="gpt-4-vision-preview", max_tokens=1024)
msg = model.invoke(
[HumanMessage(
content=[
{"type": "text", "text": inputs["prompt"]},
{"type": "text", "text": parser.get_format_instructions()},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{inputs['image']}"}},
])]
)
return msg.content
parser = JsonOutputParser(pydantic_object=ImageInformation)
def get_image_informations(image_path: str) -> dict:
vision_prompt = """
Given the image, provide the following information:
- Title of the product in image
- A description of the product in image based on the text written in image
"""
vision_chain = load_image_chain | image_model | parser
return vision_chain.invoke({'image_path': f'{image_path}',
'prompt': vision_prompt})
gpt_vision_result = get_image_informations("sampleImages/edited3.jpg")
print(gpt_vision_result)