|
import textwrap |
|
import numpy as np |
|
import pandas as pd |
|
from transformers import BlipProcessor, BlipForConditionalGeneration, DetrImageProcessor, DetrForObjectDetection |
|
from PyPDF2 import PdfReader |
|
import google.generativeai as genai |
|
import google.ai.generativelanguage as glm |
|
from PIL import Image |
|
import torch |
|
from IPython.display import Markdown |
|
from langchain.embeddings.openai import OpenAIEmbeddings |
|
from langchain.text_splitter import CharacterTextSplitter |
|
from langchain.vectorstores import FAISS |
|
|
|
|
|
|
|
|
|
|
|
class ImageProcessor: |
|
def __init__(self, image_path): |
|
self.image_path = image_path |
|
|
|
def get_caption(self, image_path): |
|
|
|
""" |
|
Generates a short caption for the provided image. |
|
|
|
Args: |
|
image_path (str): The path to the image file. |
|
|
|
Returns: |
|
str: A string representing the caption for the image. |
|
""" |
|
image = Image.open(image_path).convert('RGB') |
|
|
|
model_name = "Salesforce/blip-image-captioning-large" |
|
device = "cpu" |
|
|
|
processor = BlipProcessor.from_pretrained(model_name) |
|
model = BlipForConditionalGeneration.from_pretrained(model_name).to(device) |
|
|
|
inputs = processor(image, return_tensors='pt').to(device) |
|
output = model.generate(**inputs, max_new_tokens=20) |
|
|
|
caption = processor.decode(output[0], skip_special_tokens=True) |
|
|
|
return caption |
|
|
|
|
|
|
|
def detect_objects(self, image_path): |
|
|
|
""" |
|
Detects objects in the provided image. |
|
|
|
Args: |
|
image_path (str): The path to the image file. |
|
|
|
Returns: |
|
str: A string with all the detected objects. Each object as '[x1, x2, y1, y2, class_name, confindence_score]'. |
|
""" |
|
image = Image.open(image_path).convert('RGB') |
|
|
|
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50") |
|
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50") |
|
|
|
inputs = processor(images=image, return_tensors="pt") |
|
outputs = model(**inputs) |
|
|
|
|
|
|
|
target_sizes = torch.tensor([image.size[::-1]]) |
|
results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0] |
|
|
|
detections = "" |
|
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): |
|
detections += '[{}, {}, {}, {}]'.format(int(box[0]), int(box[1]), int(box[2]), int(box[3])) |
|
detections += ' {}'.format(model.config.id2label[int(label)]) |
|
detections += ' {}\n'.format(float(score)) |
|
|
|
return detections |
|
|
|
|
|
|
|
def make_prompt(self, query, image_captions, objects_detections): |
|
|
|
escaped_captions = image_captions.replace("'", "").replace('"', "").replace("\n", " ") |
|
escaped_objects = objects_detections.replace("'", "").replace('"', "").replace("\n", " ") |
|
prompt = textwrap.dedent("""You are a helpful and informative bot that answers questions using text from the image captions and objects detected included below. \ |
|
Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \ |
|
However, you are talking to a non-technical audience, so be sure to break down complicated concepts and \ |
|
strike a friendly and conversational tone. \ |
|
If the image captions or objects detected are irrelevant to the answer, you may ignore them. |
|
QUESTION: '{query}' |
|
IMAGE CAPTIONS: '{image_captions}' |
|
OBJECTS DETECTED: '{objects_detected}' |
|
|
|
ANSWER: |
|
""").format(query=query, image_captions=escaped_captions, objects_detected=escaped_objects) |
|
|
|
return prompt |
|
|
|
|
|
|
|
def generate_answer(self, prompt): |
|
|
|
model = genai.GenerativeModel('gemini-pro') |
|
answer = model.generate_content(prompt) |
|
|
|
return answer.text |
|
|
|
|
|
|
|
|
|
|
|
|
|
class PDFProcessor: |
|
def __init__(self, pdf_path): |
|
self.pdf_path = pdf_path |
|
|
|
def create_embedding_df(self, pdf_path): |
|
|
|
|
|
pdfreader = PdfReader(pdf_path) |
|
|
|
|
|
documents = [] |
|
for i, page in enumerate(pdfreader.pages): |
|
content = page.extract_text() |
|
if content: |
|
|
|
document = { |
|
"Title": f"Page {i+1}", |
|
"Text": content |
|
} |
|
documents.append(document) |
|
|
|
|
|
df = pd.DataFrame(documents) |
|
|
|
|
|
model = 'models/embedding-001' |
|
|
|
|
|
def embed_fn(title, text): |
|
return genai.embed_content( |
|
model=model, |
|
content=text, |
|
task_type="retrieval_document", |
|
title=title |
|
)["embedding"] |
|
|
|
|
|
df['Embeddings'] = df.apply(lambda row: embed_fn(row['Title'], row['Text']), axis=1) |
|
|
|
return df |
|
|
|
|
|
|
|
def find_best_passage(self, query, dataframe): |
|
|
|
""" |
|
Compute the distances between the query and each document in the dataframe |
|
using the dot product. |
|
""" |
|
model = 'models/embedding-001' |
|
query_embedding = genai.embed_content(model=model, |
|
content=query, |
|
task_type="retrieval_query") |
|
dot_products = np.dot(np.stack(dataframe['Embeddings']), query_embedding["embedding"]) |
|
idx = np.argmax(dot_products) |
|
|
|
return dataframe.iloc[idx]['Text'] |
|
|
|
|
|
|
|
def make_prompt(self, query, relevant_passage): |
|
|
|
escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ") |
|
prompt = textwrap.dedent("""You are a helpful and informative bot that answers questions using text from the reference passage included below. \ |
|
Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \ |
|
However, you are talking to a non-technical audience, so be sure to break down complicated concepts and \ |
|
strike a friendly and converstional tone. \ |
|
If the passage is irrelevant to the answer, you may ignore it. |
|
QUESTION: '{query}' |
|
PASSAGE: '{relevant_passage}' |
|
|
|
ANSWER: |
|
""").format(query=query, relevant_passage=escaped) |
|
|
|
return prompt |
|
|
|
|
|
|
|
def generate_answer(self, prompt): |
|
|
|
model = genai.GenerativeModel('gemini-pro') |
|
answer = model.generate_content(prompt) |
|
|
|
return answer.text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|