pratikshahp's picture
Update app.py
ed8e778 verified
import gradio as gr
from openai import OpenAI
from PyPDF2 import PdfReader
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Set up OpenAI API key
api_key = os.getenv("OPENAI_API_KEY") # Make sure to have your OpenAI API key in .env
client = OpenAI(api_key=api_key)
# Set OpenAI API key
# Function to extract text from PDF
def extract_pdf_text(pdf_file):
reader = PdfReader(pdf_file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
# Function to split text into chunks
def split_text(text, chunk_size=500, overlap=50):
chunks = []
for i in range(0, len(text), chunk_size - overlap):
chunks.append(text[i:i + chunk_size])
return chunks
# Function to generate embeddings
def get_embeddings(text):
response = client.embeddings.create(
input=text,
model="text-embedding-3-small"
)
return np.array(response.data[0].embedding)
# Function to retrieve relevant chunks
def retrieve_relevant_chunks(query, chunks, chunk_embeddings):
query_embedding = get_embeddings(query)
similarities = cosine_similarity([query_embedding], chunk_embeddings)[0]
top_indices = np.argsort(similarities)[::-1][:3] # Get top 3 relevant chunks
return [chunks[i] for i in top_indices] #we use the chunks list to retrieve the actual text of those relevant chunks. These text chunks will form the context.
# Function to generate a response
def generate_response(context, query):
messages = [
{"role": "system", "content": "You are an assistant that answers questions based on the provided context in 30 words."},
{"role": "user", "content": f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"}
]
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=messages,
max_tokens=300
)
return response.choices[0].message.content
# Global variables to store chunks and embeddings
chunks = []
chunk_embeddings = []
# Gradio interface functions
def process_pdf(pdf_file):
global chunks, chunk_embeddings
text = extract_pdf_text(pdf_file)
chunks = split_text(text)
chunk_embeddings = [get_embeddings(chunk) for chunk in chunks]
return "PDF processed successfully! You can now chat with it."
def chat_with_pdf(query):
global chunks, chunk_embeddings
if not chunks or not chunk_embeddings:
return "Please upload and process a PDF first."
relevant_chunks = retrieve_relevant_chunks(query, chunks, chunk_embeddings)
context = "\n".join(relevant_chunks)
return generate_response(context, query)
# Gradio app
with gr.Blocks() as app:
gr.Markdown("# Chat with Your PDF πŸ“„πŸ€–")
pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"])
process_button = gr.Button("Process PDF")
process_status = gr.Textbox(label="Status", interactive=False)
query = gr.Textbox(label="Ask a Question")
chat_button = gr.Button("Chat with PDF")
response = gr.Textbox(label="Response", interactive=False)
process_button.click(process_pdf, inputs=pdf_file, outputs=process_status)
chat_button.click(chat_with_pdf, inputs=query, outputs=response)
app.launch()