Spaces:

jatinmehra
/

PDF-Insight-PRO

Running

PDF-Insight-PRO / preprocessing.py

Jatin Mehra

initial

52c6dbe over 1 year ago

4.66 kB

	import os
	import PyPDF2
	from groq import Groq
	import streamlit as st
	from collections import defaultdict

	class Model:
	"""
	A class that represents a model for generating responses based on a given context and query.
	"""

	def __init__(self):
	"""
	Initializes the Model object and sets up the Groq client.
	"""
	# api_key = os.getenv("GROQ_API_KEY")
	api_key = st.secrets["GROQ_API_KEY"]
	if not api_key:
	raise ValueError("GROQ_API_KEY environment variable is not set.")
	self.client = Groq(api_key=api_key)
	self.contexts = []
	self.cache = defaultdict(dict) # Caching for repeated queries

	def extract_text_from_pdf(self, pdf_file):
	"""
	Extracts text from a PDF file.
	Args:
	- pdf_file: The file-like object of the PDF.
	Returns:
	- text: The extracted text from the PDF file.
	"""
	try:
	pdf_reader = PyPDF2.PdfReader(pdf_file)
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text()
	return text
	except Exception as e:
	raise ValueError(f"Error extracting text: {str(e)}")

	def generate_response(self, context, query, temperature, max_tokens, model):
	"""
	Generates a response based on the given context and query.
	Args:
	- context: The context for generating the response.
	- query: The query or question.
	- temperature: The sampling temperature for response generation.
	- max_tokens: The maximum number of tokens for the response.
	- model: The model ID to be used for generating the response.
	Returns:
	- response: The generated response.
	"""
	# Caching check
	if query in self.cache and self.cache[query]["context"] == context:
	return self.cache[query]["response"]

	messages = [
	{"role": "system", "content": f"Context: {context}"},
	{"role": "user", "content": query},
	]
	try:
	completion = self.client.chat.completions.create(
	model=model, # Model ID
	messages=messages,
	temperature=temperature,
	max_tokens=max_tokens,
	)
	response = completion.choices[0].message.content
	self.cache[query]["context"] = context
	self.cache[query]["response"] = response # Cache the response
	return response
	except Exception as e:
	return f"API request failed: {str(e)}"

	def add_to_context(self, file_path: str):
	"""
	Reads a PDF file and appends its content to the context for generating responses.
	Args:
	- file_path: The path to the PDF file.
	"""
	try:
	with open(file_path, "rb") as pdf_file:
	context = self.extract_text_from_pdf(pdf_file)
	self.contexts.append(context)
	except Exception as e:
	raise ValueError(f"Error processing PDF: {str(e)}")

	def remove_from_context(self, index: int):
	"""
	Removes a document from the context based on its index.
	Args:
	- index: The index of the document to remove.
	"""
	if 0 <= index < len(self.contexts):
	self.contexts.pop(index)
	else:
	raise ValueError("Invalid index for removing context.")

	def get_combined_context(self):
	"""
	Combines all contexts into a single context string.
	Returns:
	- combined_context: The combined context from all documents.
	"""
	return "\n".join(self.contexts)

	def get_response(self, question: str, temperature: float, max_tokens: int, model: str):
	"""
	Generates a response based on the given question and the current combined context.
	Args:
	- question: The user's question.
	- temperature: The sampling temperature for response generation.
	- max_tokens: The maximum number of tokens for the response.
	- model: The model ID to be used for generating the response.
	Returns:
	- response: The generated response or a prompt to upload a document.
	"""
	if not self.contexts:
	return "Please upload a document."
	combined_context = self.get_combined_context()
	return self.generate_response(combined_context, question, temperature, max_tokens, model)

	def clear(self):
	"""
	Clears the current context.
	"""
	self.contexts = []
	self.cache.clear()