Spaces:

ADOPLE
/

Contract_Management

Runtime error

App Files Files Community

Contract_Management / summary_extractor.py

robertselvam

Upload 5 files

a2780b1 about 2 years ago

raw

history blame

4.22 kB

	import openai
	import json
	from typing import Dict
	import os
	from typing import List
	from langchain.chat_models import ChatOpenAI
	from langchain.document_loaders import PyPDFLoader
	from langchain.chains.mapreduce import MapReduceChain
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.chains.summarize import load_summarize_chain
	from langchain.prompts import PromptTemplate


	class Extractor:

	"""
	This class handles the extraction of tags from a PDF document.

	Attributes:
	config (dict): Configuration settings loaded from a JSON file.
	pdf_file_path (str): Path to the input PDF file.
	"""
	def __init__(self):
	"""
	Initialize the Extractor class.
	"""

	# Set OpenAI API key
	# os.environ["OPENAI_API_KEY"] = ""

	def _document_loader(self,pdf_file_path) -> List[str]:
	"""
	Load and split the PDF document into individual pages.

	Returns:
	List[str]: List of text content from each page.
	"""
	try:
	loader = PyPDFLoader(pdf_file_path.name)
	pages = loader.load_and_split()
	return pages

	except Exception as e:
	print(f"Error while loading and splitting the document: {str(e)}")


	def _document_text_spilliter(self,pdf_file_path) -> List[str]:
	"""
	Split the document text into smaller chunks.

	Returns:
	List[str]: List of smaller text chunks.
	"""
	try:
	# Load the document texts
	docs = self._document_loader(pdf_file_path)

	# Initialize the text splitter with specified chunk size and overlap
	text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
	chunk_size=1000, chunk_overlap=200
	)

	# Split the documents into chunks
	split_docs = text_splitter.split_documents(docs)

	# Return the list of split document chunks
	return split_docs

	except Exception as e:
	print(f"Error while splitting document text: {str(e)}")


	def _refine_summary(self,pdf_file_path) -> str:
	"""
	Generate a refined summary of the document using language models.

	Returns:
	str: Refined summary text.
	"""
	try:
	# Split documents into chunks for efficient processing
	split_docs = self._document_text_spilliter(pdf_file_path)

	# Prepare the prompt template for summarization
	prompt_template = """Write a concise summary of the following:
	{text}
	CONCISE SUMMARY:"""
	prompt = PromptTemplate.from_template(prompt_template)

	# Prepare the template for refining the summary with additional context
	refine_template = (
	"Your job is to produce a final summary\n"
	"We have provided an existing summary up to a certain point: {existing_answer}\n"
	"We have the opportunity to refine the existing summary"
	"(only if needed) with some more context below.\n"
	"------------\n"
	"{text}\n"
	"------------\n"
	"Given the new context, refine the original summary"
	"If the context isn't useful, return the original summary."
	)
	refine_prompt = PromptTemplate.from_template(refine_template)

	# Load the summarization chain using the ChatOpenAI language model
	chain = load_summarize_chain(
	llm = ChatOpenAI(temperature=0),
	chain_type="refine",
	question_prompt=prompt,
	refine_prompt=refine_prompt,
	return_intermediate_steps=True,
	input_key="input_documents",
	output_key="output_text",
	)

	# Generate the refined summary using the loaded summarization chain
	result = chain({"input_documents": split_docs}, return_only_outputs=True)

	return result["output_text"]

	except Exception as e:
	print(f"Error while generating refined summary: {str(e)}")