Spaces:
Runtime error
Runtime error
| import openai | |
| import json | |
| from typing import Dict | |
| import os | |
| from typing import List | |
| from langchain.chat_models import ChatOpenAI | |
| from langchain.document_loaders import PyPDFLoader | |
| from langchain.chains.mapreduce import MapReduceChain | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain.chains.summarize import load_summarize_chain | |
| from langchain.prompts import PromptTemplate | |
| class Extractor: | |
| """ | |
| This class handles the extraction of tags from a PDF document. | |
| Attributes: | |
| config (dict): Configuration settings loaded from a JSON file. | |
| pdf_file_path (str): Path to the input PDF file. | |
| """ | |
| def __init__(self): | |
| """ | |
| Initialize the Extractor class. | |
| """ | |
| # Set OpenAI API key | |
| # os.environ["OPENAI_API_KEY"] = "" | |
| def _document_loader(self,pdf_file_path) -> List[str]: | |
| """ | |
| Load and split the PDF document into individual pages. | |
| Returns: | |
| List[str]: List of text content from each page. | |
| """ | |
| try: | |
| loader = PyPDFLoader(pdf_file_path.name) | |
| pages = loader.load_and_split() | |
| return pages | |
| except Exception as e: | |
| print(f"Error while loading and splitting the document: {str(e)}") | |
| def _document_text_spilliter(self,pdf_file_path) -> List[str]: | |
| """ | |
| Split the document text into smaller chunks. | |
| Returns: | |
| List[str]: List of smaller text chunks. | |
| """ | |
| try: | |
| # Load the document texts | |
| docs = self._document_loader(pdf_file_path) | |
| # Initialize the text splitter with specified chunk size and overlap | |
| text_splitter = CharacterTextSplitter.from_tiktoken_encoder( | |
| chunk_size=1000, chunk_overlap=200 | |
| ) | |
| # Split the documents into chunks | |
| split_docs = text_splitter.split_documents(docs) | |
| # Return the list of split document chunks | |
| return split_docs | |
| except Exception as e: | |
| print(f"Error while splitting document text: {str(e)}") | |
| def _refine_summary(self,pdf_file_path) -> str: | |
| """ | |
| Generate a refined summary of the document using language models. | |
| Returns: | |
| str: Refined summary text. | |
| """ | |
| try: | |
| # Split documents into chunks for efficient processing | |
| split_docs = self._document_text_spilliter(pdf_file_path) | |
| # Prepare the prompt template for summarization | |
| prompt_template = """Write a concise summary of the following: | |
| {text} | |
| CONCISE SUMMARY:""" | |
| prompt = PromptTemplate.from_template(prompt_template) | |
| # Prepare the template for refining the summary with additional context | |
| refine_template = ( | |
| "Your job is to produce a final summary\n" | |
| "We have provided an existing summary up to a certain point: {existing_answer}\n" | |
| "We have the opportunity to refine the existing summary" | |
| "(only if needed) with some more context below.\n" | |
| "------------\n" | |
| "{text}\n" | |
| "------------\n" | |
| "Given the new context, refine the original summary" | |
| "If the context isn't useful, return the original summary." | |
| ) | |
| refine_prompt = PromptTemplate.from_template(refine_template) | |
| # Load the summarization chain using the ChatOpenAI language model | |
| chain = load_summarize_chain( | |
| llm = ChatOpenAI(temperature=0), | |
| chain_type="refine", | |
| question_prompt=prompt, | |
| refine_prompt=refine_prompt, | |
| return_intermediate_steps=True, | |
| input_key="input_documents", | |
| output_key="output_text", | |
| ) | |
| # Generate the refined summary using the loaded summarization chain | |
| result = chain({"input_documents": split_docs}, return_only_outputs=True) | |
| return result["output_text"] | |
| except Exception as e: | |
| print(f"Error while generating refined summary: {str(e)}") |