import pinecone import os import PyPDF2 import shutil import gradio as gr from tqdm import tqdm from pydantic import Field from typing import List, Optional from langchain.load.serializable import Serializable from langchain.vectorstores import Pinecone from config import PINECONE_API_KEY, PINECONE_ENVIRONMENT, INDEX_NAME, SAVE_DIR from config import OPENAI_API_BASE, OPENAI_API_KEY, OPENAI_API_TYPE, OPENAI_API_VERSION, EMBEDDING_DEPLOYMENT_ID from langchain.embeddings import OpenAIEmbeddings from langchain.text_splitter import TokenTextSplitter class Document(Serializable): """Class for storing a piece of text and associated metadata.""" page_content: str """String text.""" metadata: dict = Field(default_factory=dict) """Arbitrary metadata about the page content (e.g., source, relationships to other documents, etc.). """ filepath = "documents\STANDARD_SOFTWARE LIFECYCLES.pdf" pdftext = "" text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=30) doc_chunks = [] documents = [] with open(filepath, "rb") as pdfFileObj: pdf_reader = PyPDF2.PdfReader(pdfFileObj) for page in tqdm(pdf_reader.pages): pdftext += page.extract_text() texts = [Document(page_content=pdftext, metadata={"source": filepath})] texts = text_splitter.split_documents(texts) documents.extend(texts) print(documents[:3]) # for (idx, docs) in enumerate(documents): # docs.page_content = f"[{idx}] " + docs.page_content def add_source_numbers(lst, source_name = "Source", use_source = True): if use_source: return [f'[{idx+1}]\t "{item[0]}"\n{source_name}: {item[1]}' for idx, item in enumerate(lst)] else: return [f'[{idx+1}]\t "{item}"' for idx, item in enumerate(lst)] for (idx, d) in enumerate(documents): item = [d.page_content.strip("�"), os.path.basename(d.metadata["source"])] d.page_content = f'[{idx+1}]\t "{item[0]}"\nSource: {item[1]}' # print(reference_results) # print("----------------") # print(documents[:3]) def add_details(lst): nodes = [] for txt in lst: brief = txt[:25].replace("\n", "") nodes.append( f"
{brief}...

{txt}

" ) return nodes reference_results = [d.page_content for d in documents[:3]] display_append = add_details(reference_results) print(display_append)