Spaces:
Runtime error
Runtime error
import pinecone | |
import os | |
import PyPDF2 | |
import shutil | |
import gradio as gr | |
from tqdm import tqdm | |
from pydantic import Field | |
from typing import List, Optional | |
from langchain.load.serializable import Serializable | |
from langchain.vectorstores import Pinecone | |
from config import PINECONE_API_KEY, PINECONE_ENVIRONMENT, INDEX_NAME, SAVE_DIR | |
from config import OPENAI_API_BASE, OPENAI_API_KEY, OPENAI_API_TYPE, OPENAI_API_VERSION, EMBEDDING_DEPLOYMENT_ID | |
from langchain.embeddings import OpenAIEmbeddings | |
from langchain.text_splitter import TokenTextSplitter | |
class Document(Serializable): | |
"""Class for storing a piece of text and associated metadata.""" | |
page_content: str | |
"""String text.""" | |
metadata: dict = Field(default_factory=dict) | |
"""Arbitrary metadata about the page content (e.g., source, relationships to other | |
documents, etc.). | |
""" | |
filepath = "documents\STANDARD_SOFTWARE LIFECYCLES.pdf" | |
pdftext = "" | |
text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=30) | |
doc_chunks = [] | |
documents = [] | |
with open(filepath, "rb") as pdfFileObj: | |
pdf_reader = PyPDF2.PdfReader(pdfFileObj) | |
for page in tqdm(pdf_reader.pages): | |
pdftext += page.extract_text() | |
texts = [Document(page_content=pdftext, metadata={"source": filepath})] | |
texts = text_splitter.split_documents(texts) | |
documents.extend(texts) | |
print(documents[:3]) | |
# for (idx, docs) in enumerate(documents): | |
# docs.page_content = f"[{idx}] " + docs.page_content | |
def add_source_numbers(lst, source_name = "Source", use_source = True): | |
if use_source: | |
return [f'[{idx+1}]\t "{item[0]}"\n{source_name}: {item[1]}' for idx, item in enumerate(lst)] | |
else: | |
return [f'[{idx+1}]\t "{item}"' for idx, item in enumerate(lst)] | |
for (idx, d) in enumerate(documents): | |
item = [d.page_content.strip("�"), os.path.basename(d.metadata["source"])] | |
d.page_content = f'[{idx+1}]\t "{item[0]}"\nSource: {item[1]}' | |
# print(reference_results) | |
# print("----------------") | |
# print(documents[:3]) | |
def add_details(lst): | |
nodes = [] | |
for txt in lst: | |
brief = txt[:25].replace("\n", "") | |
nodes.append( | |
f"<details><summary>{brief}...</summary><p>{txt}</p></details>" | |
) | |
return nodes | |
reference_results = [d.page_content for d in documents[:3]] | |
display_append = add_details(reference_results) | |
print(display_append) |