Chat_QnA_v2 / test.py
binh99's picture
abcd
8c5ce8c
raw
history blame
2.38 kB
import pinecone
import os
import PyPDF2
import shutil
import gradio as gr
from tqdm import tqdm
from pydantic import Field
from typing import List, Optional
from langchain.load.serializable import Serializable
from langchain.vectorstores import Pinecone
from config import PINECONE_API_KEY, PINECONE_ENVIRONMENT, INDEX_NAME, SAVE_DIR
from config import OPENAI_API_BASE, OPENAI_API_KEY, OPENAI_API_TYPE, OPENAI_API_VERSION, EMBEDDING_DEPLOYMENT_ID
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import TokenTextSplitter
class Document(Serializable):
"""Class for storing a piece of text and associated metadata."""
page_content: str
"""String text."""
metadata: dict = Field(default_factory=dict)
"""Arbitrary metadata about the page content (e.g., source, relationships to other
documents, etc.).
"""
filepath = "documents\STANDARD_SOFTWARE LIFECYCLES.pdf"
pdftext = ""
text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=30)
doc_chunks = []
documents = []
with open(filepath, "rb") as pdfFileObj:
pdf_reader = PyPDF2.PdfReader(pdfFileObj)
for page in tqdm(pdf_reader.pages):
pdftext += page.extract_text()
texts = [Document(page_content=pdftext, metadata={"source": filepath})]
texts = text_splitter.split_documents(texts)
documents.extend(texts)
print(documents[:3])
# for (idx, docs) in enumerate(documents):
# docs.page_content = f"[{idx}] " + docs.page_content
def add_source_numbers(lst, source_name = "Source", use_source = True):
if use_source:
return [f'[{idx+1}]\t "{item[0]}"\n{source_name}: {item[1]}' for idx, item in enumerate(lst)]
else:
return [f'[{idx+1}]\t "{item}"' for idx, item in enumerate(lst)]
for (idx, d) in enumerate(documents):
item = [d.page_content.strip("�"), os.path.basename(d.metadata["source"])]
d.page_content = f'[{idx+1}]\t "{item[0]}"\nSource: {item[1]}'
# print(reference_results)
# print("----------------")
# print(documents[:3])
def add_details(lst):
nodes = []
for txt in lst:
brief = txt[:25].replace("\n", "")
nodes.append(
f"<details><summary>{brief}...</summary><p>{txt}</p></details>"
)
return nodes
reference_results = [d.page_content for d in documents[:3]]
display_append = add_details(reference_results)
print(display_append)