Spaces:
Runtime error
Runtime error
import os | |
from langchain.document_loaders import PyPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.embeddings import OpenAIEmbeddings | |
from langchain.vectorstores.chroma import Chroma | |
import os | |
from langchain.vectorstores.chroma import Chroma | |
from langchain.embeddings import OpenAIEmbeddings | |
from langchain.chat_models import ChatOpenAI | |
from langchain.prompts import ChatPromptTemplate | |
import PyPDF2 | |
def read_pages(pdf_file): | |
pages = [] | |
reader = PyPDF2.PdfReader(pdf_file) | |
for page_number in range(len(reader.pages)): | |
page = reader.pages[page_number] | |
page_content = page.extract_text() | |
pages.append(page_content) | |
return pages | |
def get_chunks(file_path): | |
loader = PyPDFLoader(file_path) | |
documents = loader.load() | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=300, | |
chunk_overlap=100, | |
length_function=len, | |
add_start_index=True, | |
) | |
chunks = text_splitter.split_documents(documents) | |
return chunks | |
def get_vectordb(chunks, CHROMA_PATH): | |
CHROMA_PATH = f"../../data/chroma/{CHROMA_PATH}" | |
if os.path.exists(CHROMA_PATH): | |
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=OpenAIEmbeddings()) | |
else: | |
db = Chroma.from_documents( | |
chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH | |
) | |
db.persist() | |
print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.") | |
return db | |
def gen_summary(text, db): | |
PROMPT_TEMPLATE = """ | |
Answer the question based only on the following context: | |
{context} | |
--- | |
Answer the question based on the above context: {question} | |
""" | |
query_text = f""" | |
Summarize the given chunk from a story. The summary should be of narrartive nature and be around 5-7 sentences long. | |
```{text}``` | |
Generate response in the following JSON format: | |
{{ | |
"summary": "Your summary here.", | |
"text: "The original text here." | |
}} | |
""" | |
results = db.similarity_search_with_relevance_scores(query_text, k=5) | |
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results]) | |
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE) | |
prompt = prompt_template.format(context=context_text, question=query_text) | |
model = ChatOpenAI() | |
response_text = model.predict(prompt) | |
return eval(response_text) | |