Spaces:
Sleeping
Sleeping
Support txt and pdf
Browse files- aimakerspace/openai_utils/embedding.py +12 -0
- aimakerspace/pdfloader.py +60 -0
- aimakerspace/semantic_chunking.py +60 -0
- app.py +41 -13
- requirements.txt +6 -1
aimakerspace/openai_utils/embedding.py
CHANGED
@@ -48,6 +48,18 @@ class EmbeddingModel:
|
|
48 |
|
49 |
return embedding.data[0].embedding
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
if __name__ == "__main__":
|
53 |
embedding_model = EmbeddingModel()
|
|
|
48 |
|
49 |
return embedding.data[0].embedding
|
50 |
|
51 |
+
def embed_documents(self, list_of_text: List[str]) -> List[List[float]]:
|
52 |
+
"""
|
53 |
+
Embed a list of documents (text strings) using the OpenAI embeddings model.
|
54 |
+
|
55 |
+
Args:
|
56 |
+
list_of_text (List[str]): A list of text strings to be embedded.
|
57 |
+
|
58 |
+
Returns:
|
59 |
+
List[List[float]]: A list of embedding vectors.
|
60 |
+
"""
|
61 |
+
return self.get_embeddings(list_of_text)
|
62 |
+
|
63 |
|
64 |
if __name__ == "__main__":
|
65 |
embedding_model = EmbeddingModel()
|
aimakerspace/pdfloader.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.document_loaders import PyPDFLoader
|
2 |
+
|
3 |
+
|
4 |
+
class PDFLoader:
|
5 |
+
def __init__(self, file_path: str):
|
6 |
+
"""
|
7 |
+
Initialize the PDFLoader class with the path to the PDF file.
|
8 |
+
|
9 |
+
Args:
|
10 |
+
file_path (str): The path to the PDF file to be loaded.
|
11 |
+
"""
|
12 |
+
self.file_path = file_path
|
13 |
+
self.loader = PyPDFLoader(self.file_path)
|
14 |
+
self.pages = None
|
15 |
+
|
16 |
+
def load_and_split(self):
|
17 |
+
"""
|
18 |
+
Load and split the PDF file into pages.
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
list: A list of pages after loading and splitting the PDF file.
|
22 |
+
"""
|
23 |
+
self.pages = self.loader.load_and_split()
|
24 |
+
return self.pages
|
25 |
+
|
26 |
+
def get_page(self, page_number: int):
|
27 |
+
"""
|
28 |
+
Get a specific page from the loaded PDF.
|
29 |
+
|
30 |
+
Args:
|
31 |
+
page_number (int): The page number to retrieve.
|
32 |
+
|
33 |
+
Returns:
|
34 |
+
dict: The content of the specified page.
|
35 |
+
"""
|
36 |
+
if self.pages is None:
|
37 |
+
raise ValueError(
|
38 |
+
"The PDF has not been loaded yet. Call load_and_split() first."
|
39 |
+
)
|
40 |
+
|
41 |
+
if page_number < 1 or page_number > len(self.pages):
|
42 |
+
raise ValueError(
|
43 |
+
f"Page number out of range. Please choose a value between 1 and {len(self.pages)}."
|
44 |
+
)
|
45 |
+
|
46 |
+
return self.pages[page_number - 1]
|
47 |
+
|
48 |
+
def get_total_pages(self):
|
49 |
+
"""
|
50 |
+
Get the total number of pages in the PDF.
|
51 |
+
|
52 |
+
Returns:
|
53 |
+
int: The total number of pages.
|
54 |
+
"""
|
55 |
+
if self.pages is None:
|
56 |
+
raise ValueError(
|
57 |
+
"The PDF has not been loaded yet. Call load_and_split() first."
|
58 |
+
)
|
59 |
+
|
60 |
+
return len(self.pages)
|
aimakerspace/semantic_chunking.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_experimental.text_splitter import SemanticChunker
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
|
5 |
+
class SemanticChunking:
|
6 |
+
def __init__(self, embedding_model, breakpoint_threshold_type="percentile"):
|
7 |
+
"""
|
8 |
+
Initialize the SemanticChunking class with a specified breakpoint threshold type and an embedding model.
|
9 |
+
|
10 |
+
Args:
|
11 |
+
embedding_model: An instance of the EmbeddingModel class to generate embeddings.
|
12 |
+
breakpoint_threshold_type (str): The type of breakpoint threshold to use for chunking.
|
13 |
+
Options include 'percentile', 'standard_deviation', 'interquartile'.
|
14 |
+
"""
|
15 |
+
self.text_splitter = SemanticChunker(
|
16 |
+
embedding_model, breakpoint_threshold_type=breakpoint_threshold_type
|
17 |
+
)
|
18 |
+
|
19 |
+
def split_text(self, text: str) -> List:
|
20 |
+
"""
|
21 |
+
Split the provided text into semantic chunks.
|
22 |
+
|
23 |
+
Args:
|
24 |
+
text (str): The text to be split into chunks.
|
25 |
+
|
26 |
+
Returns:
|
27 |
+
list: A list of documents (chunks) obtained from the text.
|
28 |
+
"""
|
29 |
+
docs = self.text_splitter.create_documents([text])
|
30 |
+
return docs
|
31 |
+
|
32 |
+
def get_chunk(self, docs: List, chunk_index: int) -> str:
|
33 |
+
"""
|
34 |
+
Get a specific chunk from the list of documents.
|
35 |
+
|
36 |
+
Args:
|
37 |
+
docs (list): The list of documents (chunks).
|
38 |
+
chunk_index (int): The index of the chunk to retrieve.
|
39 |
+
|
40 |
+
Returns:
|
41 |
+
str: The content of the specified chunk.
|
42 |
+
"""
|
43 |
+
if chunk_index < 0 or chunk_index >= len(docs):
|
44 |
+
raise ValueError(
|
45 |
+
f"Chunk index out of range. Please choose a value between 0 and {len(docs) - 1}."
|
46 |
+
)
|
47 |
+
|
48 |
+
return docs[chunk_index].page_content
|
49 |
+
|
50 |
+
def get_total_chunks(self, docs: List) -> int:
|
51 |
+
"""
|
52 |
+
Get the total number of chunks in the list of documents.
|
53 |
+
|
54 |
+
Args:
|
55 |
+
docs (list): The list of documents (chunks).
|
56 |
+
|
57 |
+
Returns:
|
58 |
+
int: The total number of chunks.
|
59 |
+
"""
|
60 |
+
return len(docs)
|
app.py
CHANGED
@@ -7,9 +7,11 @@ from aimakerspace.openai_utils.prompts import (
|
|
7 |
SystemRolePrompt,
|
8 |
AssistantRolePrompt,
|
9 |
)
|
|
|
10 |
from aimakerspace.openai_utils.embedding import EmbeddingModel
|
11 |
from aimakerspace.vectordatabase import VectorDatabase
|
12 |
from aimakerspace.openai_utils.chatmodel import ChatOpenAI
|
|
|
13 |
import chainlit as cl
|
14 |
|
15 |
system_template = """\
|
@@ -25,6 +27,7 @@ Question:
|
|
25 |
"""
|
26 |
user_role_prompt = UserRolePrompt(user_prompt_template)
|
27 |
|
|
|
28 |
class RetrievalAugmentedQAPipeline:
|
29 |
def __init__(self, llm: ChatOpenAI(), vector_db_retriever: VectorDatabase) -> None:
|
30 |
self.llm = llm
|
@@ -39,29 +42,55 @@ class RetrievalAugmentedQAPipeline:
|
|
39 |
|
40 |
formatted_system_prompt = system_role_prompt.create_message()
|
41 |
|
42 |
-
formatted_user_prompt = user_role_prompt.create_message(
|
|
|
|
|
43 |
|
44 |
async def generate_response():
|
45 |
-
async for chunk in self.llm.astream(
|
|
|
|
|
46 |
yield chunk
|
47 |
|
48 |
return {"response": generate_response(), "context": context_list}
|
49 |
|
|
|
50 |
text_splitter = CharacterTextSplitter()
|
|
|
|
|
51 |
|
52 |
|
53 |
def process_text_file(file: AskFileResponse):
|
54 |
import tempfile
|
55 |
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
temp_file_path = temp_file.name
|
58 |
|
59 |
with open(temp_file_path, "wb") as f:
|
60 |
f.write(file.content)
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
return texts
|
66 |
|
67 |
|
@@ -72,8 +101,8 @@ async def on_chat_start():
|
|
72 |
# Wait for the user to upload a file
|
73 |
while files == None:
|
74 |
files = await cl.AskFileMessage(
|
75 |
-
content="Please upload a Text File file to begin!",
|
76 |
-
accept=["text/plain"],
|
77 |
max_size_mb=2,
|
78 |
timeout=180,
|
79 |
).send()
|
@@ -93,15 +122,14 @@ async def on_chat_start():
|
|
93 |
# Create a dict vector store
|
94 |
vector_db = VectorDatabase()
|
95 |
vector_db = await vector_db.abuild_from_list(texts)
|
96 |
-
|
97 |
chat_openai = ChatOpenAI()
|
98 |
|
99 |
# Create a chain
|
100 |
retrieval_augmented_qa_pipeline = RetrievalAugmentedQAPipeline(
|
101 |
-
vector_db_retriever=vector_db,
|
102 |
-
llm=chat_openai
|
103 |
)
|
104 |
-
|
105 |
# Let the user know that the system is ready
|
106 |
msg.content = f"Processing `{file.name}` done. You can now ask questions!"
|
107 |
await msg.update()
|
@@ -119,4 +147,4 @@ async def main(message):
|
|
119 |
async for stream_resp in result["response"]:
|
120 |
await msg.stream_token(stream_resp)
|
121 |
|
122 |
-
await msg.send()
|
|
|
7 |
SystemRolePrompt,
|
8 |
AssistantRolePrompt,
|
9 |
)
|
10 |
+
from aimakerspace.pdfloader import PDFLoader
|
11 |
from aimakerspace.openai_utils.embedding import EmbeddingModel
|
12 |
from aimakerspace.vectordatabase import VectorDatabase
|
13 |
from aimakerspace.openai_utils.chatmodel import ChatOpenAI
|
14 |
+
from aimakerspace.semantic_chunking import SemanticChunking
|
15 |
import chainlit as cl
|
16 |
|
17 |
system_template = """\
|
|
|
27 |
"""
|
28 |
user_role_prompt = UserRolePrompt(user_prompt_template)
|
29 |
|
30 |
+
|
31 |
class RetrievalAugmentedQAPipeline:
|
32 |
def __init__(self, llm: ChatOpenAI(), vector_db_retriever: VectorDatabase) -> None:
|
33 |
self.llm = llm
|
|
|
42 |
|
43 |
formatted_system_prompt = system_role_prompt.create_message()
|
44 |
|
45 |
+
formatted_user_prompt = user_role_prompt.create_message(
|
46 |
+
question=user_query, context=context_prompt
|
47 |
+
)
|
48 |
|
49 |
async def generate_response():
|
50 |
+
async for chunk in self.llm.astream(
|
51 |
+
[formatted_system_prompt, formatted_user_prompt]
|
52 |
+
):
|
53 |
yield chunk
|
54 |
|
55 |
return {"response": generate_response(), "context": context_list}
|
56 |
|
57 |
+
|
58 |
text_splitter = CharacterTextSplitter()
|
59 |
+
embedding_model = EmbeddingModel()
|
60 |
+
chunker = SemanticChunking(embedding_model, breakpoint_threshold_type="percentile")
|
61 |
|
62 |
|
63 |
def process_text_file(file: AskFileResponse):
|
64 |
import tempfile
|
65 |
|
66 |
+
file_extension = os.path.splitext(file.name)[1].lower()
|
67 |
+
if file_extension == ".txt":
|
68 |
+
suffix = ".txt"
|
69 |
+
elif file_extension == ".pdf":
|
70 |
+
suffix = ".pdf"
|
71 |
+
else:
|
72 |
+
raise ValueError("Unsupported file type. Please upload a .txt or .pdf file.")
|
73 |
+
|
74 |
+
with tempfile.NamedTemporaryFile(
|
75 |
+
mode="w", delete=False, suffix=suffix
|
76 |
+
) as temp_file:
|
77 |
temp_file_path = temp_file.name
|
78 |
|
79 |
with open(temp_file_path, "wb") as f:
|
80 |
f.write(file.content)
|
81 |
|
82 |
+
if suffix == ".txt":
|
83 |
+
file_loader = TextFileLoader(temp_file_path)
|
84 |
+
elif suffix == ".pdf":
|
85 |
+
file_loader = PDFLoader(temp_file_path)
|
86 |
+
else:
|
87 |
+
raise ValueError("Unsupported file type. Please upload a .txt or .pdf file.")
|
88 |
+
documents = file_loader.load_documents()
|
89 |
+
split_pages = []
|
90 |
+
for doc in documents:
|
91 |
+
split_pages += chunker.split_text(doc.page_content)
|
92 |
+
texts = [i.page_content for i in split_pages]
|
93 |
+
# texts = text_splitter.split_texts(documents)
|
94 |
return texts
|
95 |
|
96 |
|
|
|
101 |
# Wait for the user to upload a file
|
102 |
while files == None:
|
103 |
files = await cl.AskFileMessage(
|
104 |
+
content="Please upload a Text or PDF File file to begin!",
|
105 |
+
accept=["text/plain", "application/pdf"],
|
106 |
max_size_mb=2,
|
107 |
timeout=180,
|
108 |
).send()
|
|
|
122 |
# Create a dict vector store
|
123 |
vector_db = VectorDatabase()
|
124 |
vector_db = await vector_db.abuild_from_list(texts)
|
125 |
+
|
126 |
chat_openai = ChatOpenAI()
|
127 |
|
128 |
# Create a chain
|
129 |
retrieval_augmented_qa_pipeline = RetrievalAugmentedQAPipeline(
|
130 |
+
vector_db_retriever=vector_db, llm=chat_openai
|
|
|
131 |
)
|
132 |
+
|
133 |
# Let the user know that the system is ready
|
134 |
msg.content = f"Processing `{file.name}` done. You can now ask questions!"
|
135 |
await msg.update()
|
|
|
147 |
async for stream_resp in result["response"]:
|
148 |
await msg.stream_token(stream_resp)
|
149 |
|
150 |
+
await msg.send()
|
requirements.txt
CHANGED
@@ -1,3 +1,8 @@
|
|
1 |
numpy
|
2 |
chainlit==0.7.700
|
3 |
-
openai
|
|
|
|
|
|
|
|
|
|
|
|
1 |
numpy
|
2 |
chainlit==0.7.700
|
3 |
+
openai
|
4 |
+
langchain
|
5 |
+
langchain-community
|
6 |
+
langchain-experimental
|
7 |
+
langchain-openai
|
8 |
+
langchain-core
|