Spaces:
Runtime error
Runtime error
Commit
Β·
69800ee
1
Parent(s):
653f0aa
added exception handling
Browse files- pages/Load_Documents.py +72 -22
- utils/_admin_util.py +139 -90
pages/Load_Documents.py
CHANGED
|
@@ -1,14 +1,40 @@
|
|
| 1 |
import os
|
|
|
|
| 2 |
from utils._admin_util import create_embeddings, create_vector_store, read_pdf_data, split_data
|
| 3 |
import streamlit as st
|
| 4 |
from dotenv import load_dotenv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
def main():
|
| 7 |
load_dotenv()
|
| 8 |
|
| 9 |
-
# Add API key verification
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
st.stop()
|
| 13 |
|
| 14 |
st.set_page_config(page_title="Dump PDFs to QDrant - Vector Store")
|
|
@@ -24,31 +50,55 @@ def main():
|
|
| 24 |
|
| 25 |
# Process each PDF file
|
| 26 |
for pdf in uploaded_files:
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
# Create vector store with all chunks
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
st.success(f"Successfully processed {len(uploaded_files)} files and pushed embeddings to Qdrant")
|
| 48 |
st.write(f"Total chunks created: {len(all_chunks)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
except Exception as e:
|
| 50 |
-
st.error(f"An error occurred: {str(e)}")
|
| 51 |
st.error("Please check your API key and permissions.")
|
|
|
|
| 52 |
|
| 53 |
if __name__ == '__main__':
|
| 54 |
main()
|
|
|
|
| 1 |
import os
|
| 2 |
+
import openai
|
| 3 |
from utils._admin_util import create_embeddings, create_vector_store, read_pdf_data, split_data
|
| 4 |
import streamlit as st
|
| 5 |
from dotenv import load_dotenv
|
| 6 |
+
import requests
|
| 7 |
+
|
| 8 |
+
def validate_api_key(api_key):
|
| 9 |
+
"""Test if the API key is valid"""
|
| 10 |
+
try:
|
| 11 |
+
# Make a small test request to OpenAI
|
| 12 |
+
client = openai.OpenAI(api_key=api_key)
|
| 13 |
+
client.embeddings.create(input="test", model="text-embedding-ada-002")
|
| 14 |
+
return True
|
| 15 |
+
except openai.AuthenticationError:
|
| 16 |
+
st.error("β Invalid API key")
|
| 17 |
+
return False
|
| 18 |
+
except openai.PermissionDeniedError:
|
| 19 |
+
st.error("β Permission denied. Please check your API key's permissions")
|
| 20 |
+
return False
|
| 21 |
+
except Exception as e:
|
| 22 |
+
st.error(f"β API key validation error: {str(e)}")
|
| 23 |
+
return False
|
| 24 |
|
| 25 |
def main():
|
| 26 |
load_dotenv()
|
| 27 |
|
| 28 |
+
# Add detailed API key verification
|
| 29 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
| 30 |
+
if not api_key:
|
| 31 |
+
st.error("β OpenAI API key not found! Please ensure it's set in the environment variables.")
|
| 32 |
+
st.info("To set up your API key:")
|
| 33 |
+
st.code("1. Go to Hugging Face Space settings\n2. Add OPENAI_API_KEY in Repository Secrets")
|
| 34 |
+
st.stop()
|
| 35 |
+
|
| 36 |
+
# Validate the API key
|
| 37 |
+
if not validate_api_key(api_key):
|
| 38 |
st.stop()
|
| 39 |
|
| 40 |
st.set_page_config(page_title="Dump PDFs to QDrant - Vector Store")
|
|
|
|
| 50 |
|
| 51 |
# Process each PDF file
|
| 52 |
for pdf in uploaded_files:
|
| 53 |
+
try:
|
| 54 |
+
st.write(f"Processing: {pdf.name}")
|
| 55 |
+
|
| 56 |
+
# Extract text from PDF
|
| 57 |
+
text = read_pdf_data(pdf)
|
| 58 |
+
st.write(f"π Reading {pdf.name} done")
|
| 59 |
+
|
| 60 |
+
# Create chunks for this PDF
|
| 61 |
+
chunks = split_data(text)
|
| 62 |
+
all_chunks.extend(chunks)
|
| 63 |
+
st.write(f"π Splitting {pdf.name} into chunks done")
|
| 64 |
+
except Exception as e:
|
| 65 |
+
st.error(f"β Error processing {pdf.name}: {str(e)}")
|
| 66 |
+
continue
|
| 67 |
+
|
| 68 |
+
if not all_chunks:
|
| 69 |
+
st.error("β No valid chunks were created from the PDFs")
|
| 70 |
+
st.stop()
|
| 71 |
+
|
| 72 |
+
# Create embeddings with progress tracking
|
| 73 |
+
try:
|
| 74 |
+
st.write("Creating embeddings...")
|
| 75 |
+
embeddings = create_embeddings()
|
| 76 |
+
st.write("π Creating embeddings instance done")
|
| 77 |
+
except openai.RateLimitError:
|
| 78 |
+
st.error("β Rate limit exceeded. Please try again later.")
|
| 79 |
+
st.stop()
|
| 80 |
+
except Exception as e:
|
| 81 |
+
st.error(f"β Error creating embeddings: {str(e)}")
|
| 82 |
+
st.stop()
|
| 83 |
|
| 84 |
# Create vector store with all chunks
|
| 85 |
+
try:
|
| 86 |
+
vector_store = create_vector_store(embeddings, all_chunks)
|
| 87 |
+
st.session_state.vector_store = vector_store
|
| 88 |
+
except Exception as e:
|
| 89 |
+
st.error(f"β Error creating vector store: {str(e)}")
|
| 90 |
+
st.stop()
|
| 91 |
|
| 92 |
+
st.success(f"β
Successfully processed {len(uploaded_files)} files and pushed embeddings to Qdrant")
|
| 93 |
st.write(f"Total chunks created: {len(all_chunks)}")
|
| 94 |
+
|
| 95 |
+
except requests.exceptions.RequestException as e:
|
| 96 |
+
st.error(f"β Network error: {str(e)}")
|
| 97 |
+
st.error("Please check your internet connection and try again.")
|
| 98 |
except Exception as e:
|
| 99 |
+
st.error(f"β An unexpected error occurred: {str(e)}")
|
| 100 |
st.error("Please check your API key and permissions.")
|
| 101 |
+
st.error("If the problem persists, please contact support.")
|
| 102 |
|
| 103 |
if __name__ == '__main__':
|
| 104 |
main()
|
utils/_admin_util.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import tiktoken
|
| 2 |
import PyPDF2
|
| 3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
@@ -9,6 +10,8 @@ from langchain_core.runnables import RunnablePassthrough
|
|
| 9 |
from langchain_core.output_parsers import StrOutputParser
|
| 10 |
from langchain_openai import ChatOpenAI
|
| 11 |
from langchain_core.prompts import ChatPromptTemplate
|
|
|
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
HUMAN_TEMPLATE = """
|
|
@@ -21,120 +24,166 @@ QUERY:
|
|
| 21 |
Use the provide context to answer the provided user query. Only use the provided context to answer the query. If you do not know the answer, or it's not contained in the provided context response with "I don't know"
|
| 22 |
"""
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
#Read PDF data
|
| 25 |
def read_pdf_data(pdf_file):
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
| 34 |
|
| 35 |
def tiktoken_len(text):
|
| 36 |
-
|
| 37 |
-
text
|
| 38 |
-
|
| 39 |
-
|
|
|
|
| 40 |
|
| 41 |
#Split data into chunks
|
| 42 |
def split_data(text):
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
#Create embeddings instance
|
| 52 |
|
| 53 |
def create_embeddings():
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
|
| 58 |
# Create a vector database using Qdrant
|
| 59 |
def create_vector_store(embedding_model, chunks):
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
# create RAG
|
| 75 |
def create_rag(vector_store):
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
# Invoke RAG
|
| 99 |
def invoke_rag(vector_store, query):
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
|
| 105 |
def get_ticket_category(query):
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
])
|
| 124 |
-
|
| 125 |
-
# Create the chain
|
| 126 |
-
chain = prompt | client | StrOutputParser()
|
| 127 |
-
|
| 128 |
-
# Get the category
|
| 129 |
-
category = chain.invoke({"query": query})
|
| 130 |
-
|
| 131 |
-
# Clean and validate the response
|
| 132 |
-
category = category.strip()
|
| 133 |
-
valid_categories = ["HR Support", "IT Support", "Transportation Support"]
|
| 134 |
-
|
| 135 |
-
if category not in valid_categories:
|
| 136 |
-
return "Other" # Default category if classification fails
|
| 137 |
|
| 138 |
-
|
|
|
|
|
|
|
|
|
|
| 139 |
|
| 140 |
|
|
|
|
| 1 |
+
import os
|
| 2 |
import tiktoken
|
| 3 |
import PyPDF2
|
| 4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
|
| 10 |
from langchain_core.output_parsers import StrOutputParser
|
| 11 |
from langchain_openai import ChatOpenAI
|
| 12 |
from langchain_core.prompts import ChatPromptTemplate
|
| 13 |
+
import streamlit as st
|
| 14 |
+
|
| 15 |
|
| 16 |
|
| 17 |
HUMAN_TEMPLATE = """
|
|
|
|
| 24 |
Use the provide context to answer the provided user query. Only use the provided context to answer the query. If you do not know the answer, or it's not contained in the provided context response with "I don't know"
|
| 25 |
"""
|
| 26 |
|
| 27 |
+
# Define the system prompt for categorization
|
| 28 |
+
CATEGORY_PROMPT = """You are a ticket categorization system. Categorize the following query into exactly one of these categories:
|
| 29 |
+
- HR Support: For queries about employment, benefits, leaves, workplace policies, etc.
|
| 30 |
+
- IT Support: For queries about software, hardware, network, system access, etc.
|
| 31 |
+
- Transportation Support: For queries about company transport, parking, vehicle maintenance, etc.
|
| 32 |
+
- Other: For queries that do not fit into the above categories.
|
| 33 |
+
Respond with ONLY the category name, nothing else.
|
| 34 |
+
|
| 35 |
+
Query: {query}
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
def check_api_key():
|
| 39 |
+
"""Verify that the API key is set and valid"""
|
| 40 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
| 41 |
+
if not api_key:
|
| 42 |
+
raise ValueError("OpenAI API key not found in environment variables")
|
| 43 |
+
return api_key
|
| 44 |
+
|
| 45 |
#Read PDF data
|
| 46 |
def read_pdf_data(pdf_file):
|
| 47 |
+
try:
|
| 48 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
| 49 |
+
text = ""
|
| 50 |
+
for page in pdf_reader.pages:
|
| 51 |
+
text += page.extract_text()
|
| 52 |
+
if not text.strip():
|
| 53 |
+
raise ValueError("No text extracted from PDF")
|
| 54 |
+
return text
|
| 55 |
+
except Exception as e:
|
| 56 |
+
raise Exception(f"Error reading PDF: {str(e)}")
|
| 57 |
|
| 58 |
def tiktoken_len(text):
|
| 59 |
+
try:
|
| 60 |
+
tokens = tiktoken.encoding_for_model("gpt-4").encode(text)
|
| 61 |
+
return len(tokens)
|
| 62 |
+
except Exception as e:
|
| 63 |
+
raise Exception(f"Error in token calculation: {str(e)}")
|
| 64 |
|
| 65 |
#Split data into chunks
|
| 66 |
def split_data(text):
|
| 67 |
+
try:
|
| 68 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 69 |
+
chunk_size=500, # Increased for better context
|
| 70 |
+
chunk_overlap=50, # Added overlap for better continuity
|
| 71 |
+
length_function=tiktoken_len,
|
| 72 |
+
separators=["\n\n", "\n", " ", ""]
|
| 73 |
+
)
|
| 74 |
+
chunks = text_splitter.split_text(text)
|
| 75 |
+
if not chunks:
|
| 76 |
+
raise ValueError("Text splitting produced no chunks")
|
| 77 |
+
return chunks
|
| 78 |
+
except Exception as e:
|
| 79 |
+
raise Exception(f"Error splitting text: {str(e)}")
|
| 80 |
|
| 81 |
#Create embeddings instance
|
| 82 |
|
| 83 |
def create_embeddings():
|
| 84 |
+
try:
|
| 85 |
+
api_key = check_api_key()
|
| 86 |
+
embedding_model = OpenAIEmbeddings(
|
| 87 |
+
model="text-embedding-3-small",
|
| 88 |
+
openai_api_key=api_key,
|
| 89 |
+
show_progress_bar=True
|
| 90 |
+
)
|
| 91 |
+
return embedding_model
|
| 92 |
+
except Exception as e:
|
| 93 |
+
raise Exception(f"Error creating embeddings model: {str(e)}")
|
| 94 |
|
| 95 |
|
| 96 |
# Create a vector database using Qdrant
|
| 97 |
def create_vector_store(embedding_model, chunks):
|
| 98 |
+
try:
|
| 99 |
+
embedding_dim = 1536
|
| 100 |
+
client = QdrantClient(":memory:") # Consider using persistent storage for production
|
| 101 |
+
|
| 102 |
+
# Create collection with error handling
|
| 103 |
+
try:
|
| 104 |
+
client.create_collection(
|
| 105 |
+
collection_name="lcel_doc_v2",
|
| 106 |
+
vectors_config=VectorParams(size=embedding_dim, distance=Distance.COSINE),
|
| 107 |
+
)
|
| 108 |
+
except Exception as e:
|
| 109 |
+
raise Exception(f"Error creating Qdrant collection: {str(e)}")
|
| 110 |
+
|
| 111 |
+
vector_store = QdrantVectorStore(
|
| 112 |
+
client=client,
|
| 113 |
+
collection_name="lcel_doc_v2",
|
| 114 |
+
embedding=embedding_model,
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
# Add texts with progress tracking
|
| 118 |
+
try:
|
| 119 |
+
_ = vector_store.add_texts(texts=chunks)
|
| 120 |
+
except Exception as e:
|
| 121 |
+
raise Exception(f"Error adding texts to vector store: {str(e)}")
|
| 122 |
+
|
| 123 |
+
return vector_store
|
| 124 |
+
except Exception as e:
|
| 125 |
+
raise Exception(f"Error in vector store creation: {str(e)}")
|
| 126 |
|
| 127 |
# create RAG
|
| 128 |
def create_rag(vector_store):
|
| 129 |
+
try:
|
| 130 |
+
api_key = check_api_key()
|
| 131 |
+
openai_chat_model = ChatOpenAI(
|
| 132 |
+
model="gpt-3.5-turbo",
|
| 133 |
+
openai_api_key=api_key,
|
| 134 |
+
temperature=0.7
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
chat_prompt = ChatPromptTemplate.from_messages([
|
| 138 |
+
("system", "You are a helpful assistant that answers questions based on the provided context."),
|
| 139 |
+
("human", HUMAN_TEMPLATE)
|
| 140 |
+
])
|
| 141 |
+
|
| 142 |
+
retriever = vector_store.as_retriever(search_kwargs={"k": 3})
|
| 143 |
+
|
| 144 |
+
simple_rag = (
|
| 145 |
+
{"context": retriever, "query": RunnablePassthrough()}
|
| 146 |
+
| chat_prompt
|
| 147 |
+
| openai_chat_model
|
| 148 |
+
| StrOutputParser()
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
return simple_rag
|
| 152 |
+
except Exception as e:
|
| 153 |
+
raise Exception(f"Error creating RAG chain: {str(e)}")
|
| 154 |
|
| 155 |
# Invoke RAG
|
| 156 |
def invoke_rag(vector_store, query):
|
| 157 |
+
try:
|
| 158 |
+
rag_chain = create_rag(vector_store)
|
| 159 |
+
response = rag_chain.invoke(query)
|
| 160 |
+
return response
|
| 161 |
+
except Exception as e:
|
| 162 |
+
raise Exception(f"Error invoking RAG chain: {str(e)}")
|
| 163 |
|
| 164 |
|
| 165 |
def get_ticket_category(query):
|
| 166 |
+
try:
|
| 167 |
+
api_key = check_api_key()
|
| 168 |
+
client = ChatOpenAI(
|
| 169 |
+
model="gpt-3.5-turbo",
|
| 170 |
+
openai_api_key=api_key,
|
| 171 |
+
temperature=0
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
prompt = ChatPromptTemplate.from_messages([
|
| 175 |
+
("system", CATEGORY_PROMPT)
|
| 176 |
+
])
|
| 177 |
+
|
| 178 |
+
chain = prompt | client | StrOutputParser()
|
| 179 |
+
category = chain.invoke({"query": query})
|
| 180 |
+
|
| 181 |
+
category = category.strip()
|
| 182 |
+
valid_categories = ["HR Support", "IT Support", "Transportation Support", "Other"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
+
return category if category in valid_categories else "Other"
|
| 185 |
+
except Exception as e:
|
| 186 |
+
st.error(f"Error in category classification: {str(e)}")
|
| 187 |
+
return "Other" # Fallback category
|
| 188 |
|
| 189 |
|