deepali1021 commited on
Commit
69800ee
Β·
1 Parent(s): 653f0aa

added exception handling

Browse files
Files changed (2) hide show
  1. pages/Load_Documents.py +72 -22
  2. utils/_admin_util.py +139 -90
pages/Load_Documents.py CHANGED
@@ -1,14 +1,40 @@
1
  import os
 
2
  from utils._admin_util import create_embeddings, create_vector_store, read_pdf_data, split_data
3
  import streamlit as st
4
  from dotenv import load_dotenv
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  def main():
7
  load_dotenv()
8
 
9
- # Add API key verification at the start
10
- if not os.getenv("OPENAI_API_KEY"):
11
- st.error("OpenAI API key not found! Please ensure it's set in the environment variables.")
 
 
 
 
 
 
 
12
  st.stop()
13
 
14
  st.set_page_config(page_title="Dump PDFs to QDrant - Vector Store")
@@ -24,31 +50,55 @@ def main():
24
 
25
  # Process each PDF file
26
  for pdf in uploaded_files:
27
- st.write(f"Processing: {pdf.name}")
28
-
29
- # Extract text from PDF
30
- text = read_pdf_data(pdf)
31
- st.write(f"πŸ‘‰ Reading {pdf.name} done")
32
-
33
- # Create chunks for this PDF
34
- chunks = split_data(text)
35
- all_chunks.extend(chunks)
36
- st.write(f"πŸ‘‰ Splitting {pdf.name} into chunks done")
37
-
38
- # Create embeddings once for all chunks
39
- st.write("Creating embeddings...")
40
- embeddings = create_embeddings()
41
- st.write("πŸ‘‰ Creating embeddings instance done")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  # Create vector store with all chunks
44
- vector_store = create_vector_store(embeddings, all_chunks)
45
- st.session_state.vector_store = vector_store
 
 
 
 
46
 
47
- st.success(f"Successfully processed {len(uploaded_files)} files and pushed embeddings to Qdrant")
48
  st.write(f"Total chunks created: {len(all_chunks)}")
 
 
 
 
49
  except Exception as e:
50
- st.error(f"An error occurred: {str(e)}")
51
  st.error("Please check your API key and permissions.")
 
52
 
53
  if __name__ == '__main__':
54
  main()
 
1
  import os
2
+ import openai
3
  from utils._admin_util import create_embeddings, create_vector_store, read_pdf_data, split_data
4
  import streamlit as st
5
  from dotenv import load_dotenv
6
+ import requests
7
+
8
+ def validate_api_key(api_key):
9
+ """Test if the API key is valid"""
10
+ try:
11
+ # Make a small test request to OpenAI
12
+ client = openai.OpenAI(api_key=api_key)
13
+ client.embeddings.create(input="test", model="text-embedding-ada-002")
14
+ return True
15
+ except openai.AuthenticationError:
16
+ st.error("❌ Invalid API key")
17
+ return False
18
+ except openai.PermissionDeniedError:
19
+ st.error("❌ Permission denied. Please check your API key's permissions")
20
+ return False
21
+ except Exception as e:
22
+ st.error(f"❌ API key validation error: {str(e)}")
23
+ return False
24
 
25
  def main():
26
  load_dotenv()
27
 
28
+ # Add detailed API key verification
29
+ api_key = os.getenv("OPENAI_API_KEY")
30
+ if not api_key:
31
+ st.error("❌ OpenAI API key not found! Please ensure it's set in the environment variables.")
32
+ st.info("To set up your API key:")
33
+ st.code("1. Go to Hugging Face Space settings\n2. Add OPENAI_API_KEY in Repository Secrets")
34
+ st.stop()
35
+
36
+ # Validate the API key
37
+ if not validate_api_key(api_key):
38
  st.stop()
39
 
40
  st.set_page_config(page_title="Dump PDFs to QDrant - Vector Store")
 
50
 
51
  # Process each PDF file
52
  for pdf in uploaded_files:
53
+ try:
54
+ st.write(f"Processing: {pdf.name}")
55
+
56
+ # Extract text from PDF
57
+ text = read_pdf_data(pdf)
58
+ st.write(f"πŸ‘‰ Reading {pdf.name} done")
59
+
60
+ # Create chunks for this PDF
61
+ chunks = split_data(text)
62
+ all_chunks.extend(chunks)
63
+ st.write(f"πŸ‘‰ Splitting {pdf.name} into chunks done")
64
+ except Exception as e:
65
+ st.error(f"❌ Error processing {pdf.name}: {str(e)}")
66
+ continue
67
+
68
+ if not all_chunks:
69
+ st.error("❌ No valid chunks were created from the PDFs")
70
+ st.stop()
71
+
72
+ # Create embeddings with progress tracking
73
+ try:
74
+ st.write("Creating embeddings...")
75
+ embeddings = create_embeddings()
76
+ st.write("πŸ‘‰ Creating embeddings instance done")
77
+ except openai.RateLimitError:
78
+ st.error("❌ Rate limit exceeded. Please try again later.")
79
+ st.stop()
80
+ except Exception as e:
81
+ st.error(f"❌ Error creating embeddings: {str(e)}")
82
+ st.stop()
83
 
84
  # Create vector store with all chunks
85
+ try:
86
+ vector_store = create_vector_store(embeddings, all_chunks)
87
+ st.session_state.vector_store = vector_store
88
+ except Exception as e:
89
+ st.error(f"❌ Error creating vector store: {str(e)}")
90
+ st.stop()
91
 
92
+ st.success(f"βœ… Successfully processed {len(uploaded_files)} files and pushed embeddings to Qdrant")
93
  st.write(f"Total chunks created: {len(all_chunks)}")
94
+
95
+ except requests.exceptions.RequestException as e:
96
+ st.error(f"❌ Network error: {str(e)}")
97
+ st.error("Please check your internet connection and try again.")
98
  except Exception as e:
99
+ st.error(f"❌ An unexpected error occurred: {str(e)}")
100
  st.error("Please check your API key and permissions.")
101
+ st.error("If the problem persists, please contact support.")
102
 
103
  if __name__ == '__main__':
104
  main()
utils/_admin_util.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import tiktoken
2
  import PyPDF2
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -9,6 +10,8 @@ from langchain_core.runnables import RunnablePassthrough
9
  from langchain_core.output_parsers import StrOutputParser
10
  from langchain_openai import ChatOpenAI
11
  from langchain_core.prompts import ChatPromptTemplate
 
 
12
 
13
 
14
  HUMAN_TEMPLATE = """
@@ -21,120 +24,166 @@ QUERY:
21
  Use the provide context to answer the provided user query. Only use the provided context to answer the query. If you do not know the answer, or it's not contained in the provided context response with "I don't know"
22
  """
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  #Read PDF data
25
  def read_pdf_data(pdf_file):
26
- # Create PDF reader object
27
- pdf_reader = PyPDF2.PdfReader(pdf_file)
28
-
29
- # Extract text from each page
30
- text = ""
31
- for page in pdf_reader.pages:
32
- text += page.extract_text()
33
- return text
 
 
34
 
35
  def tiktoken_len(text):
36
- tokens = tiktoken.encoding_for_model("gpt-4").encode(
37
- text,
38
- )
39
- return len(tokens)
 
40
 
41
  #Split data into chunks
42
  def split_data(text):
43
- text_splitter = RecursiveCharacterTextSplitter(
44
- chunk_size = 100,
45
- chunk_overlap = 0,
46
- length_function = tiktoken_len,
47
- )
48
- chunks = text_splitter.split_text(text)
49
- return chunks
 
 
 
 
 
 
50
 
51
  #Create embeddings instance
52
 
53
  def create_embeddings():
54
- embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
55
- return embedding_model
 
 
 
 
 
 
 
 
56
 
57
 
58
  # Create a vector database using Qdrant
59
  def create_vector_store(embedding_model, chunks):
60
- embedding_dim = 1536 # YOUR ANSWER HERE
61
- client = QdrantClient(":memory:")
62
- client.create_collection(
63
- collection_name="lcel_doc_v2",
64
- vectors_config=VectorParams(size=embedding_dim, distance=Distance.COSINE),
65
- )
66
- vector_store = QdrantVectorStore(
67
- client=client,
68
- collection_name="lcel_doc_v2",
69
- embedding=embedding_model,
70
- )
71
- _ = vector_store.add_texts(texts=chunks)
72
- return vector_store
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  # create RAG
75
  def create_rag(vector_store):
76
- # Initialize OpenAI chat model with a valid model name
77
- openai_chat_model = ChatOpenAI(model="gpt-3.5-turbo")
78
-
79
- # Create chat prompt template
80
- chat_prompt = ChatPromptTemplate.from_messages([
81
- ("system", "You are a helpful assistant that answers questions based on the provided context."),
82
- ("human", HUMAN_TEMPLATE)
83
- ])
84
-
85
- # Set up retriever with configurable k
86
- retriever = vector_store.as_retriever(search_kwargs={"k": 3})
87
-
88
- # Create RAG pipeline
89
- simple_rag = (
90
- {"context": retriever, "query": RunnablePassthrough()}
91
- | chat_prompt
92
- | openai_chat_model
93
- | StrOutputParser()
94
- )
95
-
96
- return simple_rag
 
 
 
 
97
 
98
  # Invoke RAG
99
  def invoke_rag(vector_store, query):
100
- rag_chain = create_rag(vector_store)
101
- response = rag_chain.invoke(query)
102
- return response
 
 
 
103
 
104
 
105
  def get_ticket_category(query):
106
- # Define the system prompt for categorization
107
- CATEGORY_PROMPT = """You are a ticket categorization system. Categorize the following query into exactly one of these categories:
108
- - HR Support: For queries about employment, benefits, leaves, workplace policies, etc.
109
- - IT Support: For queries about software, hardware, network, system access, etc.
110
- - Transportation Support: For queries about company transport, parking, vehicle maintenance, etc.
111
- - Other: For queries that do not fit into the above categories.
112
- Respond with ONLY the category name, nothing else.
113
-
114
- Query: {query}
115
- """
116
-
117
- # Create OpenAI client for categorization
118
- client = ChatOpenAI(model="gpt-3.5-turbo")
119
-
120
- # Create the prompt template
121
- prompt = ChatPromptTemplate.from_messages([
122
- ("system", CATEGORY_PROMPT)
123
- ])
124
-
125
- # Create the chain
126
- chain = prompt | client | StrOutputParser()
127
-
128
- # Get the category
129
- category = chain.invoke({"query": query})
130
-
131
- # Clean and validate the response
132
- category = category.strip()
133
- valid_categories = ["HR Support", "IT Support", "Transportation Support"]
134
-
135
- if category not in valid_categories:
136
- return "Other" # Default category if classification fails
137
 
138
- return category
 
 
 
139
 
140
 
 
1
+ import os
2
  import tiktoken
3
  import PyPDF2
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
10
  from langchain_core.output_parsers import StrOutputParser
11
  from langchain_openai import ChatOpenAI
12
  from langchain_core.prompts import ChatPromptTemplate
13
+ import streamlit as st
14
+
15
 
16
 
17
  HUMAN_TEMPLATE = """
 
24
  Use the provide context to answer the provided user query. Only use the provided context to answer the query. If you do not know the answer, or it's not contained in the provided context response with "I don't know"
25
  """
26
 
27
+ # Define the system prompt for categorization
28
+ CATEGORY_PROMPT = """You are a ticket categorization system. Categorize the following query into exactly one of these categories:
29
+ - HR Support: For queries about employment, benefits, leaves, workplace policies, etc.
30
+ - IT Support: For queries about software, hardware, network, system access, etc.
31
+ - Transportation Support: For queries about company transport, parking, vehicle maintenance, etc.
32
+ - Other: For queries that do not fit into the above categories.
33
+ Respond with ONLY the category name, nothing else.
34
+
35
+ Query: {query}
36
+ """
37
+
38
+ def check_api_key():
39
+ """Verify that the API key is set and valid"""
40
+ api_key = os.getenv("OPENAI_API_KEY")
41
+ if not api_key:
42
+ raise ValueError("OpenAI API key not found in environment variables")
43
+ return api_key
44
+
45
  #Read PDF data
46
  def read_pdf_data(pdf_file):
47
+ try:
48
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
49
+ text = ""
50
+ for page in pdf_reader.pages:
51
+ text += page.extract_text()
52
+ if not text.strip():
53
+ raise ValueError("No text extracted from PDF")
54
+ return text
55
+ except Exception as e:
56
+ raise Exception(f"Error reading PDF: {str(e)}")
57
 
58
  def tiktoken_len(text):
59
+ try:
60
+ tokens = tiktoken.encoding_for_model("gpt-4").encode(text)
61
+ return len(tokens)
62
+ except Exception as e:
63
+ raise Exception(f"Error in token calculation: {str(e)}")
64
 
65
  #Split data into chunks
66
  def split_data(text):
67
+ try:
68
+ text_splitter = RecursiveCharacterTextSplitter(
69
+ chunk_size=500, # Increased for better context
70
+ chunk_overlap=50, # Added overlap for better continuity
71
+ length_function=tiktoken_len,
72
+ separators=["\n\n", "\n", " ", ""]
73
+ )
74
+ chunks = text_splitter.split_text(text)
75
+ if not chunks:
76
+ raise ValueError("Text splitting produced no chunks")
77
+ return chunks
78
+ except Exception as e:
79
+ raise Exception(f"Error splitting text: {str(e)}")
80
 
81
  #Create embeddings instance
82
 
83
  def create_embeddings():
84
+ try:
85
+ api_key = check_api_key()
86
+ embedding_model = OpenAIEmbeddings(
87
+ model="text-embedding-3-small",
88
+ openai_api_key=api_key,
89
+ show_progress_bar=True
90
+ )
91
+ return embedding_model
92
+ except Exception as e:
93
+ raise Exception(f"Error creating embeddings model: {str(e)}")
94
 
95
 
96
  # Create a vector database using Qdrant
97
  def create_vector_store(embedding_model, chunks):
98
+ try:
99
+ embedding_dim = 1536
100
+ client = QdrantClient(":memory:") # Consider using persistent storage for production
101
+
102
+ # Create collection with error handling
103
+ try:
104
+ client.create_collection(
105
+ collection_name="lcel_doc_v2",
106
+ vectors_config=VectorParams(size=embedding_dim, distance=Distance.COSINE),
107
+ )
108
+ except Exception as e:
109
+ raise Exception(f"Error creating Qdrant collection: {str(e)}")
110
+
111
+ vector_store = QdrantVectorStore(
112
+ client=client,
113
+ collection_name="lcel_doc_v2",
114
+ embedding=embedding_model,
115
+ )
116
+
117
+ # Add texts with progress tracking
118
+ try:
119
+ _ = vector_store.add_texts(texts=chunks)
120
+ except Exception as e:
121
+ raise Exception(f"Error adding texts to vector store: {str(e)}")
122
+
123
+ return vector_store
124
+ except Exception as e:
125
+ raise Exception(f"Error in vector store creation: {str(e)}")
126
 
127
  # create RAG
128
  def create_rag(vector_store):
129
+ try:
130
+ api_key = check_api_key()
131
+ openai_chat_model = ChatOpenAI(
132
+ model="gpt-3.5-turbo",
133
+ openai_api_key=api_key,
134
+ temperature=0.7
135
+ )
136
+
137
+ chat_prompt = ChatPromptTemplate.from_messages([
138
+ ("system", "You are a helpful assistant that answers questions based on the provided context."),
139
+ ("human", HUMAN_TEMPLATE)
140
+ ])
141
+
142
+ retriever = vector_store.as_retriever(search_kwargs={"k": 3})
143
+
144
+ simple_rag = (
145
+ {"context": retriever, "query": RunnablePassthrough()}
146
+ | chat_prompt
147
+ | openai_chat_model
148
+ | StrOutputParser()
149
+ )
150
+
151
+ return simple_rag
152
+ except Exception as e:
153
+ raise Exception(f"Error creating RAG chain: {str(e)}")
154
 
155
  # Invoke RAG
156
  def invoke_rag(vector_store, query):
157
+ try:
158
+ rag_chain = create_rag(vector_store)
159
+ response = rag_chain.invoke(query)
160
+ return response
161
+ except Exception as e:
162
+ raise Exception(f"Error invoking RAG chain: {str(e)}")
163
 
164
 
165
  def get_ticket_category(query):
166
+ try:
167
+ api_key = check_api_key()
168
+ client = ChatOpenAI(
169
+ model="gpt-3.5-turbo",
170
+ openai_api_key=api_key,
171
+ temperature=0
172
+ )
173
+
174
+ prompt = ChatPromptTemplate.from_messages([
175
+ ("system", CATEGORY_PROMPT)
176
+ ])
177
+
178
+ chain = prompt | client | StrOutputParser()
179
+ category = chain.invoke({"query": query})
180
+
181
+ category = category.strip()
182
+ valid_categories = ["HR Support", "IT Support", "Transportation Support", "Other"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
+ return category if category in valid_categories else "Other"
185
+ except Exception as e:
186
+ st.error(f"Error in category classification: {str(e)}")
187
+ return "Other" # Fallback category
188
 
189