Fiqa commited on
Commit
de6b3b3
·
verified ·
1 Parent(s): 6d0a4ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -26
app.py CHANGED
@@ -1,8 +1,8 @@
1
  import streamlit as st
2
  import PyPDF2
3
- from langchain.llms import HuggingFaceHub
4
  import pptx
5
  import os
 
6
  from langchain.vectorstores.cassandra import Cassandra
7
  from langchain.indexes.vectorstore import VectorStoreIndexWrapper
8
  from langchain.embeddings import OpenAIEmbeddings
@@ -10,56 +10,75 @@ import cassio
10
  from langchain.text_splitter import CharacterTextSplitter
11
  from huggingface_hub import login
12
 
13
-
14
-
15
-
16
-
17
-
18
-
19
- # Secure API keys (replace with environment variables in deployment)
20
  ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
21
  ASTRA_DB_ID = os.getenv("ASTRA_DB_ID")
22
  HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
23
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
24
- login(token=HUGGINGFACE_API_KEY)
 
 
 
 
 
 
 
 
 
25
 
26
  # Initialize Astra DB connection
27
  cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)
28
 
29
  # Initialize LLM & Embeddings
30
- hf_llm = HuggingFaceHub(repo_id="google/flan-t5-large", model_kwargs={"temperature": 0, "max_length": 64})
31
- embedding =OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
 
 
 
 
 
 
 
32
 
33
  # Initialize vector store
34
  astra_vector_store = Cassandra(embedding=embedding, table_name="qa_mini_demo")
35
 
 
36
  def extract_text_from_pdf(uploaded_file):
37
  """Extract text from a PDF file."""
38
  text = ""
39
- pdf_reader = PyPDF2.PdfReader(uploaded_file)
40
- for page in pdf_reader.pages:
41
- page_text = page.extract_text()
42
- if page_text: # Avoid NoneType error
43
- text += page_text + "\n"
 
 
 
44
  return text
45
 
 
46
  def extract_text_from_ppt(uploaded_file):
47
  """Extract text from a PowerPoint file."""
48
  text = ""
49
- presentation = pptx.Presentation(uploaded_file)
50
- for slide in presentation.slides:
51
- for shape in slide.shapes:
52
- if hasattr(shape, "text"):
53
- text += shape.text + "\n"
 
 
 
54
  return text
55
 
 
56
  def main():
57
  st.title("Chat with Documents")
58
 
59
  uploaded_file = st.file_uploader("Upload a PDF or PPT file", type=["pdf", "pptx"])
60
  extract_button = st.button("Extract Text")
61
-
62
  extracted_text = ""
 
63
  if extract_button and uploaded_file is not None:
64
  if uploaded_file.name.endswith(".pdf"):
65
  extracted_text = extract_text_from_pdf(uploaded_file)
@@ -70,18 +89,20 @@ def main():
70
  text_splitter = CharacterTextSplitter(separator="\n", chunk_size=800, chunk_overlap=200, length_function=len)
71
  texts = text_splitter.split_text(extracted_text)
72
  astra_vector_store.add_texts(texts)
 
73
 
74
  # Ensure the vector store index is initialized properly
75
  astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)
76
 
77
  query = st.text_input("Enter your query")
78
  submit_query = st.button("Submit Query")
79
- if submit_query:
80
 
81
-
82
- value = astra_vector_index.query(query, llm=hf_llm)
 
 
 
83
 
84
- st.write(f"Response: {value}")
85
 
86
  if __name__ == "__main__":
87
  main()
 
1
  import streamlit as st
2
  import PyPDF2
 
3
  import pptx
4
  import os
5
+ from langchain.llms import HuggingFaceHub
6
  from langchain.vectorstores.cassandra import Cassandra
7
  from langchain.indexes.vectorstore import VectorStoreIndexWrapper
8
  from langchain.embeddings import OpenAIEmbeddings
 
10
  from langchain.text_splitter import CharacterTextSplitter
11
  from huggingface_hub import login
12
 
13
+ # Secure API keys (ensure they are set)
 
 
 
 
 
 
14
  ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
15
  ASTRA_DB_ID = os.getenv("ASTRA_DB_ID")
16
  HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
17
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
18
+
19
+ if not ASTRA_DB_APPLICATION_TOKEN or not ASTRA_DB_ID:
20
+ st.error("Astra DB credentials are missing. Set the environment variables.")
21
+ st.stop()
22
+ if not HUGGINGFACE_API_KEY:
23
+ st.error("Hugging Face API key is missing. Set the HUGGINGFACE_API_KEY environment variable.")
24
+ st.stop()
25
+ if not OPENAI_API_KEY:
26
+ st.error("OpenAI API key is missing. Set the OPENAI_API_KEY environment variable.")
27
+ st.stop()
28
 
29
  # Initialize Astra DB connection
30
  cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)
31
 
32
  # Initialize LLM & Embeddings
33
+ login(token=HUGGINGFACE_API_KEY)
34
+
35
+ hf_llm = HuggingFaceHub(
36
+ repo_id="google/flan-t5-large",
37
+ model_kwargs={"temperature": 0, "max_length": 64},
38
+ huggingfacehub_api_token=HUGGINGFACE_API_KEY
39
+ )
40
+
41
+ embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
42
 
43
  # Initialize vector store
44
  astra_vector_store = Cassandra(embedding=embedding, table_name="qa_mini_demo")
45
 
46
+
47
  def extract_text_from_pdf(uploaded_file):
48
  """Extract text from a PDF file."""
49
  text = ""
50
+ try:
51
+ pdf_reader = PyPDF2.PdfReader(uploaded_file)
52
+ for page in pdf_reader.pages:
53
+ page_text = page.extract_text()
54
+ if page_text: # Avoid NoneType error
55
+ text += page_text + "\n"
56
+ except Exception as e:
57
+ st.error(f"Error reading PDF: {e}")
58
  return text
59
 
60
+
61
  def extract_text_from_ppt(uploaded_file):
62
  """Extract text from a PowerPoint file."""
63
  text = ""
64
+ try:
65
+ presentation = pptx.Presentation(uploaded_file)
66
+ for slide in presentation.slides:
67
+ for shape in slide.shapes:
68
+ if hasattr(shape, "text"):
69
+ text += shape.text + "\n"
70
+ except Exception as e:
71
+ st.error(f"Error reading PPT: {e}")
72
  return text
73
 
74
+
75
  def main():
76
  st.title("Chat with Documents")
77
 
78
  uploaded_file = st.file_uploader("Upload a PDF or PPT file", type=["pdf", "pptx"])
79
  extract_button = st.button("Extract Text")
 
80
  extracted_text = ""
81
+
82
  if extract_button and uploaded_file is not None:
83
  if uploaded_file.name.endswith(".pdf"):
84
  extracted_text = extract_text_from_pdf(uploaded_file)
 
89
  text_splitter = CharacterTextSplitter(separator="\n", chunk_size=800, chunk_overlap=200, length_function=len)
90
  texts = text_splitter.split_text(extracted_text)
91
  astra_vector_store.add_texts(texts)
92
+ st.success("Text extracted and stored successfully!")
93
 
94
  # Ensure the vector store index is initialized properly
95
  astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)
96
 
97
  query = st.text_input("Enter your query")
98
  submit_query = st.button("Submit Query")
 
99
 
100
+ if submit_query and query:
101
+ retriever = astra_vector_index.as_retriever()
102
+ docs = retriever.get_relevant_documents(query)
103
+ response = hf_llm(docs)
104
+ st.write(f"Response: {response}")
105
 
 
106
 
107
  if __name__ == "__main__":
108
  main()