Fiqa commited on
Commit
628527c
·
verified ·
1 Parent(s): f7b0260

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +83 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import PyPDF2
3
+ from langchain.llms import HuggingFaceHub
4
+ import pptx
5
+ import os
6
+ from langchain.vectorstores.cassandra import Cassandra
7
+ from langchain.indexes.vectorstore import VectorStoreIndexWrapper
8
+ from langchain.embeddings import OpenAIEmbeddings
9
+ import cassio
10
+ from langchain.text_splitter import CharacterTextSplitter
11
+
12
+
13
+
14
+
15
+ # Secure API keys (replace with environment variables in deployment)
16
+ ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
17
+ ASTRA_DB_ID = os.getenv("ASTRA_DB_ID")
18
+ HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
19
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
20
+
21
+
22
+ # Initialize Astra DB connection
23
+ cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)
24
+
25
+ # Initialize LLM & Embeddings
26
+ hf_llm = HuggingFaceHub(repo_id="google/flan-t5-large", model_kwargs={"temperature": 0, "max_length": 64})
27
+ embedding =OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
28
+
29
+ # Initialize vector store
30
+ astra_vector_store = Cassandra(embedding=embedding, table_name="qa_mini_demo")
31
+
32
+ def extract_text_from_pdf(uploaded_file):
33
+ """Extract text from a PDF file."""
34
+ text = ""
35
+ pdf_reader = PyPDF2.PdfReader(uploaded_file)
36
+ for page in pdf_reader.pages:
37
+ page_text = page.extract_text()
38
+ if page_text: # Avoid NoneType error
39
+ text += page_text + "\n"
40
+ return text
41
+
42
+ def extract_text_from_ppt(uploaded_file):
43
+ """Extract text from a PowerPoint file."""
44
+ text = ""
45
+ presentation = pptx.Presentation(uploaded_file)
46
+ for slide in presentation.slides:
47
+ for shape in slide.shapes:
48
+ if hasattr(shape, "text"):
49
+ text += shape.text + "\n"
50
+ return text
51
+
52
+ def main():
53
+ st.title("Chat with Documents")
54
+
55
+ uploaded_file = st.file_uploader("Upload a PDF or PPT file", type=["pdf", "pptx"])
56
+ extract_button = st.button("Extract Text")
57
+
58
+ extracted_text = ""
59
+ if extract_button and uploaded_file is not None:
60
+ if uploaded_file.name.endswith(".pdf"):
61
+ extracted_text = extract_text_from_pdf(uploaded_file)
62
+ elif uploaded_file.name.endswith(".pptx"):
63
+ extracted_text = extract_text_from_ppt(uploaded_file)
64
+
65
+ if extracted_text:
66
+ text_splitter = CharacterTextSplitter(separator="\n", chunk_size=800, chunk_overlap=200, length_function=len)
67
+ texts = text_splitter.split_text(extracted_text)
68
+ astra_vector_store.add_texts(texts)
69
+
70
+ # Ensure the vector store index is initialized properly
71
+ astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)
72
+
73
+ query = st.text_input("Enter your query")
74
+ submit_query = st.button("Submit Query")
75
+ if submit_query:
76
+
77
+
78
+ value = astra_vector_index.query(query, llm=hf_llm)
79
+
80
+ st.write(f"Response: {value}")
81
+
82
+ if __name__ == "__main__":
83
+ main()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ PyPDF2
3
+ python-pptx
4
+ langchain
5
+ cassio
6
+ cassandra-driver
7
+ openai
8
+ huggingface_hub