import streamlit as st import PyPDF2 import pptx import os from langchain.llms import OpenAI from langchain.vectorstores.cassandra import Cassandra from langchain.indexes.vectorstore import VectorStoreIndexWrapper from langchain.embeddings import OpenAIEmbeddings import cassio from langchain.text_splitter import CharacterTextSplitter from huggingface_hub import login # Secure API keys (ensure they are set) ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN") ASTRA_DB_ID = os.getenv("ASTRA_DB_ID") HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY") OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") llm = OpenAI(openai_api_key=OPENAI_API_KEY) if not ASTRA_DB_APPLICATION_TOKEN or not ASTRA_DB_ID: st.error("Astra DB credentials are missing. Set the environment variables.") st.stop() if not HUGGINGFACE_API_KEY: st.error("Hugging Face API key is missing. Set the HUGGINGFACE_API_KEY environment variable.") st.stop() if not OPENAI_API_KEY: st.error("OpenAI API key is missing. Set the OPENAI_API_KEY environment variable.") st.stop() # Initialize Astra DB connection cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID) # Initialize LLM & Embeddings login(token=HUGGINGFACE_API_KEY) embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY) # Initialize vector store astra_vector_store = Cassandra(embedding=embedding, table_name="qa_mini_demo") def extract_text_from_pdf(uploaded_file): """Extract text from a PDF file.""" text = "" try: pdf_reader = PyPDF2.PdfReader(uploaded_file) for page in pdf_reader.pages: page_text = page.extract_text() if page_text: # Avoid NoneType error text += page_text + "\n" except Exception as e: st.error(f"Error reading PDF: {e}") return text def extract_text_from_ppt(uploaded_file): """Extract text from a PowerPoint file.""" text = "" try: presentation = pptx.Presentation(uploaded_file) for slide in presentation.slides: for shape in slide.shapes: if hasattr(shape, "text"): text += shape.text + "\n" except Exception as e: st.error(f"Error reading PPT: {e}") return text def main(): st.title("Chat with Documents") uploaded_file = st.file_uploader("Upload a PDF or PPT file", type=["pdf", "pptx"]) extract_button = st.button("Extract Text") extracted_text = "" if extract_button and uploaded_file is not None: if uploaded_file.name.endswith(".pdf"): extracted_text = extract_text_from_pdf(uploaded_file) elif uploaded_file.name.endswith(".pptx"): extracted_text = extract_text_from_ppt(uploaded_file) if extracted_text: text_splitter = CharacterTextSplitter(separator="\n", chunk_size=800, chunk_overlap=200, length_function=len) texts = text_splitter.split_text(extracted_text) astra_vector_store.add_texts(texts) st.success("Text extracted and stored successfully!") # Ensure the vector store index is initialized properly astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store) query = st.text_input("Enter your query") submit_query = st.button("Submit Query") if submit_query and query: response = astra_vector_index.query(query, llm =llm) st.write(f"Response: {response}") if __name__ == "__main__": main()