File size: 3,506 Bytes
6d0a4ef
 
 
 
dfb74b3
6d0a4ef
 
 
 
 
 
 
de6b3b3
6d0a4ef
 
 
 
de6b3b3
dfb74b3
 
 
 
de6b3b3
 
 
 
 
 
 
 
 
6d0a4ef
 
 
 
 
de6b3b3
 
 
 
6d0a4ef
 
 
 
de6b3b3
6d0a4ef
 
 
de6b3b3
 
 
 
 
 
 
 
6d0a4ef
 
de6b3b3
6d0a4ef
 
 
de6b3b3
 
 
 
 
 
 
 
6d0a4ef
 
de6b3b3
6d0a4ef
 
 
 
 
 
de6b3b3
6d0a4ef
 
 
 
 
 
 
 
 
 
de6b3b3
6d0a4ef
 
 
 
 
 
 
de6b3b3
a7cdcc5
dfb74b3
de6b3b3
6d0a4ef
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import streamlit as st
import PyPDF2
import pptx
import os
from langchain.llms import OpenAI
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.embeddings import OpenAIEmbeddings
import cassio
from langchain.text_splitter import CharacterTextSplitter
from huggingface_hub import login

# Secure API keys (ensure they are set)
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_ID = os.getenv("ASTRA_DB_ID")
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")



llm = OpenAI(openai_api_key=OPENAI_API_KEY)

if not ASTRA_DB_APPLICATION_TOKEN or not ASTRA_DB_ID:
    st.error("Astra DB credentials are missing. Set the environment variables.")
    st.stop()
if not HUGGINGFACE_API_KEY:
    st.error("Hugging Face API key is missing. Set the HUGGINGFACE_API_KEY environment variable.")
    st.stop()
if not OPENAI_API_KEY:
    st.error("OpenAI API key is missing. Set the OPENAI_API_KEY environment variable.")
    st.stop()

# Initialize Astra DB connection
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

# Initialize LLM & Embeddings
login(token=HUGGINGFACE_API_KEY)


embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Initialize vector store
astra_vector_store = Cassandra(embedding=embedding, table_name="qa_mini_demo")


def extract_text_from_pdf(uploaded_file):
    """Extract text from a PDF file."""
    text = ""
    try:
        pdf_reader = PyPDF2.PdfReader(uploaded_file)
        for page in pdf_reader.pages:
            page_text = page.extract_text()
            if page_text:  # Avoid NoneType error
                text += page_text + "\n"
    except Exception as e:
        st.error(f"Error reading PDF: {e}")
    return text


def extract_text_from_ppt(uploaded_file):
    """Extract text from a PowerPoint file."""
    text = ""
    try:
        presentation = pptx.Presentation(uploaded_file)
        for slide in presentation.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text += shape.text + "\n"
    except Exception as e:
        st.error(f"Error reading PPT: {e}")
    return text


def main():
    st.title("Chat with Documents")

    uploaded_file = st.file_uploader("Upload a PDF or PPT file", type=["pdf", "pptx"])
    extract_button = st.button("Extract Text")
    extracted_text = ""
    
    if extract_button and uploaded_file is not None:
        if uploaded_file.name.endswith(".pdf"):
            extracted_text = extract_text_from_pdf(uploaded_file)
        elif uploaded_file.name.endswith(".pptx"):
            extracted_text = extract_text_from_ppt(uploaded_file)

        if extracted_text:
            text_splitter = CharacterTextSplitter(separator="\n", chunk_size=800, chunk_overlap=200, length_function=len)
            texts = text_splitter.split_text(extracted_text)
            astra_vector_store.add_texts(texts)
            st.success("Text extracted and stored successfully!")

    # Ensure the vector store index is initialized properly
    astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

    query = st.text_input("Enter your query")
    submit_query = st.button("Submit Query")

    if submit_query and query:
       
        response = astra_vector_index.query(query, llm =llm)
        st.write(f"Response: {response}")


if __name__ == "__main__":
    main()