File size: 4,874 Bytes
f513b53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import streamlit as st
import os
import logging
from io import BytesIO
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain_community.llms import HuggingFaceHub
from transformers import pipeline  # For fallback if Hub fails

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Check API token
if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
    st.error("HUGGINGFACEHUB_API_TOKEN not set in secrets. Add it in Space settings.")
    st.stop()

try:
    # Function to process PDF
    def process_pdf(uploaded_file):
        try:
            logger.info("Starting PDF processing")
            pdf_reader = PdfReader(BytesIO(uploaded_file.getvalue()))
            text = ""
            for page in pdf_reader.pages:
                extracted = page.extract_text()
                if extracted:
                    text += extracted + "\n"
            
            if not text:
                raise ValueError("No text extracted from PDF.")
            
            # Chunk text (increased overlap for better context)
            text_splitter = CharacterTextSplitter(separator="\n", chunk_size=800, chunk_overlap=200, length_function=len)
            chunks = text_splitter.split_text(text)
            
            # Embeddings (light model)
            embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'})
            
            # Vector store
            vector_store = FAISS.from_texts(chunks, embedding=embeddings)
            logger.info("PDF processed successfully")
            return vector_store
        except Exception as e:
            logger.error(f"PDF processing error: {str(e)}")
            st.error(f"Error processing PDF: {str(e)}")
            return None

    # Function to answer questions
    def answer_question(vector_store, query):
        try:
            logger.info(f"Answering query: {query}")
            # Lighter LLM via pipeline for faster CPU inference
            qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
            
            # Retrieve top chunks
            docs = vector_store.similarity_search(query, k=3)
            context = "\n".join([doc.page_content for doc in docs])
            
            # Prompt
            prompt = f"Use this context to answer concisely: {context}\nQuestion: {query}\nAnswer:"
            response = qa_pipeline(prompt, max_length=256, num_return_sequences=1)[0]['generated_text']
            
            logger.info("Answer generated")
            return response.strip()
        except Exception as e:
            logger.error(f"Answer generation error: {str(e)}")
            st.error(f"Error answering: {str(e)}")
            return "Unable to generate answer."

    # Streamlit UI with chat history
    st.title("Smart PDF Q&A")
    st.write("Upload a PDF and ask questions! Chat history is preserved.")

    # Initialize session state
    if "messages" not in st.session_state:
        st.session_state.messages = []
    if "vector_store" not in st.session_state:
        st.session_state.vector_store = None

    # PDF upload and process
    uploaded_file = st.file_uploader("Upload PDF", type="pdf")
    if uploaded_file:
        if st.button("Process PDF"):
            with st.spinner("Processing..."):
                vector_store = process_pdf(uploaded_file)
                if vector_store:
                    st.session_state.vector_store = vector_store
                    st.success("PDF ready! Ask away.")
                    st.session_state.messages = []  # Reset chat on new PDF

    # Display chat history
    for message in st.session_state.messages:
        with st.chat_message(message["role"]):
            st.markdown(message["content"])

    # Question input
    if st.session_state.vector_store:
        if prompt := st.chat_input("Ask a question:"):
            # Add user message
            st.session_state.messages.append({"role": "user", "content": prompt})
            with st.chat_message("user"):
                st.markdown(prompt)
            
            # Generate answer
            with st.chat_message("assistant"):
                with st.spinner("Thinking..."):
                    answer = answer_question(st.session_state.vector_store, prompt)
                st.markdown(answer)
            st.session_state.messages.append({"role": "assistant", "content": answer})

except Exception as e:
    logger.error(f"App initialization failed: {str(e)}")
    st.error(f"Initialization error: {str(e)}. Check logs or try factory reset.")