aaporosh commited on
Commit
ce4cfdb
·
verified ·
1 Parent(s): df63323

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +165 -84
app.py CHANGED
@@ -1,105 +1,186 @@
1
  import streamlit as st
 
2
  import os
3
  from io import BytesIO
4
- import numpy as np
5
- from PyPDF2 import PdfReader
6
- from langchain.chains import RetrievalQA
7
  from langchain.text_splitter import CharacterTextSplitter
8
- from langchain_community.embeddings import HuggingFaceEmbeddings
9
  from langchain_community.vectorstores import FAISS
10
- from langchain.llms import HuggingFaceHub
11
- from langchain.prompts import PromptTemplate
12
- from langchain.chains.question_answering import load_qa_chain
13
 
14
- # Check for Hugging Face API token
15
- if "HUGGINGFACEHUB_API_TOKEN" not in os.environ:
16
- st.error("HUGGINGFACEHUB_API_TOKEN not set. Please add it in Space settings under secrets.")
17
- st.stop()
18
 
19
- try:
20
- # Function to process PDF and build vector store
21
- def process_input(uploaded_file):
22
- try:
23
- pdf_reader = PdfReader(uploaded_file)
24
- text = ""
25
- for page in pdf_reader.pages:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  extracted = page.extract_text()
27
  if extracted:
28
- text += extracted
29
- else:
30
- st.warning("Some pages could not be extracted.")
31
-
 
 
 
 
32
  if not text:
33
- raise ValueError("No text extracted from PDF.")
34
-
35
- # Split text into chunks
36
- text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len)
37
- chunks = text_splitter.split_text(text)
38
-
39
- # Create embeddings
40
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", model_kwargs={'device': 'cpu'})
41
-
42
- # Create vector store
43
- vector_store = FAISS.from_texts(chunks, embedding=embeddings)
44
- return vector_store
45
- except Exception as e:
46
- st.error(f"Error processing PDF: {str(e)}")
47
- return None
 
 
48
 
49
- # Function to answer questions
50
- def answer_question(vector_store, query):
51
- try:
52
- llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.1", model_kwargs={"temperature": 0.7, "max_length": 512})
53
-
54
- # Retrieve relevant documents
55
- retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})
56
-
57
- # Prompt template
58
- prompt_template = """Use the provided context to answer the question concisely:
59
-
60
- Context: {context}
61
-
62
- Question: {question}
63
-
64
- Answer:"""
65
- PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
66
-
67
- # QA chain
68
- chain = load_qa_chain(llm, chain_type="stuff", prompt=PROMPT)
69
- docs = retriever.get_relevant_documents(query)
70
- response = chain({"input_documents": docs, "question": query}, return_only_outputs=True)
71
- return response["output_text"].split("Answer:")[-1].strip()
72
- except Exception as e:
73
- st.error(f"Error answering question: {str(e)}")
74
- return "Unable to generate answer."
75
 
76
- # Streamlit UI
77
- def main():
78
- st.title("Smart PDF Q&A")
 
 
 
79
 
80
- # PDF upload
81
- uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
 
82
 
83
- if uploaded_file:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  if st.button("Process PDF"):
85
- with st.spinner("Processing..."):
86
- vector_store = process_input(BytesIO(uploaded_file.getvalue()))
87
- if vector_store:
88
- st.session_state.vector_store = vector_store
89
- st.success("PDF processed! Ask questions below.")
90
  else:
91
  st.error("Failed to process PDF.")
92
-
93
- # Question input
94
- if "vector_store" in st.session_state:
95
- query = st.text_input("Ask a question about the PDF:")
96
- if st.button("Submit") and query:
 
 
 
 
 
 
 
 
 
 
97
  with st.spinner("Generating answer..."):
98
- answer = answer_question(st.session_state.vector_store, query)
99
- st.write("**Answer:**", answer)
 
 
 
 
 
 
100
 
101
- if __name__ == "__main__":
102
- main()
 
 
103
 
104
  except Exception as e:
105
- st.error(f"Application failed to initialize: {str(e)}")
 
 
1
  import streamlit as st
2
+ import logging
3
  import os
4
  from io import BytesIO
5
+ import pdfplumber
 
 
6
  from langchain.text_splitter import CharacterTextSplitter
 
7
  from langchain_community.vectorstores import FAISS
8
+ from sentence_transformers import SentenceTransformer
9
+ from transformers import pipeline
 
10
 
11
+ # Setup logging for Spaces
12
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
13
+ logger = logging.getLogger(__name__)
 
14
 
15
+ # Lazy load models
16
+ @st.cache_resource(ttl=3600)
17
+ def load_embeddings_model():
18
+ logger.info("Loading embeddings model")
19
+ try:
20
+ return SentenceTransformer("all-MiniLM-L6-v2")
21
+ except Exception as e:
22
+ logger.error(f"Embeddings load error: {str(e)}")
23
+ st.error(f"Embedding model error: {str(e)}")
24
+ return None
25
+
26
+ @st.cache_resource(ttl=3600)
27
+ def load_qa_pipeline():
28
+ logger.info("Loading QA pipeline")
29
+ try:
30
+ return pipeline("text2text-generation", model="google/flan-t5-base", max_length=300)
31
+ except Exception as e:
32
+ logger.error(f"QA model load error: {str(e)}")
33
+ st.error(f"QA model error: {str(e)}")
34
+ return None
35
+
36
+ # Process PDF
37
+ def process_pdf(uploaded_file):
38
+ logger.info("Processing PDF")
39
+ try:
40
+ text = ""
41
+ with pdfplumber.open(BytesIO(uploaded_file.getvalue())) as pdf:
42
+ for page in pdf.pages:
43
  extracted = page.extract_text()
44
  if extracted:
45
+ text += extracted + "\n"
46
+
47
+ if not text:
48
+ # Optional OCR (uncomment if needed, requires pdf2image, pytesseract)
49
+ # from pdf2image import convert_from_bytes
50
+ # import pytesseract
51
+ # images = convert_from_bytes(uploaded_file.getvalue())
52
+ # text = "".join(pytesseract.image_to_string(img) for img in images)
53
  if not text:
54
+ raise ValueError("No text extracted from PDF")
55
+
56
+ text_splitter = CharacterTextSplitter(separator="\n", chunk_size=600, chunk_overlap=150)
57
+ chunks = text_splitter.split_text(text)
58
+
59
+ embeddings_model = load_embeddings_model()
60
+ if not embeddings_model:
61
+ return None, text
62
+
63
+ embeddings = [embeddings_model.encode(chunk) for chunk in chunks]
64
+ vector_store = FAISS.from_embeddings(zip(chunks, embeddings), embeddings_model.encode)
65
+ logger.info("PDF processed successfully")
66
+ return vector_store, text
67
+ except Exception as e:
68
+ logger.error(f"PDF processing error: {str(e)}")
69
+ st.error(f"PDF error: {str(e)}")
70
+ return None, ""
71
 
72
+ # Summarize PDF
73
+ def summarize_pdf(text):
74
+ logger.info("Generating summary")
75
+ try:
76
+ qa_pipeline = load_qa_pipeline()
77
+ if not qa_pipeline:
78
+ return "Summary model unavailable."
79
+
80
+ # Split text for summarization if too long
81
+ text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100)
82
+ chunks = text_splitter.split_text(text)
83
+ summaries = []
84
+
85
+ for chunk in chunks[:3]: # Limit to first 3 chunks for brevity
86
+ prompt = f"Summarize this text in 60-80 words, highlighting key points:\n{chunk}"
87
+ summary = qa_pipeline(prompt, max_length=100)[0]['generated_text']
88
+ summaries.append(summary.strip())
89
+
90
+ combined_summary = " ".join(summaries)
91
+ if len(combined_summary.split()) > 200:
92
+ combined_summary = " ".join(combined_summary.split()[:200])
93
+ logger.info("Summary generated")
94
+ return combined_summary
95
+ except Exception as e:
96
+ logger.error(f"Summary error: {str(e)}")
97
+ return f"Error summarizing: {str(e)}"
98
 
99
+ # Answer question
100
+ def answer_question(vector_store, query):
101
+ logger.info(f"Processing query: {query}")
102
+ try:
103
+ if not vector_store:
104
+ return "Please upload a PDF first."
105
 
106
+ qa_pipeline = load_qa_pipeline()
107
+ if not qa_pipeline:
108
+ return "QA model unavailable."
109
 
110
+ docs = vector_store.similarity_search(query, k=3)
111
+ context = "\n".join(doc.page_content for doc in docs)
112
+ prompt = f"Context: {context}\nQuestion: {query}\nAnswer concisely:"
113
+ response = qa_pipeline(prompt)[0]['generated_text']
114
+ logger.info("Answer generated")
115
+ return response.strip()
116
+ except Exception as e:
117
+ logger.error(f"Query error: {str(e)}")
118
+ return f"Error answering: {str(e)}"
119
+
120
+ # Streamlit UI
121
+ try:
122
+ st.set_page_config(page_title="Smart PDF Q&A", page_icon="📄")
123
+ st.title("Smart PDF Q&A")
124
+ st.markdown("""
125
+ Upload a PDF to ask questions or get a summary (up to 200 words). Chat history is preserved.
126
+ <style>
127
+ .stChatMessage { border-radius: 10px; padding: 10px; margin: 5px; }
128
+ .stChatMessage.user { background-color: #e6f3ff; }
129
+ .stChatMessage.assistant { background-color: #f0f0f0; }
130
+ </style>
131
+ """, unsafe_allow_html=True)
132
+
133
+ # Initialize session state
134
+ if "messages" not in st.session_state:
135
+ st.session_state.messages = []
136
+ if "vector_store" not in st.session_state:
137
+ st.session_state.vector_store = None
138
+ if "pdf_text" not in st.session_state:
139
+ st.session_state.pdf_text = ""
140
+
141
+ # PDF upload
142
+ uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
143
+ if uploaded_file:
144
+ col1, col2 = st.columns([1, 1])
145
+ with col1:
146
  if st.button("Process PDF"):
147
+ with st.spinner("Processing PDF..."):
148
+ st.session_state.vector_store, st.session_state.pdf_text = process_pdf(uploaded_file)
149
+ if st.session_state.vector_store:
150
+ st.success("PDF processed! Ask questions or summarize.")
151
+ st.session_state.messages = []
152
  else:
153
  st.error("Failed to process PDF.")
154
+ with col2:
155
+ if st.button("Summarize PDF") and st.session_state.pdf_text:
156
+ with st.spinner("Generating summary..."):
157
+ summary = summarize_pdf(st.session_state.pdf_text)
158
+ st.session_state.messages.append({"role": "assistant", "content": f"**Summary**: {summary}"})
159
+ st.markdown(f"**Summary**: {summary}")
160
+
161
+ # Chat interface
162
+ if st.session_state.vector_store:
163
+ prompt = st.chat_input("Ask a question about the PDF:")
164
+ if prompt:
165
+ st.session_state.messages.append({"role": "user", "content": prompt})
166
+ with st.chat_message("user"):
167
+ st.markdown(prompt)
168
+ with st.chat_message("assistant"):
169
  with st.spinner("Generating answer..."):
170
+ answer = answer_question(st.session_state.vector_store, prompt)
171
+ st.markdown(answer)
172
+ st.session_state.messages.append({"role": "assistant", "content": answer})
173
+
174
+ # Display chat history
175
+ for message in st.session_state.messages:
176
+ with st.chat_message(message["role"]):
177
+ st.markdown(message["content"])
178
 
179
+ # Download chat history
180
+ if st.session_state.messages:
181
+ chat_text = "\n".join(f"{m['role'].capitalize()}: {m['content']}" for m in st.session_state.messages)
182
+ st.download_button("Download Chat History", chat_text, "chat_history.txt")
183
 
184
  except Exception as e:
185
+ logger.error(f"App initialization failed: {str(e)}")
186
+ st.error(f"App failed to start: {str(e)}. Check Spaces logs or contact support.")