Spaces:

aaporosh
/

Smart_PDF_QA

Sleeping

App Files Files Community

aaporosh commited on 26 days ago

Commit

60746b2

verified ·

1 Parent(s): 7a45022

Update app.py

Browse files

Files changed (1) hide show

app.py +129 -162

app.py CHANGED Viewed

@@ -8,13 +8,12 @@ from langchain_community.vectorstores import FAISS
 from sentence_transformers import SentenceTransformer
 from transformers import pipeline
 import re
-from collections import defaultdict
-# Setup logging for Hugging Face Spaces
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-# Lazy load models with Hugging Face compatibility in mind
 @st.cache_resource(ttl=1800)
 def load_embeddings_model():
     logger.info("Loading embeddings model")
@@ -29,8 +28,7 @@ def load_embeddings_model():
 def load_qa_pipeline():
     logger.info("Loading QA pipeline")
     try:
-        # Using a small but effective model for Spaces resource limits
-        return pipeline("text2text-generation", model="google/flan-t5-base", max_length=256)
     except Exception as e:
         logger.error(f"QA model load error: {str(e)}")
         st.error(f"QA model error: {str(e)}")
@@ -40,89 +38,67 @@ def load_qa_pipeline():
 def load_summary_pipeline():
     logger.info("Loading summary pipeline")
     try:
-        return pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", max_length=180)
     except Exception as e:
         logger.error(f"Summary model load error: {str(e)}")
         st.error(f"Summary model error: {str(e)}")
         return None
-# Enhanced code extraction: Group by lines, preserve indentation and spaces
-def extract_code_from_page(page):
-    mono_chars = [c for c in page.chars if 'fontname' in c and 'mono' in c['fontname'].lower()]
-    if not mono_chars:
-        return ""
-    # Group characters by y-coordinate (lines), rounded for precision
-    lines = defaultdict(list)
-    for c in mono_chars:
-        y_key = round(c['y1'], 2)  # Use top coordinate
-        lines[y_key].append(c)
-    code_lines = []
-    # Sort lines top to bottom (PDF y decreases downward)
-    for y in sorted(lines.keys(), reverse=True):
-        line_chars = sorted(lines[y], key=lambda c: c['x0'])
-        line_text = ''
-        prev_x1 = None
-        # Calculate average char width for spacing detection
-        if line_chars:
-            avg_width = sum(c['width'] for c in line_chars) / len(line_chars)
-        else:
-            avg_width = 1
-        for c in line_chars:
-            if prev_x1 is not None:
-                gap = c['x0'] - prev_x1
-                if gap > avg_width * 0.3:  # Threshold for adding spaces
-                    spaces = int(gap / avg_width)
-                    line_text += ' ' * spaces
-            line_text += c['text']
-            prev_x1 = c['x1']
-        code_lines.append(line_text.rstrip())  # Trim trailing spaces but keep leading for indentation
-    return '\n'.join(code_lines)
-# Process PDF with improved extraction
 def process_pdf(uploaded_file):
-    logger.info("Processing PDF")
     try:
-        full_text = ""
-        code_text = ""
         with pdfplumber.open(BytesIO(uploaded_file.getvalue())) as pdf:
-            for page in pdf.pages[:30]:  # Increased limit for better coverage, but mindful of resources
-                # Extract text with layout preservation
-                extracted_text = page.extract_text(layout=True, x_tolerance=2, y_tolerance=2)
-                if extracted_text:
-                    full_text += extracted_text + "\n\n"
-                # Extract code blocks separately
-                code_block = extract_code_from_page(page)
-                if code_block:
-                    code_text += code_block + "\n\n"
-        if not full_text:
-            raise ValueError("No text extracted from PDF. Consider enabling OCR for scanned documents.")
-        # Improved splitting: Larger chunks for context, more overlap
-        text_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=1000, chunk_overlap=200)
-        text_chunks = text_splitter.split_text(full_text)[:80]  # Balanced for performance
-        code_chunks = text_splitter.split_text(code_text)[:40] if code_text else []
         embeddings_model = load_embeddings_model()
         if not embeddings_model:
-            return None, None, full_text, code_text
-        # Use from_texts for simplicity and compatibility
-        text_vector_store = FAISS.from_texts(text_chunks, embedding=embeddings_model) if text_chunks else None
-        code_vector_store = FAISS.from_texts(code_chunks, embedding=embeddings_model) if code_chunks else None
-        logger.info("PDF processed successfully")
-        return text_vector_store, code_vector_store, full_text, code_text
     except Exception as e:
         logger.error(f"PDF processing error: {str(e)}")
         st.error(f"PDF error: {str(e)}")
         return None, None, "", ""
-# Improved summarization: More chunks, better concatenation
 def summarize_pdf(text):
     logger.info("Generating summary")
     try:
@@ -130,116 +106,76 @@ def summarize_pdf(text):
         if not summary_pipeline:
             return "Summary model unavailable."
-        text_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=800, chunk_overlap=100)
-        chunks = text_splitter.split_text(text)[:4]  # Balanced for quality and speed
-        summaries = [summary_pipeline(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text'].strip() for chunk in chunks]
-        combined_summary = " ".join(summaries)
-        if len(combined_summary.split()) > 180:
-            combined_summary = summary_pipeline(combined_summary, max_length=180, min_length=100, do_sample=False)[0]['summary_text']
         logger.info("Summary generated")
-        return f"Sure, here's a concise summary of the PDF:\n\n{combined_summary}"
     except Exception as e:
         logger.error(f"Summary error: {str(e)}")
         return f"Oops, something went wrong summarizing: {str(e)}"
-# Improved Q&A: Better context, code handling
 def answer_question(text_vector_store, code_vector_store, query):
     logger.info(f"Processing query: {query}")
     try:
         if not text_vector_store and not code_vector_store:
-            return "Please upload and process a PDF first!"
         qa_pipeline = load_qa_pipeline()
         if not qa_pipeline:
             return "Sorry, the QA model is unavailable right now."
-        is_code_query = any(keyword in query.lower() for keyword in ["code", "script", "function", "programming", "give me code", "show code", "extract code"])
-        vector_store = code_vector_store if is_code_query and code_vector_store else text_vector_store
         if not vector_store:
             return "No relevant content found for your query."
-        docs = vector_store.similarity_search(query, k=4)  # Increased for better context
-        context = "\n\n".join(doc.page_content for doc in docs)
-        prompt = f"Based on this context: {context}\n\nQuestion: {query}\n\nProvide a friendly, concise answer like a helpful assistant:"
-        response = qa_pipeline(prompt)[0]['generated_text'].strip()
-        if is_code_query:
-            response = f"Here's the relevant code extracted from the PDF:\n```python
         logger.info("Answer generated")
-        return f"Got it! Here's the answer:\n\n{response}"
     except Exception as e:
         logger.error(f"Query error: {str(e)}")
         return f"Sorry, something went wrong: {str(e)}"
-# Streamlit UI with improved design for Hugging Face Spaces
 try:
-    st.set_page_config(page_title="Smart PDF Q&A", page_icon="📄", layout="wide", initial_sidebar_state="expanded")
-    # Enhanced CSS for modern, dark-theme friendly design (matching the screenshot's dark mode)
     st.markdown("""
         <style>
-        /* General styling */
-        .stApp { background-color: #1e1e1e; color: #ffffff; font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; }
-        .stSidebar { background-color: #2a2a2a; }
-        .stButton > button { background-color: #4a90e2; color: white; border: none; border-radius: 6px; padding: 8px 16px; font-weight: bold; }
-        .stButton > button:hover { background-color: #357abd; }
-        .stFileUploader { background-color: #333333; border: 1px solid #444444; border-radius: 6px; padding: 10px; }
-        .stSpinner > div { color: #4a90e2; }
-        .stSuccess, .stError { border-radius: 6px; padding: 10px; }
-        /* Chat messages */
-        .stChatMessage { border-radius: 12px; padding: 12px; margin: 8px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.2); }
-        .stChatMessage.user { background-color: #3a3a3a; color: #ffffff; }
-        .stChatMessage.assistant { background-color: #2a2a2a; color: #ffffff; }
-        /* Code blocks */
-        pre { background-color: #252525; color: #d4d4d4; padding: 12px; border-radius: 6px; overflow: auto; font-family: 'Consolas', 'Monaco', monospace; }
-        /* Title and markdown */
-        h1 { color: #ffffff; }
-        .stMarkdown { color: #d4d4d4; }
         </style>
     """, unsafe_allow_html=True)
-    # Sidebar for controls
-    with st.sidebar:
-        st.title("📄 PDF Controls")
-        st.markdown("Upload your PDF (up to 200MB) and process it.")
-        uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"], help="Drag and drop or browse to upload.")
-        if uploaded_file:
-            if st.button("Process PDF", key="process_btn"):
-                with st.spinner("Processing PDF... This may take a moment."):
-                    st.session_state.text_vector_store, st.session_state.code_vector_store, st.session_state.pdf_text, st.session_state.code_text = process_pdf(uploaded_file)
-                    if st.session_state.text_vector_store or st.session_state.code_vector_store:
-                        st.success("PDF processed successfully! You can now ask questions or summarize.")
-                        st.session_state.messages = []
-                    else:
-                        st.error("Failed to process PDF. Please try another file.")
-        if "pdf_text" in st.session_state and st.session_state.pdf_text:
-            if st.button("Summarize PDF", key="summarize_btn"):
-                with st.spinner("Generating summary..."):
-                    summary = summarize_pdf(st.session_state.pdf_text)
-                    st.session_state.messages.append({"role": "assistant", "content": summary})
-        st.markdown("---")
-        if st.session_state.get("messages"):
-            chat_text = "\n\n".join(f"**{m['role'].capitalize()}:** {m['content']}" for m in st.session_state.messages)
-            st.download_button("Download Chat History", chat_text, "chat_history.txt", use_container_width=True)
-    # Main content
-    st.title("Smart PDF Q&A")
-    st.markdown("""
-    Upload a PDF using the sidebar to ask questions, get summaries (~180 words), or extract code.
-    For code, try queries like "give me code for [topic]". Responses are designed to be quick, accurate, and user-friendly!
-    """)
-    # Session state initialization
     if "messages" not in st.session_state:
         st.session_state.messages = []
     if "text_vector_store" not in st.session_state:
@@ -251,26 +187,57 @@ try:
     if "code_text" not in st.session_state:
         st.session_state.code_text = ""
-    # Chat interface
-    chat_container = st.container()
-    with chat_container:
-        for message in st.session_state.messages:
-            with st.chat_message(message["role"]):
-                st.markdown(message["content"], unsafe_allow_html=True)
     if st.session_state.text_vector_store or st.session_state.code_vector_store:
-        prompt = st.chat_input("Ask a question (e.g., 'What's the main idea?' or 'Give me code for algorithm')")
         if prompt:
             st.session_state.messages.append({"role": "user", "content": prompt})
-            with chat_container.chat_message("user"):
                 st.markdown(prompt)
-            with chat_container.chat_message("assistant"):
-                with st.spinner("Thinking..."):
                     answer = answer_question(st.session_state.text_vector_store, st.session_state.code_vector_store, prompt)
                 st.markdown(answer, unsafe_allow_html=True)
             st.session_state.messages.append({"role": "assistant", "content": answer})
-            st.rerun()  # Rerun to update chat display
 except Exception as e:
     logger.error(f"App initialization failed: {str(e)}")
-    st.error(f"App failed to start: {str(e)}. Please check the logs or contact support.")

 from sentence_transformers import SentenceTransformer
 from transformers import pipeline
 import re
+# Setup logging for Spaces
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
+# Lazy load models
 @st.cache_resource(ttl=1800)
 def load_embeddings_model():
     logger.info("Loading embeddings model")
 def load_qa_pipeline():
     logger.info("Loading QA pipeline")
     try:
+        return pipeline("text2text-generation", model="google/flan-t5-small", max_length=300)
     except Exception as e:
         logger.error(f"QA model load error: {str(e)}")
         st.error(f"QA model error: {str(e)}")
 def load_summary_pipeline():
     logger.info("Loading summary pipeline")
     try:
+        return pipeline("summarization", model="sshleifer/distilbart-cnn-6-6", max_length=150)
     except Exception as e:
         logger.error(f"Summary model load error: {str(e)}")
         st.error(f"Summary model error: {str(e)}")
         return None
+# Process PDF with enhanced extraction
 def process_pdf(uploaded_file):
+    logger.info("Processing PDF with enhanced extraction")
     try:
+        text = ""
+        code_blocks = []
         with pdfplumber.open(BytesIO(uploaded_file.getvalue())) as pdf:
+            for page in pdf.pages[:20]:
+                extracted = page.extract_text(layout=False)
+                if extracted:
+                    text += extracted + "\n"
+                for char in page.chars:
+                    if 'fontname' in char and 'mono' in char['fontname'].lower():
+                        code_blocks.append(char['text'])
+                code_text = page.extract_text()
+                code_matches = re.finditer(r'(^\s{2,}.*?(?:\n\s{2,}.*?)*)', code_text, re.MULTILINE)
+                for match in code_matches:
+                    code_blocks.append(match.group().strip())
+                tables = page.extract_tables()
+                if tables:
+                    for table in tables:
+                        text += "\n".join([" | ".join(map(str, row)) for row in table if row]) + "\n"
+                for obj in page.extract_words():
+                    if obj.get('size', 0) > 12:
+                        text += f"\n{obj['text']}\n"
+        code_text = "\n".join(code_blocks).strip()
+        if not text:
+            raise ValueError("No text extracted from PDF")
+        text_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=500, chunk_overlap=100, keep_separator=True)
+        text_chunks = text_splitter.split_text(text)[:50]
+        code_chunks = text_splitter.split_text(code_text)[:25] if code_text else []
         embeddings_model = load_embeddings_model()
         if not embeddings_model:
+            return None, None, text, code_text
+        text_vector_store = FAISS.from_embeddings(
+            zip(text_chunks, [embeddings_model.encode(chunk) for chunk in text_chunks]),
+            embeddings_model.encode
+        ) if text_chunks else None
+        code_vector_store = FAISS.from_embeddings(
+            zip(code_chunks, [embeddings_model.encode(chunk) for chunk in code_chunks]),
+            embeddings_model.encode
+        ) if code_chunks else None
+        logger.info("PDF processed successfully with enhanced extraction")
+        return text_vector_store, code_vector_store, text, code_text
     except Exception as e:
         logger.error(f"PDF processing error: {str(e)}")
         st.error(f"PDF error: {str(e)}")
         return None, None, "", ""
+# Summarize PDF
 def summarize_pdf(text):
     logger.info("Generating summary")
     try:
         if not summary_pipeline:
             return "Summary model unavailable."
+        text_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=500, chunk_overlap=50)
+        chunks = text_splitter.split_text(text)[:2]
+        summaries = []
+        for chunk in chunks:
+            summary = summary_pipeline(chunk[:500], max_length=100, min_length=30, do_sample=False)[0]['summary_text']
+            summaries.append(summary.strip())
+        combined_summary = " ".join(summaries)
+        if len(combined_summary.split()) > 150:
+            combined_summary = " ".join(combined_summary.split()[:150])
         logger.info("Summary generated")
+        return f"Sure, here's a concise summary of the PDF:\n{combined_summary}"
     except Exception as e:
         logger.error(f"Summary error: {str(e)}")
         return f"Oops, something went wrong summarizing: {str(e)}"
+# Answer question with improved response
 def answer_question(text_vector_store, code_vector_store, query):
     logger.info(f"Processing query: {query}")
     try:
         if not text_vector_store and not code_vector_store:
+            return "Please upload a PDF first!"
         qa_pipeline = load_qa_pipeline()
         if not qa_pipeline:
             return "Sorry, the QA model is unavailable right now."
+        is_code_query = any(keyword in query.lower() for keyword in ["code", "script", "function", "programming", "give me code", "show code"])
+        if is_code_query and code_vector_store:
+            return f"Here's the code from the PDF:\n```python\n{st.session_state.code_text}\n```"
+        vector_store = text_vector_store
         if not vector_store:
             return "No relevant content found for your query."
+        docs = vector_store.similarity_search(query, k=5)  # Increased to 5 for more context
+        context = "\n".join(doc.page_content for doc in docs)
+        prompt = f"Context: {context}\nQuestion: {query}\nProvide a detailed, accurate answer based on the context, prioritizing relevant information. Respond as a helpful assistant:"
+        response = qa_pipeline(prompt)[0]['generated_text']
         logger.info("Answer generated")
+        return f"Got it! Here's a detailed answer:\n{response.strip()}"
     except Exception as e:
         logger.error(f"Query error: {str(e)}")
         return f"Sorry, something went wrong: {str(e)}"
+# Streamlit UI
 try:
+    st.set_page_config(page_title="Smart PDF Q&A", page_icon="📄", layout="wide")
     st.markdown("""
         <style>
+        .main { max-width: 900px; margin: 0 auto; padding: 20px; }
+        .sidebar { background-color: #f8f9fa; padding: 10px; border-radius: 5px; }
+        .chat-container { border: 1px solid #ddd; border-radius: 10px; padding: 10px; height: 60vh; overflow-y: auto; margin-top: 20px; }
+        .stChatMessage { border-radius: 10px; padding: 10px; margin: 5px; max-width: 70%; }
+        .user { background-color: #e6f3ff; align-self: flex-end; }
+        .assistant { background-color: #f0f0f0; }
+        .dark .user { background-color: #2a2a72; color: #fff; }
+        .dark .assistant { background-color: #2e2e2e; color: #fff; }
+        .stButton>button { background-color: #4CAF50; color: white; border: none; padding: 8px 16px; border-radius: 5px; }
+        .stButton>button:hover { background-color: #45a049; }
+        pre { background-color: #f8f8f8; padding: 10px; border-radius: 5px; overflow-x: auto; }
+        .header { background: linear-gradient(90deg, #4CAF50, #81C784); color: white; padding: 10px; border-radius: 5px; text-align: center; }
         </style>
     """, unsafe_allow_html=True)
+    st.markdown('<div class="header"><h1>Smart PDF Q&A</h1></div>', unsafe_allow_html=True)
+    st.markdown("Upload a PDF to ask questions, summarize (~150 words), or extract code with 'give me code'. Fast and friendly responses!")
+    # Initialize session state
     if "messages" not in st.session_state:
         st.session_state.messages = []
     if "text_vector_store" not in st.session_state:
     if "code_text" not in st.session_state:
         st.session_state.code_text = ""
+    # Sidebar with toggle
+    with st.sidebar:
+        st.markdown('<div class="sidebar">', unsafe_allow_html=True)
+        theme = st.radio("Theme", ["Light", "Dark"], index=0)
+        st.markdown('</div>', unsafe_allow_html=True)
+    # PDF upload and processing
+    uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        if st.button("Process PDF"):
+            with st.spinner("Processing PDF..."):
+                st.session_state.text_vector_store, st.session_state.code_vector_store, st.session_state.pdf_text, st.session_state.code_text = process_pdf(uploaded_file)
+                if st.session_state.text_vector_store or st.session_state.code_vector_store:
+                    st.success("PDF processed! Ask away or summarize.")
+                    st.session_state.messages = []
+                else:
+                    st.error("Failed to process PDF.")
+    with col2:
+        if st.button("Summarize PDF") and st.session_state.pdf_text:
+            with st.spinner("Summarizing..."):
+                summary = summarize_pdf(st.session_state.pdf_text)
+                st.session_state.messages.append({"role": "assistant", "content": summary})
+                st.markdown(summary, unsafe_allow_html=True)
+    # Chat interface
+    st.markdown('<div class="chat-container">', unsafe_allow_html=True)
     if st.session_state.text_vector_store or st.session_state.code_vector_store:
+        prompt = st.chat_input("Ask a question (e.g., 'Give me code' or 'What’s the main idea?'):")
         if prompt:
             st.session_state.messages.append({"role": "user", "content": prompt})
+            with st.chat_message("user"):
                 st.markdown(prompt)
+            with st.chat_message("assistant"):
+                with st.spinner('<div class="spinner">⏳</div>'):
                     answer = answer_question(st.session_state.text_vector_store, st.session_state.code_vector_store, prompt)
                 st.markdown(answer, unsafe_allow_html=True)
             st.session_state.messages.append({"role": "assistant", "content": answer})
+    # Display chat history
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"], unsafe_allow_html=True)
+    st.markdown('</div>', unsafe_allow_html=True)
+    # Download chat history
+    if st.session_state.messages:
+        chat_text = "\n".join(f"{m['role'].capitalize()}: {m['content']}" for m in st.session_state.messages)
+        st.download_button("Download Chat History", chat_text, "chat_history.txt")
 except Exception as e:
     logger.error(f"App initialization failed: {str(e)}")
+    st.error(f"App failed to start: {str(e)}. Check Spaces logs or contact support.")