aaporosh commited on
Commit
fecb449
Β·
verified Β·
1 Parent(s): d9893e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -29
app.py CHANGED
@@ -1,60 +1,93 @@
1
  import streamlit as st
2
  import pdfplumber
3
- from transformers import pipeline
4
  import re
 
 
 
 
 
 
5
 
6
- # Load models once for speed
7
- qa_model = pipeline("question-answering", model="google/flan-t5-large", tokenizer="google/flan-t5-large")
8
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
9
 
10
- st.set_page_config(page_title="Smart PDF Chatbot & Summarizer", layout="wide")
11
- st.title("πŸ“„ Smart PDF Chatbot & Summarizer")
 
 
 
 
12
 
13
- # Sidebar settings
14
- st.sidebar.header("βš™οΈ Settings")
15
- max_length = st.sidebar.slider("Summary Length", 50, 500, 250)
16
 
17
- # Upload PDF
18
- uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
 
 
 
19
 
20
  if uploaded_file:
 
21
  with pdfplumber.open(uploaded_file) as pdf:
22
  text = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
23
 
24
  if not text.strip():
25
- st.error("Couldn't extract text from this PDF.")
26
  else:
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  tabs = st.tabs(["πŸ’¬ Chat with PDF", "πŸ“ Summarize PDF", "πŸ’» Extract Code"])
28
 
29
- # Chat tab
30
  with tabs[0]:
31
  st.subheader("Ask Questions About Your PDF")
32
- question = st.text_input("Enter your question:")
33
- if st.button("Ask", key="qa") and question:
34
- try:
35
- result = qa_model(question=question, context=text)
36
- st.success(result['answer'])
37
- except Exception as e:
38
- st.error(f"Error: {e}")
 
 
 
 
39
 
40
- # Summarization tab
41
  with tabs[1]:
42
- st.subheader("PDF Summary")
43
  if st.button("Generate Summary", key="sum"):
44
  try:
45
- summary = summarizer(text, max_length=max_length, min_length=30, do_sample=False)
46
- st.info(summary[0]['summary_text'])
 
 
 
 
 
 
47
  except Exception as e:
48
- st.error(f"Error: {e}")
49
 
50
- # Code extraction tab
51
  with tabs[2]:
52
- st.subheader("Extracted Programming Code")
53
- code_blocks = re.findall(r'```[a-zA-Z]*([\s\S]*?)```', text)
54
  if code_blocks:
55
  for idx, code in enumerate(code_blocks, 1):
56
  st.code(code, language="python")
57
  else:
58
  st.warning("No code blocks found in this PDF.")
59
  else:
60
- st.info("πŸ‘† Please upload a PDF to start.")
 
1
  import streamlit as st
2
  import pdfplumber
 
3
  import re
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.vectorstores import FAISS
6
+ from langchain.embeddings import HuggingFaceEmbeddings
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.chains import ConversationalRetrievalChain
9
+ from transformers import pipeline
10
 
11
+ # -------------------- PAGE CONFIG --------------------
12
+ st.set_page_config(page_title="Smart PDF Chatbot", layout="wide")
 
13
 
14
+ # -------------------- MODELS --------------------
15
+ @st.cache_resource
16
+ def load_models():
17
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
18
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
19
+ return embeddings, summarizer
20
 
21
+ embeddings, summarizer = load_models()
 
 
22
 
23
+ # -------------------- TITLE --------------------
24
+ st.title("πŸ“„ Smart PDF Chatbot & Summarizer")
25
+
26
+ # -------------------- UPLOAD PDF --------------------
27
+ uploaded_file = st.file_uploader("πŸ“€ Upload your PDF file", type=["pdf"])
28
 
29
  if uploaded_file:
30
+ # Extract text from PDF
31
  with pdfplumber.open(uploaded_file) as pdf:
32
  text = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
33
 
34
  if not text.strip():
35
+ st.error("⚠️ Could not extract text from this PDF.")
36
  else:
37
+ # Split into chunks for better retrieval
38
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
39
+ chunks = splitter.split_text(text)
40
+
41
+ # Build vector store for retrieval
42
+ vector_store = FAISS.from_texts(chunks, embedding=embeddings)
43
+ retriever = vector_store.as_retriever()
44
+
45
+ # Create conversational chain with memory
46
+ llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
47
+ qa_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever)
48
+
49
+ # Tabs for Chat, Summary, and Code
50
  tabs = st.tabs(["πŸ’¬ Chat with PDF", "πŸ“ Summarize PDF", "πŸ’» Extract Code"])
51
 
52
+ # -------------------- CHAT TAB --------------------
53
  with tabs[0]:
54
  st.subheader("Ask Questions About Your PDF")
55
+ if "chat_history" not in st.session_state:
56
+ st.session_state.chat_history = []
57
+
58
+ user_input = st.text_input("Enter your question:", key="chat_input")
59
+ if st.button("Send"):
60
+ result = qa_chain({"question": user_input, "chat_history": st.session_state.chat_history})
61
+ st.session_state.chat_history.append((user_input, result["answer"]))
62
+
63
+ for q, a in st.session_state.chat_history:
64
+ st.markdown(f"**You:** {q}")
65
+ st.markdown(f"**Bot:** {a}")
66
 
67
+ # -------------------- SUMMARY TAB --------------------
68
  with tabs[1]:
69
+ st.subheader("πŸ“˜ PDF Summary")
70
  if st.button("Generate Summary", key="sum"):
71
  try:
72
+ # Summarize in chunks for long PDFs
73
+ summaries = []
74
+ for i in range(0, len(chunks), 3):
75
+ chunk_text = " ".join(chunks[i:i+3])
76
+ summary = summarizer(chunk_text, max_length=150, min_length=30, do_sample=False)
77
+ summaries.append(summary[0]['summary_text'])
78
+ final_summary = " ".join(summaries)
79
+ st.info(final_summary)
80
  except Exception as e:
81
+ st.error(f"Summarization error: {e}")
82
 
83
+ # -------------------- CODE EXTRACTION TAB --------------------
84
  with tabs[2]:
85
+ st.subheader("πŸ§‘β€πŸ’» Extracted Code Blocks")
86
+ code_blocks = re.findall(r"```[a-zA-Z]*([\s\S]*?)```", text)
87
  if code_blocks:
88
  for idx, code in enumerate(code_blocks, 1):
89
  st.code(code, language="python")
90
  else:
91
  st.warning("No code blocks found in this PDF.")
92
  else:
93
+ st.info("πŸ‘† Please upload a PDF to get started.")