Spaces:

DrishtiSharma
/

docqa-with-deepseek-r1

Running

App Files Files Community

DrishtiSharma commited on 10 days ago

Commit

23248f2

verified ·

1 Parent(s): c6ec0a2

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -17

app.py CHANGED Viewed

@@ -54,23 +54,54 @@ if pdf_source == "Upload a PDF file":
         st.session_state.vector_created = False
 elif pdf_source == "Enter a PDF URL":
-    pdf_url = st.text_input("Enter PDF URL:", value = "https://arxiv.org/pdf/2406.06998")
-    if pdf_url and not st.session_state.pdf_path:
-        with st.spinner("Downloading PDF..."):
-            try:
-                response = requests.get(pdf_url)
-                if response.status_code == 200:
-                    st.session_state.pdf_path = "temp.pdf"
-                    with open(st.session_state.pdf_path, "wb") as f:
-                        f.write(response.content)
-                    st.session_state.pdf_loaded = False
-                    st.session_state.chunked = False
-                    st.session_state.vector_created = False
-                    st.success("✅ PDF Downloaded Successfully!")
-                else:
-                    st.error("❌ Failed to download PDF. Check the URL.")
-            except Exception as e:
-                st.error(f"❌ Error downloading PDF: {e}")
 # Step 2: Load & Process PDF (Only Once)
 if st.session_state.pdf_path and not st.session_state.pdf_loaded:

         st.session_state.vector_created = False
 elif pdf_source == "Enter a PDF URL":
+pdf_url = st.text_input("Enter PDF URL:", key="pdf_url", on_change=lambda: st.session_state.update({"process_pdf": True}))
+if st.session_state.get("process_pdf") and pdf_url:  # ✅ Triggered only when Enter is pressed
+    with st.spinner("Downloading PDF..."):
+        try:
+            # Download PDF
+            response = requests.get(pdf_url)
+            if response.status_code == 200:
+                st.session_state.pdf_path = "temp.pdf"
+                with open(st.session_state.pdf_path, "wb") as f:
+                    f.write(response.content)
+                st.success("✅ PDF Downloaded Successfully!")
+            else:
+                st.error("❌ Failed to download PDF. Check the URL.")
+                st.stop()
+            # Step 2: Load PDF
+            st.spinner("Loading PDF...")
+            loader = PDFPlumberLoader(st.session_state.pdf_path)
+            docs = loader.load()
+            st.session_state.documents = docs
+            st.session_state.pdf_loaded = True
+            st.success(f"✅ **PDF Loaded!** Total Pages: {len(docs)}")
+            # Step 3: Chunking the document
+            st.spinner("Chunking the document...")
+            model_name = "nomic-ai/modernbert-embed-base"
+            embedding_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={'device': 'cpu'})
+            text_splitter = SemanticChunker(embedding_model)
+            if st.session_state.documents:
+                documents = text_splitter.split_documents(st.session_state.documents)
+                st.session_state.documents = documents
+                st.session_state.chunked = True
+                # Save chunks for persistence
+                CHUNKS_FILE = "/tmp/chunks.pkl"
+                with open(CHUNKS_FILE, "wb") as f:
+                    pickle.dump(documents, f)
+                st.success(f"✅ **Document Chunked!** Total Chunks: {len(documents)}")
+            # Reset trigger to prevent looping
+            st.session_state.process_pdf = False
+        except Exception as e:
+            st.error(f"❌ Error: {e}")
 # Step 2: Load & Process PDF (Only Once)
 if st.session_state.pdf_path and not st.session_state.pdf_loaded: