Spaces:

DrishtiSharma
/

docqa-with-deepseek-r1

Running

App Files Files Community

DrishtiSharma commited on 10 days ago

Commit

7f19084

verified ·

1 Parent(s): c576780

Update lab/title_issue_attempt1.py

Browse files

Files changed (1) hide show

lab/title_issue_attempt1.py +87 -42

lab/title_issue_attempt1.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import streamlit as st
 import os
 import requests
 import pdfplumber
 import chromadb
@@ -14,8 +15,7 @@ from langchain_groq import ChatGroq
 from prompts import rag_prompt, relevancy_prompt, relevant_context_picker_prompt, response_synth
 # ----------------- Streamlit UI Setup -----------------
-st.set_page_config(page_title="Blah", layout="centered")
-st.title("Blah-1")
 # ----------------- API Keys -----------------
 os.environ["GROQ_API_KEY"] = st.secrets.get("GROQ_API_KEY", "")
@@ -30,7 +30,6 @@ rag_llm.verbose = True
 # Clear ChromaDB cache to fix tenant issue
 chromadb.api.client.SharedSystemClient.clear_system_cache()
-st.title("Blah")
 # ----------------- ChromaDB Persistent Directory -----------------
 CHROMA_DB_DIR = "/mnt/data/chroma_db"
@@ -48,34 +47,80 @@ if "processed_chunks" not in st.session_state:
 if "vector_store" not in st.session_state:
     st.session_state.vector_store = None
-# ----------------- Improved Metadata Extraction -----------------
 def extract_metadata(pdf_path):
-    """Extracts title, author, emails, and affiliations from PDF."""
     with pdfplumber.open(pdf_path) as pdf:
-        metadata = pdf.metadata or {}
-        # Extract title
-        title = metadata.get("Title", "").strip()
-        if not title and pdf.pages:
-            text = pdf.pages[0].extract_text()
-            title_match = re.search(r"(?i)title[:\-]?\s*(.*)", text or "")
-            title = title_match.group(1) if title_match else text.split("\n")[0] if text else "Untitled Document"
-        # Extract author
-        author = metadata.get("Author", "").strip()
-        if not author and pdf.pages:
-            author_match = re.search(r"(?i)by\s+([A-Za-z\s,]+)", pdf.pages[0].extract_text() or "")
-            author = author_match.group(1).strip() if author_match else "Unknown Author"
-        # Extract emails
-        emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", pdf.pages[0].extract_text() or "")
-        email_str = ", ".join(emails) if emails else "No emails found"
-        # Extract affiliations
-        affiliations = re.findall(r"(?:Department|Faculty|Institute|University|College|School)\s+[\w\s]+", pdf.pages[0].extract_text() or "")
-        affiliation_str = ", ".join(affiliations) if affiliations else "No affiliations found"
-    return title, author, email_str, affiliation_str
 # ----------------- Step 1: Choose PDF Source -----------------
 pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
@@ -118,22 +163,25 @@ if not st.session_state.pdf_loaded and "pdf_path" in st.session_state:
         st.json(docs[0].metadata)
         # Extract metadata
-        title, author, email_str, affiliation_str = extract_metadata(st.session_state.pdf_path)
-        # Display extracted metadata
-        st.subheader("📄 Extracted Document Metadata")
-        st.write(f"**Title:** {title}")
-        st.write(f"**Author:** {author}")
-        st.write(f"**Emails:** {email_str}")
-        st.write(f"**Affiliations:** {affiliation_str}")
         # Embedding Model
         model_name = "nomic-ai/modernbert-embed-base"
         embedding_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={"device": "cpu"}, encode_kwargs={'normalize_embeddings': False})
         # Convert metadata into a retrievable chunk
-        metadata_text = f"Title: {title}\nAuthor: {author}\nEmails: {email_str}\nAffiliations: {affiliation_str}"
-        metadata_doc = {"page_content": metadata_text, "metadata": {"source": "metadata"}}
         # Prevent unnecessary re-chunking
         if not st.session_state.chunked:
@@ -191,9 +239,6 @@ if query:
     st.markdown("### Extracted Relevant Contexts")
     st.json(contexts["relevant_contexts"])
-    st.markdown("### RAG Final Response")
-    st.write(final_response["final_response"])
     st.subheader("context_relevancy_evaluation_chain Statement")
     st.json(final_response["relevancy_response"])
@@ -204,4 +249,4 @@ if query:
     st.json(final_response["relevant_contexts"])
     st.subheader("RAG Response Statement")
-    st.json(final_response["final_response"])

 import streamlit as st
 import os
+import json
 import requests
 import pdfplumber
 import chromadb
 from prompts import rag_prompt, relevancy_prompt, relevant_context_picker_prompt, response_synth
 # ----------------- Streamlit UI Setup -----------------
+st.set_page_config(page_title="Blah-1", layout="centered")
 # ----------------- API Keys -----------------
 os.environ["GROQ_API_KEY"] = st.secrets.get("GROQ_API_KEY", "")
 # Clear ChromaDB cache to fix tenant issue
 chromadb.api.client.SharedSystemClient.clear_system_cache()
 # ----------------- ChromaDB Persistent Directory -----------------
 CHROMA_DB_DIR = "/mnt/data/chroma_db"
 if "vector_store" not in st.session_state:
     st.session_state.vector_store = None
+# ----------------- Text Cleaning Functions -----------------
+def clean_extracted_text(text):
+    """
+    Cleans extracted PDF text by removing excessive line breaks, fixing spacing issues, and resolving OCR artifacts.
+    """
+    text = re.sub(r'\n+', '\n', text)  # Remove excessive newlines
+    text = re.sub(r'\s{2,}', ' ', text)  # Remove extra spaces
+    text = re.sub(r'(\w)-\n(\w)', r'\1\2', text)  # Fix hyphenated words split by a newline
+    return text.strip()
+def extract_title_manually(text):
+    """
+    Attempts to find the title by checking the first few lines.
+    - Titles are usually long enough (more than 5 words).
+    - Ignores common header text like "Abstract", "Introduction".
+    """
+    lines = text.split("\n")
+    ignore_keywords = ["abstract", "introduction", "keywords", "contents", "table", "figure"]
+    for line in lines[:5]:  # Check only the first 5 lines
+        clean_line = line.strip()
+        if len(clean_line.split()) > 5 and not any(word.lower() in clean_line.lower() for word in ignore_keywords):
+            return clean_line  # Return first valid title
+    return "Unknown"
+# ----------------- Metadata Extraction -----------------
+# ----------------- Metadata Extraction -----------------
 def extract_metadata(pdf_path):
+    """Extracts metadata using simple heuristics without LLM."""
     with pdfplumber.open(pdf_path) as pdf:
+        if not pdf.pages:
+            return {
+                "Title": "Unknown",
+                "Author": "Unknown",
+                "Emails": "No emails found",
+                "Affiliations": "No affiliations found"
+            }
+        # Extract text from the first page
+        first_page_text = pdf.pages[0].extract_text() or "No text found."
+        cleaned_text = clean_extracted_text(first_page_text)
+        # Extract Title
+        pre_extracted_title = extract_title_manually(cleaned_text)
+        # Extract Authors (Names typically appear before affiliations)
+        author_pattern = re.compile(r"([\w\-\s]+,\s?)+[\w\-\s]+")
+        authors = "Unknown"
+        for line in cleaned_text.split("\n"):
+            match = author_pattern.search(line)
+            if match:
+                authors = match.group(0)
+                break
+        # Extract Emails
+        email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
+        emails = ", ".join(email_pattern.findall(cleaned_text)) or "No emails found"
+        # Extract Affiliations (usually below author names)
+        affiliations = "Unknown"
+        for i, line in enumerate(cleaned_text.split("\n")):
+            if "@" in line:  # Email appears before affiliations
+                affiliations = cleaned_text.split("\n")[i + 1] if i + 1 < len(cleaned_text.split("\n")) else "Unknown"
+                break
+        return {
+            "Title": pre_extracted_title,
+            "Author": authors,
+            "Emails": emails,
+            "Affiliations": affiliations
+        }
 # ----------------- Step 1: Choose PDF Source -----------------
 pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
         st.json(docs[0].metadata)
         # Extract metadata
+        metadata = extract_metadata(st.session_state.pdf_path)
+        # Display extracted-metadata
+        if isinstance(metadata, dict):
+            st.subheader("📄 Extracted Document Metadata")
+            st.write(f"**Title:** {metadata.get('Title', 'Unknown')}")
+            st.write(f"**Author:** {metadata.get('Author', 'Unknown')}")
+            st.write(f"**Emails:** {metadata.get('Emails', 'No emails found')}")
+            st.write(f"**Affiliations:** {metadata.get('Affiliations', 'No affiliations found')}")
+        else:
+            st.error("Metadata extraction failed.")
         # Embedding Model
         model_name = "nomic-ai/modernbert-embed-base"
         embedding_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={"device": "cpu"}, encode_kwargs={'normalize_embeddings': False})
         # Convert metadata into a retrievable chunk
+        metadata_doc = {"page_content": metadata, "metadata": {"source": "metadata"}}
         # Prevent unnecessary re-chunking
         if not st.session_state.chunked:
     st.markdown("### Extracted Relevant Contexts")
     st.json(contexts["relevant_contexts"])
     st.subheader("context_relevancy_evaluation_chain Statement")
     st.json(final_response["relevancy_response"])
     st.json(final_response["relevant_contexts"])
     st.subheader("RAG Response Statement")
+    st.json(final_response["final_response"])