DrishtiSharma commited on
Commit
7208469
Β·
verified Β·
1 Parent(s): a4e4b71

Update lab/title_issue.py

Browse files
Files changed (1) hide show
  1. lab/title_issue.py +53 -32
lab/title_issue.py CHANGED
@@ -1,12 +1,14 @@
1
  import streamlit as st
2
  import os
3
  import requests
 
4
  import chromadb
 
5
  from langchain.document_loaders import PDFPlumberLoader
6
  from langchain_huggingface import HuggingFaceEmbeddings
7
  from langchain_experimental.text_splitter import SemanticChunker
8
  from langchain_chroma import Chroma
9
- from langchain.chains import LLMChain, SequentialChain
10
  from langchain.prompts import PromptTemplate
11
  from langchain_groq import ChatGroq
12
  from prompts import rag_prompt, relevancy_prompt, relevant_context_picker_prompt, response_synth
@@ -18,8 +20,9 @@ st.title("Blah-1")
18
  # ----------------- API Keys -----------------
19
  os.environ["GROQ_API_KEY"] = st.secrets.get("GROQ_API_KEY", "")
20
 
21
- # ----------------- Clear ChromaDB Cache -----------------
22
- chromadb.api.client.SharedSystemClient.clear_system_cache()
 
23
 
24
  # ----------------- Initialize Session State -----------------
25
  if "pdf_loaded" not in st.session_state:
@@ -33,47 +36,48 @@ if "processed_chunks" not in st.session_state:
33
  if "vector_store" not in st.session_state:
34
  st.session_state.vector_store = None
35
 
36
- # ----------------- Load Models -----------------
37
- llm_judge = ChatGroq(model="deepseek-r1-distill-llama-70b")
38
- rag_llm = ChatGroq(model="mixtral-8x7b-32768")
 
 
39
 
40
- # Enable verbose logging for debugging
41
- llm_judge.verbose = True
42
- rag_llm.verbose = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  # ----------------- PDF Selection -----------------
45
- #st.subheader("PDF Selection")
46
  pdf_source = st.radio("Choose a PDF source:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
47
 
48
  if pdf_source == "Upload a PDF file":
49
  uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"])
50
  if uploaded_file:
51
- st.session_state.pdf_path = "temp.pdf"
52
  with open(st.session_state.pdf_path, "wb") as f:
53
  f.write(uploaded_file.getbuffer())
54
  st.session_state.pdf_loaded = False
55
  st.session_state.chunked = False
56
  st.session_state.vector_created = False
57
 
58
- elif pdf_source == "Enter a PDF URL":
59
- pdf_url = st.text_input("Enter PDF URL:")
60
- if pdf_url and not st.session_state.pdf_loaded:
61
- with st.spinner("πŸ”„ Downloading PDF..."):
62
- try:
63
- response = requests.get(pdf_url)
64
- if response.status_code == 200:
65
- st.session_state.pdf_path = "temp.pdf"
66
- with open(st.session_state.pdf_path, "wb") as f:
67
- f.write(response.content)
68
- st.session_state.pdf_loaded = False
69
- st.session_state.chunked = False
70
- st.session_state.vector_created = False
71
- st.success("βœ… PDF Downloaded Successfully!")
72
- else:
73
- st.error("❌ Failed to download PDF. Check the URL.")
74
- except Exception as e:
75
- st.error(f"Error downloading PDF: {e}")
76
-
77
  # ----------------- Process PDF -----------------
78
  if not st.session_state.pdf_loaded and "pdf_path" in st.session_state:
79
  with st.spinner("πŸ”„ Processing document... Please wait."):
@@ -81,14 +85,29 @@ if not st.session_state.pdf_loaded and "pdf_path" in st.session_state:
81
  docs = loader.load()
82
  st.json(docs[0].metadata)
83
 
 
 
 
 
 
 
 
 
 
 
84
  # Embedding Model
85
  model_name = "nomic-ai/modernbert-embed-base"
86
- embedding_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={"device": "cpu"}, encode_kwargs = {'normalize_embeddings': False})
 
 
 
 
87
 
88
  # Prevent unnecessary re-chunking
89
  if not st.session_state.chunked:
90
  text_splitter = SemanticChunker(embedding_model)
91
  document_chunks = text_splitter.split_documents(docs)
 
92
  st.session_state.processed_chunks = document_chunks
93
  st.session_state.chunked = True
94
 
@@ -99,6 +118,7 @@ if not st.session_state.pdf_loaded and "pdf_path" in st.session_state:
99
  if not st.session_state.vector_created and st.session_state.processed_chunks:
100
  with st.spinner("πŸ”„ Initializing Vector Store..."):
101
  st.session_state.vector_store = Chroma(
 
102
  collection_name="deepseek_collection",
103
  collection_metadata={"hnsw:space": "cosine"},
104
  embedding_function=embedding_model
@@ -107,6 +127,7 @@ if not st.session_state.vector_created and st.session_state.processed_chunks:
107
  st.session_state.vector_created = True
108
  st.success("βœ… Vector store initialized successfully!")
109
 
 
110
  # ----------------- Query Input -----------------
111
  query = st.text_input("πŸ” Ask a question about the document:")
112
 
@@ -151,4 +172,4 @@ if query:
151
  st.json(final_response["relevant_contexts"])
152
 
153
  st.subheader("RAG Response Statement")
154
- st.json(final_response["final_response"])
 
1
  import streamlit as st
2
  import os
3
  import requests
4
+ import pdfplumber
5
  import chromadb
6
+ import re
7
  from langchain.document_loaders import PDFPlumberLoader
8
  from langchain_huggingface import HuggingFaceEmbeddings
9
  from langchain_experimental.text_splitter import SemanticChunker
10
  from langchain_chroma import Chroma
11
+ from langchain.chains import LLMChain
12
  from langchain.prompts import PromptTemplate
13
  from langchain_groq import ChatGroq
14
  from prompts import rag_prompt, relevancy_prompt, relevant_context_picker_prompt, response_synth
 
20
  # ----------------- API Keys -----------------
21
  os.environ["GROQ_API_KEY"] = st.secrets.get("GROQ_API_KEY", "")
22
 
23
+ # ----------------- ChromaDB Persistent Directory -----------------
24
+ CHROMA_DB_DIR = "/mnt/data/chroma_db"
25
+ os.makedirs(CHROMA_DB_DIR, exist_ok=True)
26
 
27
  # ----------------- Initialize Session State -----------------
28
  if "pdf_loaded" not in st.session_state:
 
36
  if "vector_store" not in st.session_state:
37
  st.session_state.vector_store = None
38
 
39
+ # ----------------- Improved Metadata Extraction -----------------
40
+ def extract_metadata(pdf_path):
41
+ """Extracts title, author, emails, and affiliations from PDF."""
42
+ with pdfplumber.open(pdf_path) as pdf:
43
+ metadata = pdf.metadata or {}
44
 
45
+ # Extract title
46
+ title = metadata.get("Title", "").strip()
47
+ if not title and pdf.pages:
48
+ text = pdf.pages[0].extract_text()
49
+ title_match = re.search(r"(?i)title[:\-]?\s*(.*)", text or "")
50
+ title = title_match.group(1) if title_match else text.split("\n")[0] if text else "Untitled Document"
51
+
52
+ # Extract author
53
+ author = metadata.get("Author", "").strip()
54
+ if not author and pdf.pages:
55
+ author_match = re.search(r"(?i)by\s+([A-Za-z\s,]+)", pdf.pages[0].extract_text() or "")
56
+ author = author_match.group(1).strip() if author_match else "Unknown Author"
57
+
58
+ # Extract emails
59
+ emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", pdf.pages[0].extract_text() or "")
60
+ email_str = ", ".join(emails) if emails else "No emails found"
61
+
62
+ # Extract affiliations
63
+ affiliations = re.findall(r"(?:Department|Faculty|Institute|University|College|School)\s+[\w\s]+", pdf.pages[0].extract_text() or "")
64
+ affiliation_str = ", ".join(affiliations) if affiliations else "No affiliations found"
65
+
66
+ return title, author, email_str, affiliation_str
67
 
68
  # ----------------- PDF Selection -----------------
 
69
  pdf_source = st.radio("Choose a PDF source:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
70
 
71
  if pdf_source == "Upload a PDF file":
72
  uploaded_file = st.file_uploader("Upload your PDF file", type=["pdf"])
73
  if uploaded_file:
74
+ st.session_state.pdf_path = "/mnt/data/temp.pdf"
75
  with open(st.session_state.pdf_path, "wb") as f:
76
  f.write(uploaded_file.getbuffer())
77
  st.session_state.pdf_loaded = False
78
  st.session_state.chunked = False
79
  st.session_state.vector_created = False
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  # ----------------- Process PDF -----------------
82
  if not st.session_state.pdf_loaded and "pdf_path" in st.session_state:
83
  with st.spinner("πŸ”„ Processing document... Please wait."):
 
85
  docs = loader.load()
86
  st.json(docs[0].metadata)
87
 
88
+ # Extract metadata
89
+ title, author, email_str, affiliation_str = extract_metadata(st.session_state.pdf_path)
90
+
91
+ # Display extracted metadata
92
+ st.subheader("πŸ“„ Extracted Document Metadata")
93
+ st.write(f"**Title:** {title}")
94
+ st.write(f"**Author:** {author}")
95
+ st.write(f"**Emails:** {email_str}")
96
+ st.write(f"**Affiliations:** {affiliation_str}")
97
+
98
  # Embedding Model
99
  model_name = "nomic-ai/modernbert-embed-base"
100
+ embedding_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={"device": "cpu"}, encode_kwargs={'normalize_embeddings': False})
101
+
102
+ # Convert metadata into a retrievable chunk
103
+ metadata_text = f"Title: {title}\nAuthor: {author}\nEmails: {email_str}\nAffiliations: {affiliation_str}"
104
+ metadata_doc = {"page_content": metadata_text, "metadata": {"source": "metadata"}}
105
 
106
  # Prevent unnecessary re-chunking
107
  if not st.session_state.chunked:
108
  text_splitter = SemanticChunker(embedding_model)
109
  document_chunks = text_splitter.split_documents(docs)
110
+ document_chunks.insert(0, metadata_doc) # Insert metadata as a retrievable document
111
  st.session_state.processed_chunks = document_chunks
112
  st.session_state.chunked = True
113
 
 
118
  if not st.session_state.vector_created and st.session_state.processed_chunks:
119
  with st.spinner("πŸ”„ Initializing Vector Store..."):
120
  st.session_state.vector_store = Chroma(
121
+ persist_directory=CHROMA_DB_DIR, # <-- Ensures persistence
122
  collection_name="deepseek_collection",
123
  collection_metadata={"hnsw:space": "cosine"},
124
  embedding_function=embedding_model
 
127
  st.session_state.vector_created = True
128
  st.success("βœ… Vector store initialized successfully!")
129
 
130
+
131
  # ----------------- Query Input -----------------
132
  query = st.text_input("πŸ” Ask a question about the document:")
133
 
 
172
  st.json(final_response["relevant_contexts"])
173
 
174
  st.subheader("RAG Response Statement")
175
+ st.json(final_response["final_response"])