DrishtiSharma commited on
Commit
7f19084
Β·
verified Β·
1 Parent(s): c576780

Update lab/title_issue_attempt1.py

Browse files
Files changed (1) hide show
  1. lab/title_issue_attempt1.py +87 -42
lab/title_issue_attempt1.py CHANGED
@@ -1,5 +1,6 @@
1
  import streamlit as st
2
  import os
 
3
  import requests
4
  import pdfplumber
5
  import chromadb
@@ -14,8 +15,7 @@ from langchain_groq import ChatGroq
14
  from prompts import rag_prompt, relevancy_prompt, relevant_context_picker_prompt, response_synth
15
 
16
  # ----------------- Streamlit UI Setup -----------------
17
- st.set_page_config(page_title="Blah", layout="centered")
18
- st.title("Blah-1")
19
 
20
  # ----------------- API Keys -----------------
21
  os.environ["GROQ_API_KEY"] = st.secrets.get("GROQ_API_KEY", "")
@@ -30,7 +30,6 @@ rag_llm.verbose = True
30
  # Clear ChromaDB cache to fix tenant issue
31
  chromadb.api.client.SharedSystemClient.clear_system_cache()
32
 
33
- st.title("Blah")
34
 
35
  # ----------------- ChromaDB Persistent Directory -----------------
36
  CHROMA_DB_DIR = "/mnt/data/chroma_db"
@@ -48,34 +47,80 @@ if "processed_chunks" not in st.session_state:
48
  if "vector_store" not in st.session_state:
49
  st.session_state.vector_store = None
50
 
51
- # ----------------- Improved Metadata Extraction -----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  def extract_metadata(pdf_path):
53
- """Extracts title, author, emails, and affiliations from PDF."""
 
54
  with pdfplumber.open(pdf_path) as pdf:
55
- metadata = pdf.metadata or {}
56
-
57
- # Extract title
58
- title = metadata.get("Title", "").strip()
59
- if not title and pdf.pages:
60
- text = pdf.pages[0].extract_text()
61
- title_match = re.search(r"(?i)title[:\-]?\s*(.*)", text or "")
62
- title = title_match.group(1) if title_match else text.split("\n")[0] if text else "Untitled Document"
63
-
64
- # Extract author
65
- author = metadata.get("Author", "").strip()
66
- if not author and pdf.pages:
67
- author_match = re.search(r"(?i)by\s+([A-Za-z\s,]+)", pdf.pages[0].extract_text() or "")
68
- author = author_match.group(1).strip() if author_match else "Unknown Author"
69
-
70
- # Extract emails
71
- emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", pdf.pages[0].extract_text() or "")
72
- email_str = ", ".join(emails) if emails else "No emails found"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
- # Extract affiliations
75
- affiliations = re.findall(r"(?:Department|Faculty|Institute|University|College|School)\s+[\w\s]+", pdf.pages[0].extract_text() or "")
76
- affiliation_str = ", ".join(affiliations) if affiliations else "No affiliations found"
77
-
78
- return title, author, email_str, affiliation_str
79
 
80
  # ----------------- Step 1: Choose PDF Source -----------------
81
  pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
@@ -118,22 +163,25 @@ if not st.session_state.pdf_loaded and "pdf_path" in st.session_state:
118
  st.json(docs[0].metadata)
119
 
120
  # Extract metadata
121
- title, author, email_str, affiliation_str = extract_metadata(st.session_state.pdf_path)
122
-
123
- # Display extracted metadata
124
- st.subheader("πŸ“„ Extracted Document Metadata")
125
- st.write(f"**Title:** {title}")
126
- st.write(f"**Author:** {author}")
127
- st.write(f"**Emails:** {email_str}")
128
- st.write(f"**Affiliations:** {affiliation_str}")
 
 
 
129
 
130
  # Embedding Model
131
  model_name = "nomic-ai/modernbert-embed-base"
132
  embedding_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={"device": "cpu"}, encode_kwargs={'normalize_embeddings': False})
133
 
134
  # Convert metadata into a retrievable chunk
135
- metadata_text = f"Title: {title}\nAuthor: {author}\nEmails: {email_str}\nAffiliations: {affiliation_str}"
136
- metadata_doc = {"page_content": metadata_text, "metadata": {"source": "metadata"}}
137
 
138
  # Prevent unnecessary re-chunking
139
  if not st.session_state.chunked:
@@ -191,9 +239,6 @@ if query:
191
  st.markdown("### Extracted Relevant Contexts")
192
  st.json(contexts["relevant_contexts"])
193
 
194
- st.markdown("### RAG Final Response")
195
- st.write(final_response["final_response"])
196
-
197
  st.subheader("context_relevancy_evaluation_chain Statement")
198
  st.json(final_response["relevancy_response"])
199
 
@@ -204,4 +249,4 @@ if query:
204
  st.json(final_response["relevant_contexts"])
205
 
206
  st.subheader("RAG Response Statement")
207
- st.json(final_response["final_response"])
 
1
  import streamlit as st
2
  import os
3
+ import json
4
  import requests
5
  import pdfplumber
6
  import chromadb
 
15
  from prompts import rag_prompt, relevancy_prompt, relevant_context_picker_prompt, response_synth
16
 
17
  # ----------------- Streamlit UI Setup -----------------
18
+ st.set_page_config(page_title="Blah-1", layout="centered")
 
19
 
20
  # ----------------- API Keys -----------------
21
  os.environ["GROQ_API_KEY"] = st.secrets.get("GROQ_API_KEY", "")
 
30
  # Clear ChromaDB cache to fix tenant issue
31
  chromadb.api.client.SharedSystemClient.clear_system_cache()
32
 
 
33
 
34
  # ----------------- ChromaDB Persistent Directory -----------------
35
  CHROMA_DB_DIR = "/mnt/data/chroma_db"
 
47
  if "vector_store" not in st.session_state:
48
  st.session_state.vector_store = None
49
 
50
+
51
+ # ----------------- Text Cleaning Functions -----------------
52
+ def clean_extracted_text(text):
53
+ """
54
+ Cleans extracted PDF text by removing excessive line breaks, fixing spacing issues, and resolving OCR artifacts.
55
+ """
56
+ text = re.sub(r'\n+', '\n', text) # Remove excessive newlines
57
+ text = re.sub(r'\s{2,}', ' ', text) # Remove extra spaces
58
+ text = re.sub(r'(\w)-\n(\w)', r'\1\2', text) # Fix hyphenated words split by a newline
59
+ return text.strip()
60
+
61
+ def extract_title_manually(text):
62
+ """
63
+ Attempts to find the title by checking the first few lines.
64
+ - Titles are usually long enough (more than 5 words).
65
+ - Ignores common header text like "Abstract", "Introduction".
66
+ """
67
+ lines = text.split("\n")
68
+ ignore_keywords = ["abstract", "introduction", "keywords", "contents", "table", "figure"]
69
+
70
+ for line in lines[:5]: # Check only the first 5 lines
71
+ clean_line = line.strip()
72
+ if len(clean_line.split()) > 5 and not any(word.lower() in clean_line.lower() for word in ignore_keywords):
73
+ return clean_line # Return first valid title
74
+ return "Unknown"
75
+
76
+ # ----------------- Metadata Extraction -----------------
77
+ # ----------------- Metadata Extraction -----------------
78
  def extract_metadata(pdf_path):
79
+ """Extracts metadata using simple heuristics without LLM."""
80
+
81
  with pdfplumber.open(pdf_path) as pdf:
82
+ if not pdf.pages:
83
+ return {
84
+ "Title": "Unknown",
85
+ "Author": "Unknown",
86
+ "Emails": "No emails found",
87
+ "Affiliations": "No affiliations found"
88
+ }
89
+
90
+ # Extract text from the first page
91
+ first_page_text = pdf.pages[0].extract_text() or "No text found."
92
+ cleaned_text = clean_extracted_text(first_page_text)
93
+
94
+ # Extract Title
95
+ pre_extracted_title = extract_title_manually(cleaned_text)
96
+
97
+ # Extract Authors (Names typically appear before affiliations)
98
+ author_pattern = re.compile(r"([\w\-\s]+,\s?)+[\w\-\s]+")
99
+ authors = "Unknown"
100
+ for line in cleaned_text.split("\n"):
101
+ match = author_pattern.search(line)
102
+ if match:
103
+ authors = match.group(0)
104
+ break
105
+
106
+ # Extract Emails
107
+ email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
108
+ emails = ", ".join(email_pattern.findall(cleaned_text)) or "No emails found"
109
+
110
+ # Extract Affiliations (usually below author names)
111
+ affiliations = "Unknown"
112
+ for i, line in enumerate(cleaned_text.split("\n")):
113
+ if "@" in line: # Email appears before affiliations
114
+ affiliations = cleaned_text.split("\n")[i + 1] if i + 1 < len(cleaned_text.split("\n")) else "Unknown"
115
+ break
116
+
117
+ return {
118
+ "Title": pre_extracted_title,
119
+ "Author": authors,
120
+ "Emails": emails,
121
+ "Affiliations": affiliations
122
+ }
123
 
 
 
 
 
 
124
 
125
  # ----------------- Step 1: Choose PDF Source -----------------
126
  pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
 
163
  st.json(docs[0].metadata)
164
 
165
  # Extract metadata
166
+ metadata = extract_metadata(st.session_state.pdf_path)
167
+
168
+ # Display extracted-metadata
169
+ if isinstance(metadata, dict):
170
+ st.subheader("πŸ“„ Extracted Document Metadata")
171
+ st.write(f"**Title:** {metadata.get('Title', 'Unknown')}")
172
+ st.write(f"**Author:** {metadata.get('Author', 'Unknown')}")
173
+ st.write(f"**Emails:** {metadata.get('Emails', 'No emails found')}")
174
+ st.write(f"**Affiliations:** {metadata.get('Affiliations', 'No affiliations found')}")
175
+ else:
176
+ st.error("Metadata extraction failed.")
177
 
178
  # Embedding Model
179
  model_name = "nomic-ai/modernbert-embed-base"
180
  embedding_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={"device": "cpu"}, encode_kwargs={'normalize_embeddings': False})
181
 
182
  # Convert metadata into a retrievable chunk
183
+ metadata_doc = {"page_content": metadata, "metadata": {"source": "metadata"}}
184
+
185
 
186
  # Prevent unnecessary re-chunking
187
  if not st.session_state.chunked:
 
239
  st.markdown("### Extracted Relevant Contexts")
240
  st.json(contexts["relevant_contexts"])
241
 
 
 
 
242
  st.subheader("context_relevancy_evaluation_chain Statement")
243
  st.json(final_response["relevancy_response"])
244
 
 
249
  st.json(final_response["relevant_contexts"])
250
 
251
  st.subheader("RAG Response Statement")
252
+ st.json(final_response["final_response"])