Spaces:
Running
Running
File size: 9,476 Bytes
3eed450 e9c51d1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
import os
import re
from PyPDF2 import PdfReader, PdfWriter
import streamlit as st
from config import keywords_dict, stop_keywords, anti_keywords
def find_cover(uploaded_file):
"""
Extracts and saves the first page of a PDF to a temporary file.
Parameters:
uploaded_file: The uploaded PDF file.
Returns:
str: Path to the temporary file containing the first page of the PDF.
"""
section_title = "cover"
if uploaded_file:
try:
# Read the PDF and extract the first page
pdf_reader = PdfReader(uploaded_file)
first_page = pdf_reader.pages[0]
pdf_writer = PdfWriter()
temp_cover_page_path = os.path.join(f"temp_{section_title}_1.pdf")
with open(temp_cover_page_path, "wb") as f:
pdf_writer.add_page(first_page)
pdf_writer.write(f)
# Return the path to the temporary file
return temp_cover_page_path
except Exception as e:
st.error(f"An error occurred while processing the PDF: {e}")
return None
else:
st.warning("Please upload a PDF on the Home page first.")
return None
def find_underwriter(uploaded_file):
"""
Searches for pages in a PDF containing specific keywords for the 'underwriter' section and returns the extracted file path.
Parameters:
uploaded_file: The uploaded PDF file.
Returns:
str: Path to the temporary file containing the extracted 'underwriter' page(s).
"""
section_name = "underwriter"
keyword_sets = keywords_dict.get(section_name, [])
if not keyword_sets:
st.error(f"No keywords defined for section: {section_name}")
return None
if uploaded_file:
try:
pdf_reader = PdfReader(uploaded_file)
total_pages = len(pdf_reader.pages)
start_page = total_pages // 3 # Skip the first 1/3 of the PDF
pages = pdf_reader.pages[start_page:]
# Loop through the keyword sets
for keyword_set in keyword_sets:
for page_num, page in enumerate(pages, start=start_page + 1):
text = page.extract_text()
# Check if any keyword in the set is found on the page
if any(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
# Save the matched page to a temporary file
pdf_writer = PdfWriter()
pdf_writer.add_page(page)
temp_page_path = os.path.join(f"temp_{section_name}_{page_num}.pdf")
with open(temp_page_path, "wb") as f:
pdf_writer.write(f)
# Return the path of the extracted page
return temp_page_path
st.warning(f"No pages contain the specified keywords for {section_name}.")
return None
except Exception as e:
st.error(f"An error occurred while processing the PDF: {e}")
return None
else:
st.warning("Please upload a PDF on the Home page first.")
return None
def find_financial(uploaded_file, section_name):
"""
Extracts and displays sections of a PDF based on keyword matches.
Parameters:
uploaded_file: The uploaded PDF file (Streamlit file uploader object).
section_name: The name of the section to search for (e.g., "income_statement").
Returns:
bool: True if processing completed without interruptions; False if stopped or an error occurred.
"""
if uploaded_file:
try:
pdf_reader = PdfReader(uploaded_file)
total_pages = len(pdf_reader.pages)
# Step 1: Start from the second half of the PDF
start_page = total_pages // 2
pages = pdf_reader.pages[start_page:]
section_keywords = keywords_dict.get(section_name, [])
section_stop_keywords = stop_keywords.get(section_name, [])
section_anti_keywords = anti_keywords.get(section_name, [])
pdf_writer = PdfWriter() # Writer for the extracted pages
extraction_started = False # Flag to check if extraction has started
extraction_start_page = None # Track the starting page number
pages_extracted = 0 # Counter for extracted pages
for page_num, page in enumerate(pages, start=start_page + 1):
text = page.extract_text()
# Step 2: Find the keywords within the keywords_dict
if not extraction_started:
for keyword_set in section_keywords:
if all(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
pdf_writer.add_page(page)
pages_extracted += 1
extraction_start_page = page_num # Set the starting page number
# Check for stop keywords on the same page
if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
for stop_set in section_stop_keywords):
# Check for anti-keywords before stopping
if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
for anti_set in section_anti_keywords):
pdf_writer.pages.pop() # Remove the last added page
pages_extracted -= 1
# Save and display the extracted pages (if any)
if len(pdf_writer.pages) > 0:
temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num}.pdf")
with open(temp_section_path, "wb") as f:
pdf_writer.write(f)
return temp_section_path
else:
st.warning(f"No pages matched the criteria for {section_name}.")
# Stop extraction immediately and signal to stop all processing
return False
else:
# Continue extraction
extraction_started = True
break
elif extraction_started:
# Check if we've reached the 3-page limit
if pages_extracted >= 3:
if len(pdf_writer.pages) > 0:
temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num-1}.pdf")
with open(temp_section_path, "wb") as f:
pdf_writer.write(f)
return temp_section_path
return False
# Step 3: Add the page to the output
pdf_writer.add_page(page)
pages_extracted += 1
# Step 4: Check for stop keywords
if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
for stop_set in section_stop_keywords):
# Step 5: After stopping, check for anti-keywords
if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
for anti_set in section_anti_keywords):
pdf_writer.pages.pop() # Remove the last added page
pages_extracted -= 1
# Save and display the extracted pages (if any)
if len(pdf_writer.pages) > 0:
temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num}.pdf")
with open(temp_section_path, "wb") as f:
pdf_writer.write(f)
return temp_section_path
else:
st.warning(f"No pages matched the criteria for {section_name}.")
# Stop extraction and signal to stop all processing
return False
# If extraction finished without hitting stop keywords, save and display the pages
if len(pdf_writer.pages) > 0:
temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num}.pdf")
with open(temp_section_path, "wb") as f:
pdf_writer.write(f)
return temp_section_path
else:
st.warning(f"No pages matched the criteria for {section_name}.")
# Indicate that processing can continue
return True
except Exception as e:
st.error(f"An error occurred while processing the PDF: {e}")
# Stop processing due to an error
return False
else:
st.warning("Please upload a PDF on the Home page first.")
# Stop processing since no file is uploaded
return False |