Spaces:

msr2903
/

ProspectusLens

Running

File size: 9,476 Bytes

import os
import re
from PyPDF2 import PdfReader, PdfWriter
import streamlit as st
from config import keywords_dict, stop_keywords, anti_keywords

def find_cover(uploaded_file):
    """
    Extracts and saves the first page of a PDF to a temporary file.

    Parameters:
        uploaded_file: The uploaded PDF file.

    Returns:
        str: Path to the temporary file containing the first page of the PDF.
    """
    section_title = "cover"
    if uploaded_file:
        try:
            # Read the PDF and extract the first page
            pdf_reader = PdfReader(uploaded_file)
            first_page = pdf_reader.pages[0]

            pdf_writer = PdfWriter()
            temp_cover_page_path = os.path.join(f"temp_{section_title}_1.pdf")
            with open(temp_cover_page_path, "wb") as f:
                pdf_writer.add_page(first_page)
                pdf_writer.write(f)

            # Return the path to the temporary file
            return temp_cover_page_path
        except Exception as e:
            st.error(f"An error occurred while processing the PDF: {e}")
            return None
    else:
        st.warning("Please upload a PDF on the Home page first.")
        return None


def find_underwriter(uploaded_file):
    """
    Searches for pages in a PDF containing specific keywords for the 'underwriter' section and returns the extracted file path.

    Parameters:
        uploaded_file: The uploaded PDF file.

    Returns:
        str: Path to the temporary file containing the extracted 'underwriter' page(s).
    """
    section_name = "underwriter"

    keyword_sets = keywords_dict.get(section_name, [])
    if not keyword_sets:
        st.error(f"No keywords defined for section: {section_name}")
        return None

    if uploaded_file:
        try:
            pdf_reader = PdfReader(uploaded_file)
            total_pages = len(pdf_reader.pages)
            start_page = total_pages // 3  # Skip the first 1/3 of the PDF
            pages = pdf_reader.pages[start_page:]

            # Loop through the keyword sets
            for keyword_set in keyword_sets:
                for page_num, page in enumerate(pages, start=start_page + 1):
                    text = page.extract_text()
                    
                    # Check if any keyword in the set is found on the page
                    if any(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
                        # Save the matched page to a temporary file
                        pdf_writer = PdfWriter()
                        pdf_writer.add_page(page)

                        temp_page_path = os.path.join(f"temp_{section_name}_{page_num}.pdf")
                        with open(temp_page_path, "wb") as f:
                            pdf_writer.write(f)

                        # Return the path of the extracted page
                        return temp_page_path

            st.warning(f"No pages contain the specified keywords for {section_name}.")
            return None
        except Exception as e:
            st.error(f"An error occurred while processing the PDF: {e}")
            return None
    else:
        st.warning("Please upload a PDF on the Home page first.")
        return None

def find_financial(uploaded_file, section_name):
    """
    Extracts and displays sections of a PDF based on keyword matches.

    Parameters:
        uploaded_file: The uploaded PDF file (Streamlit file uploader object).
        section_name: The name of the section to search for (e.g., "income_statement").

    Returns:
        bool: True if processing completed without interruptions; False if stopped or an error occurred.
    """
    if uploaded_file:
        try:
            pdf_reader = PdfReader(uploaded_file)
            total_pages = len(pdf_reader.pages)

            # Step 1: Start from the second half of the PDF
            start_page = total_pages // 2
            pages = pdf_reader.pages[start_page:]

            section_keywords = keywords_dict.get(section_name, [])
            section_stop_keywords = stop_keywords.get(section_name, [])
            section_anti_keywords = anti_keywords.get(section_name, [])

            pdf_writer = PdfWriter()  # Writer for the extracted pages
            extraction_started = False  # Flag to check if extraction has started
            extraction_start_page = None  # Track the starting page number
            pages_extracted = 0  # Counter for extracted pages

            for page_num, page in enumerate(pages, start=start_page + 1):
                text = page.extract_text()

                # Step 2: Find the keywords within the keywords_dict
                if not extraction_started:
                    for keyword_set in section_keywords:
                        if all(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
                            pdf_writer.add_page(page)
                            pages_extracted += 1
                            extraction_start_page = page_num  # Set the starting page number

                            # Check for stop keywords on the same page
                            if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
                                   for stop_set in section_stop_keywords):

                                # Check for anti-keywords before stopping
                                if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
                                       for anti_set in section_anti_keywords):
                                    pdf_writer.pages.pop()  # Remove the last added page
                                    pages_extracted -= 1

                                # Save and display the extracted pages (if any)
                                if len(pdf_writer.pages) > 0:
                                    temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num}.pdf")
                                    with open(temp_section_path, "wb") as f:
                                        pdf_writer.write(f)
                                    return temp_section_path
                                else:
                                    st.warning(f"No pages matched the criteria for {section_name}.")

                                # Stop extraction immediately and signal to stop all processing
                                return False
                            else:
                                # Continue extraction
                                extraction_started = True
                                break
                elif extraction_started:
                    # Check if we've reached the 3-page limit
                    if pages_extracted >= 3:
                        if len(pdf_writer.pages) > 0:
                            temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num-1}.pdf")
                            with open(temp_section_path, "wb") as f:
                                pdf_writer.write(f)
                            return temp_section_path
                        return False

                    # Step 3: Add the page to the output
                    pdf_writer.add_page(page)
                    pages_extracted += 1

                    # Step 4: Check for stop keywords
                    if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
                           for stop_set in section_stop_keywords):

                        # Step 5: After stopping, check for anti-keywords
                        if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
                               for anti_set in section_anti_keywords):
                            pdf_writer.pages.pop()  # Remove the last added page
                            pages_extracted -= 1

                        # Save and display the extracted pages (if any)
                        if len(pdf_writer.pages) > 0:
                            temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num}.pdf")
                            with open(temp_section_path, "wb") as f:
                                pdf_writer.write(f)
                            return temp_section_path
                        else:
                            st.warning(f"No pages matched the criteria for {section_name}.")

                        # Stop extraction and signal to stop all processing
                        return False

            # If extraction finished without hitting stop keywords, save and display the pages
            if len(pdf_writer.pages) > 0:
                temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num}.pdf")
                with open(temp_section_path, "wb") as f:
                    pdf_writer.write(f)
                return temp_section_path
            else:
                st.warning(f"No pages matched the criteria for {section_name}.")

            # Indicate that processing can continue
            return True

        except Exception as e:
            st.error(f"An error occurred while processing the PDF: {e}")
            # Stop processing due to an error
            return False
    else:
        st.warning("Please upload a PDF on the Home page first.")
        # Stop processing since no file is uploaded
        return False