File size: 9,476 Bytes
3eed450
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e9c51d1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import os
import re
from PyPDF2 import PdfReader, PdfWriter
import streamlit as st
from config import keywords_dict, stop_keywords, anti_keywords

def find_cover(uploaded_file):
    """
    Extracts and saves the first page of a PDF to a temporary file.

    Parameters:
        uploaded_file: The uploaded PDF file.

    Returns:
        str: Path to the temporary file containing the first page of the PDF.
    """
    section_title = "cover"
    if uploaded_file:
        try:
            # Read the PDF and extract the first page
            pdf_reader = PdfReader(uploaded_file)
            first_page = pdf_reader.pages[0]

            pdf_writer = PdfWriter()
            temp_cover_page_path = os.path.join(f"temp_{section_title}_1.pdf")
            with open(temp_cover_page_path, "wb") as f:
                pdf_writer.add_page(first_page)
                pdf_writer.write(f)

            # Return the path to the temporary file
            return temp_cover_page_path
        except Exception as e:
            st.error(f"An error occurred while processing the PDF: {e}")
            return None
    else:
        st.warning("Please upload a PDF on the Home page first.")
        return None


def find_underwriter(uploaded_file):
    """
    Searches for pages in a PDF containing specific keywords for the 'underwriter' section and returns the extracted file path.

    Parameters:
        uploaded_file: The uploaded PDF file.

    Returns:
        str: Path to the temporary file containing the extracted 'underwriter' page(s).
    """
    section_name = "underwriter"

    keyword_sets = keywords_dict.get(section_name, [])
    if not keyword_sets:
        st.error(f"No keywords defined for section: {section_name}")
        return None

    if uploaded_file:
        try:
            pdf_reader = PdfReader(uploaded_file)
            total_pages = len(pdf_reader.pages)
            start_page = total_pages // 3  # Skip the first 1/3 of the PDF
            pages = pdf_reader.pages[start_page:]

            # Loop through the keyword sets
            for keyword_set in keyword_sets:
                for page_num, page in enumerate(pages, start=start_page + 1):
                    text = page.extract_text()
                    
                    # Check if any keyword in the set is found on the page
                    if any(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
                        # Save the matched page to a temporary file
                        pdf_writer = PdfWriter()
                        pdf_writer.add_page(page)

                        temp_page_path = os.path.join(f"temp_{section_name}_{page_num}.pdf")
                        with open(temp_page_path, "wb") as f:
                            pdf_writer.write(f)

                        # Return the path of the extracted page
                        return temp_page_path

            st.warning(f"No pages contain the specified keywords for {section_name}.")
            return None
        except Exception as e:
            st.error(f"An error occurred while processing the PDF: {e}")
            return None
    else:
        st.warning("Please upload a PDF on the Home page first.")
        return None

def find_financial(uploaded_file, section_name):
    """
    Extracts and displays sections of a PDF based on keyword matches.

    Parameters:
        uploaded_file: The uploaded PDF file (Streamlit file uploader object).
        section_name: The name of the section to search for (e.g., "income_statement").

    Returns:
        bool: True if processing completed without interruptions; False if stopped or an error occurred.
    """
    if uploaded_file:
        try:
            pdf_reader = PdfReader(uploaded_file)
            total_pages = len(pdf_reader.pages)

            # Step 1: Start from the second half of the PDF
            start_page = total_pages // 2
            pages = pdf_reader.pages[start_page:]

            section_keywords = keywords_dict.get(section_name, [])
            section_stop_keywords = stop_keywords.get(section_name, [])
            section_anti_keywords = anti_keywords.get(section_name, [])

            pdf_writer = PdfWriter()  # Writer for the extracted pages
            extraction_started = False  # Flag to check if extraction has started
            extraction_start_page = None  # Track the starting page number
            pages_extracted = 0  # Counter for extracted pages

            for page_num, page in enumerate(pages, start=start_page + 1):
                text = page.extract_text()

                # Step 2: Find the keywords within the keywords_dict
                if not extraction_started:
                    for keyword_set in section_keywords:
                        if all(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
                            pdf_writer.add_page(page)
                            pages_extracted += 1
                            extraction_start_page = page_num  # Set the starting page number

                            # Check for stop keywords on the same page
                            if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
                                   for stop_set in section_stop_keywords):

                                # Check for anti-keywords before stopping
                                if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
                                       for anti_set in section_anti_keywords):
                                    pdf_writer.pages.pop()  # Remove the last added page
                                    pages_extracted -= 1

                                # Save and display the extracted pages (if any)
                                if len(pdf_writer.pages) > 0:
                                    temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num}.pdf")
                                    with open(temp_section_path, "wb") as f:
                                        pdf_writer.write(f)
                                    return temp_section_path
                                else:
                                    st.warning(f"No pages matched the criteria for {section_name}.")

                                # Stop extraction immediately and signal to stop all processing
                                return False
                            else:
                                # Continue extraction
                                extraction_started = True
                                break
                elif extraction_started:
                    # Check if we've reached the 3-page limit
                    if pages_extracted >= 3:
                        if len(pdf_writer.pages) > 0:
                            temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num-1}.pdf")
                            with open(temp_section_path, "wb") as f:
                                pdf_writer.write(f)
                            return temp_section_path
                        return False

                    # Step 3: Add the page to the output
                    pdf_writer.add_page(page)
                    pages_extracted += 1

                    # Step 4: Check for stop keywords
                    if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
                           for stop_set in section_stop_keywords):

                        # Step 5: After stopping, check for anti-keywords
                        if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
                               for anti_set in section_anti_keywords):
                            pdf_writer.pages.pop()  # Remove the last added page
                            pages_extracted -= 1

                        # Save and display the extracted pages (if any)
                        if len(pdf_writer.pages) > 0:
                            temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num}.pdf")
                            with open(temp_section_path, "wb") as f:
                                pdf_writer.write(f)
                            return temp_section_path
                        else:
                            st.warning(f"No pages matched the criteria for {section_name}.")

                        # Stop extraction and signal to stop all processing
                        return False

            # If extraction finished without hitting stop keywords, save and display the pages
            if len(pdf_writer.pages) > 0:
                temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num}.pdf")
                with open(temp_section_path, "wb") as f:
                    pdf_writer.write(f)
                return temp_section_path
            else:
                st.warning(f"No pages matched the criteria for {section_name}.")

            # Indicate that processing can continue
            return True

        except Exception as e:
            st.error(f"An error occurred while processing the PDF: {e}")
            # Stop processing due to an error
            return False
    else:
        st.warning("Please upload a PDF on the Home page first.")
        # Stop processing since no file is uploaded
        return False