kritsadaK commited on
Commit
07804ad
·
verified ·
1 Parent(s): d2242ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -229
app.py CHANGED
@@ -1,234 +1,15 @@
1
- import os
2
  import streamlit as st
3
- import spacy
4
- import pandas as pd
5
- import re
6
- from transformers import pipeline
7
 
8
- # Ensure Streamlit is properly initialized
9
- st.set_page_config(page_title="FinBrief: Financial Document Insights", layout="wide")
10
 
11
- # Disable GPU if not needed
12
- os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
13
 
14
- # Initialize session state variables **before using them**
15
- if "pdf_text" not in st.session_state:
16
- st.session_state["pdf_text"] = "" # Default to an empty string
17
- if "pdf_tables" not in st.session_state:
18
- st.session_state["pdf_tables"] = [] # Default to an empty list
19
- if "nlp" not in st.session_state:
20
- st.session_state["nlp"] = None
21
- if "summarizer" not in st.session_state:
22
- st.session_state["summarizer"] = None
23
 
24
- # Load the spaCy model once at the start
25
- try:
26
- st.session_state["nlp"] = spacy.load("en_core_web_sm")
27
- st.write("spaCy model loaded successfully.")
28
- except OSError:
29
- st.session_state["nlp"] = None
30
- st.write("Failed to load spaCy model.")
31
-
32
- # Load the summarization model from Hugging Face Model Hub
33
- try:
34
- online_model_path = "kritsadaK/bart-financial-summarization"
35
- st.session_state["summarizer"] = pipeline(
36
- "summarization",
37
- model=online_model_path,
38
- tokenizer=online_model_path
39
- )
40
- st.write("Online summarization model loaded successfully.")
41
- except Exception:
42
- st.session_state["summarizer"] = None
43
- st.write("Failed to load online summarization model.")
44
-
45
- # Now it's safe to access session state variables
46
- if st.session_state["pdf_text"]:
47
- st.text_area("Extracted Text", st.session_state["pdf_text"], height=400)
48
- else:
49
- st.warning("No text extracted yet. Upload a PDF to start.")
50
-
51
- # Define regex patterns to extract structured data
52
- patterns = {
53
- "Fund Name": r"^(.*?) Fund", # Extracts the name before "Fund"
54
- "CUSIP": r"CUSIP\s+(\d+)",
55
- "Inception Date": r"Inception Date\s+([\w\s\d]+)",
56
- "Benchmark": r"Benchmark\s+([\w\s\d]+)",
57
- "Expense Ratio": r"Expense Information.*?(\d+\.\d+%)",
58
- "Total Assets": r"Total Assets\s+USD\s+([\d,]+)",
59
- "Portfolio Turnover": r"Portfolio Holdings Turnover.*?(\d+\.\d+%)",
60
- "Cash Allocation": r"% of Portfolio in Cash\s+(\d+\.\d+%)",
61
- "Alpha": r"Alpha\s+(-?\d+\.\d+%)",
62
- "Standard Deviation": r"Standard Deviation\s+(\d+\.\d+%)"
63
- }
64
-
65
- # Set the title and layout
66
- st.title("FinBrief: Financial Document Insights")
67
- st.markdown("[Example Financial Documents](https://drive.google.com/drive/folders/1jMu3S7S_Hc_RgK6_cvsCqIB8x3SSS-R6)")
68
-
69
- # Custom styling
70
- st.markdown(
71
- """
72
- <style>
73
- .sidebar .sidebar-content {
74
- background-color: #f7f7f7;
75
- color: #333;
76
- }
77
- .css-1d391kg {
78
- background-color: #f0f4f8;
79
- }
80
- .stButton>button {
81
- background-color: #4CAF50;
82
- color: white;
83
- padding: 10px 20px;
84
- border-radius: 5px;
85
- font-size: 16px;
86
- }
87
- .stTextArea textarea {
88
- border: 2px solid #4CAF50;
89
- border-radius: 5px;
90
- padding: 10px;
91
- }
92
- </style>
93
- """,
94
- unsafe_allow_html=True,
95
- )
96
-
97
- # Function to extract text and tables using pdfplumber
98
- def extract_text_tables_pdfplumber(pdf_file):
99
- import io
100
- import pdfplumber
101
-
102
- print("\n🔹 PDFPlumber: Extracting text and tables...")
103
- with pdfplumber.open(io.BytesIO(pdf_file.read())) as pdf:
104
- all_text = ""
105
- all_tables = []
106
-
107
- for page in pdf.pages:
108
- page_text = page.extract_text()
109
- if page_text:
110
- all_text += page_text + "\n"
111
-
112
- # Extract tables
113
- tables = page.extract_tables()
114
- all_tables.extend(tables) # Store all tables
115
-
116
- if all_text.strip():
117
- print(all_text[:1000]) # Print first 1000 characters for verification
118
- return all_text, all_tables
119
- else:
120
- print("No text extracted. The PDF might be image-based.")
121
- return None, None
122
-
123
- # Step 0: Upload PDF
124
- st.sidebar.header("Upload Your Financial Document")
125
- uploaded_file = st.sidebar.file_uploader("Choose a PDF file", type="pdf")
126
-
127
- if uploaded_file is not None:
128
- st.sidebar.write(f"You uploaded: {uploaded_file.name}")
129
-
130
- # Extract text and tables
131
- pdf_text, pdf_tables = extract_text_tables_pdfplumber(uploaded_file)
132
-
133
- if pdf_text is not None:
134
- # Store results in session state
135
- st.session_state["pdf_text"] = pdf_text
136
- st.session_state["pdf_tables"] = pdf_tables # Save tables separately
137
-
138
- st.sidebar.success("PDF uploaded and text extracted!")
139
- else:
140
- st.markdown("[Example Financial Documents](https://drive.google.com/drive/folders/1jMu3S7S_Hc_RgK6_cvsCqIB8x3SSS-R6)")
141
- st.error("No text extracted from the uploaded PDF.")
142
-
143
- # Step 1: Display Extracted Text
144
- st.subheader("Extracted Text")
145
- if st.session_state["pdf_text"]:
146
- st.text_area("Document Text", st.session_state["pdf_text"], height=400)
147
- else:
148
- st.warning("No text extracted yet. Upload a PDF to start.")
149
-
150
- # Step 2: Display Extracted Tables
151
- st.subheader("Extracted Tables")
152
- if st.session_state["pdf_tables"]: # Check if tables exist
153
- for idx, table in enumerate(st.session_state["pdf_tables"]):
154
- st.write(f"Table {idx+1}")
155
- st.write(pd.DataFrame(table)) # Display tables as DataFrames
156
- else:
157
- st.info("No tables extracted.")
158
-
159
- # Retrieve variables from session state
160
- nlp = st.session_state["nlp"]
161
- summarizer = st.session_state["summarizer"]
162
- pdf_text = st.session_state["pdf_text"]
163
- pdf_tables = st.session_state["pdf_tables"]
164
-
165
- # Ensure that the models are loaded
166
- if nlp is None or summarizer is None:
167
- st.error("Models are not properly loaded. Please check your model paths and installation.")
168
- else:
169
- # Step 3: Named Entity Recognition (NER)
170
- st.subheader("NER Analysis")
171
-
172
- # Display full extracted text, not just first 1000 characters
173
- example_text = st.text_area(
174
- "Enter or paste text for analysis",
175
- height=400,
176
- value=st.session_state["pdf_text"] if st.session_state["pdf_text"] else ""
177
- )
178
-
179
- if st.button("Analyze"):
180
- # Ensure full extracted text is used for analysis
181
- text_for_analysis = st.session_state["pdf_text"].strip() if st.session_state["pdf_text"] else example_text.strip()
182
-
183
- if text_for_analysis:
184
- with st.spinner("Analyzing text..."):
185
- # Extract structured financial data using regex (Now using full text)
186
- extracted_data = {
187
- key: (match.group(1) if match else "N/A")
188
- for key, pattern in patterns.items()
189
- if (match := re.search(pattern, text_for_analysis, re.IGNORECASE))
190
- }
191
-
192
- # Use spaCy to extract additional financial terms (Now using full text)
193
- doc = nlp(text_for_analysis)
194
- financial_entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ["MONEY", "PERCENT", "ORG", "DATE"]]
195
-
196
- # Store extracted data in a structured dictionary
197
- structured_data = {**extracted_data, "Named Entities Extracted": financial_entities}
198
-
199
- # Display results
200
- st.write("Entities Found:")
201
- st.write(pd.DataFrame(financial_entities, columns=["Entity", "Label"]))
202
-
203
- st.write("Structured Data Extracted:")
204
- st.write(pd.DataFrame([structured_data]))
205
-
206
- else:
207
- st.error("Please provide some text for analysis.")
208
-
209
- # Step 4: Summarization
210
- st.subheader("Summarization")
211
-
212
- # Display full extracted text, not just first 1000 characters
213
- input_text = st.text_area(
214
- "Enter text to summarize",
215
- height=400,
216
- value=st.session_state["pdf_text"] if st.session_state["pdf_text"] else ""
217
- )
218
-
219
- if st.button("Summarize"):
220
- # Ensure full extracted text is used for summarization
221
- text_to_summarize = st.session_state["pdf_text"].strip() if st.session_state["pdf_text"] else input_text.strip()
222
-
223
- if text_to_summarize:
224
- with st.spinner("Generating summary..."):
225
- summary = summarizer(
226
- text_to_summarize,
227
- max_length=min(len(text_to_summarize.split()), 1024),
228
- min_length=100,
229
- do_sample=False
230
- )
231
- st.write("Summary:")
232
- st.success(summary[0]["summary_text"])
233
- else:
234
- st.error("Please provide text to summarize.")
 
 
1
  import streamlit as st
 
 
 
 
2
 
3
+ # Set the title of the app
4
+ st.title("My Simple Streamlit App")
5
 
6
+ # Add a text input
7
+ user_input = st.text_input("Enter some text:")
8
 
9
+ # Display user input
10
+ if user_input:
11
+ st.write(f"You entered: {user_input}")
 
 
 
 
 
 
12
 
13
+ # Add a button
14
+ if st.button("Click Me!"):
15
+ st.write("Button clicked!")