KKowenn commited on
Commit
fb39280
·
verified ·
1 Parent(s): 7f6a448

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +9 -7
  2. app.py +427 -0
  3. requirements.txt +11 -0
  4. space.yaml +7 -0
README.md CHANGED
@@ -1,13 +1,15 @@
1
  ---
2
- title: Finbreif3
3
- emoji: 🏃
4
- colorFrom: indigo
5
- colorTo: green
6
  sdk: streamlit
7
- sdk_version: 1.43.2
8
  app_file: app.py
9
  pinned: false
10
- short_description: debugging
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
  ---
2
+ title: FinBrief
3
+ emoji: 💵
4
+ colorFrom: green
5
+ colorTo: gray
6
  sdk: streamlit
 
7
  app_file: app.py
8
  pinned: false
9
+ license: mit
10
+ short_description: Financial PDF Document Summarization web-App
11
  ---
12
 
13
+
14
+ # Install Rust
15
+ RUN apt-get update && apt-get install -y cargo
app.py ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import spacy
3
+ import pandas as pd
4
+ import re
5
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
6
+ import subprocess
7
+ import os
8
+ os.environ["TRANSFORMERS_CACHE"] = "/home/user/.cache/huggingface"
9
+ os.environ["HF_HOME"] = "/home/user/.cache/huggingface"
10
+ os.environ["TORCH_HOME"] = "/home/user/.cache/torch"
11
+ os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
12
+
13
+ import torch
14
+ import nltk
15
+ from nltk.tokenize import sent_tokenize
16
+ import traceback
17
+
18
+ # Set Streamlit page config
19
+ st.set_page_config(page_title="FinBrief: Financial Document Insights", layout="wide")
20
+
21
+ try:
22
+ nlp = spacy.load("en_core_web_sm")
23
+ st.write("spaCy model loaded successfully!")
24
+ print("spaCy model loaded successfully!")
25
+ except OSError:
26
+ st.write("Failed to load spaCy model. Attempting to install...")
27
+ print("Failed to load spaCy model. Attempting to install...")
28
+ subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
29
+ try:
30
+ nlp = spacy.load("en_core_web_sm")
31
+ st.write("spaCy model installed and loaded successfully!")
32
+ print("spaCy model installed and loaded successfully!")
33
+ except Exception as e:
34
+ st.write(f"Still failed to load spaCy model: {e}")
35
+ print(f"Still failed to load spaCy model: {e}")
36
+ nlp = None # Mark spaCy as failed
37
+
38
+ model_name = "kritsadaK/bart-financial-summarization"
39
+
40
+ try:
41
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
42
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True)
43
+ summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
44
+ st.write("Hugging Face summarization model loaded successfully!")
45
+ print("Hugging Face summarization model loaded successfully!")
46
+ except Exception as e:
47
+ st.write(f"Failed to load Hugging Face summarization model: {e}")
48
+ print(f"Failed to load Hugging Face summarization model: {e}")
49
+ summarizer = None # Mark Hugging Face model as failed
50
+
51
+ # Store models in Streamlit session state
52
+ st.session_state["nlp"] = nlp
53
+ st.session_state["summarizer"] = summarizer
54
+
55
+ # UI: Show clear error messages if models failed
56
+ if nlp is None:
57
+ st.error("The spaCy model failed to load. Ensure it is installed.")
58
+ if summarizer is None:
59
+ st.error("The summarization model failed to load. Check the model path or internet connection.")
60
+
61
+ st.title("FinBrief: Financial Document Insights")
62
+ st.write("Upload a financial document for analysis.")
63
+
64
+
65
+ # Initialize session state
66
+ if "nlp" not in st.session_state:
67
+ st.session_state["nlp"] = nlp
68
+ if "summarizer" not in st.session_state:
69
+ st.session_state["summarizer"] = summarizer
70
+
71
+ # Set up NLTK data directory
72
+ nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data')
73
+ if not os.path.exists(nltk_data_dir):
74
+ os.makedirs(nltk_data_dir)
75
+ nltk.data.path.append(nltk_data_dir)
76
+
77
+ def download_nltk_punkt():
78
+ try:
79
+ nltk.data.find('tokenizers/punkt')
80
+ st.write("NLTK 'punkt' tokenizer is already installed.")
81
+ print("NLTK 'punkt' tokenizer is already installed.")
82
+ except LookupError:
83
+ st.write("NLTK 'punkt' tokenizer not found. Attempting to download...")
84
+ print("NLTK 'punkt' tokenizer not found. Attempting to download...")
85
+ try:
86
+ nltk.download('punkt', download_dir=nltk_data_dir, quiet=True)
87
+ nltk.data.find('tokenizers/punkt')
88
+ st.write("NLTK 'punkt' tokenizer downloaded successfully.")
89
+ print("NLTK 'punkt' tokenizer downloaded successfully.")
90
+ except Exception as e:
91
+ st.error(f"NLTK 'punkt' tokenizer download failed: {e}")
92
+ print(f"NLTK 'punkt' tokenizer download failed: {e}")
93
+
94
+ # Call the function at the beginning of script
95
+ download_nltk_punkt()
96
+
97
+ # Debugging: Check session state initialization
98
+ print(f"Session State - NLP: {st.session_state['nlp'] is not None}, Summarizer: {st.session_state['summarizer'] is not None}")
99
+
100
+ # # Load the summarization model locally
101
+ # try:
102
+ # local_model_path = "./local_models/bart-financial"
103
+ # summarizer = pipeline("summarization", model=local_model_path, tokenizer=local_model_path)
104
+ # st.write("Local summarization model loaded successfully!")
105
+ # except Exception as e:
106
+ # summarizer = None # Handle case where model is missing
107
+ # st.write("Failed to load local summarization model.")
108
+
109
+
110
+ # Define regex patterns to extract structured data
111
+ patterns = {
112
+ "Fund Name": r"^(.*?) Fund", # Extracts the name before "Fund"
113
+ "CUSIP": r"CUSIP\s+(\d+)",
114
+ "Inception Date": r"Inception Date\s+([\w\s\d]+)",
115
+ "Benchmark": r"Benchmark\s+([\w\s\d]+)",
116
+ "Expense Ratio": r"Expense Information.*?(\d+\.\d+%)",
117
+ "Total Assets": r"Total Assets\s+USD\s+([\d,]+)",
118
+ "Portfolio Turnover": r"Portfolio Holdings Turnover.*?(\d+\.\d+%)",
119
+ "Cash Allocation": r"% of Portfolio in Cash\s+(\d+\.\d+%)",
120
+ "Alpha": r"Alpha\s+(-?\d+\.\d+%)",
121
+ "Standard Deviation": r"Standard Deviation\s+(\d+\.\d+%)"
122
+ }
123
+
124
+ # Set the title and layout
125
+ st.markdown("[Example Financial Documents](https://drive.google.com/drive/folders/1jMu3S7S_Hc_RgK6_cvsCqIB8x3SSS-R6)")
126
+
127
+ # Custom styling (this remains unchanged)
128
+ st.markdown(
129
+ """
130
+ <style>
131
+ .sidebar .sidebar-content {
132
+ background-color: #f7f7f7;
133
+ color: #333;
134
+ }
135
+ .css-1d391kg {
136
+ background-color: #f0f4f8;
137
+ }
138
+ .stButton>button {
139
+ background-color: #4CAF50;
140
+ color: white;
141
+ padding: 10px 20px;
142
+ border-radius: 5px;
143
+ font-size: 16px;
144
+ }
145
+ .stTextArea textarea {
146
+ border: 2px solid #4CAF50;
147
+ border-radius: 5px;
148
+ padding: 10px;
149
+ }
150
+ </style>
151
+ """,
152
+ unsafe_allow_html=True,
153
+ )
154
+
155
+ # Function to extract text and tables using pdfplumber
156
+ def extract_text_tables_pdfplumber(pdf_file):
157
+ import io
158
+ import pdfplumber
159
+
160
+ print("\nPDFPlumber: Extracting text and tables...")
161
+ with pdfplumber.open(io.BytesIO(pdf_file.read())) as pdf:
162
+ all_text = ""
163
+ all_tables = []
164
+
165
+ for page in pdf.pages:
166
+ page_text = page.extract_text()
167
+ if page_text:
168
+ all_text += page_text + "\n"
169
+
170
+ # Extract tables
171
+ tables = page.extract_tables()
172
+ all_tables.extend(tables) # Store all tables
173
+
174
+ if all_text.strip():
175
+ print(all_text[:1000]) # Print first 1000 characters for verification
176
+ return all_text, all_tables
177
+ else:
178
+ print("No text extracted. The PDF might be image-based.")
179
+ return None, None
180
+
181
+ def split_text_into_chunks(text, tokenizer, max_tokens=1024):
182
+ sentences = nltk.sent_tokenize(text)
183
+ chunks = []
184
+ current_chunk = ''
185
+ current_length = 0
186
+
187
+ for sentence in sentences:
188
+ sentence_tokens = tokenizer.encode(sentence, add_special_tokens=False)
189
+ sentence_length = len(sentence_tokens)
190
+
191
+ # If adding the next sentence exceeds the max_tokens limit
192
+ if current_length + sentence_length > max_tokens:
193
+ if current_chunk:
194
+ chunks.append(current_chunk.strip())
195
+ # Start a new chunk
196
+ current_chunk = sentence
197
+ current_length = sentence_length
198
+ else:
199
+ current_chunk += ' ' + sentence
200
+ current_length += sentence_length
201
+
202
+ if current_chunk:
203
+ chunks.append(current_chunk.strip())
204
+
205
+ return chunks
206
+
207
+ def remove_duplicate_sentences(text):
208
+ sentences = nltk.sent_tokenize(text)
209
+ unique_sentences = []
210
+ seen_sentences = set()
211
+
212
+ for sentence in sentences:
213
+ # Normalize the sentence to ignore case and punctuation for comparison
214
+ normalized_sentence = sentence.strip().lower()
215
+ if normalized_sentence not in seen_sentences:
216
+ seen_sentences.add(normalized_sentence)
217
+ unique_sentences.append(sentence)
218
+
219
+ return ' '.join(unique_sentences)
220
+
221
+ # Ensure session state is initialized
222
+ if "pdf_text" not in st.session_state:
223
+ st.session_state["pdf_text"] = ""
224
+ if "pdf_tables" not in st.session_state:
225
+ st.session_state["pdf_tables"] = [] # Initialize as an empty list
226
+
227
+ # Step 0: Upload PDF
228
+ st.sidebar.header("Upload Your Financial Document")
229
+ uploaded_file = st.sidebar.file_uploader("Choose a PDF file", type="pdf")
230
+
231
+ if uploaded_file is not None:
232
+ st.sidebar.write(f"You uploaded: {uploaded_file.name}")
233
+
234
+ # Extract text and tables
235
+ pdf_text, pdf_tables = extract_text_tables_pdfplumber(uploaded_file)
236
+
237
+ if pdf_text is not None:
238
+ # Store results in session state
239
+ st.session_state["pdf_text"] = pdf_text
240
+ st.session_state["pdf_tables"] = pdf_tables # Save tables separately
241
+
242
+ st.sidebar.success("PDF uploaded and text extracted!")
243
+ else:
244
+ st.markdown("[Example Financial Documents](https://drive.google.com/drive/folders/1jMu3S7S_Hc_RgK6_cvsCqIB8x3SSS-R6)")
245
+ st.error("No text extracted from the uploaded PDF.")
246
+
247
+ # Step 1: Display Extracted Text
248
+ st.subheader("Extracted Text")
249
+ if st.session_state["pdf_text"]:
250
+ st.text_area("Document Text", st.session_state["pdf_text"], height=400)
251
+ else:
252
+ st.warning("No text extracted yet. Upload a PDF to start.")
253
+
254
+
255
+ # Step 2: Display Extracted Tables (Fixed Error)
256
+ st.subheader("Extracted Tables")
257
+ if st.session_state["pdf_tables"]: # Check if tables exist
258
+ for idx, table in enumerate(st.session_state["pdf_tables"]):
259
+ st.write(f"Table {idx+1}")
260
+ st.write(pd.DataFrame(table)) # Display tables as DataFrames
261
+ else:
262
+ st.info("No tables extracted.")
263
+
264
+ # Retrieve variables from session state
265
+ nlp = st.session_state["nlp"]
266
+ summarizer = st.session_state["summarizer"]
267
+ pdf_text = st.session_state["pdf_text"]
268
+ pdf_tables = st.session_state["pdf_tables"]
269
+
270
+ # Ensure that the models are loaded
271
+ if nlp is None or summarizer is None:
272
+ st.error("Models are not properly loaded. Please check your model paths and installation.")
273
+ else:
274
+ # Step 3: Named Entity Recognition (NER)
275
+ st.subheader("NER Analysis")
276
+
277
+ # Display full extracted text, not just first 1000 characters
278
+ example_text = st.text_area(
279
+ "Enter or paste text for analysis",
280
+ height=400,
281
+ value=st.session_state["pdf_text"] if st.session_state["pdf_text"] else ""
282
+ )
283
+
284
+ if st.button("Analyze"):
285
+ # Ensure full extracted text is used for analysis
286
+ text_for_analysis = st.session_state["pdf_text"].strip() if st.session_state["pdf_text"] else example_text.strip()
287
+
288
+ if text_for_analysis:
289
+ with st.spinner("Analyzing text..."):
290
+ # Extract structured financial data using regex (Now using full text)
291
+ extracted_data = {
292
+ key: (match.group(1) if match else "N/A")
293
+ for key, pattern in patterns.items()
294
+ if (match := re.search(pattern, text_for_analysis, re.IGNORECASE))
295
+ }
296
+
297
+ # Use spaCy to extract additional financial terms (Now using full text)
298
+ doc = nlp(text_for_analysis)
299
+ financial_entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ["MONEY", "PERCENT", "ORG", "DATE"]]
300
+
301
+ # Store extracted data in a structured dictionary
302
+ structured_data = {**extracted_data, "Named Entities Extracted": financial_entities}
303
+
304
+ # Display results
305
+ st.write("Entities Found:")
306
+ st.write(pd.DataFrame(financial_entities, columns=["Entity", "Label"]))
307
+
308
+ st.write("Structured Data Extracted:")
309
+ st.write(pd.DataFrame([structured_data]))
310
+
311
+ else:
312
+ st.error("Please provide some text for analysis.")
313
+
314
+ # Step 4: Summarization
315
+ st.subheader("Summarization")
316
+ st.write("Generate concise summaries of financial documents.")
317
+
318
+ # Text summarization input
319
+ input_text = st.text_area(
320
+ "Enter text to summarize",
321
+ height=200,
322
+ value=st.session_state.get("pdf_text", "") if "pdf_text" in st.session_state else ""
323
+ )
324
+
325
+ if st.button("Summarize"):
326
+ text_to_summarize = input_text.strip()
327
+ if text_to_summarize:
328
+ try:
329
+ # Display original text length
330
+ input_length = len(text_to_summarize.split())
331
+ st.write(f"Original text length: {input_length} words")
332
+
333
+ # Define the maximum number of tokens the model can handle
334
+ max_input_tokens = 1024 # BART's maximum input length
335
+
336
+ # Function to split text into chunks based on tokens (modified to avoid overlaps)
337
+ def split_text_into_chunks(text, tokenizer, max_tokens=max_input_tokens):
338
+ sentences = nltk.sent_tokenize(text)
339
+ chunks = []
340
+ current_chunk = ''
341
+ current_length = 0
342
+
343
+ for sentence in sentences:
344
+ sentence_tokens = tokenizer.encode(sentence, add_special_tokens=False)
345
+ sentence_length = len(sentence_tokens)
346
+
347
+ # If adding the sentence exceeds max_tokens, start a new chunk
348
+ if current_length + sentence_length > max_tokens:
349
+ if current_chunk:
350
+ chunks.append(current_chunk.strip())
351
+ current_chunk = sentence
352
+ current_length = sentence_length
353
+ else:
354
+ current_chunk += ' ' + sentence
355
+ current_length += sentence_length
356
+
357
+ if current_chunk:
358
+ chunks.append(current_chunk.strip())
359
+
360
+ return chunks
361
+
362
+ # Function to remove duplicate sentences
363
+ def remove_duplicate_sentences(text):
364
+ sentences = nltk.sent_tokenize(text)
365
+ unique_sentences = []
366
+ seen_sentences = set()
367
+
368
+ for sentence in sentences:
369
+ normalized_sentence = sentence.strip().lower()
370
+ if normalized_sentence not in seen_sentences:
371
+ seen_sentences.add(normalized_sentence)
372
+ unique_sentences.append(sentence)
373
+
374
+ return ' '.join(unique_sentences)
375
+
376
+ # Split the text into manageable chunks
377
+ chunks = split_text_into_chunks(text_to_summarize, tokenizer)
378
+ st.write(f"Text has been split into {len(chunks)} chunks.")
379
+
380
+ # Summarize each chunk
381
+ summaries = []
382
+ for i, chunk in enumerate(chunks):
383
+ st.write(f"Summarizing chunk {i+1}/{len(chunks)}...")
384
+ # Adjust summary length parameters as needed
385
+ chunk_length = len(chunk.split())
386
+ max_summary_length = min(150, chunk_length // 2)
387
+ min_summary_length = max(50, max_summary_length // 2)
388
+
389
+ try:
390
+ summary_output = summarizer(
391
+ chunk,
392
+ max_length=max_summary_length,
393
+ min_length=min_summary_length,
394
+ do_sample=False,
395
+ truncation=True
396
+ )
397
+ chunk_summary = summary_output[0]['summary_text'].strip()
398
+
399
+ if not chunk_summary:
400
+ st.warning(f"The summary for chunk {i+1} is empty.")
401
+ else:
402
+ summaries.append(chunk_summary)
403
+ # Optionally display the summary of the current chunk
404
+ # st.write(f"Summary of chunk {i+1}:")
405
+ # st.write(chunk_summary)
406
+ # st.write("---")
407
+
408
+ except Exception as e:
409
+ st.error(f"Summarization failed for chunk {i+1}: {e}")
410
+ st.text(traceback.format_exc())
411
+ continue
412
+
413
+ if summaries:
414
+ # Combine summaries
415
+ combined_summary = ' '.join(summaries)
416
+ # Remove duplicate sentences
417
+ final_summary = remove_duplicate_sentences(combined_summary)
418
+ st.write("Final Summary:")
419
+ st.success(final_summary)
420
+ else:
421
+ st.error("No summaries were generated.")
422
+
423
+ except Exception as e:
424
+ st.error("An error occurred during summarization.")
425
+ st.text(traceback.format_exc())
426
+ else:
427
+ st.error("Please provide text to summarize.")
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit==1.37.1
2
+ spacy==3.8.4
3
+ pandas==2.2.2
4
+ numpy==1.26.4
5
+ transformers==4.48.1
6
+ tokenizers==0.21.0
7
+ pdfplumber==0.11.5
8
+ flax==0.8.3
9
+ huggingface-hub==0.29.1
10
+ torch
11
+ nltk==3.8.1
space.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: FinBrief
3
+ python_version: 3.8.19
4
+ sdk: streamlit
5
+ app_file: app.py
6
+ pinned: false
7
+ license: mit