KKowenn commited on
Commit
c84b881
·
verified ·
1 Parent(s): b0e3283

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +226 -140
app.py CHANGED
@@ -14,6 +14,7 @@ import torch
14
  import nltk
15
  from nltk.tokenize import sent_tokenize
16
  import traceback
 
17
 
18
  # Set Streamlit page config
19
  st.set_page_config(page_title="FinBrief: Financial Document Insights", layout="wide")
@@ -61,7 +62,6 @@ if summarizer is None:
61
  st.title("FinBrief: Financial Document Insights")
62
  st.write("Upload a financial document for analysis.")
63
 
64
-
65
  # Initialize session state
66
  if "nlp" not in st.session_state:
67
  st.session_state["nlp"] = nlp
@@ -97,16 +97,6 @@ download_nltk_punkt()
97
  # Debugging: Check session state initialization
98
  print(f"Session State - NLP: {st.session_state['nlp'] is not None}, Summarizer: {st.session_state['summarizer'] is not None}")
99
 
100
- # # Load the summarization model locally
101
- # try:
102
- # local_model_path = "./local_models/bart-financial"
103
- # summarizer = pipeline("summarization", model=local_model_path, tokenizer=local_model_path)
104
- # st.write("Local summarization model loaded successfully!")
105
- # except Exception as e:
106
- # summarizer = None # Handle case where model is missing
107
- # st.write("Failed to load local summarization model.")
108
-
109
-
110
  # Define regex patterns to extract structured data
111
  patterns = {
112
  "Fund Name": r"^(.*?) Fund", # Extracts the name before "Fund"
@@ -177,7 +167,74 @@ def extract_text_tables_pdfplumber(pdf_file):
177
  else:
178
  print("No text extracted. The PDF might be image-based.")
179
  return None, None
180
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  def split_text_into_chunks(text, tokenizer, max_tokens=512):
182
  sentences = nltk.sent_tokenize(text)
183
  chunks = []
@@ -291,137 +348,166 @@ else:
291
  extracted_data = {
292
  key: (match.group(1) if match else "N/A")
293
  for key, pattern in patterns.items()
294
- if (match := re.search(pattern, text_for_analysis, re.IGNORECASE))
295
- }
296
-
297
- # Use spaCy to extract additional financial terms (Now using full text)
298
- doc = nlp(text_for_analysis)
299
- financial_entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ["MONEY", "PERCENT", "ORG", "DATE"]]
300
-
301
- # Store extracted data in a structured dictionary
302
- structured_data = {**extracted_data, "Named Entities Extracted": financial_entities}
303
-
304
- # Display results
305
- st.write("Entities Found:")
306
- st.write(pd.DataFrame(financial_entities, columns=["Entity", "Label"]))
307
-
308
- st.write("Structured Data Extracted:")
309
- st.write(pd.DataFrame([structured_data]))
310
-
311
- else:
312
- st.error("Please provide some text for analysis.")
313
-
314
- # Step 4: Summarization
315
- st.subheader("Summarization")
316
- st.write("Generate concise summaries of financial documents.")
317
-
318
- # Text summarization input
319
- input_text = st.text_area(
320
- "Enter text to summarize",
321
- height=200,
322
- value=st.session_state.get("pdf_text", "") if "pdf_text" in st.session_state else ""
323
- )
324
-
325
- if st.button("Summarize"):
326
- text_to_summarize = input_text.strip()
327
- if text_to_summarize:
328
- try:
329
- # Display original text length
330
- input_length = len(text_to_summarize.split())
331
- st.write(f"Original text length: {input_length} words")
332
-
333
- # Define the maximum number of tokens the model can handle
334
- max_input_tokens = 512
335
-
336
- # Function to split text into chunks based on tokens (modified to avoid overlaps)
337
- def split_text_into_chunks(text, tokenizer, max_tokens=max_input_tokens):
338
- sentences = nltk.sent_tokenize(text)
339
- chunks = []
340
- current_chunk = ''
341
- current_length = 0
342
-
343
- for sentence in sentences:
344
- sentence_tokens = tokenizer.encode(sentence, add_special_tokens=False)
345
- sentence_length = len(sentence_tokens)
346
-
347
- # If adding the sentence exceeds max_tokens, start a new chunk
348
- if current_length + sentence_length > max_tokens:
349
- if current_chunk:
350
- chunks.append(current_chunk.strip())
351
- current_chunk = sentence
352
- current_length = sentence_length
353
- else:
354
- current_chunk += ' ' + sentence
355
- current_length += sentence_length
356
 
357
- if current_chunk:
358
- chunks.append(current_chunk.strip())
 
359
 
360
- return chunks
 
361
 
362
- # Function to remove duplicate sentences
363
- def remove_duplicate_sentences(text):
364
- sentences = nltk.sent_tokenize(text)
365
- unique_sentences = []
366
- seen_sentences = set()
367
 
368
- for sentence in sentences:
369
- normalized_sentence = sentence.strip().lower()
370
- if normalized_sentence not in seen_sentences:
371
- seen_sentences.add(normalized_sentence)
372
- unique_sentences.append(sentence)
373
 
374
- return ' '.join(unique_sentences)
 
375
 
376
- # Split the text into manageable chunks
377
- chunks = split_text_into_chunks(text_to_summarize, tokenizer)
378
- st.write(f"Text has been split into {len(chunks)} chunks.")
379
-
380
- # Summarize each chunk
381
- summaries = []
382
- for i, chunk in enumerate(chunks):
383
- st.write(f"Summarizing chunk {i+1}/{len(chunks)}...")
384
- # Adjust summary length parameters as needed
385
- chunk_length = len(chunk.split())
386
- max_summary_length = min(150, chunk_length // 2)
387
- min_summary_length = max(50, max_summary_length // 2)
388
-
389
- try:
390
- summary_output = summarizer(
391
- chunk,
392
- max_length=max_summary_length,
393
- min_length=min_summary_length,
394
- do_sample=False,
395
- truncation=True
396
- )
397
- chunk_summary = summary_output[0]['summary_text'].strip()
398
-
399
- if not chunk_summary:
400
- st.warning(f"The summary for chunk {i+1} is empty.")
401
- else:
402
- summaries.append(chunk_summary)
403
- # Optionally display the summary of the current chunk
404
- # st.write(f"Summary of chunk {i+1}:")
405
- # st.write(chunk_summary)
406
- # st.write("---")
407
-
408
- except Exception as e:
409
- st.error(f"Summarization failed for chunk {i+1}: {e}")
410
- st.text(traceback.format_exc())
411
- continue
412
-
413
- if summaries:
414
- # Combine summaries
415
- combined_summary = ' '.join(summaries)
416
- # Remove duplicate sentences
417
- final_summary = remove_duplicate_sentences(combined_summary)
418
- st.write("Final Summary:")
419
- st.success(final_summary)
420
- else:
421
- st.error("No summaries were generated.")
422
-
423
- except Exception as e:
424
- st.error("An error occurred during summarization.")
425
- st.text(traceback.format_exc())
426
- else:
427
- st.error("Please provide text to summarize.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  import nltk
15
  from nltk.tokenize import sent_tokenize
16
  import traceback
17
+ from collections import Counter
18
 
19
  # Set Streamlit page config
20
  st.set_page_config(page_title="FinBrief: Financial Document Insights", layout="wide")
 
62
  st.title("FinBrief: Financial Document Insights")
63
  st.write("Upload a financial document for analysis.")
64
 
 
65
  # Initialize session state
66
  if "nlp" not in st.session_state:
67
  st.session_state["nlp"] = nlp
 
97
  # Debugging: Check session state initialization
98
  print(f"Session State - NLP: {st.session_state['nlp'] is not None}, Summarizer: {st.session_state['summarizer'] is not None}")
99
 
 
 
 
 
 
 
 
 
 
 
100
  # Define regex patterns to extract structured data
101
  patterns = {
102
  "Fund Name": r"^(.*?) Fund", # Extracts the name before "Fund"
 
167
  else:
168
  print("No text extracted. The PDF might be image-based.")
169
  return None, None
170
+
171
+ # NEW: Function to evaluate chunk relevance
172
+ def evaluate_chunk_relevance(chunk, keywords=None):
173
+ """
174
+ Evaluate the relevance of a text chunk based on various factors.
175
+ Returns a score representing the chunk's relevance.
176
+ """
177
+ if not keywords:
178
+ # Default financial keywords
179
+ keywords = ["fund", "portfolio", "performance", "return", "asset", "investment",
180
+ "expense", "risk", "benchmark", "allocation", "strategy", "market",
181
+ "growth", "income", "dividend", "yield", "capital", "equity", "bond",
182
+ "summary", "overview", "highlight", "key", "important", "significant"]
183
+
184
+ score = 0
185
+
186
+ # Factor 1: Length of the chunk (longer chunks often contain more information)
187
+ word_count = len(chunk.split())
188
+ score += min(word_count / 100, 5) # Cap at 5 points
189
+
190
+ # Factor 2: Keyword presence
191
+ # Count keywords in lowercase text
192
+ lower_chunk = chunk.lower()
193
+ keyword_count = sum(1 for keyword in keywords if keyword.lower() in lower_chunk)
194
+ keyword_density = keyword_count / max(1, word_count) * 100
195
+ score += min(keyword_density * 2, 10) # Cap at 10 points
196
+
197
+ # Factor 3: Presence of numbers (financial documents often contain important numbers)
198
+ number_count = len(re.findall(r'\d+\.?\d*%?', chunk))
199
+ score += min(number_count / 5, 5) # Cap at 5 points
200
+
201
+ # Factor 4: Structured information (lists, tables, etc.)
202
+ bullet_count = len(re.findall(r'•|\*|-|–|[0-9]+\.', chunk))
203
+ score += min(bullet_count, 5) # Cap at 5 points
204
+
205
+ # Factor 5: Presence of section headers
206
+ header_patterns = [
207
+ r'^[A-Z][A-Za-z\s]+:', # Title followed by colon
208
+ r'^[A-Z][A-Z\s]+', # ALL CAPS text
209
+ r'^\d+\.\s+[A-Z]' # Numbered section
210
+ ]
211
+ header_count = sum(1 for pattern in header_patterns if re.search(pattern, chunk, re.MULTILINE))
212
+ score += min(header_count * 2, 5) # Cap at 5 points
213
+
214
+ return score
215
+
216
+ # NEW: Function to rank and select the best chunks
217
+ def rank_and_select_chunks(chunks, max_chunks=5, keywords=None):
218
+ """
219
+ Rank chunks by relevance and return the top chunks.
220
+ """
221
+ # Evaluate each chunk
222
+ chunk_scores = [(chunk, evaluate_chunk_relevance(chunk, keywords)) for chunk in chunks]
223
+
224
+ # Sort chunks by score (highest first)
225
+ sorted_chunks = sorted(chunk_scores, key=lambda x: x[1], reverse=True)
226
+
227
+ # Select the top N chunks
228
+ top_chunks = [chunk for chunk, score in sorted_chunks[:max_chunks]]
229
+
230
+ # Print scores for debugging
231
+ print("Chunk scores:")
232
+ for i, (chunk, score) in enumerate(sorted_chunks):
233
+ print(f"Chunk {i+1}: Score {score:.2f}, Length {len(chunk.split())} words")
234
+ print(f"First 100 chars: {chunk[:100]}...")
235
+
236
+ return top_chunks
237
+
238
  def split_text_into_chunks(text, tokenizer, max_tokens=512):
239
  sentences = nltk.sent_tokenize(text)
240
  chunks = []
 
348
  extracted_data = {
349
  key: (match.group(1) if match else "N/A")
350
  for key, pattern in patterns.items()
351
+ if (match := re.search(pattern, text_for_analysis, re.IGNORECASE))
352
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
 
354
+ # Use spaCy to extract additional financial terms (Now using full text)
355
+ doc = nlp(text_for_analysis)
356
+ financial_entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ["MONEY", "PERCENT", "ORG", "DATE"]]
357
 
358
+ # Store extracted data in a structured dictionary
359
+ structured_data = {**extracted_data, "Named Entities Extracted": financial_entities}
360
 
361
+ # Display results
362
+ st.write("Entities Found:")
363
+ st.write(pd.DataFrame(financial_entities, columns=["Entity", "Label"]))
 
 
364
 
365
+ st.write("Structured Data Extracted:")
366
+ st.write(pd.DataFrame([structured_data]))
 
 
 
367
 
368
+ else:
369
+ st.error("Please provide some text for analysis.")
370
 
371
+ # Step 4: Summarization
372
+ st.subheader("Summarization")
373
+ st.write("Generate concise summaries of financial documents.")
374
+
375
+ # Add customization options for summarization with chunk selection
376
+ st.sidebar.header("Summarization Settings")
377
+ max_chunks_to_process = st.sidebar.slider(
378
+ "Max chunks to summarize",
379
+ min_value=1,
380
+ max_value=10,
381
+ value=3,
382
+ help="Select fewer chunks for faster processing but less comprehensive summaries"
383
+ )
384
+
385
+ # Allow users to add custom keywords
386
+ custom_keywords = st.sidebar.text_input(
387
+ "Add custom keywords (comma separated)",
388
+ value="",
389
+ help="Add domain-specific keywords to improve chunk selection"
390
+ )
391
+
392
+ # Text summarization input
393
+ input_text = st.text_area(
394
+ "Enter text to summarize",
395
+ height=200,
396
+ value=st.session_state.get("pdf_text", "") if "pdf_text" in st.session_state else ""
397
+ )
398
+
399
+ # Add option to see chunk selection details
400
+ show_chunk_details = st.sidebar.checkbox("Show chunk selection details", value=False)
401
+
402
+ if st.button("Summarize"):
403
+ text_to_summarize = input_text.strip()
404
+ if text_to_summarize:
405
+ try:
406
+ # Display original text length
407
+ input_length = len(text_to_summarize.split())
408
+ st.write(f"Original text length: {input_length} words")
409
+
410
+ # Process custom keywords if provided
411
+ keywords = None
412
+ if custom_keywords:
413
+ keywords = [kw.strip() for kw in custom_keywords.split(",") if kw.strip()]
414
+ st.write(f"Using custom keywords: {', '.join(keywords)}")
415
+
416
+ # Split the text into manageable chunks
417
+ chunks = split_text_into_chunks(text_to_summarize, tokenizer)
418
+ st.write(f"Text has been split into {len(chunks)} chunks.")
419
+
420
+ # NEW: Rank and select the best chunks instead of processing all of them
421
+ selected_chunks = rank_and_select_chunks(
422
+ chunks,
423
+ max_chunks=max_chunks_to_process,
424
+ keywords=keywords
425
+ )
426
+
427
+ st.write(f"Selected {len(selected_chunks)} highest-ranked chunks for summarization.")
428
+
429
+ # Show chunk selection details if requested
430
+ if show_chunk_details:
431
+ with st.expander("Chunk Selection Details"):
432
+ for i, chunk in enumerate(selected_chunks):
433
+ st.markdown(f"**Chunk {i+1}**")
434
+ st.write(f"Length: {len(chunk.split())} words")
435
+ st.text(chunk[:300] + "..." if len(chunk) > 300 else chunk)
436
+ st.write("---")
437
+
438
+ # Summarize each selected chunk
439
+ summaries = []
440
+ with st.spinner(f"Summarizing {len(selected_chunks)} chunks..."):
441
+ for i, chunk in enumerate(selected_chunks):
442
+ st.write(f"Summarizing chunk {i+1}/{len(selected_chunks)}...")
443
+ # Adjust summary length parameters as needed
444
+ chunk_length = len(chunk.split())
445
+ max_summary_length = min(150, chunk_length // 2)
446
+ min_summary_length = max(50, max_summary_length // 2)
447
+
448
+ try:
449
+ summary_output = summarizer(
450
+ chunk,
451
+ max_length=max_summary_length,
452
+ min_length=min_summary_length,
453
+ do_sample=False,
454
+ truncation=True
455
+ )
456
+ chunk_summary = summary_output[0]['summary_text'].strip()
457
+
458
+ if not chunk_summary:
459
+ st.warning(f"The summary for chunk {i+1} is empty.")
460
+ else:
461
+ summaries.append(chunk_summary)
462
+
463
+ except Exception as e:
464
+ st.error(f"Summarization failed for chunk {i+1}: {e}")
465
+ st.text(traceback.format_exc())
466
+ continue
467
+
468
+ if summaries:
469
+ # Combine summaries and remove duplicates
470
+ combined_summary = ' '.join(summaries)
471
+ final_summary = remove_duplicate_sentences(combined_summary)
472
+
473
+ # Calculate compression ratio
474
+ summary_length = len(final_summary.split())
475
+ compression_ratio = (1 - summary_length / input_length) * 100
476
+
477
+ st.subheader("Final Summary")
478
+ st.success(final_summary)
479
+ st.write(f"Summary length: {summary_length} words ({compression_ratio:.1f}% compression)")
480
+
481
+ # Display summary statistics
482
+ st.subheader("Summary Statistics")
483
+ stats_col1, stats_col2 = st.columns(2)
484
+ with stats_col1:
485
+ st.metric("Original Length", f"{input_length} words")
486
+ st.metric("Total Chunks", str(len(chunks)))
487
+ with stats_col2:
488
+ st.metric("Summary Length", f"{summary_length} words")
489
+ st.metric("Chunks Processed", str(len(selected_chunks)))
490
+
491
+ else:
492
+ st.error("No summaries were generated.")
493
+
494
+ except Exception as e:
495
+ st.error("An error occurred during summarization.")
496
+ st.text(traceback.format_exc())
497
+ else:
498
+ st.error("Please provide text to summarize.")
499
+
500
+ # Add help information
501
+ st.sidebar.markdown("---")
502
+ with st.sidebar.expander("How Chunk Selection Works"):
503
+ st.markdown("""
504
+ The chunk selection algorithm ranks text chunks based on:
505
+
506
+ 1. **Keyword density** - Presence of financial terms
507
+ 2. **Length** - Longer chunks often contain more information
508
+ 3. **Numbers** - Financial documents with numbers are often important
509
+ 4. **Structure** - Lists and bullet points signal key information
510
+ 5. **Headers** - Section headers often introduce important content
511
+
512
+ Adjust the settings above to customize the selection process.
513
+ """)