kritsadaK commited on
Commit
9bd38f7
·
verified ·
1 Parent(s): 3b15327

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +183 -98
app.py CHANGED
@@ -14,6 +14,7 @@ import torch
14
  import nltk
15
  from nltk.tokenize import sent_tokenize
16
  import traceback
 
17
 
18
  # Set Streamlit page config
19
  st.set_page_config(page_title="FinBrief: Financial Document Insights", layout="wide")
@@ -61,7 +62,6 @@ if summarizer is None:
61
  st.title("FinBrief: Financial Document Insights")
62
  st.write("Upload a financial document for analysis.")
63
 
64
-
65
  # Initialize session state
66
  if "nlp" not in st.session_state:
67
  st.session_state["nlp"] = nlp
@@ -97,16 +97,6 @@ download_nltk_punkt()
97
  # Debugging: Check session state initialization
98
  print(f"Session State - NLP: {st.session_state['nlp'] is not None}, Summarizer: {st.session_state['summarizer'] is not None}")
99
 
100
- # # Load the summarization model locally
101
- # try:
102
- # local_model_path = "./local_models/bart-financial"
103
- # summarizer = pipeline("summarization", model=local_model_path, tokenizer=local_model_path)
104
- # st.write("Local summarization model loaded successfully!")
105
- # except Exception as e:
106
- # summarizer = None # Handle case where model is missing
107
- # st.write("Failed to load local summarization model.")
108
-
109
-
110
  # Define regex patterns to extract structured data
111
  patterns = {
112
  "Fund Name": r"^(.*?) Fund", # Extracts the name before "Fund"
@@ -177,8 +167,75 @@ def extract_text_tables_pdfplumber(pdf_file):
177
  else:
178
  print("No text extracted. The PDF might be image-based.")
179
  return None, None
180
-
181
- def split_text_into_chunks(text, tokenizer, max_tokens=1024):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  sentences = nltk.sent_tokenize(text)
183
  chunks = []
184
  current_chunk = ''
@@ -269,7 +326,7 @@ pdf_tables = st.session_state["pdf_tables"]
269
 
270
  # Ensure that the models are loaded
271
  if nlp is None or summarizer is None:
272
- st.error("Models are not properly loaded. Please check your model paths and installation.")
273
  else:
274
  # Step 3: Named Entity Recognition (NER)
275
  st.subheader("NER Analysis")
@@ -284,7 +341,7 @@ else:
284
  if st.button("Analyze"):
285
  # Ensure full extracted text is used for analysis
286
  text_for_analysis = st.session_state["pdf_text"].strip() if st.session_state["pdf_text"] else example_text.strip()
287
-
288
  if text_for_analysis:
289
  with st.spinner("Analyzing text..."):
290
  # Extract structured financial data using regex (Now using full text)
@@ -294,27 +351,43 @@ else:
294
  if (match := re.search(pattern, text_for_analysis, re.IGNORECASE))
295
  }
296
 
297
- # Use spaCy to extract additional financial terms (Now using full text)
298
  doc = nlp(text_for_analysis)
299
  financial_entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ["MONEY", "PERCENT", "ORG", "DATE"]]
300
-
301
  # Store extracted data in a structured dictionary
302
  structured_data = {**extracted_data, "Named Entities Extracted": financial_entities}
303
-
304
  # Display results
305
  st.write("Entities Found:")
306
  st.write(pd.DataFrame(financial_entities, columns=["Entity", "Label"]))
307
-
308
  st.write("Structured Data Extracted:")
309
  st.write(pd.DataFrame([structured_data]))
310
-
311
  else:
312
  st.error("Please provide some text for analysis.")
313
-
314
  # Step 4: Summarization
315
  st.subheader("Summarization")
316
  st.write("Generate concise summaries of financial documents.")
317
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  # Text summarization input
319
  input_text = st.text_area(
320
  "Enter text to summarize",
@@ -322,6 +395,9 @@ else:
322
  value=st.session_state.get("pdf_text", "") if "pdf_text" in st.session_state else ""
323
  )
324
 
 
 
 
325
  if st.button("Summarize"):
326
  text_to_summarize = input_text.strip()
327
  if text_to_summarize:
@@ -329,94 +405,88 @@ else:
329
  # Display original text length
330
  input_length = len(text_to_summarize.split())
331
  st.write(f"Original text length: {input_length} words")
332
-
333
- # Define the maximum number of tokens the model can handle
334
- max_input_tokens = 1024 # BART's maximum input length
335
-
336
- # Function to split text into chunks based on tokens (modified to avoid overlaps)
337
- def split_text_into_chunks(text, tokenizer, max_tokens=max_input_tokens):
338
- sentences = nltk.sent_tokenize(text)
339
- chunks = []
340
- current_chunk = ''
341
- current_length = 0
342
-
343
- for sentence in sentences:
344
- sentence_tokens = tokenizer.encode(sentence, add_special_tokens=False)
345
- sentence_length = len(sentence_tokens)
346
-
347
- # If adding the sentence exceeds max_tokens, start a new chunk
348
- if current_length + sentence_length > max_tokens:
349
- if current_chunk:
350
- chunks.append(current_chunk.strip())
351
- current_chunk = sentence
352
- current_length = sentence_length
353
- else:
354
- current_chunk += ' ' + sentence
355
- current_length += sentence_length
356
-
357
- if current_chunk:
358
- chunks.append(current_chunk.strip())
359
-
360
- return chunks
361
-
362
- # Function to remove duplicate sentences
363
- def remove_duplicate_sentences(text):
364
- sentences = nltk.sent_tokenize(text)
365
- unique_sentences = []
366
- seen_sentences = set()
367
-
368
- for sentence in sentences:
369
- normalized_sentence = sentence.strip().lower()
370
- if normalized_sentence not in seen_sentences:
371
- seen_sentences.add(normalized_sentence)
372
- unique_sentences.append(sentence)
373
-
374
- return ' '.join(unique_sentences)
375
 
376
  # Split the text into manageable chunks
377
  chunks = split_text_into_chunks(text_to_summarize, tokenizer)
378
  st.write(f"Text has been split into {len(chunks)} chunks.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
 
380
- # Summarize each chunk
381
  summaries = []
382
- for i, chunk in enumerate(chunks):
383
- st.write(f"Summarizing chunk {i+1}/{len(chunks)}...")
384
- # Adjust summary length parameters as needed
385
- chunk_length = len(chunk.split())
386
- max_summary_length = min(150, chunk_length // 2)
387
- min_summary_length = max(50, max_summary_length // 2)
388
-
389
- try:
390
- summary_output = summarizer(
391
- chunk,
392
- max_length=max_summary_length,
393
- min_length=min_summary_length,
394
- do_sample=False,
395
- truncation=True
396
- )
397
- chunk_summary = summary_output[0]['summary_text'].strip()
398
-
399
- if not chunk_summary:
400
- st.warning(f"The summary for chunk {i+1} is empty.")
401
- else:
402
- summaries.append(chunk_summary)
403
- # Optionally display the summary of the current chunk
404
- # st.write(f"Summary of chunk {i+1}:")
405
- # st.write(chunk_summary)
406
- # st.write("---")
407
-
408
- except Exception as e:
409
- st.error(f"Summarization failed for chunk {i+1}: {e}")
410
- st.text(traceback.format_exc())
411
- continue
412
 
413
  if summaries:
414
- # Combine summaries
415
  combined_summary = ' '.join(summaries)
416
- # Remove duplicate sentences
417
  final_summary = remove_duplicate_sentences(combined_summary)
418
- st.write("Final Summary:")
 
 
 
 
 
419
  st.success(final_summary)
 
 
 
 
 
 
 
 
 
 
 
 
420
  else:
421
  st.error("No summaries were generated.")
422
 
@@ -425,3 +495,18 @@ else:
425
  st.text(traceback.format_exc())
426
  else:
427
  st.error("Please provide text to summarize.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  import nltk
15
  from nltk.tokenize import sent_tokenize
16
  import traceback
17
+ from collections import Counter
18
 
19
  # Set Streamlit page config
20
  st.set_page_config(page_title="FinBrief: Financial Document Insights", layout="wide")
 
62
  st.title("FinBrief: Financial Document Insights")
63
  st.write("Upload a financial document for analysis.")
64
 
 
65
  # Initialize session state
66
  if "nlp" not in st.session_state:
67
  st.session_state["nlp"] = nlp
 
97
  # Debugging: Check session state initialization
98
  print(f"Session State - NLP: {st.session_state['nlp'] is not None}, Summarizer: {st.session_state['summarizer'] is not None}")
99
 
 
 
 
 
 
 
 
 
 
 
100
  # Define regex patterns to extract structured data
101
  patterns = {
102
  "Fund Name": r"^(.*?) Fund", # Extracts the name before "Fund"
 
167
  else:
168
  print("No text extracted. The PDF might be image-based.")
169
  return None, None
170
+
171
+ # NEW: Function to evaluate chunk relevance
172
+ def evaluate_chunk_relevance(chunk, keywords=None):
173
+ """
174
+ Evaluate the relevance of a text chunk based on various factors.
175
+ Returns a score representing the chunk's relevance.
176
+ """
177
+ if not keywords:
178
+ # Default financial keywords
179
+ keywords = ["fund", "portfolio", "performance", "return", "asset", "investment",
180
+ "expense", "risk", "benchmark", "allocation", "strategy", "market",
181
+ "growth", "income", "dividend", "yield", "capital", "equity", "bond",
182
+ "summary", "overview", "highlight", "key", "important", "significant"]
183
+
184
+ score = 0
185
+
186
+ # Factor 1: Length of the chunk (longer chunks often contain more information)
187
+ word_count = len(chunk.split())
188
+ score += min(word_count / 100, 5) # Cap at 5 points
189
+
190
+ # Factor 2: Keyword presence
191
+ # Count keywords in lowercase text
192
+ lower_chunk = chunk.lower()
193
+ keyword_count = sum(1 for keyword in keywords if keyword.lower() in lower_chunk)
194
+ keyword_density = keyword_count / max(1, word_count) * 100
195
+ score += min(keyword_density * 2, 10) # Cap at 10 points
196
+
197
+ # Factor 3: Presence of numbers (financial documents often contain important numbers)
198
+ number_count = len(re.findall(r'\d+\.?\d*%?', chunk))
199
+ score += min(number_count / 5, 5) # Cap at 5 points
200
+
201
+ # Factor 4: Structured information (lists, tables, etc.)
202
+ bullet_count = len(re.findall(r'•|\*|-|–|[0-9]+\.', chunk))
203
+ score += min(bullet_count, 5) # Cap at 5 points
204
+
205
+ # Factor 5: Presence of section headers
206
+ header_patterns = [
207
+ r'^[A-Z][A-Za-z\s]+:', # Title followed by colon
208
+ r'^[A-Z][A-Z\s]+', # ALL CAPS text
209
+ r'^\d+\.\s+[A-Z]' # Numbered section
210
+ ]
211
+ header_count = sum(1 for pattern in header_patterns if re.search(pattern, chunk, re.MULTILINE))
212
+ score += min(header_count * 2, 5) # Cap at 5 points
213
+
214
+ return score
215
+
216
+ # NEW: Function to rank and select the best chunks
217
+ def rank_and_select_chunks(chunks, max_chunks=5, keywords=None):
218
+ """
219
+ Rank chunks by relevance and return the top chunks.
220
+ """
221
+ # Evaluate each chunk
222
+ chunk_scores = [(chunk, evaluate_chunk_relevance(chunk, keywords)) for chunk in chunks]
223
+
224
+ # Sort chunks by score (highest first)
225
+ sorted_chunks = sorted(chunk_scores, key=lambda x: x[1], reverse=True)
226
+
227
+ # Select the top N chunks
228
+ top_chunks = [chunk for chunk, score in sorted_chunks[:max_chunks]]
229
+
230
+ # Print scores for debugging
231
+ print("Chunk scores:")
232
+ for i, (chunk, score) in enumerate(sorted_chunks):
233
+ print(f"Chunk {i+1}: Score {score:.2f}, Length {len(chunk.split())} words")
234
+ print(f"First 100 chars: {chunk[:100]}...")
235
+
236
+ return top_chunks
237
+
238
+ def split_text_into_chunks(text, tokenizer, max_tokens=512):
239
  sentences = nltk.sent_tokenize(text)
240
  chunks = []
241
  current_chunk = ''
 
326
 
327
  # Ensure that the models are loaded
328
  if nlp is None or summarizer is None:
329
+ st.error("Models are not properly loaded. Please check model paths and installation.")
330
  else:
331
  # Step 3: Named Entity Recognition (NER)
332
  st.subheader("NER Analysis")
 
341
  if st.button("Analyze"):
342
  # Ensure full extracted text is used for analysis
343
  text_for_analysis = st.session_state["pdf_text"].strip() if st.session_state["pdf_text"] else example_text.strip()
344
+
345
  if text_for_analysis:
346
  with st.spinner("Analyzing text..."):
347
  # Extract structured financial data using regex (Now using full text)
 
351
  if (match := re.search(pattern, text_for_analysis, re.IGNORECASE))
352
  }
353
 
 
354
  doc = nlp(text_for_analysis)
355
  financial_entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ["MONEY", "PERCENT", "ORG", "DATE"]]
356
+
357
  # Store extracted data in a structured dictionary
358
  structured_data = {**extracted_data, "Named Entities Extracted": financial_entities}
359
+
360
  # Display results
361
  st.write("Entities Found:")
362
  st.write(pd.DataFrame(financial_entities, columns=["Entity", "Label"]))
363
+
364
  st.write("Structured Data Extracted:")
365
  st.write(pd.DataFrame([structured_data]))
366
+
367
  else:
368
  st.error("Please provide some text for analysis.")
369
+
370
  # Step 4: Summarization
371
  st.subheader("Summarization")
372
  st.write("Generate concise summaries of financial documents.")
373
 
374
+ # Add customization options for summarization with chunk selection
375
+ st.sidebar.header("Summarization Settings")
376
+ max_chunks_to_process = st.sidebar.slider(
377
+ "Max chunks to summarize",
378
+ min_value=1,
379
+ max_value=10,
380
+ value=3,
381
+ help="Select fewer chunks for faster processing but less comprehensive summaries"
382
+ )
383
+
384
+ # Allow users to add custom keywords
385
+ custom_keywords = st.sidebar.text_input(
386
+ "Add custom keywords (comma separated)",
387
+ value="",
388
+ help="Add domain-specific keywords to improve chunk selection"
389
+ )
390
+
391
  # Text summarization input
392
  input_text = st.text_area(
393
  "Enter text to summarize",
 
395
  value=st.session_state.get("pdf_text", "") if "pdf_text" in st.session_state else ""
396
  )
397
 
398
+ # Add option to see chunk selection details
399
+ show_chunk_details = st.sidebar.checkbox("Show chunk selection details", value=False)
400
+
401
  if st.button("Summarize"):
402
  text_to_summarize = input_text.strip()
403
  if text_to_summarize:
 
405
  # Display original text length
406
  input_length = len(text_to_summarize.split())
407
  st.write(f"Original text length: {input_length} words")
408
+
409
+ # Process custom keywords if provided
410
+ keywords = None
411
+ if custom_keywords:
412
+ keywords = [kw.strip() for kw in custom_keywords.split(",") if kw.strip()]
413
+ st.write(f"Using custom keywords: {', '.join(keywords)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
 
415
  # Split the text into manageable chunks
416
  chunks = split_text_into_chunks(text_to_summarize, tokenizer)
417
  st.write(f"Text has been split into {len(chunks)} chunks.")
418
+
419
+ # NEW: Rank and select the best chunks instead of processing all of them
420
+ selected_chunks = rank_and_select_chunks(
421
+ chunks,
422
+ max_chunks=max_chunks_to_process,
423
+ keywords=keywords
424
+ )
425
+
426
+ st.write(f"Selected {len(selected_chunks)} highest-ranked chunks for summarization.")
427
+
428
+ # Show chunk selection details if requested
429
+ if show_chunk_details:
430
+ with st.expander("Chunk Selection Details"):
431
+ for i, chunk in enumerate(selected_chunks):
432
+ st.markdown(f"**Chunk {i+1}**")
433
+ st.write(f"Length: {len(chunk.split())} words")
434
+ st.text(chunk[:300] + "..." if len(chunk) > 300 else chunk)
435
+ st.write("---")
436
 
437
+ # Summarize each selected chunk
438
  summaries = []
439
+ with st.spinner(f"Summarizing {len(selected_chunks)} chunks..."):
440
+ for i, chunk in enumerate(selected_chunks):
441
+ st.write(f"Summarizing chunk {i+1}/{len(selected_chunks)}...")
442
+ # Adjust summary length parameters as needed
443
+ chunk_length = len(chunk.split())
444
+ max_summary_length = min(150, chunk_length // 2)
445
+ min_summary_length = max(50, max_summary_length // 2)
446
+
447
+ try:
448
+ summary_output = summarizer(
449
+ chunk,
450
+ max_length=max_summary_length,
451
+ min_length=min_summary_length,
452
+ do_sample=False,
453
+ truncation=True
454
+ )
455
+ chunk_summary = summary_output[0]['summary_text'].strip()
456
+
457
+ if not chunk_summary:
458
+ st.warning(f"The summary for chunk {i+1} is empty.")
459
+ else:
460
+ summaries.append(chunk_summary)
461
+
462
+ except Exception as e:
463
+ st.error(f"Summarization failed for chunk {i+1}: {e}")
464
+ st.text(traceback.format_exc())
465
+ continue
 
 
 
466
 
467
  if summaries:
468
+ # Combine summaries and remove duplicates
469
  combined_summary = ' '.join(summaries)
 
470
  final_summary = remove_duplicate_sentences(combined_summary)
471
+
472
+ # Calculate compression ratio
473
+ summary_length = len(final_summary.split())
474
+ compression_ratio = (1 - summary_length / input_length) * 100
475
+
476
+ st.subheader("Final Summary")
477
  st.success(final_summary)
478
+ st.write(f"Summary length: {summary_length} words ({compression_ratio:.1f}% compression)")
479
+
480
+ # Display summary statistics
481
+ st.subheader("Summary Statistics")
482
+ stats_col1, stats_col2 = st.columns(2)
483
+ with stats_col1:
484
+ st.metric("Original Length", f"{input_length} words")
485
+ st.metric("Total Chunks", str(len(chunks)))
486
+ with stats_col2:
487
+ st.metric("Summary Length", f"{summary_length} words")
488
+ st.metric("Chunks Processed", str(len(selected_chunks)))
489
+
490
  else:
491
  st.error("No summaries were generated.")
492
 
 
495
  st.text(traceback.format_exc())
496
  else:
497
  st.error("Please provide text to summarize.")
498
+
499
+ # Add help information
500
+ st.sidebar.markdown("---")
501
+ with st.sidebar.expander("How Chunk Selection Works"):
502
+ st.markdown("""
503
+ The chunk selection algorithm ranks text chunks based on:
504
+
505
+ 1. **Keyword density** - Presence of financial terms
506
+ 2. **Length** - Longer chunks often contain more information
507
+ 3. **Numbers** - Financial documents with numbers are often important
508
+ 4. **Structure** - Lists and bullet points signal key information
509
+ 5. **Headers** - Section headers often introduce important content
510
+
511
+ Adjust the settings above to customize the selection process.
512
+ """)