KKowenn commited on
Commit
647728d
·
verified ·
1 Parent(s): 652103c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +133 -133
app.py CHANGED
@@ -367,146 +367,146 @@ else:
367
  else:
368
  st.error("Please provide some text for analysis.")
369
 
370
- # Step 4: Summarization
371
- st.subheader("Summarization")
372
- st.write("Generate concise summaries of financial documents.")
373
-
374
- # Add customization options for summarization with chunk selection
375
- st.sidebar.header("Summarization Settings")
376
- max_chunks_to_process = st.sidebar.slider(
377
- "Max chunks to summarize",
378
- min_value=1,
379
- max_value=10,
380
- value=3,
381
- help="Select fewer chunks for faster processing but less comprehensive summaries"
382
- )
383
-
384
- # Allow users to add custom keywords
385
- custom_keywords = st.sidebar.text_input(
386
- "Add custom keywords (comma separated)",
387
- value="",
388
- help="Add domain-specific keywords to improve chunk selection"
389
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
 
391
- # Text summarization input
392
- input_text = st.text_area(
393
- "Enter text to summarize",
394
- height=200,
395
- value=st.session_state.get("pdf_text", "") if "pdf_text" in st.session_state else ""
396
- )
 
 
 
397
 
398
- # Add option to see chunk selection details
399
- show_chunk_details = st.sidebar.checkbox("Show chunk selection details", value=False)
 
 
400
 
401
- if st.button("Summarize"):
402
- text_to_summarize = input_text.strip()
403
- if text_to_summarize:
404
- try:
405
- # Display original text length
406
- input_length = len(text_to_summarize.split())
407
- st.write(f"Original text length: {input_length} words")
 
 
408
 
409
- # Process custom keywords if provided
410
- keywords = None
411
- if custom_keywords:
412
- keywords = [kw.strip() for kw in custom_keywords.split(",") if kw.strip()]
413
- st.write(f"Using custom keywords: {', '.join(keywords)}")
414
-
415
- # Split the text into manageable chunks
416
- chunks = split_text_into_chunks(text_to_summarize, tokenizer)
417
- st.write(f"Text has been split into {len(chunks)} chunks.")
418
 
419
- # NEW: Rank and select the best chunks instead of processing all of them
420
- selected_chunks = rank_and_select_chunks(
421
- chunks,
422
- max_chunks=max_chunks_to_process,
423
- keywords=keywords
424
- )
425
 
426
- st.write(f"Selected {len(selected_chunks)} highest-ranked chunks for summarization.")
 
 
 
 
 
 
 
 
427
 
428
- # Show chunk selection details if requested
429
- if show_chunk_details:
430
- with st.expander("Chunk Selection Details"):
431
- for i, chunk in enumerate(selected_chunks):
432
- st.markdown(f"**Chunk {i+1}**")
433
- st.write(f"Length: {len(chunk.split())} words")
434
- st.text(chunk[:300] + "..." if len(chunk) > 300 else chunk)
435
- st.write("---")
436
-
437
- # Summarize each selected chunk
438
- summaries = []
439
- with st.spinner(f"Summarizing {len(selected_chunks)} chunks..."):
440
- for i, chunk in enumerate(selected_chunks):
441
- st.write(f"Summarizing chunk {i+1}/{len(selected_chunks)}...")
442
- # Adjust summary length parameters as needed
443
- chunk_length = len(chunk.split())
444
- max_summary_length = min(150, chunk_length // 2)
445
- min_summary_length = max(50, max_summary_length // 2)
446
-
447
- try:
448
- summary_output = summarizer(
449
- chunk,
450
- max_length=max_summary_length,
451
- min_length=min_summary_length,
452
- do_sample=False,
453
- truncation=True
454
- )
455
- chunk_summary = summary_output[0]['summary_text'].strip()
456
-
457
- if not chunk_summary:
458
- st.warning(f"The summary for chunk {i+1} is empty.")
459
- else:
460
- summaries.append(chunk_summary)
461
 
462
- except Exception as e:
463
- st.error(f"Summarization failed for chunk {i+1}: {e}")
464
- st.text(traceback.format_exc())
465
- continue
 
466
 
467
- if summaries:
468
- # Combine summaries and remove duplicates
469
- combined_summary = ' '.join(summaries)
470
- final_summary = remove_duplicate_sentences(combined_summary)
471
-
472
- # Calculate compression ratio
473
- summary_length = len(final_summary.split())
474
- compression_ratio = (1 - summary_length / input_length) * 100
475
-
476
- st.subheader("Final Summary")
477
- st.success(final_summary)
478
- st.write(f"Summary length: {summary_length} words ({compression_ratio:.1f}% compression)")
479
-
480
- # Display summary statistics
481
- st.subheader("Summary Statistics")
482
- stats_col1, stats_col2 = st.columns(2)
483
- with stats_col1:
484
- st.metric("Original Length", f"{input_length} words")
485
- st.metric("Total Chunks", str(len(chunks)))
486
- with stats_col2:
487
- st.metric("Summary Length", f"{summary_length} words")
488
- st.metric("Chunks Processed", str(len(selected_chunks)))
489
-
490
- else:
491
- st.error("No summaries were generated.")
492
 
493
- except Exception as e:
494
- st.error("An error occurred during summarization.")
495
- st.text(traceback.format_exc())
496
- else:
497
- st.error("Please provide text to summarize.")
498
-
499
- # Add help information
500
- st.sidebar.markdown("---")
501
- with st.sidebar.expander("How Chunk Selection Works"):
502
- st.markdown("""
503
- The chunk selection algorithm ranks text chunks based on:
504
-
505
- 1. **Keyword density** - Presence of financial terms
506
- 2. **Length** - Longer chunks often contain more information
507
- 3. **Numbers** - Financial documents with numbers are often important
508
- 4. **Structure** - Lists and bullet points signal key information
509
- 5. **Headers** - Section headers often introduce important content
510
-
511
- Adjust the settings above to customize the selection process.
512
- """)
 
367
  else:
368
  st.error("Please provide some text for analysis.")
369
 
370
+ # Step 4: Summarization
371
+ st.subheader("Summarization")
372
+ st.write("Generate concise summaries of financial documents.")
373
+
374
+ # Add customization options for summarization with chunk selection
375
+ st.sidebar.header("Summarization Settings")
376
+ max_chunks_to_process = st.sidebar.slider(
377
+ "Max chunks to summarize",
378
+ min_value=1,
379
+ max_value=10,
380
+ value=3,
381
+ help="Select fewer chunks for faster processing but less comprehensive summaries"
382
+ )
383
+
384
+ # Allow users to add custom keywords
385
+ custom_keywords = st.sidebar.text_input(
386
+ "Add custom keywords (comma separated)",
387
+ value="",
388
+ help="Add domain-specific keywords to improve chunk selection"
389
+ )
390
+
391
+ # Text summarization input
392
+ input_text = st.text_area(
393
+ "Enter text to summarize",
394
+ height=200,
395
+ value=st.session_state.get("pdf_text", "") if "pdf_text" in st.session_state else ""
396
+ )
397
+
398
+ # Add option to see chunk selection details
399
+ show_chunk_details = st.sidebar.checkbox("Show chunk selection details", value=False)
400
+
401
+ if st.button("Summarize"):
402
+ text_to_summarize = input_text.strip()
403
+ if text_to_summarize:
404
+ try:
405
+ # Display original text length
406
+ input_length = len(text_to_summarize.split())
407
+ st.write(f"Original text length: {input_length} words")
408
+
409
+ # Process custom keywords if provided
410
+ keywords = None
411
+ if custom_keywords:
412
+ keywords = [kw.strip() for kw in custom_keywords.split(",") if kw.strip()]
413
+ st.write(f"Using custom keywords: {', '.join(keywords)}")
414
+
415
+ # Split the text into manageable chunks
416
+ chunks = split_text_into_chunks(text_to_summarize, tokenizer)
417
+ st.write(f"Text has been split into {len(chunks)} chunks.")
418
+
419
+ # NEW: Rank and select the best chunks instead of processing all of them
420
+ selected_chunks = rank_and_select_chunks(
421
+ chunks,
422
+ max_chunks=max_chunks_to_process,
423
+ keywords=keywords
424
+ )
425
+
426
+ st.write(f"Selected {len(selected_chunks)} highest-ranked chunks for summarization.")
427
+
428
+ # Show chunk selection details if requested
429
+ if show_chunk_details:
430
+ with st.expander("Chunk Selection Details"):
431
+ for i, chunk in enumerate(selected_chunks):
432
+ st.markdown(f"**Chunk {i+1}**")
433
+ st.write(f"Length: {len(chunk.split())} words")
434
+ st.text(chunk[:300] + "..." if len(chunk) > 300 else chunk)
435
+ st.write("---")
436
+
437
+ # Summarize each selected chunk
438
+ summaries = []
439
+ with st.spinner(f"Summarizing {len(selected_chunks)} chunks..."):
440
+ for i, chunk in enumerate(selected_chunks):
441
+ st.write(f"Summarizing chunk {i+1}/{len(selected_chunks)}...")
442
+ # Adjust summary length parameters as needed
443
+ chunk_length = len(chunk.split())
444
+ max_summary_length = min(150, chunk_length // 2)
445
+ min_summary_length = max(50, max_summary_length // 2)
446
 
447
+ try:
448
+ summary_output = summarizer(
449
+ chunk,
450
+ max_length=max_summary_length,
451
+ min_length=min_summary_length,
452
+ do_sample=False,
453
+ truncation=True
454
+ )
455
+ chunk_summary = summary_output[0]['summary_text'].strip()
456
 
457
+ if not chunk_summary:
458
+ st.warning(f"The summary for chunk {i+1} is empty.")
459
+ else:
460
+ summaries.append(chunk_summary)
461
 
462
+ except Exception as e:
463
+ st.error(f"Summarization failed for chunk {i+1}: {e}")
464
+ st.text(traceback.format_exc())
465
+ continue
466
+
467
+ if summaries:
468
+ # Combine summaries and remove duplicates
469
+ combined_summary = ' '.join(summaries)
470
+ final_summary = remove_duplicate_sentences(combined_summary)
471
 
472
+ # Calculate compression ratio
473
+ summary_length = len(final_summary.split())
474
+ compression_ratio = (1 - summary_length / input_length) * 100
 
 
 
 
 
 
475
 
476
+ st.subheader("Final Summary")
477
+ st.success(final_summary)
478
+ st.write(f"Summary length: {summary_length} words ({compression_ratio:.1f}% compression)")
 
 
 
479
 
480
+ # Display summary statistics
481
+ st.subheader("Summary Statistics")
482
+ stats_col1, stats_col2 = st.columns(2)
483
+ with stats_col1:
484
+ st.metric("Original Length", f"{input_length} words")
485
+ st.metric("Total Chunks", str(len(chunks)))
486
+ with stats_col2:
487
+ st.metric("Summary Length", f"{summary_length} words")
488
+ st.metric("Chunks Processed", str(len(selected_chunks)))
489
 
490
+ else:
491
+ st.error("No summaries were generated.")
492
+
493
+ except Exception as e:
494
+ st.error("An error occurred during summarization.")
495
+ st.text(traceback.format_exc())
496
+ else:
497
+ st.error("Please provide text to summarize.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
498
 
499
+ # Add help information
500
+ st.sidebar.markdown("---")
501
+ with st.sidebar.expander("How Chunk Selection Works"):
502
+ st.markdown("""
503
+ The chunk selection algorithm ranks text chunks based on:
504
 
505
+ 1. **Keyword density** - Presence of financial terms
506
+ 2. **Length** - Longer chunks often contain more information
507
+ 3. **Numbers** - Financial documents with numbers are often important
508
+ 4. **Structure** - Lists and bullet points signal key information
509
+ 5. **Headers** - Section headers often introduce important content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
510
 
511
+ Adjust the settings above to customize the selection process.
512
+ """)