Update app.py
Browse files
app.py
CHANGED
@@ -367,146 +367,146 @@ else:
|
|
367 |
else:
|
368 |
st.error("Please provide some text for analysis.")
|
369 |
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
390 |
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
|
|
|
|
|
|
397 |
|
398 |
-
|
399 |
-
|
|
|
|
|
400 |
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
|
|
|
|
408 |
|
409 |
-
#
|
410 |
-
|
411 |
-
|
412 |
-
keywords = [kw.strip() for kw in custom_keywords.split(",") if kw.strip()]
|
413 |
-
st.write(f"Using custom keywords: {', '.join(keywords)}")
|
414 |
-
|
415 |
-
# Split the text into manageable chunks
|
416 |
-
chunks = split_text_into_chunks(text_to_summarize, tokenizer)
|
417 |
-
st.write(f"Text has been split into {len(chunks)} chunks.")
|
418 |
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
max_chunks=max_chunks_to_process,
|
423 |
-
keywords=keywords
|
424 |
-
)
|
425 |
|
426 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
427 |
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
# Summarize each selected chunk
|
438 |
-
summaries = []
|
439 |
-
with st.spinner(f"Summarizing {len(selected_chunks)} chunks..."):
|
440 |
-
for i, chunk in enumerate(selected_chunks):
|
441 |
-
st.write(f"Summarizing chunk {i+1}/{len(selected_chunks)}...")
|
442 |
-
# Adjust summary length parameters as needed
|
443 |
-
chunk_length = len(chunk.split())
|
444 |
-
max_summary_length = min(150, chunk_length // 2)
|
445 |
-
min_summary_length = max(50, max_summary_length // 2)
|
446 |
-
|
447 |
-
try:
|
448 |
-
summary_output = summarizer(
|
449 |
-
chunk,
|
450 |
-
max_length=max_summary_length,
|
451 |
-
min_length=min_summary_length,
|
452 |
-
do_sample=False,
|
453 |
-
truncation=True
|
454 |
-
)
|
455 |
-
chunk_summary = summary_output[0]['summary_text'].strip()
|
456 |
-
|
457 |
-
if not chunk_summary:
|
458 |
-
st.warning(f"The summary for chunk {i+1} is empty.")
|
459 |
-
else:
|
460 |
-
summaries.append(chunk_summary)
|
461 |
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
|
|
466 |
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
# Calculate compression ratio
|
473 |
-
summary_length = len(final_summary.split())
|
474 |
-
compression_ratio = (1 - summary_length / input_length) * 100
|
475 |
-
|
476 |
-
st.subheader("Final Summary")
|
477 |
-
st.success(final_summary)
|
478 |
-
st.write(f"Summary length: {summary_length} words ({compression_ratio:.1f}% compression)")
|
479 |
-
|
480 |
-
# Display summary statistics
|
481 |
-
st.subheader("Summary Statistics")
|
482 |
-
stats_col1, stats_col2 = st.columns(2)
|
483 |
-
with stats_col1:
|
484 |
-
st.metric("Original Length", f"{input_length} words")
|
485 |
-
st.metric("Total Chunks", str(len(chunks)))
|
486 |
-
with stats_col2:
|
487 |
-
st.metric("Summary Length", f"{summary_length} words")
|
488 |
-
st.metric("Chunks Processed", str(len(selected_chunks)))
|
489 |
-
|
490 |
-
else:
|
491 |
-
st.error("No summaries were generated.")
|
492 |
|
493 |
-
|
494 |
-
|
495 |
-
st.text(traceback.format_exc())
|
496 |
-
else:
|
497 |
-
st.error("Please provide text to summarize.")
|
498 |
-
|
499 |
-
# Add help information
|
500 |
-
st.sidebar.markdown("---")
|
501 |
-
with st.sidebar.expander("How Chunk Selection Works"):
|
502 |
-
st.markdown("""
|
503 |
-
The chunk selection algorithm ranks text chunks based on:
|
504 |
-
|
505 |
-
1. **Keyword density** - Presence of financial terms
|
506 |
-
2. **Length** - Longer chunks often contain more information
|
507 |
-
3. **Numbers** - Financial documents with numbers are often important
|
508 |
-
4. **Structure** - Lists and bullet points signal key information
|
509 |
-
5. **Headers** - Section headers often introduce important content
|
510 |
-
|
511 |
-
Adjust the settings above to customize the selection process.
|
512 |
-
""")
|
|
|
367 |
else:
|
368 |
st.error("Please provide some text for analysis.")
|
369 |
|
370 |
+
# Step 4: Summarization
|
371 |
+
st.subheader("Summarization")
|
372 |
+
st.write("Generate concise summaries of financial documents.")
|
373 |
+
|
374 |
+
# Add customization options for summarization with chunk selection
|
375 |
+
st.sidebar.header("Summarization Settings")
|
376 |
+
max_chunks_to_process = st.sidebar.slider(
|
377 |
+
"Max chunks to summarize",
|
378 |
+
min_value=1,
|
379 |
+
max_value=10,
|
380 |
+
value=3,
|
381 |
+
help="Select fewer chunks for faster processing but less comprehensive summaries"
|
382 |
+
)
|
383 |
+
|
384 |
+
# Allow users to add custom keywords
|
385 |
+
custom_keywords = st.sidebar.text_input(
|
386 |
+
"Add custom keywords (comma separated)",
|
387 |
+
value="",
|
388 |
+
help="Add domain-specific keywords to improve chunk selection"
|
389 |
+
)
|
390 |
+
|
391 |
+
# Text summarization input
|
392 |
+
input_text = st.text_area(
|
393 |
+
"Enter text to summarize",
|
394 |
+
height=200,
|
395 |
+
value=st.session_state.get("pdf_text", "") if "pdf_text" in st.session_state else ""
|
396 |
+
)
|
397 |
+
|
398 |
+
# Add option to see chunk selection details
|
399 |
+
show_chunk_details = st.sidebar.checkbox("Show chunk selection details", value=False)
|
400 |
+
|
401 |
+
if st.button("Summarize"):
|
402 |
+
text_to_summarize = input_text.strip()
|
403 |
+
if text_to_summarize:
|
404 |
+
try:
|
405 |
+
# Display original text length
|
406 |
+
input_length = len(text_to_summarize.split())
|
407 |
+
st.write(f"Original text length: {input_length} words")
|
408 |
+
|
409 |
+
# Process custom keywords if provided
|
410 |
+
keywords = None
|
411 |
+
if custom_keywords:
|
412 |
+
keywords = [kw.strip() for kw in custom_keywords.split(",") if kw.strip()]
|
413 |
+
st.write(f"Using custom keywords: {', '.join(keywords)}")
|
414 |
+
|
415 |
+
# Split the text into manageable chunks
|
416 |
+
chunks = split_text_into_chunks(text_to_summarize, tokenizer)
|
417 |
+
st.write(f"Text has been split into {len(chunks)} chunks.")
|
418 |
+
|
419 |
+
# NEW: Rank and select the best chunks instead of processing all of them
|
420 |
+
selected_chunks = rank_and_select_chunks(
|
421 |
+
chunks,
|
422 |
+
max_chunks=max_chunks_to_process,
|
423 |
+
keywords=keywords
|
424 |
+
)
|
425 |
+
|
426 |
+
st.write(f"Selected {len(selected_chunks)} highest-ranked chunks for summarization.")
|
427 |
+
|
428 |
+
# Show chunk selection details if requested
|
429 |
+
if show_chunk_details:
|
430 |
+
with st.expander("Chunk Selection Details"):
|
431 |
+
for i, chunk in enumerate(selected_chunks):
|
432 |
+
st.markdown(f"**Chunk {i+1}**")
|
433 |
+
st.write(f"Length: {len(chunk.split())} words")
|
434 |
+
st.text(chunk[:300] + "..." if len(chunk) > 300 else chunk)
|
435 |
+
st.write("---")
|
436 |
+
|
437 |
+
# Summarize each selected chunk
|
438 |
+
summaries = []
|
439 |
+
with st.spinner(f"Summarizing {len(selected_chunks)} chunks..."):
|
440 |
+
for i, chunk in enumerate(selected_chunks):
|
441 |
+
st.write(f"Summarizing chunk {i+1}/{len(selected_chunks)}...")
|
442 |
+
# Adjust summary length parameters as needed
|
443 |
+
chunk_length = len(chunk.split())
|
444 |
+
max_summary_length = min(150, chunk_length // 2)
|
445 |
+
min_summary_length = max(50, max_summary_length // 2)
|
446 |
|
447 |
+
try:
|
448 |
+
summary_output = summarizer(
|
449 |
+
chunk,
|
450 |
+
max_length=max_summary_length,
|
451 |
+
min_length=min_summary_length,
|
452 |
+
do_sample=False,
|
453 |
+
truncation=True
|
454 |
+
)
|
455 |
+
chunk_summary = summary_output[0]['summary_text'].strip()
|
456 |
|
457 |
+
if not chunk_summary:
|
458 |
+
st.warning(f"The summary for chunk {i+1} is empty.")
|
459 |
+
else:
|
460 |
+
summaries.append(chunk_summary)
|
461 |
|
462 |
+
except Exception as e:
|
463 |
+
st.error(f"Summarization failed for chunk {i+1}: {e}")
|
464 |
+
st.text(traceback.format_exc())
|
465 |
+
continue
|
466 |
+
|
467 |
+
if summaries:
|
468 |
+
# Combine summaries and remove duplicates
|
469 |
+
combined_summary = ' '.join(summaries)
|
470 |
+
final_summary = remove_duplicate_sentences(combined_summary)
|
471 |
|
472 |
+
# Calculate compression ratio
|
473 |
+
summary_length = len(final_summary.split())
|
474 |
+
compression_ratio = (1 - summary_length / input_length) * 100
|
|
|
|
|
|
|
|
|
|
|
|
|
475 |
|
476 |
+
st.subheader("Final Summary")
|
477 |
+
st.success(final_summary)
|
478 |
+
st.write(f"Summary length: {summary_length} words ({compression_ratio:.1f}% compression)")
|
|
|
|
|
|
|
479 |
|
480 |
+
# Display summary statistics
|
481 |
+
st.subheader("Summary Statistics")
|
482 |
+
stats_col1, stats_col2 = st.columns(2)
|
483 |
+
with stats_col1:
|
484 |
+
st.metric("Original Length", f"{input_length} words")
|
485 |
+
st.metric("Total Chunks", str(len(chunks)))
|
486 |
+
with stats_col2:
|
487 |
+
st.metric("Summary Length", f"{summary_length} words")
|
488 |
+
st.metric("Chunks Processed", str(len(selected_chunks)))
|
489 |
|
490 |
+
else:
|
491 |
+
st.error("No summaries were generated.")
|
492 |
+
|
493 |
+
except Exception as e:
|
494 |
+
st.error("An error occurred during summarization.")
|
495 |
+
st.text(traceback.format_exc())
|
496 |
+
else:
|
497 |
+
st.error("Please provide text to summarize.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
498 |
|
499 |
+
# Add help information
|
500 |
+
st.sidebar.markdown("---")
|
501 |
+
with st.sidebar.expander("How Chunk Selection Works"):
|
502 |
+
st.markdown("""
|
503 |
+
The chunk selection algorithm ranks text chunks based on:
|
504 |
|
505 |
+
1. **Keyword density** - Presence of financial terms
|
506 |
+
2. **Length** - Longer chunks often contain more information
|
507 |
+
3. **Numbers** - Financial documents with numbers are often important
|
508 |
+
4. **Structure** - Lists and bullet points signal key information
|
509 |
+
5. **Headers** - Section headers often introduce important content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
510 |
|
511 |
+
Adjust the settings above to customize the selection process.
|
512 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|