Spaces:

KKowenn
/

finbreif3

Running

App Files Files Community

KKowenn commited on 2 days ago

Commit

647728d

verified ·

1 Parent(s): 652103c

Update app.py

Browse files

Files changed (1) hide show

app.py +133 -133

app.py CHANGED Viewed

@@ -367,146 +367,146 @@ else:
         else:
             st.error("Please provide some text for analysis.")
-        # Step 4: Summarization
-        st.subheader("Summarization")
-        st.write("Generate concise summaries of financial documents.")
-        # Add customization options for summarization with chunk selection
-        st.sidebar.header("Summarization Settings")
-        max_chunks_to_process = st.sidebar.slider(
-            "Max chunks to summarize",
-            min_value=1,
-            max_value=10,
-            value=3,
-            help="Select fewer chunks for faster processing but less comprehensive summaries"
-        )
-        # Allow users to add custom keywords
-        custom_keywords = st.sidebar.text_input(
-            "Add custom keywords (comma separated)",
-            value="",
-            help="Add domain-specific keywords to improve chunk selection"
-        )
-        # Text summarization input
-        input_text = st.text_area(
-            "Enter text to summarize",
-            height=200,
-            value=st.session_state.get("pdf_text", "") if "pdf_text" in st.session_state else ""
-        )
-        # Add option to see chunk selection details
-        show_chunk_details = st.sidebar.checkbox("Show chunk selection details", value=False)
-        if st.button("Summarize"):
-            text_to_summarize = input_text.strip()
-            if text_to_summarize:
-                try:
-                    # Display original text length
-                    input_length = len(text_to_summarize.split())
-                    st.write(f"Original text length: {input_length} words")
-                    # Process custom keywords if provided
-                    keywords = None
-                    if custom_keywords:
-                        keywords = [kw.strip() for kw in custom_keywords.split(",") if kw.strip()]
-                        st.write(f"Using custom keywords: {', '.join(keywords)}")
-                    # Split the text into manageable chunks
-                    chunks = split_text_into_chunks(text_to_summarize, tokenizer)
-                    st.write(f"Text has been split into {len(chunks)} chunks.")
-                    # NEW: Rank and select the best chunks instead of processing all of them
-                    selected_chunks = rank_and_select_chunks(
-                        chunks,
-                        max_chunks=max_chunks_to_process,
-                        keywords=keywords
-                    )
-                    st.write(f"Selected {len(selected_chunks)} highest-ranked chunks for summarization.")
-                    # Show chunk selection details if requested
-                    if show_chunk_details:
-                        with st.expander("Chunk Selection Details"):
-                            for i, chunk in enumerate(selected_chunks):
-                                st.markdown(f"**Chunk {i+1}**")
-                                st.write(f"Length: {len(chunk.split())} words")
-                                st.text(chunk[:300] + "..." if len(chunk) > 300 else chunk)
-                                st.write("---")
-                    # Summarize each selected chunk
-                    summaries = []
-                    with st.spinner(f"Summarizing {len(selected_chunks)} chunks..."):
-                        for i, chunk in enumerate(selected_chunks):
-                            st.write(f"Summarizing chunk {i+1}/{len(selected_chunks)}...")
-                            # Adjust summary length parameters as needed
-                            chunk_length = len(chunk.split())
-                            max_summary_length = min(150, chunk_length // 2)
-                            min_summary_length = max(50, max_summary_length // 2)
-                            try:
-                                summary_output = summarizer(
-                                    chunk,
-                                    max_length=max_summary_length,
-                                    min_length=min_summary_length,
-                                    do_sample=False,
-                                    truncation=True
-                                )
-                                chunk_summary = summary_output[0]['summary_text'].strip()
-                                if not chunk_summary:
-                                    st.warning(f"The summary for chunk {i+1} is empty.")
-                                else:
-                                    summaries.append(chunk_summary)
-                            except Exception as e:
-                                st.error(f"Summarization failed for chunk {i+1}: {e}")
-                                st.text(traceback.format_exc())
-                                continue
-                    if summaries:
-                        # Combine summaries and remove duplicates
-                        combined_summary = ' '.join(summaries)
-                        final_summary = remove_duplicate_sentences(combined_summary)
-                        # Calculate compression ratio
-                        summary_length = len(final_summary.split())
-                        compression_ratio = (1 - summary_length / input_length) * 100
-                        st.subheader("Final Summary")
-                        st.success(final_summary)
-                        st.write(f"Summary length: {summary_length} words ({compression_ratio:.1f}% compression)")
-                        # Display summary statistics
-                        st.subheader("Summary Statistics")
-                        stats_col1, stats_col2 = st.columns(2)
-                        with stats_col1:
-                            st.metric("Original Length", f"{input_length} words")
-                            st.metric("Total Chunks", str(len(chunks)))
-                        with stats_col2:
-                            st.metric("Summary Length", f"{summary_length} words")
-                            st.metric("Chunks Processed", str(len(selected_chunks)))
-                    else:
-                        st.error("No summaries were generated.")
-                except Exception as e:
-                    st.error("An error occurred during summarization.")
-                    st.text(traceback.format_exc())
-            else:
-                st.error("Please provide text to summarize.")
-        # Add help information
-        st.sidebar.markdown("---")
-        with st.sidebar.expander("How Chunk Selection Works"):
-            st.markdown("""
-            The chunk selection algorithm ranks text chunks based on:
-            1. **Keyword density** - Presence of financial terms
-            2. **Length** - Longer chunks often contain more information
-            3. **Numbers** - Financial documents with numbers are often important
-            4. **Structure** - Lists and bullet points signal key information
-            5. **Headers** - Section headers often introduce important content
-            Adjust the settings above to customize the selection process.
-            """)

         else:
             st.error("Please provide some text for analysis.")
+    # Step 4: Summarization
+    st.subheader("Summarization")
+    st.write("Generate concise summaries of financial documents.")
+    # Add customization options for summarization with chunk selection
+    st.sidebar.header("Summarization Settings")
+    max_chunks_to_process = st.sidebar.slider(
+        "Max chunks to summarize",
+        min_value=1,
+        max_value=10,
+        value=3,
+        help="Select fewer chunks for faster processing but less comprehensive summaries"
+    )
+    # Allow users to add custom keywords
+    custom_keywords = st.sidebar.text_input(
+        "Add custom keywords (comma separated)",
+        value="",
+        help="Add domain-specific keywords to improve chunk selection"
+    )
+    # Text summarization input
+    input_text = st.text_area(
+        "Enter text to summarize",
+        height=200,
+        value=st.session_state.get("pdf_text", "") if "pdf_text" in st.session_state else ""
+    )
+    # Add option to see chunk selection details
+    show_chunk_details = st.sidebar.checkbox("Show chunk selection details", value=False)
+    if st.button("Summarize"):
+        text_to_summarize = input_text.strip()
+        if text_to_summarize:
+            try:
+                # Display original text length
+                input_length = len(text_to_summarize.split())
+                st.write(f"Original text length: {input_length} words")
+                # Process custom keywords if provided
+                keywords = None
+                if custom_keywords:
+                    keywords = [kw.strip() for kw in custom_keywords.split(",") if kw.strip()]
+                    st.write(f"Using custom keywords: {', '.join(keywords)}")
+                # Split the text into manageable chunks
+                chunks = split_text_into_chunks(text_to_summarize, tokenizer)
+                st.write(f"Text has been split into {len(chunks)} chunks.")
+                # NEW: Rank and select the best chunks instead of processing all of them
+                selected_chunks = rank_and_select_chunks(
+                    chunks,
+                    max_chunks=max_chunks_to_process,
+                    keywords=keywords
+                )
+                st.write(f"Selected {len(selected_chunks)} highest-ranked chunks for summarization.")
+                # Show chunk selection details if requested
+                if show_chunk_details:
+                    with st.expander("Chunk Selection Details"):
+                        for i, chunk in enumerate(selected_chunks):
+                            st.markdown(f"**Chunk {i+1}**")
+                            st.write(f"Length: {len(chunk.split())} words")
+                            st.text(chunk[:300] + "..." if len(chunk) > 300 else chunk)
+                            st.write("---")
+                # Summarize each selected chunk
+                summaries = []
+                with st.spinner(f"Summarizing {len(selected_chunks)} chunks..."):
+                    for i, chunk in enumerate(selected_chunks):
+                        st.write(f"Summarizing chunk {i+1}/{len(selected_chunks)}...")
+                        # Adjust summary length parameters as needed
+                        chunk_length = len(chunk.split())
+                        max_summary_length = min(150, chunk_length // 2)
+                        min_summary_length = max(50, max_summary_length // 2)
+                        try:
+                            summary_output = summarizer(
+                                chunk,
+                                max_length=max_summary_length,
+                                min_length=min_summary_length,
+                                do_sample=False,
+                                truncation=True
+                            )
+                            chunk_summary = summary_output[0]['summary_text'].strip()
+                            if not chunk_summary:
+                                st.warning(f"The summary for chunk {i+1} is empty.")
+                            else:
+                                summaries.append(chunk_summary)
+                        except Exception as e:
+                            st.error(f"Summarization failed for chunk {i+1}: {e}")
+                            st.text(traceback.format_exc())
+                            continue
+                if summaries:
+                    # Combine summaries and remove duplicates
+                    combined_summary = ' '.join(summaries)
+                    final_summary = remove_duplicate_sentences(combined_summary)
+                    # Calculate compression ratio
+                    summary_length = len(final_summary.split())
+                    compression_ratio = (1 - summary_length / input_length) * 100
+                    st.subheader("Final Summary")
+                    st.success(final_summary)
+                    st.write(f"Summary length: {summary_length} words ({compression_ratio:.1f}% compression)")
+                    # Display summary statistics
+                    st.subheader("Summary Statistics")
+                    stats_col1, stats_col2 = st.columns(2)
+                    with stats_col1:
+                        st.metric("Original Length", f"{input_length} words")
+                        st.metric("Total Chunks", str(len(chunks)))
+                    with stats_col2:
+                        st.metric("Summary Length", f"{summary_length} words")
+                        st.metric("Chunks Processed", str(len(selected_chunks)))
+                else:
+                    st.error("No summaries were generated.")
+            except Exception as e:
+                st.error("An error occurred during summarization.")
+                st.text(traceback.format_exc())
+        else:
+            st.error("Please provide text to summarize.")
+    # Add help information
+    st.sidebar.markdown("---")
+    with st.sidebar.expander("How Chunk Selection Works"):
+        st.markdown("""
+        The chunk selection algorithm ranks text chunks based on:
+        1. **Keyword density** - Presence of financial terms
+        2. **Length** - Longer chunks often contain more information
+        3. **Numbers** - Financial documents with numbers are often important
+        4. **Structure** - Lists and bullet points signal key information
+        5. **Headers** - Section headers often introduce important content
+        Adjust the settings above to customize the selection process.
+        """)