Spaces:

KKowenn
/

finbreif3

Running

App Files Files Community

KKowenn commited on 3 days ago

Commit

fb39280

verified ·

1 Parent(s): 7f6a448

Upload 4 files

Browse files

Files changed (4) hide show

README.md +9 -7
app.py +427 -0
requirements.txt +11 -0
space.yaml +7 -0

README.md CHANGED Viewed

@@ -1,13 +1,15 @@
 ---
-title: Finbreif3
-emoji: 🏃
-colorFrom: indigo
-colorTo: green
 sdk: streamlit
-sdk_version: 1.43.2
 app_file: app.py
 pinned: false
-short_description: debugging
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: FinBrief
+emoji: 💵
+colorFrom: green
+colorTo: gray
 sdk: streamlit
 app_file: app.py
 pinned: false
+license: mit
+short_description: Financial PDF Document Summarization web-App
 ---
+# Install Rust
+RUN apt-get update && apt-get install -y cargo

app.py ADDED Viewed

	@@ -0,0 +1,427 @@

+import streamlit as st
+import spacy
+import pandas as pd
+import re
+from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
+import subprocess
+import os
+os.environ["TRANSFORMERS_CACHE"] = "/home/user/.cache/huggingface"
+os.environ["HF_HOME"] = "/home/user/.cache/huggingface"
+os.environ["TORCH_HOME"] = "/home/user/.cache/torch"
+os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
+import torch
+import nltk
+from nltk.tokenize import sent_tokenize
+import traceback
+# Set Streamlit page config
+st.set_page_config(page_title="FinBrief: Financial Document Insights", layout="wide")
+try:
+    nlp = spacy.load("en_core_web_sm")
+    st.write("spaCy model loaded successfully!")
+    print("spaCy model loaded successfully!")
+except OSError:
+    st.write("Failed to load spaCy model. Attempting to install...")
+    print("Failed to load spaCy model. Attempting to install...")
+    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
+    try:
+        nlp = spacy.load("en_core_web_sm")
+        st.write("spaCy model installed and loaded successfully!")
+        print("spaCy model installed and loaded successfully!")
+    except Exception as e:
+        st.write(f"Still failed to load spaCy model: {e}")
+        print(f"Still failed to load spaCy model: {e}")
+        nlp = None  # Mark spaCy as failed
+model_name = "kritsadaK/bart-financial-summarization"
+try:
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True)
+    summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
+    st.write("Hugging Face summarization model loaded successfully!")
+    print("Hugging Face summarization model loaded successfully!")
+except Exception as e:
+    st.write(f"Failed to load Hugging Face summarization model: {e}")
+    print(f"Failed to load Hugging Face summarization model: {e}")
+    summarizer = None  # Mark Hugging Face model as failed
+# Store models in Streamlit session state
+st.session_state["nlp"] = nlp
+st.session_state["summarizer"] = summarizer
+# UI: Show clear error messages if models failed
+if nlp is None:
+    st.error("The spaCy model failed to load. Ensure it is installed.")
+if summarizer is None:
+    st.error("The summarization model failed to load. Check the model path or internet connection.")
+st.title("FinBrief: Financial Document Insights")
+st.write("Upload a financial document for analysis.")
+# Initialize session state
+if "nlp" not in st.session_state:
+    st.session_state["nlp"] = nlp
+if "summarizer" not in st.session_state:
+    st.session_state["summarizer"] = summarizer
+# Set up NLTK data directory
+nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data')
+if not os.path.exists(nltk_data_dir):
+    os.makedirs(nltk_data_dir)
+nltk.data.path.append(nltk_data_dir)
+def download_nltk_punkt():
+    try:
+        nltk.data.find('tokenizers/punkt')
+        st.write("NLTK 'punkt' tokenizer is already installed.")
+        print("NLTK 'punkt' tokenizer is already installed.")
+    except LookupError:
+        st.write("NLTK 'punkt' tokenizer not found. Attempting to download...")
+        print("NLTK 'punkt' tokenizer not found. Attempting to download...")
+        try:
+            nltk.download('punkt', download_dir=nltk_data_dir, quiet=True)
+            nltk.data.find('tokenizers/punkt')
+            st.write("NLTK 'punkt' tokenizer downloaded successfully.")
+            print("NLTK 'punkt' tokenizer downloaded successfully.")
+        except Exception as e:
+            st.error(f"NLTK 'punkt' tokenizer download failed: {e}")
+            print(f"NLTK 'punkt' tokenizer download failed: {e}")
+# Call the function at the beginning of script
+download_nltk_punkt()
+# Debugging: Check session state initialization
+print(f"Session State - NLP: {st.session_state['nlp'] is not None}, Summarizer: {st.session_state['summarizer'] is not None}")
+# # Load the summarization model locally
+# try:
+#     local_model_path = "./local_models/bart-financial"
+#     summarizer = pipeline("summarization", model=local_model_path, tokenizer=local_model_path)
+#     st.write("Local summarization model loaded successfully!")
+# except Exception as e:
+#     summarizer = None  # Handle case where model is missing
+#     st.write("Failed to load local summarization model.")
+# Define regex patterns to extract structured data
+patterns = {
+    "Fund Name": r"^(.*?) Fund",  # Extracts the name before "Fund"
+    "CUSIP": r"CUSIP\s+(\d+)",
+    "Inception Date": r"Inception Date\s+([\w\s\d]+)",
+    "Benchmark": r"Benchmark\s+([\w\s\d]+)",
+    "Expense Ratio": r"Expense Information.*?(\d+\.\d+%)",
+    "Total Assets": r"Total Assets\s+USD\s+([\d,]+)",
+    "Portfolio Turnover": r"Portfolio Holdings Turnover.*?(\d+\.\d+%)",
+    "Cash Allocation": r"% of Portfolio in Cash\s+(\d+\.\d+%)",
+    "Alpha": r"Alpha\s+(-?\d+\.\d+%)",
+    "Standard Deviation": r"Standard Deviation\s+(\d+\.\d+%)"
+}
+# Set the title and layout
+st.markdown("[Example Financial Documents](https://drive.google.com/drive/folders/1jMu3S7S_Hc_RgK6_cvsCqIB8x3SSS-R6)")
+# Custom styling (this remains unchanged)
+st.markdown(
+    """
+    <style>
+    .sidebar .sidebar-content {
+        background-color: #f7f7f7;
+        color: #333;
+    }
+    .css-1d391kg {
+        background-color: #f0f4f8;
+    }
+    .stButton>button {
+        background-color: #4CAF50;
+        color: white;
+        padding: 10px 20px;
+        border-radius: 5px;
+        font-size: 16px;
+    }
+    .stTextArea textarea {
+        border: 2px solid #4CAF50;
+        border-radius: 5px;
+        padding: 10px;
+    }
+    </style>
+    """,
+    unsafe_allow_html=True,
+)
+# Function to extract text and tables using pdfplumber
+def extract_text_tables_pdfplumber(pdf_file):
+    import io
+    import pdfplumber
+    print("\nPDFPlumber: Extracting text and tables...")
+    with pdfplumber.open(io.BytesIO(pdf_file.read())) as pdf:
+        all_text = ""
+        all_tables = []
+        for page in pdf.pages:
+            page_text = page.extract_text()
+            if page_text:
+                all_text += page_text + "\n"
+            # Extract tables
+            tables = page.extract_tables()
+            all_tables.extend(tables)  # Store all tables
+        if all_text.strip():
+            print(all_text[:1000])  # Print first 1000 characters for verification
+            return all_text, all_tables
+        else:
+            print("No text extracted. The PDF might be image-based.")
+            return None, None
+def split_text_into_chunks(text, tokenizer, max_tokens=1024):
+    sentences = nltk.sent_tokenize(text)
+    chunks = []
+    current_chunk = ''
+    current_length = 0
+    for sentence in sentences:
+        sentence_tokens = tokenizer.encode(sentence, add_special_tokens=False)
+        sentence_length = len(sentence_tokens)
+        # If adding the next sentence exceeds the max_tokens limit
+        if current_length + sentence_length > max_tokens:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+            # Start a new chunk
+            current_chunk = sentence
+            current_length = sentence_length
+        else:
+            current_chunk += ' ' + sentence
+            current_length += sentence_length
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
+def remove_duplicate_sentences(text):
+    sentences = nltk.sent_tokenize(text)
+    unique_sentences = []
+    seen_sentences = set()
+    for sentence in sentences:
+        # Normalize the sentence to ignore case and punctuation for comparison
+        normalized_sentence = sentence.strip().lower()
+        if normalized_sentence not in seen_sentences:
+            seen_sentences.add(normalized_sentence)
+            unique_sentences.append(sentence)
+    return ' '.join(unique_sentences)
+# Ensure session state is initialized
+if "pdf_text" not in st.session_state:
+    st.session_state["pdf_text"] = ""
+if "pdf_tables" not in st.session_state:
+    st.session_state["pdf_tables"] = []  # Initialize as an empty list
+# Step 0: Upload PDF
+st.sidebar.header("Upload Your Financial Document")
+uploaded_file = st.sidebar.file_uploader("Choose a PDF file", type="pdf")
+if uploaded_file is not None:
+    st.sidebar.write(f"You uploaded: {uploaded_file.name}")
+    # Extract text and tables
+    pdf_text, pdf_tables = extract_text_tables_pdfplumber(uploaded_file)
+    if pdf_text is not None:
+        # Store results in session state
+        st.session_state["pdf_text"] = pdf_text
+        st.session_state["pdf_tables"] = pdf_tables  # Save tables separately
+        st.sidebar.success("PDF uploaded and text extracted!")
+    else:
+        st.markdown("[Example Financial Documents](https://drive.google.com/drive/folders/1jMu3S7S_Hc_RgK6_cvsCqIB8x3SSS-R6)")
+        st.error("No text extracted from the uploaded PDF.")
+# Step 1: Display Extracted Text
+st.subheader("Extracted Text")
+if st.session_state["pdf_text"]:
+    st.text_area("Document Text", st.session_state["pdf_text"], height=400)
+else:
+    st.warning("No text extracted yet. Upload a PDF to start.")
+# Step 2: Display Extracted Tables (Fixed Error)
+st.subheader("Extracted Tables")
+if st.session_state["pdf_tables"]:  # Check if tables exist
+    for idx, table in enumerate(st.session_state["pdf_tables"]):
+        st.write(f"Table {idx+1}")
+        st.write(pd.DataFrame(table))  # Display tables as DataFrames
+else:
+    st.info("No tables extracted.")
+# Retrieve variables from session state
+nlp = st.session_state["nlp"]
+summarizer = st.session_state["summarizer"]
+pdf_text = st.session_state["pdf_text"]
+pdf_tables = st.session_state["pdf_tables"]
+# Ensure that the models are loaded
+if nlp is None or summarizer is None:
+    st.error("Models are not properly loaded. Please check your model paths and installation.")
+else:
+    # Step 3: Named Entity Recognition (NER)
+    st.subheader("NER Analysis")
+    # Display full extracted text, not just first 1000 characters
+    example_text = st.text_area(
+        "Enter or paste text for analysis",
+        height=400,
+        value=st.session_state["pdf_text"] if st.session_state["pdf_text"] else ""
+    )
+    if st.button("Analyze"):
+        # Ensure full extracted text is used for analysis
+        text_for_analysis = st.session_state["pdf_text"].strip() if st.session_state["pdf_text"] else example_text.strip()
+        if text_for_analysis:
+            with st.spinner("Analyzing text..."):
+                # Extract structured financial data using regex (Now using full text)
+                extracted_data = {
+                    key: (match.group(1) if match else "N/A")
+                    for key, pattern in patterns.items()
+                    if (match := re.search(pattern, text_for_analysis, re.IGNORECASE))
+                }
+                # Use spaCy to extract additional financial terms (Now using full text)
+                doc = nlp(text_for_analysis)
+                financial_entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ["MONEY", "PERCENT", "ORG", "DATE"]]
+                # Store extracted data in a structured dictionary
+                structured_data = {**extracted_data, "Named Entities Extracted": financial_entities}
+                # Display results
+                st.write("Entities Found:")
+                st.write(pd.DataFrame(financial_entities, columns=["Entity", "Label"]))
+                st.write("Structured Data Extracted:")
+                st.write(pd.DataFrame([structured_data]))
+        else:
+            st.error("Please provide some text for analysis.")
+    # Step 4: Summarization
+    st.subheader("Summarization")
+    st.write("Generate concise summaries of financial documents.")
+    # Text summarization input
+    input_text = st.text_area(
+        "Enter text to summarize",
+        height=200,
+        value=st.session_state.get("pdf_text", "") if "pdf_text" in st.session_state else ""
+    )
+    if st.button("Summarize"):
+        text_to_summarize = input_text.strip()
+        if text_to_summarize:
+            try:
+                # Display original text length
+                input_length = len(text_to_summarize.split())
+                st.write(f"Original text length: {input_length} words")
+                # Define the maximum number of tokens the model can handle
+                max_input_tokens = 1024  # BART's maximum input length
+                # Function to split text into chunks based on tokens (modified to avoid overlaps)
+                def split_text_into_chunks(text, tokenizer, max_tokens=max_input_tokens):
+                    sentences = nltk.sent_tokenize(text)
+                    chunks = []
+                    current_chunk = ''
+                    current_length = 0
+                    for sentence in sentences:
+                        sentence_tokens = tokenizer.encode(sentence, add_special_tokens=False)
+                        sentence_length = len(sentence_tokens)
+                        # If adding the sentence exceeds max_tokens, start a new chunk
+                        if current_length + sentence_length > max_tokens:
+                            if current_chunk:
+                                chunks.append(current_chunk.strip())
+                            current_chunk = sentence
+                            current_length = sentence_length
+                        else:
+                            current_chunk += ' ' + sentence
+                            current_length += sentence_length
+                    if current_chunk:
+                        chunks.append(current_chunk.strip())
+                    return chunks
+                # Function to remove duplicate sentences
+                def remove_duplicate_sentences(text):
+                    sentences = nltk.sent_tokenize(text)
+                    unique_sentences = []
+                    seen_sentences = set()
+                    for sentence in sentences:
+                        normalized_sentence = sentence.strip().lower()
+                        if normalized_sentence not in seen_sentences:
+                            seen_sentences.add(normalized_sentence)
+                            unique_sentences.append(sentence)
+                    return ' '.join(unique_sentences)
+                # Split the text into manageable chunks
+                chunks = split_text_into_chunks(text_to_summarize, tokenizer)
+                st.write(f"Text has been split into {len(chunks)} chunks.")
+                # Summarize each chunk
+                summaries = []
+                for i, chunk in enumerate(chunks):
+                    st.write(f"Summarizing chunk {i+1}/{len(chunks)}...")
+                    # Adjust summary length parameters as needed
+                    chunk_length = len(chunk.split())
+                    max_summary_length = min(150, chunk_length // 2)
+                    min_summary_length = max(50, max_summary_length // 2)
+                    try:
+                        summary_output = summarizer(
+                            chunk,
+                            max_length=max_summary_length,
+                            min_length=min_summary_length,
+                            do_sample=False,
+                            truncation=True
+                        )
+                        chunk_summary = summary_output[0]['summary_text'].strip()
+                        if not chunk_summary:
+                            st.warning(f"The summary for chunk {i+1} is empty.")
+                        else:
+                            summaries.append(chunk_summary)
+                            # Optionally display the summary of the current chunk
+                            # st.write(f"Summary of chunk {i+1}:")
+                            # st.write(chunk_summary)
+                            # st.write("---")
+                    except Exception as e:
+                        st.error(f"Summarization failed for chunk {i+1}: {e}")
+                        st.text(traceback.format_exc())
+                        continue
+                if summaries:
+                    # Combine summaries
+                    combined_summary = ' '.join(summaries)
+                    # Remove duplicate sentences
+                    final_summary = remove_duplicate_sentences(combined_summary)
+                    st.write("Final Summary:")
+                    st.success(final_summary)
+                else:
+                    st.error("No summaries were generated.")
+            except Exception as e:
+                st.error("An error occurred during summarization.")
+                st.text(traceback.format_exc())
+        else:
+            st.error("Please provide text to summarize.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+streamlit==1.37.1
+spacy==3.8.4
+pandas==2.2.2
+numpy==1.26.4
+transformers==4.48.1
+tokenizers==0.21.0
+pdfplumber==0.11.5
+flax==0.8.3
+huggingface-hub==0.29.1
+torch
+nltk==3.8.1

space.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+---
+title: FinBrief
+python_version: 3.8.19
+sdk: streamlit
+app_file: app.py
+pinned: false
+license: mit