Spaces:

kritsadaK
/

FinBrief

Running

App Files Files Community

kritsadaK commited on Feb 23, 2025

Commit

07804ad

verified ·

1 Parent(s): d2242ec

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -229

app.py CHANGED Viewed

@@ -1,234 +1,15 @@
-import os
 import streamlit as st
-import spacy
-import pandas as pd
-import re
-from transformers import pipeline
-# Ensure Streamlit is properly initialized
-st.set_page_config(page_title="FinBrief: Financial Document Insights", layout="wide")
-# Disable GPU if not needed
-os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
-# Initialize session state variables **before using them**
-if "pdf_text" not in st.session_state:
-    st.session_state["pdf_text"] = ""  # Default to an empty string
-if "pdf_tables" not in st.session_state:
-    st.session_state["pdf_tables"] = []  # Default to an empty list
-if "nlp" not in st.session_state:
-    st.session_state["nlp"] = None
-if "summarizer" not in st.session_state:
-    st.session_state["summarizer"] = None
-# Load the spaCy model once at the start
-try:
-    st.session_state["nlp"] = spacy.load("en_core_web_sm")
-    st.write("spaCy model loaded successfully.")
-except OSError:
-    st.session_state["nlp"] = None
-    st.write("Failed to load spaCy model.")
-# Load the summarization model from Hugging Face Model Hub
-try:
-    online_model_path = "kritsadaK/bart-financial-summarization"
-    st.session_state["summarizer"] = pipeline(
-        "summarization",
-        model=online_model_path,
-        tokenizer=online_model_path
-    )
-    st.write("Online summarization model loaded successfully.")
-except Exception:
-    st.session_state["summarizer"] = None
-    st.write("Failed to load online summarization model.")
-# Now it's safe to access session state variables
-if st.session_state["pdf_text"]:
-    st.text_area("Extracted Text", st.session_state["pdf_text"], height=400)
-else:
-    st.warning("No text extracted yet. Upload a PDF to start.")
-# Define regex patterns to extract structured data
-patterns = {
-    "Fund Name": r"^(.*?) Fund",  # Extracts the name before "Fund"
-    "CUSIP": r"CUSIP\s+(\d+)",
-    "Inception Date": r"Inception Date\s+([\w\s\d]+)",
-    "Benchmark": r"Benchmark\s+([\w\s\d]+)",
-    "Expense Ratio": r"Expense Information.*?(\d+\.\d+%)",
-    "Total Assets": r"Total Assets\s+USD\s+([\d,]+)",
-    "Portfolio Turnover": r"Portfolio Holdings Turnover.*?(\d+\.\d+%)",
-    "Cash Allocation": r"% of Portfolio in Cash\s+(\d+\.\d+%)",
-    "Alpha": r"Alpha\s+(-?\d+\.\d+%)",
-    "Standard Deviation": r"Standard Deviation\s+(\d+\.\d+%)"
-}
-# Set the title and layout
-st.title("FinBrief: Financial Document Insights")
-st.markdown("[Example Financial Documents](https://drive.google.com/drive/folders/1jMu3S7S_Hc_RgK6_cvsCqIB8x3SSS-R6)")
-# Custom styling
-st.markdown(
-    """
-    <style>
-    .sidebar .sidebar-content {
-        background-color: #f7f7f7;
-        color: #333;
-    }
-    .css-1d391kg {
-        background-color: #f0f4f8;
-    }
-    .stButton>button {
-        background-color: #4CAF50;
-        color: white;
-        padding: 10px 20px;
-        border-radius: 5px;
-        font-size: 16px;
-    }
-    .stTextArea textarea {
-        border: 2px solid #4CAF50;
-        border-radius: 5px;
-        padding: 10px;
-    }
-    </style>
-    """,
-    unsafe_allow_html=True,
-)
-# Function to extract text and tables using pdfplumber
-def extract_text_tables_pdfplumber(pdf_file):
-    import io
-    import pdfplumber
-    print("\n🔹 PDFPlumber: Extracting text and tables...")
-    with pdfplumber.open(io.BytesIO(pdf_file.read())) as pdf:
-        all_text = ""
-        all_tables = []
-        for page in pdf.pages:
-            page_text = page.extract_text()
-            if page_text:
-                all_text += page_text + "\n"
-            # Extract tables
-            tables = page.extract_tables()
-            all_tables.extend(tables)  # Store all tables
-        if all_text.strip():
-            print(all_text[:1000])  # Print first 1000 characters for verification
-            return all_text, all_tables
-        else:
-            print("No text extracted. The PDF might be image-based.")
-            return None, None
-# Step 0: Upload PDF
-st.sidebar.header("Upload Your Financial Document")
-uploaded_file = st.sidebar.file_uploader("Choose a PDF file", type="pdf")
-if uploaded_file is not None:
-    st.sidebar.write(f"You uploaded: {uploaded_file.name}")
-    # Extract text and tables
-    pdf_text, pdf_tables = extract_text_tables_pdfplumber(uploaded_file)
-    if pdf_text is not None:
-        # Store results in session state
-        st.session_state["pdf_text"] = pdf_text
-        st.session_state["pdf_tables"] = pdf_tables  # Save tables separately
-        st.sidebar.success("PDF uploaded and text extracted!")
-    else:
-        st.markdown("[Example Financial Documents](https://drive.google.com/drive/folders/1jMu3S7S_Hc_RgK6_cvsCqIB8x3SSS-R6)")
-        st.error("No text extracted from the uploaded PDF.")
-# Step 1: Display Extracted Text
-st.subheader("Extracted Text")
-if st.session_state["pdf_text"]:
-    st.text_area("Document Text", st.session_state["pdf_text"], height=400)
-else:
-    st.warning("No text extracted yet. Upload a PDF to start.")
-# Step 2: Display Extracted Tables
-st.subheader("Extracted Tables")
-if st.session_state["pdf_tables"]:  # Check if tables exist
-    for idx, table in enumerate(st.session_state["pdf_tables"]):
-        st.write(f"Table {idx+1}")
-        st.write(pd.DataFrame(table))  # Display tables as DataFrames
-else:
-    st.info("No tables extracted.")
-# Retrieve variables from session state
-nlp = st.session_state["nlp"]
-summarizer = st.session_state["summarizer"]
-pdf_text = st.session_state["pdf_text"]
-pdf_tables = st.session_state["pdf_tables"]
-# Ensure that the models are loaded
-if nlp is None or summarizer is None:
-    st.error("Models are not properly loaded. Please check your model paths and installation.")
-else:
-    # Step 3: Named Entity Recognition (NER)
-    st.subheader("NER Analysis")
-    # Display full extracted text, not just first 1000 characters
-    example_text = st.text_area(
-        "Enter or paste text for analysis",
-        height=400,
-        value=st.session_state["pdf_text"] if st.session_state["pdf_text"] else ""
-    )
-    if st.button("Analyze"):
-        # Ensure full extracted text is used for analysis
-        text_for_analysis = st.session_state["pdf_text"].strip() if st.session_state["pdf_text"] else example_text.strip()
-        if text_for_analysis:
-            with st.spinner("Analyzing text..."):
-                # Extract structured financial data using regex (Now using full text)
-                extracted_data = {
-                    key: (match.group(1) if match else "N/A")
-                    for key, pattern in patterns.items()
-                    if (match := re.search(pattern, text_for_analysis, re.IGNORECASE))
-                }
-                # Use spaCy to extract additional financial terms (Now using full text)
-                doc = nlp(text_for_analysis)
-                financial_entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ["MONEY", "PERCENT", "ORG", "DATE"]]
-                # Store extracted data in a structured dictionary
-                structured_data = {**extracted_data, "Named Entities Extracted": financial_entities}
-                # Display results
-                st.write("Entities Found:")
-                st.write(pd.DataFrame(financial_entities, columns=["Entity", "Label"]))
-                st.write("Structured Data Extracted:")
-                st.write(pd.DataFrame([structured_data]))
-        else:
-            st.error("Please provide some text for analysis.")
-    # Step 4: Summarization
-    st.subheader("Summarization")
-    # Display full extracted text, not just first 1000 characters
-    input_text = st.text_area(
-        "Enter text to summarize",
-        height=400,
-        value=st.session_state["pdf_text"] if st.session_state["pdf_text"] else ""
-    )
-    if st.button("Summarize"):
-        # Ensure full extracted text is used for summarization
-        text_to_summarize = st.session_state["pdf_text"].strip() if st.session_state["pdf_text"] else input_text.strip()
-        if text_to_summarize:
-            with st.spinner("Generating summary..."):
-                summary = summarizer(
-                    text_to_summarize,
-                    max_length=min(len(text_to_summarize.split()), 1024),
-                    min_length=100,
-                    do_sample=False
-                )
-                st.write("Summary:")
-                st.success(summary[0]["summary_text"])
-        else:
-            st.error("Please provide text to summarize.")

 import streamlit as st
+# Set the title of the app
+st.title("My Simple Streamlit App")
+# Add a text input
+user_input = st.text_input("Enter some text:")
+# Display user input
+if user_input:
+    st.write(f"You entered: {user_input}")
+# Add a button
+if st.button("Click Me!"):
+    st.write("Button clicked!")