import streamlit as st # Set page configuration first st.set_page_config(page_title="FinBrief: Financial Document Insights", layout="wide") import spacy import pandas as pd import re from transformers import pipeline import subprocess # Load the spaCy model once at the start try: nlp = spacy.load("en_core_web_sm") st.write("spaCy model loaded successfully!") except OSError: st.write("Failed to load spaCy model.") subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"]) nlp = spacy.load("en_core_web_sm") # Load the summarization model from Hugging Face Model Hub try: online_model_path = "kritsadaK/bart-financial-summarization" summarizer = pipeline("summarization", model=online_model_path, tokenizer=online_model_path) st.write("Online summarization model loaded successfully!") except Exception as e: summarizer = None # Handle case where model is missing st.write("Failed to load online summarization model.") # Initialize models in session state if not already loaded if "nlp" not in st.session_state: st.session_state["nlp"] = nlp if "summarizer" not in st.session_state: st.session_state["summarizer"] = summarizer # # Load the summarization model locally # try: # local_model_path = "./local_models/bart-financial" # summarizer = pipeline("summarization", model=local_model_path, tokenizer=local_model_path) # st.write("Local summarization model loaded successfully!") # except Exception as e: # summarizer = None # Handle case where model is missing # st.write("Failed to load local summarization model.") # Define regex patterns to extract structured data patterns = { "Fund Name": r"^(.*?) Fund", # Extracts the name before "Fund" "CUSIP": r"CUSIP\s+(\d+)", "Inception Date": r"Inception Date\s+([\w\s\d]+)", "Benchmark": r"Benchmark\s+([\w\s\d]+)", "Expense Ratio": r"Expense Information.*?(\d+\.\d+%)", "Total Assets": r"Total Assets\s+USD\s+([\d,]+)", "Portfolio Turnover": r"Portfolio Holdings Turnover.*?(\d+\.\d+%)", "Cash Allocation": r"% of Portfolio in Cash\s+(\d+\.\d+%)", "Alpha": r"Alpha\s+(-?\d+\.\d+%)", "Standard Deviation": r"Standard Deviation\s+(\d+\.\d+%)" } # Set the title and layout st.title("FinBrief: Financial Document Insights") st.markdown("[Example Financial Documents](https://drive.google.com/drive/folders/1jMu3S7S_Hc_RgK6_cvsCqIB8x3SSS-R6)") # Custom styling (this remains unchanged) st.markdown( """ """, unsafe_allow_html=True, ) # Function to extract text and tables using pdfplumber def extract_text_tables_pdfplumber(pdf_file): import io import pdfplumber print("\n🔹 PDFPlumber: Extracting text and tables...") with pdfplumber.open(io.BytesIO(pdf_file.read())) as pdf: all_text = "" all_tables = [] for page in pdf.pages: page_text = page.extract_text() if page_text: all_text += page_text + "\n" # Extract tables tables = page.extract_tables() all_tables.extend(tables) # Store all tables if all_text.strip(): print(all_text[:1000]) # Print first 1000 characters for verification return all_text, all_tables else: print("No text extracted. The PDF might be image-based.") return None, None # Ensure session state is initialized if "pdf_text" not in st.session_state: st.session_state["pdf_text"] = "" if "pdf_tables" not in st.session_state: st.session_state["pdf_tables"] = [] # Initialize as an empty list # Step 0: Upload PDF st.sidebar.header("Upload Your Financial Document") uploaded_file = st.sidebar.file_uploader("Choose a PDF file", type="pdf") if uploaded_file is not None: st.sidebar.write(f"You uploaded: {uploaded_file.name}") # Extract text and tables pdf_text, pdf_tables = extract_text_tables_pdfplumber(uploaded_file) if pdf_text is not None: # Store results in session state st.session_state["pdf_text"] = pdf_text st.session_state["pdf_tables"] = pdf_tables # Save tables separately st.sidebar.success("PDF uploaded and text extracted!") else: st.markdown("[Example Financial Documents](https://drive.google.com/drive/folders/1jMu3S7S_Hc_RgK6_cvsCqIB8x3SSS-R6)") st.error("No text extracted from the uploaded PDF.") # Step 1: Display Extracted Text st.subheader("Extracted Text") if st.session_state["pdf_text"]: st.text_area("Document Text", st.session_state["pdf_text"], height=400) else: st.warning("No text extracted yet. Upload a PDF to start.") # Step 2: Display Extracted Tables (Fixed Error) st.subheader("Extracted Tables") if st.session_state["pdf_tables"]: # Check if tables exist for idx, table in enumerate(st.session_state["pdf_tables"]): st.write(f"Table {idx+1}") st.write(pd.DataFrame(table)) # Display tables as DataFrames else: st.info("No tables extracted.") # Retrieve variables from session state nlp = st.session_state["nlp"] summarizer = st.session_state["summarizer"] pdf_text = st.session_state["pdf_text"] pdf_tables = st.session_state["pdf_tables"] # Ensure that the models are loaded if nlp is None or summarizer is None: st.error("Models are not properly loaded. Please check your model paths and installation.") else: # Step 3: Named Entity Recognition (NER) st.subheader("NER Analysis") # Display full extracted text, not just first 1000 characters example_text = st.text_area( "Enter or paste text for analysis", height=400, value=st.session_state["pdf_text"] if st.session_state["pdf_text"] else "" ) if st.button("Analyze"): # Ensure full extracted text is used for analysis text_for_analysis = st.session_state["pdf_text"].strip() if st.session_state["pdf_text"] else example_text.strip() if text_for_analysis: with st.spinner("Analyzing text..."): # Extract structured financial data using regex (Now using full text) extracted_data = { key: (match.group(1) if match else "N/A") for key, pattern in patterns.items() if (match := re.search(pattern, text_for_analysis, re.IGNORECASE)) } # Use spaCy to extract additional financial terms (Now using full text) doc = nlp(text_for_analysis) financial_entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ["MONEY", "PERCENT", "ORG", "DATE"]] # Store extracted data in a structured dictionary structured_data = {**extracted_data, "Named Entities Extracted": financial_entities} # Display results st.write("Entities Found:") st.write(pd.DataFrame(financial_entities, columns=["Entity", "Label"])) st.write("Structured Data Extracted:") st.write(pd.DataFrame([structured_data])) else: st.error("Please provide some text for analysis.") # Step 4: Summarization st.subheader("Summarization") # Display full extracted text, not just first 1000 characters input_text = st.text_area( "Enter text to summarize", height=400, value=st.session_state["pdf_text"] if st.session_state["pdf_text"] else "" ) if st.button("Summarize"): # Ensure full extracted text is used for summarization text_to_summarize = st.session_state["pdf_text"].strip() if st.session_state["pdf_text"] else input_text.strip() if text_to_summarize: with st.spinner("Generating summary..."): summary = summarizer( text_to_summarize, max_length=min(len(text_to_summarize.split()), 1024), min_length=100, do_sample=False ) st.write("Summary:") st.success(summary[0]["summary_text"]) else: st.error("Please provide text to summarize.")