Spaces:

AnilNiraula
/

FinChat

Running

App Files Files Community

AnilNiraula commited on Jul 8

Commit

0aaffc4

verified ·

1 Parent(s): 8782909

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -56

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-#Loading packages
 import logging
 import os
 import time
@@ -15,11 +14,11 @@ import difflib
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)
-# Define device (force CPU for Spaces free tier)
 device = torch.device("cpu")
 logger.info(f"Using device: {device}")
-# Load dataset and precompute period data
 csv_path = "flat-ui__data-Sun Jul 06 2025.csv"
 try:
     df = pd.read_csv(csv_path)
@@ -32,7 +31,7 @@ except Exception as e:
     logger.error(f"Error loading dataset: {e}")
     df = None
-# Precompute yearly aggregates for faster lookups
 if df is not None:
     df_yearly = df.groupby(df['Date'].dt.year).agg({
         'SP500': 'mean',
@@ -46,7 +45,7 @@ if df is not None:
 else:
     df_yearly = None
-# Response cache with financial data entries
 response_cache = {
     "hi": "Hello! I'm FinChat, your financial advisor. How can I help with investing?",
     "hello": "Hello! I'm FinChat, your financial advisor. How can I help with investing?",
@@ -164,38 +163,7 @@ response_cache = {
     ),
     "what is the s&p 500 index fund average growth rate?": (
         "The S&P 500 index fund’s average annual return is approximately 10–12% over the long term (1927–2025), including dividends, based on historical data. "
-        "For example, from 2015 to 2024, it averaged ~12.2% annually. Returns vary yearly due to market conditions. Consult a financial planner."
-    ),
-    "what was the s&p 500 return in 2020?": (
-        "The S&P 500 returned approximately 16.3% in 2020, including dividends, driven by recovery from the COVID-19 market crash."
-    ),
-    "what was the s&p 500 return in 2022?": (
-        "The S&P 500 returned approximately -18.1% in 2022, impacted by high inflation and interest rate hikes."
-    ),
-    "what is the average annual growth rate of the s&p 500 from 2000 to 2010?": (
-        "The S&P 500’s average annual growth rate from 2000 to 2010 was approximately 0.4%, including dividends, impacted by the dot-com crash and 2008 financial crisis."
-    ),
-    "what is the average annual growth rate of the s&p 500 from 2011 to 2016?": (
-        "The S&P 500’s average annual growth rate from 2011 to 2016 was approximately 12.7%, including dividends, driven by post-financial crisis recovery."
-    ),
-    "what was the average annual return of the s&p 500 between 2010 and 2020?": (
-        "The S&P 500’s average annual return from 2010 to 2020 was approximately 13.6%, including dividends, driven by post-financial crisis recovery."
-    ),
-    "what will my return be in 10 years if i invest $5000 into s&p 500 right now?": (
-        "Assuming a 10% average annual return, a $5,000 investment in the S&P 500 would grow to approximately $12,974 in 10 years with annual compounding. "
-        "This is based on the historical average return of 10–12% (1927–2025). Future returns vary and are not guaranteed. Consult a financial planner."
-    ),
-    "what was the 1-year average annual growth rate of the s&p 500 from 2020?": (
-        "The S&P 500 returned approximately 16.3% in 2020, including dividends, driven by recovery from the COVID-19 market crash."
-    ),
-    "what was the 3-year average annual growth rate of the s&p 500 from 2018?": (
-        "The S&P 500’s average annual growth rate from 2018 to 2020 was approximately 10.2%, including dividends, based on historical data."
-    ),
-    "what was the 5-year average annual growth rate of the s&p 500 from 2016?": (
-        "The S&P 500’s average annual growth rate from 2016 to 2020 was approximately 13.6%, including dividends, driven by strong market recovery."
-    ),
-    "what is the average return rate of the s&p 500 in the past 10 years?": (
-        "The S&P 500’s average annual return rate from 2015 to 2024 was approximately 12.2%, including dividends, based on historical data."
     )
 }
@@ -209,7 +177,7 @@ try:
 except Exception as e:
     logger.warning(f"Failed to load cache.json: {e}")
-# Load model and tokenizer (use fine-tuned model if available)
 model_name = "./finetuned_model" if os.path.exists("./finetuned_model") else "distilgpt2"
 try:
     logger.info(f"Loading tokenizer for {model_name}")
@@ -227,7 +195,7 @@ except Exception as e:
     logger.error(f"Error loading model/tokenizer: {e}")
     raise RuntimeError(f"Failed to load model: {str(e)}")
-# Pre-tokenize prompt prefix
 prompt_prefix = (
     "You are FinChat, a financial advisor with expertise in stock market performance. Provide detailed, numbered list advice with clear reasoning for investing prompts, "
     "including precise historical data when relevant (e.g., S&P 500 returns for specific years or periods). For investment return queries, use compound interest calculations "
@@ -252,7 +220,7 @@ prompt_prefix = (
 )
 prefix_tokens = tokenizer(prompt_prefix, return_tensors="pt", truncation=True, max_length=512).to(device)
-# Substring matching for cache with fuzzy matching
 def get_closest_cache_key(message, cache_keys):
     message = message.lower().strip()
     matches = difflib.get_close_matches(message, cache_keys, n=1, cutoff=0.8)
@@ -260,25 +228,32 @@ def get_closest_cache_key(message, cache_keys):
 # Parse period from user input
 def parse_period(query):
     # Match specific year ranges (e.g., "between 2010 and 2020", "2000–2008")
-    match = re.search(r'(?:between|from)\s*(\d{4})\s*(?:and|to|-|–)\s*(\d{4})', query, re.IGNORECASE)
     if match:
         start_year, end_year = map(int, match.groups())
-        return start_year, end_year, None
-    # Match duration-based queries (e.g., "1-year from 2020", "3-year growth rate")
-    match = re.search(r'(\d+)-year.*from\s*(\d{4})', query, re.IGNORECASE)
     if match:
         duration, start_year = map(int, match.groups())
         end_year = start_year + duration - 1
         return start_year, end_year, duration
-    # Match general duration queries (e.g., "past 10 years", "3-year growth rate")
-    match = re.search(r'past\s*(\d+)-year|\b(\d+)-year.*(?:return|growth\s*rate)', query, re.IGNORECASE)
     if match:
         duration = int(match.group(1) or match.group(2))
         max_year = df_yearly['Year'].max() if df_yearly is not None else 2025
         start_year = max_year - duration + 1
         end_year = max_year
         return start_year, end_year, duration
     return None, None, None
 # Calculate average growth rate
@@ -289,7 +264,11 @@ def calculate_growth_rate(start_year, end_year, duration=None):
     if df_period.empty:
         return None, f"No data available for {start_year} to {end_year}."
     avg_return = df_period['Return'].mean()
-    if duration:
         response = f"The S&P 500’s {duration}-year average annual return from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
     else:
         response = f"The S&P 500’s average annual return from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
@@ -308,7 +287,7 @@ def parse_investment_query(query):
 def calculate_future_value(amount, years):
     if df_yearly is None or amount is None or years is None:
         return None, "Data not available or invalid input."
-    avg_annual_return = 10.0  # Historical S&P 500 average (1927–2025)
     future_value = amount * (1 + avg_annual_return / 100) ** years
     return future_value, (
         f"Assuming a 10% average annual return, a ${amount:,.0f} investment in the S&P 500 would grow to approximately ${future_value:,.0f} "
@@ -316,7 +295,7 @@ def calculate_future_value(amount, years):
         "Future returns vary and are not guaranteed. Consult a financial planner."
     )
-# Define chat function
 def chat_with_model(user_input, history=None, is_processing=False):
     try:
         start_time = time.time()
@@ -369,7 +348,7 @@ def chat_with_model(user_input, history=None, is_processing=False):
                 logger.info(f"Response time: {end_time - start_time:.2f} seconds")
                 return response, history, False, ""
-        # Skip model for short prompts
         if len(user_input.strip()) <= 5:
             logger.info("Short prompt, returning default response")
             response = "Hello! I'm FinChat, your financial advisor. Ask about investing!"
@@ -381,7 +360,7 @@ def chat_with_model(user_input, history=None, is_processing=False):
             logger.info(f"Response time: {end_time - start_time:.2f} seconds")
             return response, history, False, ""
-        # Construct prompt
         full_prompt = prompt_prefix + user_input + "\nA:"
         try:
             inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
@@ -396,13 +375,12 @@ def chat_with_model(user_input, history=None, is_processing=False):
             logger.info(f"Response time: {end_time - start_time:.2f} seconds")
             return response, history, False, ""
-        # Generate response
         with torch.inference_mode():
             logger.info("Generating response with model")
             gen_start_time = time.time()
             outputs = model.generate(
                 **inputs,
-                max_new_tokens=40,  # Reduced for faster inference
                 min_length=20,
                 do_sample=False,
                 repetition_penalty=2.0,
@@ -438,7 +416,7 @@ def chat_with_model(user_input, history=None, is_processing=False):
         logger.info(f"Response time: {end_time - start_time:.2f} seconds")
         return response, history, False, ""
-# Save cache on exit
 def save_cache():
     try:
         with open(cache_file, 'w') as f:
@@ -447,7 +425,7 @@ def save_cache():
     except Exception as e:
         logger.warning(f"Failed to save cache.json: {e}")
-# Create Gradio interface with loading animation
 logger.info("Initializing Gradio interface")
 try:
     with gr.Blocks(
@@ -503,7 +481,7 @@ except Exception as e:
     logger.error(f"Error initializing Gradio interface: {e}")
     raise
-# Launch interface (conditional for Spaces)
 if __name__ == "__main__" and not os.getenv("HF_SPACE"):
     logger.info("Launching Gradio interface locally")
     try:

 import logging
 import os
 import time
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)
+# Define device
 device = torch.device("cpu")
 logger.info(f"Using device: {device}")
+# Load dataset
 csv_path = "flat-ui__data-Sun Jul 06 2025.csv"
 try:
     df = pd.read_csv(csv_path)
     logger.error(f"Error loading dataset: {e}")
     df = None
+# Precompute yearly aggregates
 if df is not None:
     df_yearly = df.groupby(df['Date'].dt.year).agg({
         'SP500': 'mean',
 else:
     df_yearly = None
+# Response cache
 response_cache = {
     "hi": "Hello! I'm FinChat, your financial advisor. How can I help with investing?",
     "hello": "Hello! I'm FinChat, your financial advisor. How can I help with investing?",
     ),
     "what is the s&p 500 index fund average growth rate?": (
         "The S&P 500 index fund’s average annual return is approximately 10–12% over the long term (1927–2025), including dividends, based on historical data. "
+        "Returns vary yearly due to market conditions. Consult a financial planner."
     )
 }
 except Exception as e:
     logger.warning(f"Failed to load cache.json: {e}")
+# Load model and tokenizer
 model_name = "./finetuned_model" if os.path.exists("./finetuned_model") else "distilgpt2"
 try:
     logger.info(f"Loading tokenizer for {model_name}")
     logger.error(f"Error loading model/tokenizer: {e}")
     raise RuntimeError(f"Failed to load model: {str(e)}")
+# Prompt prefix
 prompt_prefix = (
     "You are FinChat, a financial advisor with expertise in stock market performance. Provide detailed, numbered list advice with clear reasoning for investing prompts, "
     "including precise historical data when relevant (e.g., S&P 500 returns for specific years or periods). For investment return queries, use compound interest calculations "
 )
 prefix_tokens = tokenizer(prompt_prefix, return_tensors="pt", truncation=True, max_length=512).to(device)
+# Substring matching for cache
 def get_closest_cache_key(message, cache_keys):
     message = message.lower().strip()
     matches = difflib.get_close_matches(message, cache_keys, n=1, cutoff=0.8)
 # Parse period from user input
 def parse_period(query):
+    query = query.lower()
     # Match specific year ranges (e.g., "between 2010 and 2020", "2000–2008")
+    match = re.search(r'(?:between|from)\s*(\d{4})\s*(?:and|to|-|–)\s*(\d{4})', query)
     if match:
         start_year, end_year = map(int, match.groups())
+        if start_year <= end_year:
+            return start_year, end_year, None
+    # Match duration-based queries (e.g., "1-year from 2020", "3-year growth rate from 2018")
+    match = re.search(r'(\d+)-year.*from\s*(\d{4})', query)
     if match:
         duration, start_year = map(int, match.groups())
         end_year = start_year + duration - 1
         return start_year, end_year, duration
+    # Match general duration queries (e.g., "past 5 years", "10-year growth rate")
+    match = re.search(r'(?:past\s*(\d+)-year|\b(\d+)-year.*(?:return|growth\s*rate))', query)
     if match:
         duration = int(match.group(1) or match.group(2))
         max_year = df_yearly['Year'].max() if df_yearly is not None else 2025
         start_year = max_year - duration + 1
         end_year = max_year
         return start_year, end_year, duration
+    # Match single year (e.g., "return in 2020")
+    match = re.search(r'return\s*(?:in|for)\s*(\d{4})', query)
+    if match:
+        year = int(match.group(1))
+        return year, year, 1
     return None, None, None
 # Calculate average growth rate
     if df_period.empty:
         return None, f"No data available for {start_year} to {end_year}."
     avg_return = df_period['Return'].mean()
+    if np.isnan(avg_return):
+        return None, f"Insufficient data for {start_year} to {end_year}."
+    if duration == 1 and start_year == end_year:
+        response = f"The S&P 500 returned approximately {avg_return:.1f}% in {start_year}, including dividends."
+    elif duration:
         response = f"The S&P 500’s {duration}-year average annual return from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
     else:
         response = f"The S&P 500’s average annual return from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
 def calculate_future_value(amount, years):
     if df_yearly is None or amount is None or years is None:
         return None, "Data not available or invalid input."
+    avg_annual_return = 10.0
     future_value = amount * (1 + avg_annual_return / 100) ** years
     return future_value, (
         f"Assuming a 10% average annual return, a ${amount:,.0f} investment in the S&P 500 would grow to approximately ${future_value:,.0f} "
         "Future returns vary and are not guaranteed. Consult a financial planner."
     )
+# Chat function
 def chat_with_model(user_input, history=None, is_processing=False):
     try:
         start_time = time.time()
                 logger.info(f"Response time: {end_time - start_time:.2f} seconds")
                 return response, history, False, ""
+        # Handle short prompts
         if len(user_input.strip()) <= 5:
             logger.info("Short prompt, returning default response")
             response = "Hello! I'm FinChat, your financial advisor. Ask about investing!"
             logger.info(f"Response time: {end_time - start_time:.2f} seconds")
             return response, history, False, ""
+        # Construct and generate response
         full_prompt = prompt_prefix + user_input + "\nA:"
         try:
             inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
             logger.info(f"Response time: {end_time - start_time:.2f} seconds")
             return response, history, False, ""
         with torch.inference_mode():
             logger.info("Generating response with model")
             gen_start_time = time.time()
             outputs = model.generate(
                 **inputs,
+                max_new_tokens=40,
                 min_length=20,
                 do_sample=False,
                 repetition_penalty=2.0,
         logger.info(f"Response time: {end_time - start_time:.2f} seconds")
         return response, history, False, ""
+# Save cache
 def save_cache():
     try:
         with open(cache_file, 'w') as f:
     except Exception as e:
         logger.warning(f"Failed to save cache.json: {e}")
+# Gradio interface
 logger.info("Initializing Gradio interface")
 try:
     with gr.Blocks(
     logger.error(f"Error initializing Gradio interface: {e}")
     raise
+# Launch interface
 if __name__ == "__main__" and not os.getenv("HF_SPACE"):
     logger.info("Launching Gradio interface locally")
     try: