Spaces:

AnilNiraula
/

FinChat

Running

App Files Files Community

AnilNiraula commited on Jul 8

Commit

8ebe689

verified ·

1 Parent(s): 08d63f2

Update finetuned_model.py

Browse files

Files changed (1) hide show

finetuned_model.py +116 -13

finetuned_model.py CHANGED Viewed

@@ -5,7 +5,10 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments,
 import torch
 import numpy as np
-# Load and Preprocess Dataset
 csv_path = "flat-ui__data-Sun Jul 06 2025.csv"
 try:
     df = pd.read_csv(csv_path)
@@ -13,12 +16,13 @@ except Exception as e:
     print(f"Error loading CSV: {e}")
     exit()
 df['Date'] = pd.to_datetime(df['Date'])
 df = df.sort_values('Date')
-df['Return'] = df['SP500'].pct_change(12) * 100
-df['Real Return'] = df['Real Price'].pct_change(12) * 100
-# Aggregate to yearly data
 df_yearly = df.groupby(df['Date'].dt.year).agg({
     'SP500': 'mean',
     'Return': 'mean',
@@ -29,7 +33,7 @@ df_yearly = df.groupby(df['Date'].dt.year).agg({
 }).reset_index()
 df_yearly = df_yearly.rename(columns={'Date': 'Year'})
-# Create question-answer pairs
 qa_pairs = []
 for _, row in df_yearly.iterrows():
     year = int(row['Year'])
@@ -40,21 +44,108 @@ for _, row in df_yearly.iterrows():
     real_return = row.get('Real Return', 0.0)
     pe10 = row.get('PE10', 0.0)
     qa_pairs.append({
         "question": f"What was the S&P 500 return in {year}?",
         "answer": f"The S&P 500 returned approximately {return_val:.1f}% in {year}, including dividends."
     })
-    # Add more pairs as needed...
 # Add specific period and general questions
 qa_pairs.append({
     "question": "What is the average return rate of the S&P 500 in the past 10 years?",
     "answer": "The S&P 500’s average annual return rate from 2015 to 2024 was approximately 12.2%, including dividends, based on historical data."
 })
 qa_pairs.append({
     "question": "What was the average annual return of the S&P 500 between 2020 and 2022?",
     "answer": "The S&P 500’s average annual return from 2020 to 2022 was approximately 8.3%, including dividends, with significant volatility due to the COVID-19 recovery and 2022 bear market."
 })
 # Save to JSON
 with open("financial_data.json", "w") as f:
@@ -67,7 +158,7 @@ train_dataset = dataset["train"]
 val_dataset = dataset["test"].train_test_split(test_size=0.5, seed=42)["train"]
 test_dataset = dataset["test"].train_test_split(test_size=0.5, seed=42)["test"]
-# Tokenize Data
 tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
 tokenizer.pad_token = tokenizer.eos_token
@@ -82,10 +173,10 @@ tokenized_train = train_dataset.map(tokenize_function, batched=True)
 tokenized_val = val_dataset.map(tokenize_function, batched=True)
 tokenized_test = test_dataset.map(tokenize_function, batched=True)
-# Load Pre-trained Model
 model = AutoModelForCausalLM.from_pretrained("distilgpt2")
-# Set Up Fine-Tuning
 training_args = TrainingArguments(
     output_dir="./finetuned_model",
     evaluation_strategy="epoch",
@@ -107,13 +198,25 @@ trainer = Trainer(
     eval_dataset=tokenized_val,
 )
-# Fine-Tune the Model
 trainer.train()
-# Evaluate the Model
 eval_results = trainer.evaluate(tokenized_test)
 print("Evaluation results:", eval_results)
-# Save the Fine-Tuned Model
 trainer.save_model("./finetuned_model")
-tokenizer.save_pretrained("./finetuned_model")

 import torch
 import numpy as np
+# Step 1: Set Up Environment
+# Ensure libraries are installed: pip install transformers datasets torch accelerate pandas numpy
+# Step 2: Load and Preprocess Dataset
 csv_path = "flat-ui__data-Sun Jul 06 2025.csv"
 try:
     df = pd.read_csv(csv_path)
     print(f"Error loading CSV: {e}")
     exit()
+# Preprocess: Calculate annual returns
 df['Date'] = pd.to_datetime(df['Date'])
 df = df.sort_values('Date')
+df['Return'] = df['SP500'].pct_change(12) * 100  # Annual return based on monthly data
+df['Real Return'] = df['Real Price'].pct_change(12) * 100  # Inflation-adjusted return
+# Aggregate to yearly data for faster processing
 df_yearly = df.groupby(df['Date'].dt.year).agg({
     'SP500': 'mean',
     'Return': 'mean',
 }).reset_index()
 df_yearly = df_yearly.rename(columns={'Date': 'Year'})
+# Create question-answer pairs and summaries
 qa_pairs = []
 for _, row in df_yearly.iterrows():
     year = int(row['Year'])
     real_return = row.get('Real Return', 0.0)
     pe10 = row.get('PE10', 0.0)
+    # Year-specific questions
     qa_pairs.append({
         "question": f"What was the S&P 500 return in {year}?",
         "answer": f"The S&P 500 returned approximately {return_val:.1f}% in {year}, including dividends."
     })
+    qa_pairs.append({
+        "question": f"What was the S&P 500 index value in {year}?",
+        "answer": f"The S&P 500 averaged approximately {sp500:.2f} in {year}."
+    })
+    qa_pairs.append({
+        "question": f"What was the S&P 500 real return in {year}?",
+        "answer": f"The S&P 500’s inflation-adjusted return was approximately {real_return:.1f}% in {year}."
+    })
+    if dividend > 0:
+        qa_pairs.append({
+            "question": f"What was the S&P 500 dividend in {year}?",
+            "answer": f"The S&P 500 dividend was approximately {dividend:.2f} in {year}."
+        })
+    if earnings > 0:
+        qa_pairs.append({
+            "question": f"What were the S&P 500 earnings in {year}?",
+            "answer": f"The S&P 500 earnings were approximately {earnings:.2f} in {year}."
+        })
+    if pe10 > 0:
+        qa_pairs.append({
+            "question": f"What was the S&P 500 PE10 ratio in {year}?",
+            "answer": f"The S&P 500 PE10 ratio was approximately {pe10:.2f} in {year}."
+        })
+    # Summaries
+    qa_pairs.append({
+        "summary": f"In {year}, the S&P 500 averaged {sp500:.2f} with a {return_val:.1f}% annual return and a {real_return:.1f}% real return."
+    })
+# Period-specific questions (1-year, 3-year, 5-year, 10-year, and recent ranges)
+years = df_yearly['Year'].unique()
+for year in years:
+    for duration in [1, 3, 5, 10]:
+        start_year = int(year)
+        end_year = start_year + duration - 1
+        if end_year <= df_yearly['Year'].max():
+            df_period = df_yearly[(df_yearly['Year'] >= start_year) & (df_yearly['Year'] <= end_year)]
+            avg_return = df_period['Return'].mean()
+            avg_real_return = df_period['Real Return'].mean()
+            qa_pairs.append({
+                "question": f"What was the {duration}-year average annual growth rate of the S&P 500 from {start_year}?",
+                "answer": f"The S&P 500’s {duration}-year average annual growth rate from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
+            })
+            qa_pairs.append({
+                "question": f"What was the {duration}-year real return of the S&P 500 from {start_year}?",
+                "answer": f"The S&P 500’s {duration}-year average annual inflation-adjusted return from {start_year} to {end_year} was approximately {avg_real_return:.1f}%."
+            })
+# Custom period questions, including recent periods
+custom_periods = [(2000, 2010), (2011, 2016), (2010, 2020), (2000, 2008), (2015, 2024), (2020, 2022), (2020, 2024)]
+for start_year, end_year in custom_periods:
+    df_period = df_yearly[(df_yearly['Year'] >= start_year) & (df_yearly['Year'] <= end_year)]
+    if not df_period.empty:
+        avg_return = df_period['Return'].mean()
+        avg_real_return = df_period['Real Return'].mean()
+        qa_pairs.append({
+            "question": f"What was the average annual growth rate of the S&P 500 between {start_year} and {end_year}?",
+            "answer": f"The S&P 500’s average annual growth rate from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
+        })
+        qa_pairs.append({
+            "question": f"What was the average annual return of the S&P 500 between {start_year} and {end_year}?",
+            "answer": f"The S&P 500’s average annual return from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
+        })
+        qa_pairs.append({
+            "question": f"What was the S&P 500’s real return between {start_year} and {end_year}?",
+            "answer": f"The S&P 500’s average annual inflation-adjusted return from {start_year} to {end_year} was approximately {avg_real_return:.1f}%."
+        })
+# Investment return questions
+amounts = [1000, 5000, 10000]
+durations = [1, 3, 5, 10, 20]
+avg_annual_return = 10.0  # Historical S&P 500 average (1927–2025)
+for amount in amounts:
+    for n in durations:
+        future_value = amount * (1 + avg_annual_return / 100) ** n
+        qa_pairs.append({
+            "question": f"What will ${amount} be worth in {n} years if invested in the S&P 500?",
+            "answer": f"Assuming a 10% average annual return, ${amount:,.0f} invested in the S&P 500 would grow to approximately ${future_value:,.0f} in {n} years with annual compounding."
+        })
 # Add specific period and general questions
 qa_pairs.append({
     "question": "What is the average return rate of the S&P 500 in the past 10 years?",
     "answer": "The S&P 500’s average annual return rate from 2015 to 2024 was approximately 12.2%, including dividends, based on historical data."
 })
+qa_pairs.append({
+    "question": "What is the S&P 500 index fund average growth rate?",
+    "answer": "The S&P 500 index fund’s average annual return is approximately 10–12% over the long term (1927–2025), including dividends, based on historical data."
+})
 qa_pairs.append({
     "question": "What was the average annual return of the S&P 500 between 2020 and 2022?",
     "answer": "The S&P 500’s average annual return from 2020 to 2022 was approximately 8.3%, including dividends, with significant volatility due to the COVID-19 recovery and 2022 bear market."
 })
+qa_pairs.append({
+    "question": "What was the average annual return of the S&P 500 in the past 5 years?",
+    "answer": "The S&P 500’s average annual return from 2020 to 2024 was approximately 10.5%, including dividends, based on historical data."
+})
 # Save to JSON
 with open("financial_data.json", "w") as f:
 val_dataset = dataset["test"].train_test_split(test_size=0.5, seed=42)["train"]
 test_dataset = dataset["test"].train_test_split(test_size=0.5, seed=42)["test"]
+# Step 3: Tokenize Data
 tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
 tokenizer.pad_token = tokenizer.eos_token
 tokenized_val = val_dataset.map(tokenize_function, batched=True)
 tokenized_test = test_dataset.map(tokenize_function, batched=True)
+# Step 4: Load Pre-trained Model
 model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+# Step 5: Set Up Fine-Tuning
 training_args = TrainingArguments(
     output_dir="./finetuned_model",
     evaluation_strategy="epoch",
     eval_dataset=tokenized_val,
 )
+# Step 6: Fine-Tune the Model
 trainer.train()
+# Step 7: Evaluate the Model
 eval_results = trainer.evaluate(tokenized_test)
 print("Evaluation results:", eval_results)
+# Step 8: Save the Fine-Tuned Model
 trainer.save_model("./finetuned_model")
+tokenizer.save_pretrained("./finetuned_model")
+# Step 9: Optimize with TorchScript
+model.eval()
+sample_input = tokenizer("What was the average annual return of the S&P 500 between 2020 and 2022?", return_tensors="pt")["input_ids"].to(device)
+traced_model = torch.jit.trace(model, sample_input)
+traced_model.save("./finetuned_model/distilgpt2_traced.pt")
+# Test the model
+input_text = "What was the average annual return of the S&P 500 between 2020 and 2022?"
+inputs = tokenizer(input_text, return_tensors="pt")["input_ids"].to(device)
+outputs = traced_model.generate(inputs, max_new_tokens=20, repetition_penalty=3.0, no_repeat_ngram_size=2)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))