Spaces:

AnilNiraula
/

FinChat

Running

App Files Files Community

AnilNiraula commited on Jul 7

Commit

432b779

verified ·

1 Parent(s): a5ecbe2

Update finetuned_model.py

Browse files

Files changed (1) hide show

finetuned_model.py +22 -5

finetuned_model.py CHANGED Viewed

@@ -3,9 +3,10 @@ import json
 from datasets import Dataset
 from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
 import torch
 # Step 1: Set Up Environment
-# Ensure libraries are installed: pip install transformers datasets torch accelerate pandas
 # Step 2: Load and Preprocess Dataset
 csv_path = "flat-ui__data-Sun Jul 06 2025.csv"
@@ -33,7 +34,7 @@ for _, row in df.iterrows():
     real_return = row.get('Real Return', 0.0)
     pe10 = row.get('PE10', 0.0)
-    # Question-answer pairs
     qa_pairs.append({
         "question": f"What was the S&P 500 return in {year}?",
         "answer": f"The S&P 500 returned approximately {return_val:.1f}% in {year}, including dividends."
@@ -67,6 +68,22 @@ for _, row in df.iterrows():
         "summary": f"On {date}, the S&P 500 closed at {sp500:.2f} with a {return_val:.1f}% annual return and a {real_return:.1f}% real return."
     })
 # Add general S&P 500 growth rate question
 qa_pairs.append({
     "question": "What is the S&P 500 index fund average growth rate?",
@@ -106,10 +123,10 @@ model = AutoModelForCausalLM.from_pretrained("distilgpt2")
 training_args = TrainingArguments(
     output_dir="./finetuned_model",
     evaluation_strategy="epoch",
-    learning_rate=1e-5,  # Adjusted for better convergence
     per_device_train_batch_size=4,
     per_device_eval_batch_size=4,
-    num_train_epochs=5,  # Increased for better training
     weight_decay=0.01,
     logging_steps=10,
     save_strategy="epoch",
@@ -136,7 +153,7 @@ trainer.save_model("./finetuned_model")
 tokenizer.save_pretrained("./finetuned_model")
 # Test the model
-input_text = "What was the S&P 500 return in 2020?"
 inputs = tokenizer(input_text, return_tensors="pt")
 outputs = model.generate(**inputs, max_new_tokens=50)
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))

 from datasets import Dataset
 from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
 import torch
+import numpy as np
 # Step 1: Set Up Environment
+# Ensure libraries are installed: pip install transformers datasets torch accelerate pandas numpy
 # Step 2: Load and Preprocess Dataset
 csv_path = "flat-ui__data-Sun Jul 06 2025.csv"
     real_return = row.get('Real Return', 0.0)
     pe10 = row.get('PE10', 0.0)
+    # Year-specific questions
     qa_pairs.append({
         "question": f"What was the S&P 500 return in {year}?",
         "answer": f"The S&P 500 returned approximately {return_val:.1f}% in {year}, including dividends."
         "summary": f"On {date}, the S&P 500 closed at {sp500:.2f} with a {return_val:.1f}% annual return and a {real_return:.1f}% real return."
     })
+# Add period-specific questions
+# Example periods: 2000–2010, 2011–2016, 2010–2020
+periods = [(2000, 2010), (2011, 2016), (2010, 2020)]
+for start_year, end_year in periods:
+    df_period = df[(df['Date'].dt.year >= start_year) & (df['Date'].dt.year <= end_year)]
+    avg_return = df_period['Return'].mean()
+    avg_real_return = df_period['Real Return'].mean()
+    qa_pairs.append({
+        "question": f"What is the average annual growth rate of the S&P 500 from {start_year} to {end_year}?",
+        "answer": f"The S&P 500’s average annual growth rate from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
+    })
+    qa_pairs.append({
+        "question": f"What was the S&P 500’s real return from {start_year} to {end_year}?",
+        "answer": f"The S&P 500’s average annual inflation-adjusted return from {start_year} to {end_year} was approximately {avg_real_return:.1f}%."
+    })
 # Add general S&P 500 growth rate question
 qa_pairs.append({
     "question": "What is the S&P 500 index fund average growth rate?",
 training_args = TrainingArguments(
     output_dir="./finetuned_model",
     evaluation_strategy="epoch",
+    learning_rate=1e-5,
     per_device_train_batch_size=4,
     per_device_eval_batch_size=4,
+    num_train_epochs=5,
     weight_decay=0.01,
     logging_steps=10,
     save_strategy="epoch",
 tokenizer.save_pretrained("./finetuned_model")
 # Test the model
+input_text = "What is the average annual growth rate of the S&P 500 from 2000 to 2010?"
 inputs = tokenizer(input_text, return_tensors="pt")
 outputs = model.generate(**inputs, max_new_tokens=50)
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))