AnilNiraula commited on
Commit
432b779
·
verified ·
1 Parent(s): a5ecbe2

Update finetuned_model.py

Browse files
Files changed (1) hide show
  1. finetuned_model.py +22 -5
finetuned_model.py CHANGED
@@ -3,9 +3,10 @@ import json
3
  from datasets import Dataset
4
  from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
5
  import torch
 
6
 
7
  # Step 1: Set Up Environment
8
- # Ensure libraries are installed: pip install transformers datasets torch accelerate pandas
9
 
10
  # Step 2: Load and Preprocess Dataset
11
  csv_path = "flat-ui__data-Sun Jul 06 2025.csv"
@@ -33,7 +34,7 @@ for _, row in df.iterrows():
33
  real_return = row.get('Real Return', 0.0)
34
  pe10 = row.get('PE10', 0.0)
35
 
36
- # Question-answer pairs
37
  qa_pairs.append({
38
  "question": f"What was the S&P 500 return in {year}?",
39
  "answer": f"The S&P 500 returned approximately {return_val:.1f}% in {year}, including dividends."
@@ -67,6 +68,22 @@ for _, row in df.iterrows():
67
  "summary": f"On {date}, the S&P 500 closed at {sp500:.2f} with a {return_val:.1f}% annual return and a {real_return:.1f}% real return."
68
  })
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  # Add general S&P 500 growth rate question
71
  qa_pairs.append({
72
  "question": "What is the S&P 500 index fund average growth rate?",
@@ -106,10 +123,10 @@ model = AutoModelForCausalLM.from_pretrained("distilgpt2")
106
  training_args = TrainingArguments(
107
  output_dir="./finetuned_model",
108
  evaluation_strategy="epoch",
109
- learning_rate=1e-5, # Adjusted for better convergence
110
  per_device_train_batch_size=4,
111
  per_device_eval_batch_size=4,
112
- num_train_epochs=5, # Increased for better training
113
  weight_decay=0.01,
114
  logging_steps=10,
115
  save_strategy="epoch",
@@ -136,7 +153,7 @@ trainer.save_model("./finetuned_model")
136
  tokenizer.save_pretrained("./finetuned_model")
137
 
138
  # Test the model
139
- input_text = "What was the S&P 500 return in 2020?"
140
  inputs = tokenizer(input_text, return_tensors="pt")
141
  outputs = model.generate(**inputs, max_new_tokens=50)
142
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 
3
  from datasets import Dataset
4
  from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
5
  import torch
6
+ import numpy as np
7
 
8
  # Step 1: Set Up Environment
9
+ # Ensure libraries are installed: pip install transformers datasets torch accelerate pandas numpy
10
 
11
  # Step 2: Load and Preprocess Dataset
12
  csv_path = "flat-ui__data-Sun Jul 06 2025.csv"
 
34
  real_return = row.get('Real Return', 0.0)
35
  pe10 = row.get('PE10', 0.0)
36
 
37
+ # Year-specific questions
38
  qa_pairs.append({
39
  "question": f"What was the S&P 500 return in {year}?",
40
  "answer": f"The S&P 500 returned approximately {return_val:.1f}% in {year}, including dividends."
 
68
  "summary": f"On {date}, the S&P 500 closed at {sp500:.2f} with a {return_val:.1f}% annual return and a {real_return:.1f}% real return."
69
  })
70
 
71
+ # Add period-specific questions
72
+ # Example periods: 2000–2010, 2011–2016, 2010–2020
73
+ periods = [(2000, 2010), (2011, 2016), (2010, 2020)]
74
+ for start_year, end_year in periods:
75
+ df_period = df[(df['Date'].dt.year >= start_year) & (df['Date'].dt.year <= end_year)]
76
+ avg_return = df_period['Return'].mean()
77
+ avg_real_return = df_period['Real Return'].mean()
78
+ qa_pairs.append({
79
+ "question": f"What is the average annual growth rate of the S&P 500 from {start_year} to {end_year}?",
80
+ "answer": f"The S&P 500’s average annual growth rate from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
81
+ })
82
+ qa_pairs.append({
83
+ "question": f"What was the S&P 500’s real return from {start_year} to {end_year}?",
84
+ "answer": f"The S&P 500’s average annual inflation-adjusted return from {start_year} to {end_year} was approximately {avg_real_return:.1f}%."
85
+ })
86
+
87
  # Add general S&P 500 growth rate question
88
  qa_pairs.append({
89
  "question": "What is the S&P 500 index fund average growth rate?",
 
123
  training_args = TrainingArguments(
124
  output_dir="./finetuned_model",
125
  evaluation_strategy="epoch",
126
+ learning_rate=1e-5,
127
  per_device_train_batch_size=4,
128
  per_device_eval_batch_size=4,
129
+ num_train_epochs=5,
130
  weight_decay=0.01,
131
  logging_steps=10,
132
  save_strategy="epoch",
 
153
  tokenizer.save_pretrained("./finetuned_model")
154
 
155
  # Test the model
156
+ input_text = "What is the average annual growth rate of the S&P 500 from 2000 to 2010?"
157
  inputs = tokenizer(input_text, return_tensors="pt")
158
  outputs = model.generate(**inputs, max_new_tokens=50)
159
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))