AnilNiraula commited on
Commit
67a27e0
·
verified ·
1 Parent(s): f61aa81

Update finetuned_model.py

Browse files
Files changed (1) hide show
  1. finetuned_model.py +37 -21
finetuned_model.py CHANGED
@@ -22,11 +22,21 @@ df = df.sort_values('Date')
22
  df['Return'] = df['SP500'].pct_change(12) * 100 # Annual return based on monthly data
23
  df['Real Return'] = df['Real Price'].pct_change(12) * 100 # Inflation-adjusted return
24
 
 
 
 
 
 
 
 
 
 
 
 
25
  # Create question-answer pairs and summaries
26
  qa_pairs = []
27
- for _, row in df.iterrows():
28
- date = row['Date'].strftime('%Y-%m-%d')
29
- year = row['Date'].year
30
  sp500 = row['SP500']
31
  dividend = row['Dividend']
32
  earnings = row['Earnings']
@@ -40,8 +50,8 @@ for _, row in df.iterrows():
40
  "answer": f"The S&P 500 returned approximately {return_val:.1f}% in {year}, including dividends."
41
  })
42
  qa_pairs.append({
43
- "question": f"What was the S&P 500 index value on {date}?",
44
- "answer": f"The S&P 500 closed at approximately {sp500:.2f} on {date}."
45
  })
46
  qa_pairs.append({
47
  "question": f"What was the S&P 500 real return in {year}?",
@@ -65,17 +75,17 @@ for _, row in df.iterrows():
65
 
66
  # Summaries
67
  qa_pairs.append({
68
- "summary": f"On {date}, the S&P 500 closed at {sp500:.2f} with a {return_val:.1f}% annual return and a {real_return:.1f}% real return."
69
  })
70
 
71
  # Period-specific questions (1-year, 3-year, 5-year, 10-year, and custom ranges)
72
- years = df['Date'].dt.year.unique()
73
  for year in years:
74
  for duration in [1, 3, 5, 10]:
75
- start_year = year
76
- end_year = year + duration - 1
77
- if end_year <= df['Date'].dt.year.max():
78
- df_period = df[(df['Date'].dt.year >= start_year) & (df['Date'].dt.year <= end_year)]
79
  avg_return = df_period['Return'].mean()
80
  avg_real_return = df_period['Real Return'].mean()
81
  qa_pairs.append({
@@ -90,16 +100,20 @@ for year in years:
90
  # Custom period questions
91
  custom_periods = [(2000, 2010), (2011, 2016), (2010, 2020), (2000, 2008), (2015, 2024)]
92
  for start_year, end_year in custom_periods:
93
- df_period = df[(df['Date'].dt.year >= start_year) & (df['Date'].dt.year <= end_year)]
94
  if not df_period.empty:
95
  avg_return = df_period['Return'].mean()
96
  avg_real_return = df_period['Real Return'].mean()
97
  qa_pairs.append({
98
- "question": f"What is the average annual growth rate of the S&P 500 from {start_year} to {end_year}?",
99
  "answer": f"The S&P 500’s average annual growth rate from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
100
  })
101
  qa_pairs.append({
102
- "question": f"What was the S&P 500’s real return from {start_year} to {end_year}?",
 
 
 
 
103
  "answer": f"The S&P 500’s average annual inflation-adjusted return from {start_year} to {end_year} was approximately {avg_real_return:.1f}%."
104
  })
105
 
@@ -115,17 +129,19 @@ for amount in amounts:
115
  "answer": f"Assuming a 10% average annual return, ${amount:,.0f} invested in the S&P 500 would grow to approximately ${future_value:,.0f} in {n} years with annual compounding."
116
  })
117
 
118
- # Add specific 10-year growth rate question
119
  qa_pairs.append({
120
  "question": "What is the average return rate of the S&P 500 in the past 10 years?",
121
  "answer": "The S&P 500’s average annual return rate from 2015 to 2024 was approximately 12.2%, including dividends, based on historical data."
122
  })
123
-
124
- # Add general S&P 500 growth rate question
125
  qa_pairs.append({
126
  "question": "What is the S&P 500 index fund average growth rate?",
127
  "answer": "The S&P 500 index fund’s average annual return is approximately 10–12% over the long term (1927–2025), including dividends, based on historical data."
128
  })
 
 
 
 
129
 
130
  # Save to JSON
131
  with open("financial_data.json", "w") as f:
@@ -161,8 +177,8 @@ training_args = TrainingArguments(
161
  output_dir="./finetuned_model",
162
  evaluation_strategy="epoch",
163
  learning_rate=1e-5,
164
- per_device_train_batch_size=4,
165
- per_device_eval_batch_size=4,
166
  num_train_epochs=7,
167
  weight_decay=0.01,
168
  logging_steps=10,
@@ -190,7 +206,7 @@ trainer.save_model("./finetuned_model")
190
  tokenizer.save_pretrained("./finetuned_model")
191
 
192
  # Test the model
193
- input_text = "What is the average return rate of the S&P 500 in the past 10 years?"
194
  inputs = tokenizer(input_text, return_tensors="pt")
195
- outputs = model.generate(**inputs, max_new_tokens=50)
196
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 
22
  df['Return'] = df['SP500'].pct_change(12) * 100 # Annual return based on monthly data
23
  df['Real Return'] = df['Real Price'].pct_change(12) * 100 # Inflation-adjusted return
24
 
25
+ # Aggregate to yearly data for faster processing
26
+ df_yearly = df.groupby(df['Date'].dt.year).agg({
27
+ 'SP500': 'mean',
28
+ 'Return': 'mean',
29
+ 'Real Return': 'mean',
30
+ 'Dividend': 'mean',
31
+ 'Earnings': 'mean',
32
+ 'PE10': 'mean'
33
+ }).reset_index()
34
+ df_yearly = df_yearly.rename(columns={'Date': 'Year'})
35
+
36
  # Create question-answer pairs and summaries
37
  qa_pairs = []
38
+ for _, row in df_yearly.iterrows():
39
+ year = int(row['Year'])
 
40
  sp500 = row['SP500']
41
  dividend = row['Dividend']
42
  earnings = row['Earnings']
 
50
  "answer": f"The S&P 500 returned approximately {return_val:.1f}% in {year}, including dividends."
51
  })
52
  qa_pairs.append({
53
+ "question": f"What was the S&P 500 index value in {year}?",
54
+ "answer": f"The S&P 500 averaged approximately {sp500:.2f} in {year}."
55
  })
56
  qa_pairs.append({
57
  "question": f"What was the S&P 500 real return in {year}?",
 
75
 
76
  # Summaries
77
  qa_pairs.append({
78
+ "summary": f"In {year}, the S&P 500 averaged {sp500:.2f} with a {return_val:.1f}% annual return and a {real_return:.1f}% real return."
79
  })
80
 
81
  # Period-specific questions (1-year, 3-year, 5-year, 10-year, and custom ranges)
82
+ years = df_yearly['Year'].unique()
83
  for year in years:
84
  for duration in [1, 3, 5, 10]:
85
+ start_year = int(year)
86
+ end_year = start_year + duration - 1
87
+ if end_year <= df_yearly['Year'].max():
88
+ df_period = df_yearly[(df_yearly['Year'] >= start_year) & (df_yearly['Year'] <= end_year)]
89
  avg_return = df_period['Return'].mean()
90
  avg_real_return = df_period['Real Return'].mean()
91
  qa_pairs.append({
 
100
  # Custom period questions
101
  custom_periods = [(2000, 2010), (2011, 2016), (2010, 2020), (2000, 2008), (2015, 2024)]
102
  for start_year, end_year in custom_periods:
103
+ df_period = df_yearly[(df_yearly['Year'] >= start_year) & (df_yearly['Year'] <= end_year)]
104
  if not df_period.empty:
105
  avg_return = df_period['Return'].mean()
106
  avg_real_return = df_period['Real Return'].mean()
107
  qa_pairs.append({
108
+ "question": f"What was the average annual growth rate of the S&P 500 between {start_year} and {end_year}?",
109
  "answer": f"The S&P 500’s average annual growth rate from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
110
  })
111
  qa_pairs.append({
112
+ "question": f"What was the average annual return of the S&P 500 between {start_year} and {end_year}?",
113
+ "answer": f"The S&P 500’s average annual return from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
114
+ })
115
+ qa_pairs.append({
116
+ "question": f"What was the S&P 500’s real return between {start_year} and {end_year}?",
117
  "answer": f"The S&P 500’s average annual inflation-adjusted return from {start_year} to {end_year} was approximately {avg_real_return:.1f}%."
118
  })
119
 
 
129
  "answer": f"Assuming a 10% average annual return, ${amount:,.0f} invested in the S&P 500 would grow to approximately ${future_value:,.0f} in {n} years with annual compounding."
130
  })
131
 
132
+ # Add specific period and general questions
133
  qa_pairs.append({
134
  "question": "What is the average return rate of the S&P 500 in the past 10 years?",
135
  "answer": "The S&P 500’s average annual return rate from 2015 to 2024 was approximately 12.2%, including dividends, based on historical data."
136
  })
 
 
137
  qa_pairs.append({
138
  "question": "What is the S&P 500 index fund average growth rate?",
139
  "answer": "The S&P 500 index fund’s average annual return is approximately 10–12% over the long term (1927–2025), including dividends, based on historical data."
140
  })
141
+ qa_pairs.append({
142
+ "question": "What was the average annual return of the S&P 500 between 2010 and 2020?",
143
+ "answer": "The S&P 500’s average annual return from 2010 to 2020 was approximately 13.6%, including dividends, driven by post-financial crisis recovery."
144
+ })
145
 
146
  # Save to JSON
147
  with open("financial_data.json", "w") as f:
 
177
  output_dir="./finetuned_model",
178
  evaluation_strategy="epoch",
179
  learning_rate=1e-5,
180
+ per_device_train_batch_size=8, # Increased for faster training
181
+ per_device_eval_batch_size=8,
182
  num_train_epochs=7,
183
  weight_decay=0.01,
184
  logging_steps=10,
 
206
  tokenizer.save_pretrained("./finetuned_model")
207
 
208
  # Test the model
209
+ input_text = "What was the average annual return of the S&P 500 between 2010 and 2020?"
210
  inputs = tokenizer(input_text, return_tensors="pt")
211
+ outputs = model.generate(**inputs, max_new_tokens=40)
212
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))