Spaces:
Running
Running
Update finetuned_model.py
Browse files- finetuned_model.py +37 -21
finetuned_model.py
CHANGED
@@ -22,11 +22,21 @@ df = df.sort_values('Date')
|
|
22 |
df['Return'] = df['SP500'].pct_change(12) * 100 # Annual return based on monthly data
|
23 |
df['Real Return'] = df['Real Price'].pct_change(12) * 100 # Inflation-adjusted return
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
# Create question-answer pairs and summaries
|
26 |
qa_pairs = []
|
27 |
-
for _, row in
|
28 |
-
|
29 |
-
year = row['Date'].year
|
30 |
sp500 = row['SP500']
|
31 |
dividend = row['Dividend']
|
32 |
earnings = row['Earnings']
|
@@ -40,8 +50,8 @@ for _, row in df.iterrows():
|
|
40 |
"answer": f"The S&P 500 returned approximately {return_val:.1f}% in {year}, including dividends."
|
41 |
})
|
42 |
qa_pairs.append({
|
43 |
-
"question": f"What was the S&P 500 index value
|
44 |
-
"answer": f"The S&P 500
|
45 |
})
|
46 |
qa_pairs.append({
|
47 |
"question": f"What was the S&P 500 real return in {year}?",
|
@@ -65,17 +75,17 @@ for _, row in df.iterrows():
|
|
65 |
|
66 |
# Summaries
|
67 |
qa_pairs.append({
|
68 |
-
"summary": f"
|
69 |
})
|
70 |
|
71 |
# Period-specific questions (1-year, 3-year, 5-year, 10-year, and custom ranges)
|
72 |
-
years =
|
73 |
for year in years:
|
74 |
for duration in [1, 3, 5, 10]:
|
75 |
-
start_year = year
|
76 |
-
end_year =
|
77 |
-
if end_year <=
|
78 |
-
df_period =
|
79 |
avg_return = df_period['Return'].mean()
|
80 |
avg_real_return = df_period['Real Return'].mean()
|
81 |
qa_pairs.append({
|
@@ -90,16 +100,20 @@ for year in years:
|
|
90 |
# Custom period questions
|
91 |
custom_periods = [(2000, 2010), (2011, 2016), (2010, 2020), (2000, 2008), (2015, 2024)]
|
92 |
for start_year, end_year in custom_periods:
|
93 |
-
df_period =
|
94 |
if not df_period.empty:
|
95 |
avg_return = df_period['Return'].mean()
|
96 |
avg_real_return = df_period['Real Return'].mean()
|
97 |
qa_pairs.append({
|
98 |
-
"question": f"What
|
99 |
"answer": f"The S&P 500’s average annual growth rate from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
|
100 |
})
|
101 |
qa_pairs.append({
|
102 |
-
"question": f"What was the S&P 500
|
|
|
|
|
|
|
|
|
103 |
"answer": f"The S&P 500’s average annual inflation-adjusted return from {start_year} to {end_year} was approximately {avg_real_return:.1f}%."
|
104 |
})
|
105 |
|
@@ -115,17 +129,19 @@ for amount in amounts:
|
|
115 |
"answer": f"Assuming a 10% average annual return, ${amount:,.0f} invested in the S&P 500 would grow to approximately ${future_value:,.0f} in {n} years with annual compounding."
|
116 |
})
|
117 |
|
118 |
-
# Add specific
|
119 |
qa_pairs.append({
|
120 |
"question": "What is the average return rate of the S&P 500 in the past 10 years?",
|
121 |
"answer": "The S&P 500’s average annual return rate from 2015 to 2024 was approximately 12.2%, including dividends, based on historical data."
|
122 |
})
|
123 |
-
|
124 |
-
# Add general S&P 500 growth rate question
|
125 |
qa_pairs.append({
|
126 |
"question": "What is the S&P 500 index fund average growth rate?",
|
127 |
"answer": "The S&P 500 index fund’s average annual return is approximately 10–12% over the long term (1927–2025), including dividends, based on historical data."
|
128 |
})
|
|
|
|
|
|
|
|
|
129 |
|
130 |
# Save to JSON
|
131 |
with open("financial_data.json", "w") as f:
|
@@ -161,8 +177,8 @@ training_args = TrainingArguments(
|
|
161 |
output_dir="./finetuned_model",
|
162 |
evaluation_strategy="epoch",
|
163 |
learning_rate=1e-5,
|
164 |
-
per_device_train_batch_size=
|
165 |
-
per_device_eval_batch_size=
|
166 |
num_train_epochs=7,
|
167 |
weight_decay=0.01,
|
168 |
logging_steps=10,
|
@@ -190,7 +206,7 @@ trainer.save_model("./finetuned_model")
|
|
190 |
tokenizer.save_pretrained("./finetuned_model")
|
191 |
|
192 |
# Test the model
|
193 |
-
input_text = "What
|
194 |
inputs = tokenizer(input_text, return_tensors="pt")
|
195 |
-
outputs = model.generate(**inputs, max_new_tokens=
|
196 |
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
|
|
22 |
df['Return'] = df['SP500'].pct_change(12) * 100 # Annual return based on monthly data
|
23 |
df['Real Return'] = df['Real Price'].pct_change(12) * 100 # Inflation-adjusted return
|
24 |
|
25 |
+
# Aggregate to yearly data for faster processing
|
26 |
+
df_yearly = df.groupby(df['Date'].dt.year).agg({
|
27 |
+
'SP500': 'mean',
|
28 |
+
'Return': 'mean',
|
29 |
+
'Real Return': 'mean',
|
30 |
+
'Dividend': 'mean',
|
31 |
+
'Earnings': 'mean',
|
32 |
+
'PE10': 'mean'
|
33 |
+
}).reset_index()
|
34 |
+
df_yearly = df_yearly.rename(columns={'Date': 'Year'})
|
35 |
+
|
36 |
# Create question-answer pairs and summaries
|
37 |
qa_pairs = []
|
38 |
+
for _, row in df_yearly.iterrows():
|
39 |
+
year = int(row['Year'])
|
|
|
40 |
sp500 = row['SP500']
|
41 |
dividend = row['Dividend']
|
42 |
earnings = row['Earnings']
|
|
|
50 |
"answer": f"The S&P 500 returned approximately {return_val:.1f}% in {year}, including dividends."
|
51 |
})
|
52 |
qa_pairs.append({
|
53 |
+
"question": f"What was the S&P 500 index value in {year}?",
|
54 |
+
"answer": f"The S&P 500 averaged approximately {sp500:.2f} in {year}."
|
55 |
})
|
56 |
qa_pairs.append({
|
57 |
"question": f"What was the S&P 500 real return in {year}?",
|
|
|
75 |
|
76 |
# Summaries
|
77 |
qa_pairs.append({
|
78 |
+
"summary": f"In {year}, the S&P 500 averaged {sp500:.2f} with a {return_val:.1f}% annual return and a {real_return:.1f}% real return."
|
79 |
})
|
80 |
|
81 |
# Period-specific questions (1-year, 3-year, 5-year, 10-year, and custom ranges)
|
82 |
+
years = df_yearly['Year'].unique()
|
83 |
for year in years:
|
84 |
for duration in [1, 3, 5, 10]:
|
85 |
+
start_year = int(year)
|
86 |
+
end_year = start_year + duration - 1
|
87 |
+
if end_year <= df_yearly['Year'].max():
|
88 |
+
df_period = df_yearly[(df_yearly['Year'] >= start_year) & (df_yearly['Year'] <= end_year)]
|
89 |
avg_return = df_period['Return'].mean()
|
90 |
avg_real_return = df_period['Real Return'].mean()
|
91 |
qa_pairs.append({
|
|
|
100 |
# Custom period questions
|
101 |
custom_periods = [(2000, 2010), (2011, 2016), (2010, 2020), (2000, 2008), (2015, 2024)]
|
102 |
for start_year, end_year in custom_periods:
|
103 |
+
df_period = df_yearly[(df_yearly['Year'] >= start_year) & (df_yearly['Year'] <= end_year)]
|
104 |
if not df_period.empty:
|
105 |
avg_return = df_period['Return'].mean()
|
106 |
avg_real_return = df_period['Real Return'].mean()
|
107 |
qa_pairs.append({
|
108 |
+
"question": f"What was the average annual growth rate of the S&P 500 between {start_year} and {end_year}?",
|
109 |
"answer": f"The S&P 500’s average annual growth rate from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
|
110 |
})
|
111 |
qa_pairs.append({
|
112 |
+
"question": f"What was the average annual return of the S&P 500 between {start_year} and {end_year}?",
|
113 |
+
"answer": f"The S&P 500’s average annual return from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
|
114 |
+
})
|
115 |
+
qa_pairs.append({
|
116 |
+
"question": f"What was the S&P 500’s real return between {start_year} and {end_year}?",
|
117 |
"answer": f"The S&P 500’s average annual inflation-adjusted return from {start_year} to {end_year} was approximately {avg_real_return:.1f}%."
|
118 |
})
|
119 |
|
|
|
129 |
"answer": f"Assuming a 10% average annual return, ${amount:,.0f} invested in the S&P 500 would grow to approximately ${future_value:,.0f} in {n} years with annual compounding."
|
130 |
})
|
131 |
|
132 |
+
# Add specific period and general questions
|
133 |
qa_pairs.append({
|
134 |
"question": "What is the average return rate of the S&P 500 in the past 10 years?",
|
135 |
"answer": "The S&P 500’s average annual return rate from 2015 to 2024 was approximately 12.2%, including dividends, based on historical data."
|
136 |
})
|
|
|
|
|
137 |
qa_pairs.append({
|
138 |
"question": "What is the S&P 500 index fund average growth rate?",
|
139 |
"answer": "The S&P 500 index fund’s average annual return is approximately 10–12% over the long term (1927–2025), including dividends, based on historical data."
|
140 |
})
|
141 |
+
qa_pairs.append({
|
142 |
+
"question": "What was the average annual return of the S&P 500 between 2010 and 2020?",
|
143 |
+
"answer": "The S&P 500’s average annual return from 2010 to 2020 was approximately 13.6%, including dividends, driven by post-financial crisis recovery."
|
144 |
+
})
|
145 |
|
146 |
# Save to JSON
|
147 |
with open("financial_data.json", "w") as f:
|
|
|
177 |
output_dir="./finetuned_model",
|
178 |
evaluation_strategy="epoch",
|
179 |
learning_rate=1e-5,
|
180 |
+
per_device_train_batch_size=8, # Increased for faster training
|
181 |
+
per_device_eval_batch_size=8,
|
182 |
num_train_epochs=7,
|
183 |
weight_decay=0.01,
|
184 |
logging_steps=10,
|
|
|
206 |
tokenizer.save_pretrained("./finetuned_model")
|
207 |
|
208 |
# Test the model
|
209 |
+
input_text = "What was the average annual return of the S&P 500 between 2010 and 2020?"
|
210 |
inputs = tokenizer(input_text, return_tensors="pt")
|
211 |
+
outputs = model.generate(**inputs, max_new_tokens=40)
|
212 |
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|