Spaces:
Running
Running
Update finetuned_model.py
Browse files- finetuned_model.py +116 -13
finetuned_model.py
CHANGED
@@ -5,7 +5,10 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments,
|
|
5 |
import torch
|
6 |
import numpy as np
|
7 |
|
8 |
-
#
|
|
|
|
|
|
|
9 |
csv_path = "flat-ui__data-Sun Jul 06 2025.csv"
|
10 |
try:
|
11 |
df = pd.read_csv(csv_path)
|
@@ -13,12 +16,13 @@ except Exception as e:
|
|
13 |
print(f"Error loading CSV: {e}")
|
14 |
exit()
|
15 |
|
|
|
16 |
df['Date'] = pd.to_datetime(df['Date'])
|
17 |
df = df.sort_values('Date')
|
18 |
-
df['Return'] = df['SP500'].pct_change(12) * 100
|
19 |
-
df['Real Return'] = df['Real Price'].pct_change(12) * 100
|
20 |
|
21 |
-
# Aggregate to yearly data
|
22 |
df_yearly = df.groupby(df['Date'].dt.year).agg({
|
23 |
'SP500': 'mean',
|
24 |
'Return': 'mean',
|
@@ -29,7 +33,7 @@ df_yearly = df.groupby(df['Date'].dt.year).agg({
|
|
29 |
}).reset_index()
|
30 |
df_yearly = df_yearly.rename(columns={'Date': 'Year'})
|
31 |
|
32 |
-
# Create question-answer pairs
|
33 |
qa_pairs = []
|
34 |
for _, row in df_yearly.iterrows():
|
35 |
year = int(row['Year'])
|
@@ -40,21 +44,108 @@ for _, row in df_yearly.iterrows():
|
|
40 |
real_return = row.get('Real Return', 0.0)
|
41 |
pe10 = row.get('PE10', 0.0)
|
42 |
|
|
|
43 |
qa_pairs.append({
|
44 |
"question": f"What was the S&P 500 return in {year}?",
|
45 |
"answer": f"The S&P 500 returned approximately {return_val:.1f}% in {year}, including dividends."
|
46 |
})
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
# Add specific period and general questions
|
50 |
qa_pairs.append({
|
51 |
"question": "What is the average return rate of the S&P 500 in the past 10 years?",
|
52 |
"answer": "The S&P 500’s average annual return rate from 2015 to 2024 was approximately 12.2%, including dividends, based on historical data."
|
53 |
})
|
|
|
|
|
|
|
|
|
54 |
qa_pairs.append({
|
55 |
"question": "What was the average annual return of the S&P 500 between 2020 and 2022?",
|
56 |
"answer": "The S&P 500’s average annual return from 2020 to 2022 was approximately 8.3%, including dividends, with significant volatility due to the COVID-19 recovery and 2022 bear market."
|
57 |
})
|
|
|
|
|
|
|
|
|
58 |
|
59 |
# Save to JSON
|
60 |
with open("financial_data.json", "w") as f:
|
@@ -67,7 +158,7 @@ train_dataset = dataset["train"]
|
|
67 |
val_dataset = dataset["test"].train_test_split(test_size=0.5, seed=42)["train"]
|
68 |
test_dataset = dataset["test"].train_test_split(test_size=0.5, seed=42)["test"]
|
69 |
|
70 |
-
# Tokenize Data
|
71 |
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
|
72 |
tokenizer.pad_token = tokenizer.eos_token
|
73 |
|
@@ -82,10 +173,10 @@ tokenized_train = train_dataset.map(tokenize_function, batched=True)
|
|
82 |
tokenized_val = val_dataset.map(tokenize_function, batched=True)
|
83 |
tokenized_test = test_dataset.map(tokenize_function, batched=True)
|
84 |
|
85 |
-
# Load Pre-trained Model
|
86 |
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
|
87 |
|
88 |
-
# Set Up Fine-Tuning
|
89 |
training_args = TrainingArguments(
|
90 |
output_dir="./finetuned_model",
|
91 |
evaluation_strategy="epoch",
|
@@ -107,13 +198,25 @@ trainer = Trainer(
|
|
107 |
eval_dataset=tokenized_val,
|
108 |
)
|
109 |
|
110 |
-
# Fine-Tune the Model
|
111 |
trainer.train()
|
112 |
|
113 |
-
# Evaluate the Model
|
114 |
eval_results = trainer.evaluate(tokenized_test)
|
115 |
print("Evaluation results:", eval_results)
|
116 |
|
117 |
-
# Save the Fine-Tuned Model
|
118 |
trainer.save_model("./finetuned_model")
|
119 |
-
tokenizer.save_pretrained("./finetuned_model")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
import torch
|
6 |
import numpy as np
|
7 |
|
8 |
+
# Step 1: Set Up Environment
|
9 |
+
# Ensure libraries are installed: pip install transformers datasets torch accelerate pandas numpy
|
10 |
+
|
11 |
+
# Step 2: Load and Preprocess Dataset
|
12 |
csv_path = "flat-ui__data-Sun Jul 06 2025.csv"
|
13 |
try:
|
14 |
df = pd.read_csv(csv_path)
|
|
|
16 |
print(f"Error loading CSV: {e}")
|
17 |
exit()
|
18 |
|
19 |
+
# Preprocess: Calculate annual returns
|
20 |
df['Date'] = pd.to_datetime(df['Date'])
|
21 |
df = df.sort_values('Date')
|
22 |
+
df['Return'] = df['SP500'].pct_change(12) * 100 # Annual return based on monthly data
|
23 |
+
df['Real Return'] = df['Real Price'].pct_change(12) * 100 # Inflation-adjusted return
|
24 |
|
25 |
+
# Aggregate to yearly data for faster processing
|
26 |
df_yearly = df.groupby(df['Date'].dt.year).agg({
|
27 |
'SP500': 'mean',
|
28 |
'Return': 'mean',
|
|
|
33 |
}).reset_index()
|
34 |
df_yearly = df_yearly.rename(columns={'Date': 'Year'})
|
35 |
|
36 |
+
# Create question-answer pairs and summaries
|
37 |
qa_pairs = []
|
38 |
for _, row in df_yearly.iterrows():
|
39 |
year = int(row['Year'])
|
|
|
44 |
real_return = row.get('Real Return', 0.0)
|
45 |
pe10 = row.get('PE10', 0.0)
|
46 |
|
47 |
+
# Year-specific questions
|
48 |
qa_pairs.append({
|
49 |
"question": f"What was the S&P 500 return in {year}?",
|
50 |
"answer": f"The S&P 500 returned approximately {return_val:.1f}% in {year}, including dividends."
|
51 |
})
|
52 |
+
qa_pairs.append({
|
53 |
+
"question": f"What was the S&P 500 index value in {year}?",
|
54 |
+
"answer": f"The S&P 500 averaged approximately {sp500:.2f} in {year}."
|
55 |
+
})
|
56 |
+
qa_pairs.append({
|
57 |
+
"question": f"What was the S&P 500 real return in {year}?",
|
58 |
+
"answer": f"The S&P 500’s inflation-adjusted return was approximately {real_return:.1f}% in {year}."
|
59 |
+
})
|
60 |
+
if dividend > 0:
|
61 |
+
qa_pairs.append({
|
62 |
+
"question": f"What was the S&P 500 dividend in {year}?",
|
63 |
+
"answer": f"The S&P 500 dividend was approximately {dividend:.2f} in {year}."
|
64 |
+
})
|
65 |
+
if earnings > 0:
|
66 |
+
qa_pairs.append({
|
67 |
+
"question": f"What were the S&P 500 earnings in {year}?",
|
68 |
+
"answer": f"The S&P 500 earnings were approximately {earnings:.2f} in {year}."
|
69 |
+
})
|
70 |
+
if pe10 > 0:
|
71 |
+
qa_pairs.append({
|
72 |
+
"question": f"What was the S&P 500 PE10 ratio in {year}?",
|
73 |
+
"answer": f"The S&P 500 PE10 ratio was approximately {pe10:.2f} in {year}."
|
74 |
+
})
|
75 |
+
|
76 |
+
# Summaries
|
77 |
+
qa_pairs.append({
|
78 |
+
"summary": f"In {year}, the S&P 500 averaged {sp500:.2f} with a {return_val:.1f}% annual return and a {real_return:.1f}% real return."
|
79 |
+
})
|
80 |
+
|
81 |
+
# Period-specific questions (1-year, 3-year, 5-year, 10-year, and recent ranges)
|
82 |
+
years = df_yearly['Year'].unique()
|
83 |
+
for year in years:
|
84 |
+
for duration in [1, 3, 5, 10]:
|
85 |
+
start_year = int(year)
|
86 |
+
end_year = start_year + duration - 1
|
87 |
+
if end_year <= df_yearly['Year'].max():
|
88 |
+
df_period = df_yearly[(df_yearly['Year'] >= start_year) & (df_yearly['Year'] <= end_year)]
|
89 |
+
avg_return = df_period['Return'].mean()
|
90 |
+
avg_real_return = df_period['Real Return'].mean()
|
91 |
+
qa_pairs.append({
|
92 |
+
"question": f"What was the {duration}-year average annual growth rate of the S&P 500 from {start_year}?",
|
93 |
+
"answer": f"The S&P 500’s {duration}-year average annual growth rate from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
|
94 |
+
})
|
95 |
+
qa_pairs.append({
|
96 |
+
"question": f"What was the {duration}-year real return of the S&P 500 from {start_year}?",
|
97 |
+
"answer": f"The S&P 500’s {duration}-year average annual inflation-adjusted return from {start_year} to {end_year} was approximately {avg_real_return:.1f}%."
|
98 |
+
})
|
99 |
+
|
100 |
+
# Custom period questions, including recent periods
|
101 |
+
custom_periods = [(2000, 2010), (2011, 2016), (2010, 2020), (2000, 2008), (2015, 2024), (2020, 2022), (2020, 2024)]
|
102 |
+
for start_year, end_year in custom_periods:
|
103 |
+
df_period = df_yearly[(df_yearly['Year'] >= start_year) & (df_yearly['Year'] <= end_year)]
|
104 |
+
if not df_period.empty:
|
105 |
+
avg_return = df_period['Return'].mean()
|
106 |
+
avg_real_return = df_period['Real Return'].mean()
|
107 |
+
qa_pairs.append({
|
108 |
+
"question": f"What was the average annual growth rate of the S&P 500 between {start_year} and {end_year}?",
|
109 |
+
"answer": f"The S&P 500’s average annual growth rate from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
|
110 |
+
})
|
111 |
+
qa_pairs.append({
|
112 |
+
"question": f"What was the average annual return of the S&P 500 between {start_year} and {end_year}?",
|
113 |
+
"answer": f"The S&P 500’s average annual return from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
|
114 |
+
})
|
115 |
+
qa_pairs.append({
|
116 |
+
"question": f"What was the S&P 500’s real return between {start_year} and {end_year}?",
|
117 |
+
"answer": f"The S&P 500’s average annual inflation-adjusted return from {start_year} to {end_year} was approximately {avg_real_return:.1f}%."
|
118 |
+
})
|
119 |
+
|
120 |
+
# Investment return questions
|
121 |
+
amounts = [1000, 5000, 10000]
|
122 |
+
durations = [1, 3, 5, 10, 20]
|
123 |
+
avg_annual_return = 10.0 # Historical S&P 500 average (1927–2025)
|
124 |
+
for amount in amounts:
|
125 |
+
for n in durations:
|
126 |
+
future_value = amount * (1 + avg_annual_return / 100) ** n
|
127 |
+
qa_pairs.append({
|
128 |
+
"question": f"What will ${amount} be worth in {n} years if invested in the S&P 500?",
|
129 |
+
"answer": f"Assuming a 10% average annual return, ${amount:,.0f} invested in the S&P 500 would grow to approximately ${future_value:,.0f} in {n} years with annual compounding."
|
130 |
+
})
|
131 |
|
132 |
# Add specific period and general questions
|
133 |
qa_pairs.append({
|
134 |
"question": "What is the average return rate of the S&P 500 in the past 10 years?",
|
135 |
"answer": "The S&P 500’s average annual return rate from 2015 to 2024 was approximately 12.2%, including dividends, based on historical data."
|
136 |
})
|
137 |
+
qa_pairs.append({
|
138 |
+
"question": "What is the S&P 500 index fund average growth rate?",
|
139 |
+
"answer": "The S&P 500 index fund’s average annual return is approximately 10–12% over the long term (1927–2025), including dividends, based on historical data."
|
140 |
+
})
|
141 |
qa_pairs.append({
|
142 |
"question": "What was the average annual return of the S&P 500 between 2020 and 2022?",
|
143 |
"answer": "The S&P 500’s average annual return from 2020 to 2022 was approximately 8.3%, including dividends, with significant volatility due to the COVID-19 recovery and 2022 bear market."
|
144 |
})
|
145 |
+
qa_pairs.append({
|
146 |
+
"question": "What was the average annual return of the S&P 500 in the past 5 years?",
|
147 |
+
"answer": "The S&P 500’s average annual return from 2020 to 2024 was approximately 10.5%, including dividends, based on historical data."
|
148 |
+
})
|
149 |
|
150 |
# Save to JSON
|
151 |
with open("financial_data.json", "w") as f:
|
|
|
158 |
val_dataset = dataset["test"].train_test_split(test_size=0.5, seed=42)["train"]
|
159 |
test_dataset = dataset["test"].train_test_split(test_size=0.5, seed=42)["test"]
|
160 |
|
161 |
+
# Step 3: Tokenize Data
|
162 |
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
|
163 |
tokenizer.pad_token = tokenizer.eos_token
|
164 |
|
|
|
173 |
tokenized_val = val_dataset.map(tokenize_function, batched=True)
|
174 |
tokenized_test = test_dataset.map(tokenize_function, batched=True)
|
175 |
|
176 |
+
# Step 4: Load Pre-trained Model
|
177 |
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
|
178 |
|
179 |
+
# Step 5: Set Up Fine-Tuning
|
180 |
training_args = TrainingArguments(
|
181 |
output_dir="./finetuned_model",
|
182 |
evaluation_strategy="epoch",
|
|
|
198 |
eval_dataset=tokenized_val,
|
199 |
)
|
200 |
|
201 |
+
# Step 6: Fine-Tune the Model
|
202 |
trainer.train()
|
203 |
|
204 |
+
# Step 7: Evaluate the Model
|
205 |
eval_results = trainer.evaluate(tokenized_test)
|
206 |
print("Evaluation results:", eval_results)
|
207 |
|
208 |
+
# Step 8: Save the Fine-Tuned Model
|
209 |
trainer.save_model("./finetuned_model")
|
210 |
+
tokenizer.save_pretrained("./finetuned_model")
|
211 |
+
|
212 |
+
# Step 9: Optimize with TorchScript
|
213 |
+
model.eval()
|
214 |
+
sample_input = tokenizer("What was the average annual return of the S&P 500 between 2020 and 2022?", return_tensors="pt")["input_ids"].to(device)
|
215 |
+
traced_model = torch.jit.trace(model, sample_input)
|
216 |
+
traced_model.save("./finetuned_model/distilgpt2_traced.pt")
|
217 |
+
|
218 |
+
# Test the model
|
219 |
+
input_text = "What was the average annual return of the S&P 500 between 2020 and 2022?"
|
220 |
+
inputs = tokenizer(input_text, return_tensors="pt")["input_ids"].to(device)
|
221 |
+
outputs = traced_model.generate(inputs, max_new_tokens=20, repetition_penalty=3.0, no_repeat_ngram_size=2)
|
222 |
+
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|