AnilNiraula commited on
Commit
8ebe689
·
verified ·
1 Parent(s): 08d63f2

Update finetuned_model.py

Browse files
Files changed (1) hide show
  1. finetuned_model.py +116 -13
finetuned_model.py CHANGED
@@ -5,7 +5,10 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments,
5
  import torch
6
  import numpy as np
7
 
8
- # Load and Preprocess Dataset
 
 
 
9
  csv_path = "flat-ui__data-Sun Jul 06 2025.csv"
10
  try:
11
  df = pd.read_csv(csv_path)
@@ -13,12 +16,13 @@ except Exception as e:
13
  print(f"Error loading CSV: {e}")
14
  exit()
15
 
 
16
  df['Date'] = pd.to_datetime(df['Date'])
17
  df = df.sort_values('Date')
18
- df['Return'] = df['SP500'].pct_change(12) * 100
19
- df['Real Return'] = df['Real Price'].pct_change(12) * 100
20
 
21
- # Aggregate to yearly data
22
  df_yearly = df.groupby(df['Date'].dt.year).agg({
23
  'SP500': 'mean',
24
  'Return': 'mean',
@@ -29,7 +33,7 @@ df_yearly = df.groupby(df['Date'].dt.year).agg({
29
  }).reset_index()
30
  df_yearly = df_yearly.rename(columns={'Date': 'Year'})
31
 
32
- # Create question-answer pairs
33
  qa_pairs = []
34
  for _, row in df_yearly.iterrows():
35
  year = int(row['Year'])
@@ -40,21 +44,108 @@ for _, row in df_yearly.iterrows():
40
  real_return = row.get('Real Return', 0.0)
41
  pe10 = row.get('PE10', 0.0)
42
 
 
43
  qa_pairs.append({
44
  "question": f"What was the S&P 500 return in {year}?",
45
  "answer": f"The S&P 500 returned approximately {return_val:.1f}% in {year}, including dividends."
46
  })
47
- # Add more pairs as needed...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  # Add specific period and general questions
50
  qa_pairs.append({
51
  "question": "What is the average return rate of the S&P 500 in the past 10 years?",
52
  "answer": "The S&P 500’s average annual return rate from 2015 to 2024 was approximately 12.2%, including dividends, based on historical data."
53
  })
 
 
 
 
54
  qa_pairs.append({
55
  "question": "What was the average annual return of the S&P 500 between 2020 and 2022?",
56
  "answer": "The S&P 500’s average annual return from 2020 to 2022 was approximately 8.3%, including dividends, with significant volatility due to the COVID-19 recovery and 2022 bear market."
57
  })
 
 
 
 
58
 
59
  # Save to JSON
60
  with open("financial_data.json", "w") as f:
@@ -67,7 +158,7 @@ train_dataset = dataset["train"]
67
  val_dataset = dataset["test"].train_test_split(test_size=0.5, seed=42)["train"]
68
  test_dataset = dataset["test"].train_test_split(test_size=0.5, seed=42)["test"]
69
 
70
- # Tokenize Data
71
  tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
72
  tokenizer.pad_token = tokenizer.eos_token
73
 
@@ -82,10 +173,10 @@ tokenized_train = train_dataset.map(tokenize_function, batched=True)
82
  tokenized_val = val_dataset.map(tokenize_function, batched=True)
83
  tokenized_test = test_dataset.map(tokenize_function, batched=True)
84
 
85
- # Load Pre-trained Model
86
  model = AutoModelForCausalLM.from_pretrained("distilgpt2")
87
 
88
- # Set Up Fine-Tuning
89
  training_args = TrainingArguments(
90
  output_dir="./finetuned_model",
91
  evaluation_strategy="epoch",
@@ -107,13 +198,25 @@ trainer = Trainer(
107
  eval_dataset=tokenized_val,
108
  )
109
 
110
- # Fine-Tune the Model
111
  trainer.train()
112
 
113
- # Evaluate the Model
114
  eval_results = trainer.evaluate(tokenized_test)
115
  print("Evaluation results:", eval_results)
116
 
117
- # Save the Fine-Tuned Model
118
  trainer.save_model("./finetuned_model")
119
- tokenizer.save_pretrained("./finetuned_model")
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import torch
6
  import numpy as np
7
 
8
+ # Step 1: Set Up Environment
9
+ # Ensure libraries are installed: pip install transformers datasets torch accelerate pandas numpy
10
+
11
+ # Step 2: Load and Preprocess Dataset
12
  csv_path = "flat-ui__data-Sun Jul 06 2025.csv"
13
  try:
14
  df = pd.read_csv(csv_path)
 
16
  print(f"Error loading CSV: {e}")
17
  exit()
18
 
19
+ # Preprocess: Calculate annual returns
20
  df['Date'] = pd.to_datetime(df['Date'])
21
  df = df.sort_values('Date')
22
+ df['Return'] = df['SP500'].pct_change(12) * 100 # Annual return based on monthly data
23
+ df['Real Return'] = df['Real Price'].pct_change(12) * 100 # Inflation-adjusted return
24
 
25
+ # Aggregate to yearly data for faster processing
26
  df_yearly = df.groupby(df['Date'].dt.year).agg({
27
  'SP500': 'mean',
28
  'Return': 'mean',
 
33
  }).reset_index()
34
  df_yearly = df_yearly.rename(columns={'Date': 'Year'})
35
 
36
+ # Create question-answer pairs and summaries
37
  qa_pairs = []
38
  for _, row in df_yearly.iterrows():
39
  year = int(row['Year'])
 
44
  real_return = row.get('Real Return', 0.0)
45
  pe10 = row.get('PE10', 0.0)
46
 
47
+ # Year-specific questions
48
  qa_pairs.append({
49
  "question": f"What was the S&P 500 return in {year}?",
50
  "answer": f"The S&P 500 returned approximately {return_val:.1f}% in {year}, including dividends."
51
  })
52
+ qa_pairs.append({
53
+ "question": f"What was the S&P 500 index value in {year}?",
54
+ "answer": f"The S&P 500 averaged approximately {sp500:.2f} in {year}."
55
+ })
56
+ qa_pairs.append({
57
+ "question": f"What was the S&P 500 real return in {year}?",
58
+ "answer": f"The S&P 500’s inflation-adjusted return was approximately {real_return:.1f}% in {year}."
59
+ })
60
+ if dividend > 0:
61
+ qa_pairs.append({
62
+ "question": f"What was the S&P 500 dividend in {year}?",
63
+ "answer": f"The S&P 500 dividend was approximately {dividend:.2f} in {year}."
64
+ })
65
+ if earnings > 0:
66
+ qa_pairs.append({
67
+ "question": f"What were the S&P 500 earnings in {year}?",
68
+ "answer": f"The S&P 500 earnings were approximately {earnings:.2f} in {year}."
69
+ })
70
+ if pe10 > 0:
71
+ qa_pairs.append({
72
+ "question": f"What was the S&P 500 PE10 ratio in {year}?",
73
+ "answer": f"The S&P 500 PE10 ratio was approximately {pe10:.2f} in {year}."
74
+ })
75
+
76
+ # Summaries
77
+ qa_pairs.append({
78
+ "summary": f"In {year}, the S&P 500 averaged {sp500:.2f} with a {return_val:.1f}% annual return and a {real_return:.1f}% real return."
79
+ })
80
+
81
+ # Period-specific questions (1-year, 3-year, 5-year, 10-year, and recent ranges)
82
+ years = df_yearly['Year'].unique()
83
+ for year in years:
84
+ for duration in [1, 3, 5, 10]:
85
+ start_year = int(year)
86
+ end_year = start_year + duration - 1
87
+ if end_year <= df_yearly['Year'].max():
88
+ df_period = df_yearly[(df_yearly['Year'] >= start_year) & (df_yearly['Year'] <= end_year)]
89
+ avg_return = df_period['Return'].mean()
90
+ avg_real_return = df_period['Real Return'].mean()
91
+ qa_pairs.append({
92
+ "question": f"What was the {duration}-year average annual growth rate of the S&P 500 from {start_year}?",
93
+ "answer": f"The S&P 500’s {duration}-year average annual growth rate from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
94
+ })
95
+ qa_pairs.append({
96
+ "question": f"What was the {duration}-year real return of the S&P 500 from {start_year}?",
97
+ "answer": f"The S&P 500’s {duration}-year average annual inflation-adjusted return from {start_year} to {end_year} was approximately {avg_real_return:.1f}%."
98
+ })
99
+
100
+ # Custom period questions, including recent periods
101
+ custom_periods = [(2000, 2010), (2011, 2016), (2010, 2020), (2000, 2008), (2015, 2024), (2020, 2022), (2020, 2024)]
102
+ for start_year, end_year in custom_periods:
103
+ df_period = df_yearly[(df_yearly['Year'] >= start_year) & (df_yearly['Year'] <= end_year)]
104
+ if not df_period.empty:
105
+ avg_return = df_period['Return'].mean()
106
+ avg_real_return = df_period['Real Return'].mean()
107
+ qa_pairs.append({
108
+ "question": f"What was the average annual growth rate of the S&P 500 between {start_year} and {end_year}?",
109
+ "answer": f"The S&P 500’s average annual growth rate from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
110
+ })
111
+ qa_pairs.append({
112
+ "question": f"What was the average annual return of the S&P 500 between {start_year} and {end_year}?",
113
+ "answer": f"The S&P 500’s average annual return from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
114
+ })
115
+ qa_pairs.append({
116
+ "question": f"What was the S&P 500’s real return between {start_year} and {end_year}?",
117
+ "answer": f"The S&P 500’s average annual inflation-adjusted return from {start_year} to {end_year} was approximately {avg_real_return:.1f}%."
118
+ })
119
+
120
+ # Investment return questions
121
+ amounts = [1000, 5000, 10000]
122
+ durations = [1, 3, 5, 10, 20]
123
+ avg_annual_return = 10.0 # Historical S&P 500 average (1927–2025)
124
+ for amount in amounts:
125
+ for n in durations:
126
+ future_value = amount * (1 + avg_annual_return / 100) ** n
127
+ qa_pairs.append({
128
+ "question": f"What will ${amount} be worth in {n} years if invested in the S&P 500?",
129
+ "answer": f"Assuming a 10% average annual return, ${amount:,.0f} invested in the S&P 500 would grow to approximately ${future_value:,.0f} in {n} years with annual compounding."
130
+ })
131
 
132
  # Add specific period and general questions
133
  qa_pairs.append({
134
  "question": "What is the average return rate of the S&P 500 in the past 10 years?",
135
  "answer": "The S&P 500’s average annual return rate from 2015 to 2024 was approximately 12.2%, including dividends, based on historical data."
136
  })
137
+ qa_pairs.append({
138
+ "question": "What is the S&P 500 index fund average growth rate?",
139
+ "answer": "The S&P 500 index fund’s average annual return is approximately 10–12% over the long term (1927–2025), including dividends, based on historical data."
140
+ })
141
  qa_pairs.append({
142
  "question": "What was the average annual return of the S&P 500 between 2020 and 2022?",
143
  "answer": "The S&P 500’s average annual return from 2020 to 2022 was approximately 8.3%, including dividends, with significant volatility due to the COVID-19 recovery and 2022 bear market."
144
  })
145
+ qa_pairs.append({
146
+ "question": "What was the average annual return of the S&P 500 in the past 5 years?",
147
+ "answer": "The S&P 500’s average annual return from 2020 to 2024 was approximately 10.5%, including dividends, based on historical data."
148
+ })
149
 
150
  # Save to JSON
151
  with open("financial_data.json", "w") as f:
 
158
  val_dataset = dataset["test"].train_test_split(test_size=0.5, seed=42)["train"]
159
  test_dataset = dataset["test"].train_test_split(test_size=0.5, seed=42)["test"]
160
 
161
+ # Step 3: Tokenize Data
162
  tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
163
  tokenizer.pad_token = tokenizer.eos_token
164
 
 
173
  tokenized_val = val_dataset.map(tokenize_function, batched=True)
174
  tokenized_test = test_dataset.map(tokenize_function, batched=True)
175
 
176
+ # Step 4: Load Pre-trained Model
177
  model = AutoModelForCausalLM.from_pretrained("distilgpt2")
178
 
179
+ # Step 5: Set Up Fine-Tuning
180
  training_args = TrainingArguments(
181
  output_dir="./finetuned_model",
182
  evaluation_strategy="epoch",
 
198
  eval_dataset=tokenized_val,
199
  )
200
 
201
+ # Step 6: Fine-Tune the Model
202
  trainer.train()
203
 
204
+ # Step 7: Evaluate the Model
205
  eval_results = trainer.evaluate(tokenized_test)
206
  print("Evaluation results:", eval_results)
207
 
208
+ # Step 8: Save the Fine-Tuned Model
209
  trainer.save_model("./finetuned_model")
210
+ tokenizer.save_pretrained("./finetuned_model")
211
+
212
+ # Step 9: Optimize with TorchScript
213
+ model.eval()
214
+ sample_input = tokenizer("What was the average annual return of the S&P 500 between 2020 and 2022?", return_tensors="pt")["input_ids"].to(device)
215
+ traced_model = torch.jit.trace(model, sample_input)
216
+ traced_model.save("./finetuned_model/distilgpt2_traced.pt")
217
+
218
+ # Test the model
219
+ input_text = "What was the average annual return of the S&P 500 between 2020 and 2022?"
220
+ inputs = tokenizer(input_text, return_tensors="pt")["input_ids"].to(device)
221
+ outputs = traced_model.generate(inputs, max_new_tokens=20, repetition_penalty=3.0, no_repeat_ngram_size=2)
222
+ print(tokenizer.decode(outputs[0], skip_special_tokens=True))