Spaces:
Running
Running
Update finetuned_model.py
Browse files- finetuned_model.py +22 -5
finetuned_model.py
CHANGED
|
@@ -3,9 +3,10 @@ import json
|
|
| 3 |
from datasets import Dataset
|
| 4 |
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
|
| 5 |
import torch
|
|
|
|
| 6 |
|
| 7 |
# Step 1: Set Up Environment
|
| 8 |
-
# Ensure libraries are installed: pip install transformers datasets torch accelerate pandas
|
| 9 |
|
| 10 |
# Step 2: Load and Preprocess Dataset
|
| 11 |
csv_path = "flat-ui__data-Sun Jul 06 2025.csv"
|
|
@@ -33,7 +34,7 @@ for _, row in df.iterrows():
|
|
| 33 |
real_return = row.get('Real Return', 0.0)
|
| 34 |
pe10 = row.get('PE10', 0.0)
|
| 35 |
|
| 36 |
-
#
|
| 37 |
qa_pairs.append({
|
| 38 |
"question": f"What was the S&P 500 return in {year}?",
|
| 39 |
"answer": f"The S&P 500 returned approximately {return_val:.1f}% in {year}, including dividends."
|
|
@@ -67,6 +68,22 @@ for _, row in df.iterrows():
|
|
| 67 |
"summary": f"On {date}, the S&P 500 closed at {sp500:.2f} with a {return_val:.1f}% annual return and a {real_return:.1f}% real return."
|
| 68 |
})
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
# Add general S&P 500 growth rate question
|
| 71 |
qa_pairs.append({
|
| 72 |
"question": "What is the S&P 500 index fund average growth rate?",
|
|
@@ -106,10 +123,10 @@ model = AutoModelForCausalLM.from_pretrained("distilgpt2")
|
|
| 106 |
training_args = TrainingArguments(
|
| 107 |
output_dir="./finetuned_model",
|
| 108 |
evaluation_strategy="epoch",
|
| 109 |
-
learning_rate=1e-5,
|
| 110 |
per_device_train_batch_size=4,
|
| 111 |
per_device_eval_batch_size=4,
|
| 112 |
-
num_train_epochs=5,
|
| 113 |
weight_decay=0.01,
|
| 114 |
logging_steps=10,
|
| 115 |
save_strategy="epoch",
|
|
@@ -136,7 +153,7 @@ trainer.save_model("./finetuned_model")
|
|
| 136 |
tokenizer.save_pretrained("./finetuned_model")
|
| 137 |
|
| 138 |
# Test the model
|
| 139 |
-
input_text = "What
|
| 140 |
inputs = tokenizer(input_text, return_tensors="pt")
|
| 141 |
outputs = model.generate(**inputs, max_new_tokens=50)
|
| 142 |
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
|
|
|
| 3 |
from datasets import Dataset
|
| 4 |
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
|
| 5 |
import torch
|
| 6 |
+
import numpy as np
|
| 7 |
|
| 8 |
# Step 1: Set Up Environment
|
| 9 |
+
# Ensure libraries are installed: pip install transformers datasets torch accelerate pandas numpy
|
| 10 |
|
| 11 |
# Step 2: Load and Preprocess Dataset
|
| 12 |
csv_path = "flat-ui__data-Sun Jul 06 2025.csv"
|
|
|
|
| 34 |
real_return = row.get('Real Return', 0.0)
|
| 35 |
pe10 = row.get('PE10', 0.0)
|
| 36 |
|
| 37 |
+
# Year-specific questions
|
| 38 |
qa_pairs.append({
|
| 39 |
"question": f"What was the S&P 500 return in {year}?",
|
| 40 |
"answer": f"The S&P 500 returned approximately {return_val:.1f}% in {year}, including dividends."
|
|
|
|
| 68 |
"summary": f"On {date}, the S&P 500 closed at {sp500:.2f} with a {return_val:.1f}% annual return and a {real_return:.1f}% real return."
|
| 69 |
})
|
| 70 |
|
| 71 |
+
# Add period-specific questions
|
| 72 |
+
# Example periods: 2000–2010, 2011–2016, 2010–2020
|
| 73 |
+
periods = [(2000, 2010), (2011, 2016), (2010, 2020)]
|
| 74 |
+
for start_year, end_year in periods:
|
| 75 |
+
df_period = df[(df['Date'].dt.year >= start_year) & (df['Date'].dt.year <= end_year)]
|
| 76 |
+
avg_return = df_period['Return'].mean()
|
| 77 |
+
avg_real_return = df_period['Real Return'].mean()
|
| 78 |
+
qa_pairs.append({
|
| 79 |
+
"question": f"What is the average annual growth rate of the S&P 500 from {start_year} to {end_year}?",
|
| 80 |
+
"answer": f"The S&P 500’s average annual growth rate from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
|
| 81 |
+
})
|
| 82 |
+
qa_pairs.append({
|
| 83 |
+
"question": f"What was the S&P 500’s real return from {start_year} to {end_year}?",
|
| 84 |
+
"answer": f"The S&P 500’s average annual inflation-adjusted return from {start_year} to {end_year} was approximately {avg_real_return:.1f}%."
|
| 85 |
+
})
|
| 86 |
+
|
| 87 |
# Add general S&P 500 growth rate question
|
| 88 |
qa_pairs.append({
|
| 89 |
"question": "What is the S&P 500 index fund average growth rate?",
|
|
|
|
| 123 |
training_args = TrainingArguments(
|
| 124 |
output_dir="./finetuned_model",
|
| 125 |
evaluation_strategy="epoch",
|
| 126 |
+
learning_rate=1e-5,
|
| 127 |
per_device_train_batch_size=4,
|
| 128 |
per_device_eval_batch_size=4,
|
| 129 |
+
num_train_epochs=5,
|
| 130 |
weight_decay=0.01,
|
| 131 |
logging_steps=10,
|
| 132 |
save_strategy="epoch",
|
|
|
|
| 153 |
tokenizer.save_pretrained("./finetuned_model")
|
| 154 |
|
| 155 |
# Test the model
|
| 156 |
+
input_text = "What is the average annual growth rate of the S&P 500 from 2000 to 2010?"
|
| 157 |
inputs = tokenizer(input_text, return_tensors="pt")
|
| 158 |
outputs = model.generate(**inputs, max_new_tokens=50)
|
| 159 |
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|