Spaces:
Running
Running
File size: 9,136 Bytes
846211a 432b779 846211a 432b779 846211a a5ecbe2 846211a a5ecbe2 846211a 67a27e0 846211a 67a27e0 846211a a5ecbe2 846211a 432b779 846211a a5ecbe2 846211a 67a27e0 846211a a5ecbe2 846211a a5ecbe2 1d078ff 846211a 67a27e0 846211a ff9da77 67a27e0 6ca9683 b0b94ca 67a27e0 6ca9683 71dc240 6ca9683 71dc240 6ca9683 ff9da77 6ca9683 67a27e0 d3992a1 67a27e0 d3992a1 67a27e0 d3992a1 6ca9683 d3992a1 6ca9683 d3992a1 432b779 67a27e0 b0b94ca 846211a 67a27e0 ff9da77 1d078ff 67a27e0 846211a a5ecbe2 846211a 432b779 1d078ff b0b94ca 846211a ff9da77 846211a 1d078ff b0b94ca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 |
import pandas as pd
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import torch
import numpy as np
# Step 1: Set Up Environment
# Ensure libraries are installed: pip install transformers datasets torch accelerate pandas numpy
# Step 2: Load and Preprocess Dataset
csv_path = "flat-ui__data-Sun Jul 06 2025.csv"
try:
df = pd.read_csv(csv_path)
except Exception as e:
print(f"Error loading CSV: {e}")
exit()
# Preprocess: Calculate annual returns
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date')
df['Return'] = df['SP500'].pct_change(12) * 100 # Annual return based on monthly data
df['Real Return'] = df['Real Price'].pct_change(12) * 100 # Inflation-adjusted return
# Aggregate to yearly data for faster processing
df_yearly = df.groupby(df['Date'].dt.year).agg({
'SP500': 'mean',
'Return': 'mean',
'Real Return': 'mean',
'Dividend': 'mean',
'Earnings': 'mean',
'PE10': 'mean'
}).reset_index()
df_yearly = df_yearly.rename(columns={'Date': 'Year'})
# Create question-answer pairs and summaries
qa_pairs = []
for _, row in df_yearly.iterrows():
year = int(row['Year'])
sp500 = row['SP500']
dividend = row['Dividend']
earnings = row['Earnings']
return_val = row.get('Return', 0.0)
real_return = row.get('Real Return', 0.0)
pe10 = row.get('PE10', 0.0)
# Year-specific questions
qa_pairs.append({
"question": f"What was the S&P 500 return in {year}?",
"answer": f"The S&P 500 returned approximately {return_val:.1f}% in {year}, including dividends."
})
qa_pairs.append({
"question": f"What was the S&P 500 index value in {year}?",
"answer": f"The S&P 500 averaged approximately {sp500:.2f} in {year}."
})
qa_pairs.append({
"question": f"What was the S&P 500 real return in {year}?",
"answer": f"The S&P 500’s inflation-adjusted return was approximately {real_return:.1f}% in {year}."
})
if dividend > 0:
qa_pairs.append({
"question": f"What was the S&P 500 dividend in {year}?",
"answer": f"The S&P 500 dividend was approximately {dividend:.2f} in {year}."
})
if earnings > 0:
qa_pairs.append({
"question": f"What were the S&P 500 earnings in {year}?",
"answer": f"The S&P 500 earnings were approximately {earnings:.2f} in {year}."
})
if pe10 > 0:
qa_pairs.append({
"question": f"What was the S&P 500 PE10 ratio in {year}?",
"answer": f"The S&P 500 PE10 ratio was approximately {pe10:.2f} in {year}."
})
# Summaries
qa_pairs.append({
"summary": f"In {year}, the S&P 500 averaged {sp500:.2f} with a {return_val:.1f}% annual return and a {real_return:.1f}% real return."
})
# Period-specific questions (1-year, 3-year, 5-year, 10-year, and recent ranges)
years = df_yearly['Year'].unique()
for year in years:
for duration in [1, 3, 5, 10]:
start_year = int(year)
end_year = start_year + duration - 1
if end_year <= df_yearly['Year'].max():
df_period = df_yearly[(df_yearly['Year'] >= start_year) & (df_yearly['Year'] <= end_year)]
avg_return = df_period['Return'].mean()
avg_real_return = df_period['Real Return'].mean()
qa_pairs.append({
"question": f"What was the {duration}-year average annual growth rate of the S&P 500 from {start_year}?",
"answer": f"The S&P 500’s {duration}-year average annual growth rate from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
})
qa_pairs.append({
"question": f"What was the {duration}-year real return of the S&P 500 from {start_year}?",
"answer": f"The S&P 500’s {duration}-year average annual inflation-adjusted return from {start_year} to {end_year} was approximately {avg_real_return:.1f}%."
})
# Custom period questions, including recent periods
custom_periods = [(2000, 2010), (2011, 2016), (2010, 2020), (2000, 2008), (2015, 2024), (2020, 2022)]
for start_year, end_year in custom_periods:
df_period = df_yearly[(df_yearly['Year'] >= start_year) & (df_yearly['Year'] <= end_year)]
if not df_period.empty:
avg_return = df_period['Return'].mean()
avg_real_return = df_period['Real Return'].mean()
qa_pairs.append({
"question": f"What was the average annual growth rate of the S&P 500 between {start_year} and {end_year}?",
"answer": f"The S&P 500’s average annual growth rate from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
})
qa_pairs.append({
"question": f"What was the average annual return of the S&P 500 between {start_year} and {end_year}?",
"answer": f"The S&P 500’s average annual return from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
})
qa_pairs.append({
"question": f"What was the S&P 500’s real return between {start_year} and {end_year}?",
"answer": f"The S&P 500’s average annual inflation-adjusted return from {start_year} to {end_year} was approximately {avg_real_return:.1f}%."
})
# Investment return questions
amounts = [1000, 5000, 10000]
durations = [1, 3, 5, 10, 20]
avg_annual_return = 10.0 # Historical S&P 500 average (1927–2025)
for amount in amounts:
for n in durations:
future_value = amount * (1 + avg_annual_return / 100) ** n
qa_pairs.append({
"question": f"What will ${amount} be worth in {n} years if invested in the S&P 500?",
"answer": f"Assuming a 10% average annual return, ${amount:,.0f} invested in the S&P 500 would grow to approximately ${future_value:,.0f} in {n} years with annual compounding."
})
# Add specific period and general questions
qa_pairs.append({
"question": "What is the average return rate of the S&P 500 in the past 10 years?",
"answer": "The S&P 500’s average annual return rate from 2015 to 2024 was approximately 12.2%, including dividends, based on historical data."
})
qa_pairs.append({
"question": "What is the S&P 500 index fund average growth rate?",
"answer": "The S&P 500 index fund’s average annual return is approximately 10–12% over the long term (1927–2025), including dividends, based on historical data."
})
qa_pairs.append({
"question": "What was the average annual return of the S&P 500 between 2020 and 2022?",
"answer": "The S&P 500’s average annual return from 2020 to 2022 was approximately 8.3%, including dividends, with significant volatility due to the COVID-19 recovery and 2022 bear market."
})
# Save to JSON
with open("financial_data.json", "w") as f:
json.dump(qa_pairs, f, indent=2)
# Load dataset
dataset = Dataset.from_json("financial_data.json")
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset["train"]
val_dataset = dataset["test"].train_test_split(test_size=0.5, seed=42)["train"]
test_dataset = dataset["test"].train_test_split(test_size=0.5, seed=42)["test"]
# Step 3: Tokenize Data
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
if "question" in examples and "answer" in examples:
inputs = [q + " A: " + a for q, a in zip(examples["question"], examples["answer"])]
else:
inputs = examples["summary"]
return tokenizer(inputs, padding="max_length", truncation=True, max_length=512)
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)
# Step 4: Load Pre-trained Model
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
# Step 5: Set Up Fine-Tuning
training_args = TrainingArguments(
output_dir="./finetuned_model",
evaluation_strategy="epoch",
learning_rate=1e-5,
per_device_train_batch_size=16, # Increased for faster training
per_device_eval_batch_size=16,
num_train_epochs=7,
weight_decay=0.01,
logging_steps=10,
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="eval_loss"
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_val,
)
# Step 6: Fine-Tune the Model
trainer.train()
# Step 7: Evaluate the Model
eval_results = trainer.evaluate(tokenized_test)
print("Evaluation results:", eval_results)
# Step 8: Save the Fine-Tuned Model
trainer.save_model("./finetuned_model")
tokenizer.save_pretrained("./finetuned_model")
# Test the model
input_text = "What was the average annual return of the S&P 500 between 2020 and 2022?"
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=30)
print(tokenizer.decode(outputs[0], skip_special_tokens=True)) |