Spaces:

AnilNiraula
/

FinChat

Running

App Files Files Community

AnilNiraula commited on Jul 8

Commit

67a27e0

verified ·

1 Parent(s): f61aa81

Update finetuned_model.py

Browse files

Files changed (1) hide show

finetuned_model.py +37 -21

finetuned_model.py CHANGED Viewed

@@ -22,11 +22,21 @@ df = df.sort_values('Date')
 df['Return'] = df['SP500'].pct_change(12) * 100  # Annual return based on monthly data
 df['Real Return'] = df['Real Price'].pct_change(12) * 100  # Inflation-adjusted return
 # Create question-answer pairs and summaries
 qa_pairs = []
-for _, row in df.iterrows():
-    date = row['Date'].strftime('%Y-%m-%d')
-    year = row['Date'].year
     sp500 = row['SP500']
     dividend = row['Dividend']
     earnings = row['Earnings']
@@ -40,8 +50,8 @@ for _, row in df.iterrows():
         "answer": f"The S&P 500 returned approximately {return_val:.1f}% in {year}, including dividends."
     })
     qa_pairs.append({
-        "question": f"What was the S&P 500 index value on {date}?",
-        "answer": f"The S&P 500 closed at approximately {sp500:.2f} on {date}."
     })
     qa_pairs.append({
         "question": f"What was the S&P 500 real return in {year}?",
@@ -65,17 +75,17 @@ for _, row in df.iterrows():
     # Summaries
     qa_pairs.append({
-        "summary": f"On {date}, the S&P 500 closed at {sp500:.2f} with a {return_val:.1f}% annual return and a {real_return:.1f}% real return."
     })
 # Period-specific questions (1-year, 3-year, 5-year, 10-year, and custom ranges)
-years = df['Date'].dt.year.unique()
 for year in years:
     for duration in [1, 3, 5, 10]:
-        start_year = year
-        end_year = year + duration - 1
-        if end_year <= df['Date'].dt.year.max():
-            df_period = df[(df['Date'].dt.year >= start_year) & (df['Date'].dt.year <= end_year)]
             avg_return = df_period['Return'].mean()
             avg_real_return = df_period['Real Return'].mean()
             qa_pairs.append({
@@ -90,16 +100,20 @@ for year in years:
 # Custom period questions
 custom_periods = [(2000, 2010), (2011, 2016), (2010, 2020), (2000, 2008), (2015, 2024)]
 for start_year, end_year in custom_periods:
-    df_period = df[(df['Date'].dt.year >= start_year) & (df['Date'].dt.year <= end_year)]
     if not df_period.empty:
         avg_return = df_period['Return'].mean()
         avg_real_return = df_period['Real Return'].mean()
         qa_pairs.append({
-            "question": f"What is the average annual growth rate of the S&P 500 from {start_year} to {end_year}?",
             "answer": f"The S&P 500’s average annual growth rate from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
         })
         qa_pairs.append({
-            "question": f"What was the S&P 500’s real return from {start_year} to {end_year}?",
             "answer": f"The S&P 500’s average annual inflation-adjusted return from {start_year} to {end_year} was approximately {avg_real_return:.1f}%."
         })
@@ -115,17 +129,19 @@ for amount in amounts:
             "answer": f"Assuming a 10% average annual return, ${amount:,.0f} invested in the S&P 500 would grow to approximately ${future_value:,.0f} in {n} years with annual compounding."
         })
-# Add specific 10-year growth rate question
 qa_pairs.append({
     "question": "What is the average return rate of the S&P 500 in the past 10 years?",
     "answer": "The S&P 500’s average annual return rate from 2015 to 2024 was approximately 12.2%, including dividends, based on historical data."
 })
-# Add general S&P 500 growth rate question
 qa_pairs.append({
     "question": "What is the S&P 500 index fund average growth rate?",
     "answer": "The S&P 500 index fund’s average annual return is approximately 10–12% over the long term (1927–2025), including dividends, based on historical data."
 })
 # Save to JSON
 with open("financial_data.json", "w") as f:
@@ -161,8 +177,8 @@ training_args = TrainingArguments(
     output_dir="./finetuned_model",
     evaluation_strategy="epoch",
     learning_rate=1e-5,
-    per_device_train_batch_size=4,
-    per_device_eval_batch_size=4,
     num_train_epochs=7,
     weight_decay=0.01,
     logging_steps=10,
@@ -190,7 +206,7 @@ trainer.save_model("./finetuned_model")
 tokenizer.save_pretrained("./finetuned_model")
 # Test the model
-input_text = "What is the average return rate of the S&P 500 in the past 10 years?"
 inputs = tokenizer(input_text, return_tensors="pt")
-outputs = model.generate(**inputs, max_new_tokens=50)
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))

 df['Return'] = df['SP500'].pct_change(12) * 100  # Annual return based on monthly data
 df['Real Return'] = df['Real Price'].pct_change(12) * 100  # Inflation-adjusted return
+# Aggregate to yearly data for faster processing
+df_yearly = df.groupby(df['Date'].dt.year).agg({
+    'SP500': 'mean',
+    'Return': 'mean',
+    'Real Return': 'mean',
+    'Dividend': 'mean',
+    'Earnings': 'mean',
+    'PE10': 'mean'
+}).reset_index()
+df_yearly = df_yearly.rename(columns={'Date': 'Year'})
 # Create question-answer pairs and summaries
 qa_pairs = []
+for _, row in df_yearly.iterrows():
+    year = int(row['Year'])
     sp500 = row['SP500']
     dividend = row['Dividend']
     earnings = row['Earnings']
         "answer": f"The S&P 500 returned approximately {return_val:.1f}% in {year}, including dividends."
     })
     qa_pairs.append({
+        "question": f"What was the S&P 500 index value in {year}?",
+        "answer": f"The S&P 500 averaged approximately {sp500:.2f} in {year}."
     })
     qa_pairs.append({
         "question": f"What was the S&P 500 real return in {year}?",
     # Summaries
     qa_pairs.append({
+        "summary": f"In {year}, the S&P 500 averaged {sp500:.2f} with a {return_val:.1f}% annual return and a {real_return:.1f}% real return."
     })
 # Period-specific questions (1-year, 3-year, 5-year, 10-year, and custom ranges)
+years = df_yearly['Year'].unique()
 for year in years:
     for duration in [1, 3, 5, 10]:
+        start_year = int(year)
+        end_year = start_year + duration - 1
+        if end_year <= df_yearly['Year'].max():
+            df_period = df_yearly[(df_yearly['Year'] >= start_year) & (df_yearly['Year'] <= end_year)]
             avg_return = df_period['Return'].mean()
             avg_real_return = df_period['Real Return'].mean()
             qa_pairs.append({
 # Custom period questions
 custom_periods = [(2000, 2010), (2011, 2016), (2010, 2020), (2000, 2008), (2015, 2024)]
 for start_year, end_year in custom_periods:
+    df_period = df_yearly[(df_yearly['Year'] >= start_year) & (df_yearly['Year'] <= end_year)]
     if not df_period.empty:
         avg_return = df_period['Return'].mean()
         avg_real_return = df_period['Real Return'].mean()
         qa_pairs.append({
+            "question": f"What was the average annual growth rate of the S&P 500 between {start_year} and {end_year}?",
             "answer": f"The S&P 500’s average annual growth rate from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
         })
         qa_pairs.append({
+            "question": f"What was the average annual return of the S&P 500 between {start_year} and {end_year}?",
+            "answer": f"The S&P 500’s average annual return from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
+        })
+        qa_pairs.append({
+            "question": f"What was the S&P 500’s real return between {start_year} and {end_year}?",
             "answer": f"The S&P 500’s average annual inflation-adjusted return from {start_year} to {end_year} was approximately {avg_real_return:.1f}%."
         })
             "answer": f"Assuming a 10% average annual return, ${amount:,.0f} invested in the S&P 500 would grow to approximately ${future_value:,.0f} in {n} years with annual compounding."
         })
+# Add specific period and general questions
 qa_pairs.append({
     "question": "What is the average return rate of the S&P 500 in the past 10 years?",
     "answer": "The S&P 500’s average annual return rate from 2015 to 2024 was approximately 12.2%, including dividends, based on historical data."
 })
 qa_pairs.append({
     "question": "What is the S&P 500 index fund average growth rate?",
     "answer": "The S&P 500 index fund’s average annual return is approximately 10–12% over the long term (1927–2025), including dividends, based on historical data."
 })
+qa_pairs.append({
+    "question": "What was the average annual return of the S&P 500 between 2010 and 2020?",
+    "answer": "The S&P 500’s average annual return from 2010 to 2020 was approximately 13.6%, including dividends, driven by post-financial crisis recovery."
+})
 # Save to JSON
 with open("financial_data.json", "w") as f:
     output_dir="./finetuned_model",
     evaluation_strategy="epoch",
     learning_rate=1e-5,
+    per_device_train_batch_size=8,  # Increased for faster training
+    per_device_eval_batch_size=8,
     num_train_epochs=7,
     weight_decay=0.01,
     logging_steps=10,
 tokenizer.save_pretrained("./finetuned_model")
 # Test the model
+input_text = "What was the average annual return of the S&P 500 between 2010 and 2020?"
 inputs = tokenizer(input_text, return_tensors="pt")
+outputs = model.generate(**inputs, max_new_tokens=40)
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))