Spaces:
Running
Running
File size: 3,688 Bytes
533c071 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import os
# Step 1: ๋ฐ์ดํฐ ๋ก๋ ๋ฐ ์ ์ฒ๋ฆฌ
# CSV ํ์ผ ๋ก๋ (ํ์ผ ๊ฒฝ๋ก๋ ์ฌ์ฉ์์ ํ๊ฒฝ์ ๋ง๊ฒ ์์ )
df = pd.read_csv('data/healthcare.csv', index_col=0)
# ํ๊ฒ ๋ณ์์ ํผ์ฒ ๋ถ๋ฆฌ
X = df.drop(columns=["InsuranceClaim"])
y = df["InsuranceClaim"]
# Step 2: ๋ฒ์ฃผํ ๋ฐ ์์นํ ๋ฐ์ดํฐ ์ฒ๋ฆฌ
categorical_features = ["Gender", "Region", "Smoker"]
numerical_features = ["Age", "BMI", "NumVisits"]
# ๋ฒ์ฃผํ ์ธ์ฝ๋ฉ ๋ฐ ์์นํ ๊ฒฐ์ธก์น ๋์ฒด
categorical_transformer = OneHotEncoder(drop="first")
numerical_transformer = SimpleImputer(strategy="mean")
# ColumnTransformer๋ฅผ ํตํด ์ ์ฒ๋ฆฌ ์ ์
preprocessor = ColumnTransformer(
transformers=[
("num", numerical_transformer, numerical_features),
("cat", categorical_transformer, categorical_features)
]
)
# Step 3: ํ์ดํ๋ผ์ธ ์์ฑ
# ์๋๋ ๊ทธ๋ฆฌ๋ ์์น ํ๋ ๊ฒฝ์ฐ์ ์ฝ๋๋ค. ์๋์ ์ฃผ์๋ค์ ํ๋ฉด, ๊ทธ๋ฆฌ๋ ์์น ํ๋ ๊ฒฝ์ฐ์ ์ฝ๋๊ฐ ๋๋ค.
# model = RandomForestRegressor(random_state=42, n_estimators=500)
model = LinearRegression()
pipeline = Pipeline(
steps=[
("preprocessor", preprocessor),
("model", model)
]
)
# Step 4: ํ์ต/ํ
์คํธ ๋ฐ์ดํฐ ๋ถ๋ฆฌ
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Step 5: ๊ทธ๋ฆฌ๋์์น ์ค์ ๋ฐ ์คํ
# param_grid = {
# "model__n_estimators": [50, 100, 200, 300, 400, 500],
# "model__max_depth": [None, 10, 20, 30],
# "model__min_samples_split": [2, 5, 10],
# "model__min_samples_leaf": [1, 2, 4]
# }
# grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
# grid_search.fit(X_train, y_train)
# # ์ต์ ์ ํ๋ผ๋ฏธํฐ์ ์ฑ๋ฅ ํ์ธ
# best_params = grid_search.best_params_
# best_model = grid_search.best_estimator_
# print("์ต์ ์ ํ๋ผ๋ฏธํฐ:", best_params)
# Step 6: ์ต์ ๋ชจ๋ธ๋ก ํ๊ฐ
# y_pred = best_model.predict(X_test)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse:.2f}")
print(f"R^2: {r2:.2f}")
# ๋ชจ๋ธ์ ํ์ผ๋ก ์ ์ฅํ์
import joblib
# ๋ชจ๋ธ ์ ์ฅ ๋๋ ํ ๋ฆฌ ์์ฑ
os.makedirs('models', exist_ok=True)
# ๋ชจ๋ธ ์ ์ฅ
joblib.dump(pipeline, 'models/healthcare_model.pkl')
# Step 7: ์๋ก์ด ๋ฐ์ดํฐ ์์ธก ํจ์
def predict_insurance_claim(new_data, pipeline):
"""
์๋ก์ด ๋ฐ์ดํฐ๋ฅผ ์
๋ ฅ๋ฐ์ ๋ณดํ ์ฒญ๊ตฌ ๊ธ์ก์ ์์ธก
:param new_data: dict ํํ์ ์
๋ ฅ ๋ฐ์ดํฐ
:param pipeline: ํ์ต๋ ํ์ดํ๋ผ์ธ ๊ฐ์ฒด
:return: ์์ธก๋ ๋ณดํ ์ฒญ๊ตฌ ๊ธ์ก
"""
new_df = pd.DataFrame([new_data])
prediction = pipeline.predict(new_df)
return prediction[0]
# ์๋ก์ด ๋ฐ์ดํฐ ์์
new_patient = {
"Age": 45,
"Gender": "Male",
"BMI": 28.5,
"Region": "South",
"Smoker": "Yes",
"NumVisits": 12
}
predicted_claim = predict_insurance_claim(new_patient, pipeline)
# predicted_claim = predict_insurance_claim(new_patient, best_model)
print(f"์๋ก์ด ํ์์ ์์ธก ๋ณดํ ์ฒญ๊ตฌ ๊ธ์ก: {predicted_claim:.2f}")
|