File size: 3,688 Bytes
533c071
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import os

# Step 1: ๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ
# CSV ํŒŒ์ผ ๋กœ๋“œ (ํŒŒ์ผ ๊ฒฝ๋กœ๋Š” ์‚ฌ์šฉ์ž์˜ ํ™˜๊ฒฝ์— ๋งž๊ฒŒ ์ˆ˜์ •)
df = pd.read_csv('data/healthcare.csv', index_col=0)

# ํƒ€๊ฒŸ ๋ณ€์ˆ˜์™€ ํ”ผ์ฒ˜ ๋ถ„๋ฆฌ
X = df.drop(columns=["InsuranceClaim"])
y = df["InsuranceClaim"]

# Step 2: ๋ฒ”์ฃผํ˜• ๋ฐ ์ˆ˜์น˜ํ˜• ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ
categorical_features = ["Gender", "Region", "Smoker"]
numerical_features = ["Age", "BMI", "NumVisits"]

# ๋ฒ”์ฃผํ˜• ์ธ์ฝ”๋”ฉ ๋ฐ ์ˆ˜์น˜ํ˜• ๊ฒฐ์ธก์น˜ ๋Œ€์ฒด
categorical_transformer = OneHotEncoder(drop="first")
numerical_transformer = SimpleImputer(strategy="mean")

# ColumnTransformer๋ฅผ ํ†ตํ•ด ์ „์ฒ˜๋ฆฌ ์ •์˜
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Step 3: ํŒŒ์ดํ”„๋ผ์ธ ์ƒ์„ฑ
# ์•„๋ž˜๋Š” ๊ทธ๋ฆฌ๋“œ ์„œ์น˜ ํ•˜๋Š” ๊ฒฝ์šฐ์˜ ์ฝ”๋“œ๋‹ค.  ์•„๋ž˜์˜ ์ฃผ์„๋“ค์„ ํ’€๋ฉด, ๊ทธ๋ฆฌ๋“œ ์„œ์น˜ ํ•˜๋Š” ๊ฒฝ์šฐ์˜ ์ฝ”๋“œ๊ฐ€ ๋œ๋‹ค.
# model = RandomForestRegressor(random_state=42, n_estimators=500)
model = LinearRegression()

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ]
)

# Step 4: ํ•™์Šต/ํ…Œ์ŠคํŠธ ๋ฐ์ดํ„ฐ ๋ถ„๋ฆฌ
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: ๊ทธ๋ฆฌ๋“œ์„œ์น˜ ์„ค์ • ๋ฐ ์‹คํ–‰
# param_grid = {
#     "model__n_estimators": [50, 100, 200, 300, 400, 500],
#     "model__max_depth": [None, 10, 20, 30],
#     "model__min_samples_split": [2, 5, 10],
#     "model__min_samples_leaf": [1, 2, 4]
# }

# grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
# grid_search.fit(X_train, y_train)

# # ์ตœ์ ์˜ ํŒŒ๋ผ๋ฏธํ„ฐ์™€ ์„ฑ๋Šฅ ํ™•์ธ
# best_params = grid_search.best_params_
# best_model = grid_search.best_estimator_

# print("์ตœ์ ์˜ ํŒŒ๋ผ๋ฏธํ„ฐ:", best_params)

# Step 6: ์ตœ์  ๋ชจ๋ธ๋กœ ํ‰๊ฐ€
# y_pred = best_model.predict(X_test)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R^2: {r2:.2f}")

# ๋ชจ๋ธ์„ ํŒŒ์ผ๋กœ ์ €์žฅํ•˜์ž
import joblib

# ๋ชจ๋ธ ์ €์žฅ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
os.makedirs('models', exist_ok=True)

# ๋ชจ๋ธ ์ €์žฅ
joblib.dump(pipeline, 'models/healthcare_model.pkl')


# Step 7: ์ƒˆ๋กœ์šด ๋ฐ์ดํ„ฐ ์˜ˆ์ธก ํ•จ์ˆ˜
def predict_insurance_claim(new_data, pipeline):
    """
    ์ƒˆ๋กœ์šด ๋ฐ์ดํ„ฐ๋ฅผ ์ž…๋ ฅ๋ฐ›์•„ ๋ณดํ—˜ ์ฒญ๊ตฌ ๊ธˆ์•ก์„ ์˜ˆ์ธก
    :param new_data: dict ํ˜•ํƒœ์˜ ์ž…๋ ฅ ๋ฐ์ดํ„ฐ
    :param pipeline: ํ•™์Šต๋œ ํŒŒ์ดํ”„๋ผ์ธ ๊ฐ์ฒด
    :return: ์˜ˆ์ธก๋œ ๋ณดํ—˜ ์ฒญ๊ตฌ ๊ธˆ์•ก
    """
    new_df = pd.DataFrame([new_data])
    prediction = pipeline.predict(new_df)
    return prediction[0]

# ์ƒˆ๋กœ์šด ๋ฐ์ดํ„ฐ ์˜ˆ์ œ
new_patient = {
    "Age": 45,
    "Gender": "Male",
    "BMI": 28.5,
    "Region": "South",
    "Smoker": "Yes",
    "NumVisits": 12
}

predicted_claim = predict_insurance_claim(new_patient, pipeline)
# predicted_claim = predict_insurance_claim(new_patient, best_model)
print(f"์ƒˆ๋กœ์šด ํ™˜์ž์˜ ์˜ˆ์ธก ๋ณดํ—˜ ์ฒญ๊ตฌ ๊ธˆ์•ก: {predicted_claim:.2f}")