healthcare-app / make_model.py
blockenters's picture
add
533c071
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import os
# Step 1: ๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ
# CSV ํŒŒ์ผ ๋กœ๋“œ (ํŒŒ์ผ ๊ฒฝ๋กœ๋Š” ์‚ฌ์šฉ์ž์˜ ํ™˜๊ฒฝ์— ๋งž๊ฒŒ ์ˆ˜์ •)
df = pd.read_csv('data/healthcare.csv', index_col=0)
# ํƒ€๊ฒŸ ๋ณ€์ˆ˜์™€ ํ”ผ์ฒ˜ ๋ถ„๋ฆฌ
X = df.drop(columns=["InsuranceClaim"])
y = df["InsuranceClaim"]
# Step 2: ๋ฒ”์ฃผํ˜• ๋ฐ ์ˆ˜์น˜ํ˜• ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ
categorical_features = ["Gender", "Region", "Smoker"]
numerical_features = ["Age", "BMI", "NumVisits"]
# ๋ฒ”์ฃผํ˜• ์ธ์ฝ”๋”ฉ ๋ฐ ์ˆ˜์น˜ํ˜• ๊ฒฐ์ธก์น˜ ๋Œ€์ฒด
categorical_transformer = OneHotEncoder(drop="first")
numerical_transformer = SimpleImputer(strategy="mean")
# ColumnTransformer๋ฅผ ํ†ตํ•ด ์ „์ฒ˜๋ฆฌ ์ •์˜
preprocessor = ColumnTransformer(
transformers=[
("num", numerical_transformer, numerical_features),
("cat", categorical_transformer, categorical_features)
]
)
# Step 3: ํŒŒ์ดํ”„๋ผ์ธ ์ƒ์„ฑ
# ์•„๋ž˜๋Š” ๊ทธ๋ฆฌ๋“œ ์„œ์น˜ ํ•˜๋Š” ๊ฒฝ์šฐ์˜ ์ฝ”๋“œ๋‹ค. ์•„๋ž˜์˜ ์ฃผ์„๋“ค์„ ํ’€๋ฉด, ๊ทธ๋ฆฌ๋“œ ์„œ์น˜ ํ•˜๋Š” ๊ฒฝ์šฐ์˜ ์ฝ”๋“œ๊ฐ€ ๋œ๋‹ค.
# model = RandomForestRegressor(random_state=42, n_estimators=500)
model = LinearRegression()
pipeline = Pipeline(
steps=[
("preprocessor", preprocessor),
("model", model)
]
)
# Step 4: ํ•™์Šต/ํ…Œ์ŠคํŠธ ๋ฐ์ดํ„ฐ ๋ถ„๋ฆฌ
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Step 5: ๊ทธ๋ฆฌ๋“œ์„œ์น˜ ์„ค์ • ๋ฐ ์‹คํ–‰
# param_grid = {
# "model__n_estimators": [50, 100, 200, 300, 400, 500],
# "model__max_depth": [None, 10, 20, 30],
# "model__min_samples_split": [2, 5, 10],
# "model__min_samples_leaf": [1, 2, 4]
# }
# grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
# grid_search.fit(X_train, y_train)
# # ์ตœ์ ์˜ ํŒŒ๋ผ๋ฏธํ„ฐ์™€ ์„ฑ๋Šฅ ํ™•์ธ
# best_params = grid_search.best_params_
# best_model = grid_search.best_estimator_
# print("์ตœ์ ์˜ ํŒŒ๋ผ๋ฏธํ„ฐ:", best_params)
# Step 6: ์ตœ์  ๋ชจ๋ธ๋กœ ํ‰๊ฐ€
# y_pred = best_model.predict(X_test)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse:.2f}")
print(f"R^2: {r2:.2f}")
# ๋ชจ๋ธ์„ ํŒŒ์ผ๋กœ ์ €์žฅํ•˜์ž
import joblib
# ๋ชจ๋ธ ์ €์žฅ ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
os.makedirs('models', exist_ok=True)
# ๋ชจ๋ธ ์ €์žฅ
joblib.dump(pipeline, 'models/healthcare_model.pkl')
# Step 7: ์ƒˆ๋กœ์šด ๋ฐ์ดํ„ฐ ์˜ˆ์ธก ํ•จ์ˆ˜
def predict_insurance_claim(new_data, pipeline):
"""
์ƒˆ๋กœ์šด ๋ฐ์ดํ„ฐ๋ฅผ ์ž…๋ ฅ๋ฐ›์•„ ๋ณดํ—˜ ์ฒญ๊ตฌ ๊ธˆ์•ก์„ ์˜ˆ์ธก
:param new_data: dict ํ˜•ํƒœ์˜ ์ž…๋ ฅ ๋ฐ์ดํ„ฐ
:param pipeline: ํ•™์Šต๋œ ํŒŒ์ดํ”„๋ผ์ธ ๊ฐ์ฒด
:return: ์˜ˆ์ธก๋œ ๋ณดํ—˜ ์ฒญ๊ตฌ ๊ธˆ์•ก
"""
new_df = pd.DataFrame([new_data])
prediction = pipeline.predict(new_df)
return prediction[0]
# ์ƒˆ๋กœ์šด ๋ฐ์ดํ„ฐ ์˜ˆ์ œ
new_patient = {
"Age": 45,
"Gender": "Male",
"BMI": 28.5,
"Region": "South",
"Smoker": "Yes",
"NumVisits": 12
}
predicted_claim = predict_insurance_claim(new_patient, pipeline)
# predicted_claim = predict_insurance_claim(new_patient, best_model)
print(f"์ƒˆ๋กœ์šด ํ™˜์ž์˜ ์˜ˆ์ธก ๋ณดํ—˜ ์ฒญ๊ตฌ ๊ธˆ์•ก: {predicted_claim:.2f}")