Spaces:
Running
Running
import pandas as pd | |
import numpy as np | |
from sklearn.model_selection import train_test_split | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.linear_model import LinearRegression | |
from sklearn.preprocessing import OneHotEncoder | |
from sklearn.compose import ColumnTransformer | |
from sklearn.impute import SimpleImputer | |
from sklearn.pipeline import Pipeline | |
from sklearn.metrics import mean_squared_error, r2_score | |
from sklearn.model_selection import GridSearchCV | |
import os | |
# Step 1: ๋ฐ์ดํฐ ๋ก๋ ๋ฐ ์ ์ฒ๋ฆฌ | |
# CSV ํ์ผ ๋ก๋ (ํ์ผ ๊ฒฝ๋ก๋ ์ฌ์ฉ์์ ํ๊ฒฝ์ ๋ง๊ฒ ์์ ) | |
df = pd.read_csv('data/healthcare.csv', index_col=0) | |
# ํ๊ฒ ๋ณ์์ ํผ์ฒ ๋ถ๋ฆฌ | |
X = df.drop(columns=["InsuranceClaim"]) | |
y = df["InsuranceClaim"] | |
# Step 2: ๋ฒ์ฃผํ ๋ฐ ์์นํ ๋ฐ์ดํฐ ์ฒ๋ฆฌ | |
categorical_features = ["Gender", "Region", "Smoker"] | |
numerical_features = ["Age", "BMI", "NumVisits"] | |
# ๋ฒ์ฃผํ ์ธ์ฝ๋ฉ ๋ฐ ์์นํ ๊ฒฐ์ธก์น ๋์ฒด | |
categorical_transformer = OneHotEncoder(drop="first") | |
numerical_transformer = SimpleImputer(strategy="mean") | |
# ColumnTransformer๋ฅผ ํตํด ์ ์ฒ๋ฆฌ ์ ์ | |
preprocessor = ColumnTransformer( | |
transformers=[ | |
("num", numerical_transformer, numerical_features), | |
("cat", categorical_transformer, categorical_features) | |
] | |
) | |
# Step 3: ํ์ดํ๋ผ์ธ ์์ฑ | |
# ์๋๋ ๊ทธ๋ฆฌ๋ ์์น ํ๋ ๊ฒฝ์ฐ์ ์ฝ๋๋ค. ์๋์ ์ฃผ์๋ค์ ํ๋ฉด, ๊ทธ๋ฆฌ๋ ์์น ํ๋ ๊ฒฝ์ฐ์ ์ฝ๋๊ฐ ๋๋ค. | |
# model = RandomForestRegressor(random_state=42, n_estimators=500) | |
model = LinearRegression() | |
pipeline = Pipeline( | |
steps=[ | |
("preprocessor", preprocessor), | |
("model", model) | |
] | |
) | |
# Step 4: ํ์ต/ํ ์คํธ ๋ฐ์ดํฐ ๋ถ๋ฆฌ | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
# Step 5: ๊ทธ๋ฆฌ๋์์น ์ค์ ๋ฐ ์คํ | |
# param_grid = { | |
# "model__n_estimators": [50, 100, 200, 300, 400, 500], | |
# "model__max_depth": [None, 10, 20, 30], | |
# "model__min_samples_split": [2, 5, 10], | |
# "model__min_samples_leaf": [1, 2, 4] | |
# } | |
# grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1) | |
# grid_search.fit(X_train, y_train) | |
# # ์ต์ ์ ํ๋ผ๋ฏธํฐ์ ์ฑ๋ฅ ํ์ธ | |
# best_params = grid_search.best_params_ | |
# best_model = grid_search.best_estimator_ | |
# print("์ต์ ์ ํ๋ผ๋ฏธํฐ:", best_params) | |
# Step 6: ์ต์ ๋ชจ๋ธ๋ก ํ๊ฐ | |
# y_pred = best_model.predict(X_test) | |
pipeline.fit(X_train, y_train) | |
y_pred = pipeline.predict(X_test) | |
rmse = np.sqrt(mean_squared_error(y_test, y_pred)) | |
r2 = r2_score(y_test, y_pred) | |
print(f"RMSE: {rmse:.2f}") | |
print(f"R^2: {r2:.2f}") | |
# ๋ชจ๋ธ์ ํ์ผ๋ก ์ ์ฅํ์ | |
import joblib | |
# ๋ชจ๋ธ ์ ์ฅ ๋๋ ํ ๋ฆฌ ์์ฑ | |
os.makedirs('models', exist_ok=True) | |
# ๋ชจ๋ธ ์ ์ฅ | |
joblib.dump(pipeline, 'models/healthcare_model.pkl') | |
# Step 7: ์๋ก์ด ๋ฐ์ดํฐ ์์ธก ํจ์ | |
def predict_insurance_claim(new_data, pipeline): | |
""" | |
์๋ก์ด ๋ฐ์ดํฐ๋ฅผ ์ ๋ ฅ๋ฐ์ ๋ณดํ ์ฒญ๊ตฌ ๊ธ์ก์ ์์ธก | |
:param new_data: dict ํํ์ ์ ๋ ฅ ๋ฐ์ดํฐ | |
:param pipeline: ํ์ต๋ ํ์ดํ๋ผ์ธ ๊ฐ์ฒด | |
:return: ์์ธก๋ ๋ณดํ ์ฒญ๊ตฌ ๊ธ์ก | |
""" | |
new_df = pd.DataFrame([new_data]) | |
prediction = pipeline.predict(new_df) | |
return prediction[0] | |
# ์๋ก์ด ๋ฐ์ดํฐ ์์ | |
new_patient = { | |
"Age": 45, | |
"Gender": "Male", | |
"BMI": 28.5, | |
"Region": "South", | |
"Smoker": "Yes", | |
"NumVisits": 12 | |
} | |
predicted_claim = predict_insurance_claim(new_patient, pipeline) | |
# predicted_claim = predict_insurance_claim(new_patient, best_model) | |
print(f"์๋ก์ด ํ์์ ์์ธก ๋ณดํ ์ฒญ๊ตฌ ๊ธ์ก: {predicted_claim:.2f}") | |