Spaces:

blockenters
/

healthcare-app

Running

App Files Files Community

healthcare-app / make_model.py

blockenters

add

533c071 about 2 months ago

raw

history blame contribute delete

3.69 kB

	import pandas as pd
	import numpy as np
	from sklearn.model_selection import train_test_split
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.linear_model import LinearRegression
	from sklearn.preprocessing import OneHotEncoder
	from sklearn.compose import ColumnTransformer
	from sklearn.impute import SimpleImputer
	from sklearn.pipeline import Pipeline
	from sklearn.metrics import mean_squared_error, r2_score
	from sklearn.model_selection import GridSearchCV
	import os

	# Step 1: 데이터 로드 및 전처리
	# CSV 파일 로드 (파일 경로는 사용자의 환경에 맞게 수정)
	df = pd.read_csv('data/healthcare.csv', index_col=0)

	# 타겟 변수와 피처 분리
	X = df.drop(columns=["InsuranceClaim"])
	y = df["InsuranceClaim"]

	# Step 2: 범주형 및 수치형 데이터 처리
	categorical_features = ["Gender", "Region", "Smoker"]
	numerical_features = ["Age", "BMI", "NumVisits"]

	# 범주형 인코딩 및 수치형 결측치 대체
	categorical_transformer = OneHotEncoder(drop="first")
	numerical_transformer = SimpleImputer(strategy="mean")

	# ColumnTransformer를 통해 전처리 정의
	preprocessor = ColumnTransformer(
	transformers=[
	("num", numerical_transformer, numerical_features),
	("cat", categorical_transformer, categorical_features)
	]
	)

	# Step 3: 파이프라인 생성
	# 아래는 그리드 서치 하는 경우의 코드다. 아래의 주석들을 풀면, 그리드 서치 하는 경우의 코드가 된다.
	# model = RandomForestRegressor(random_state=42, n_estimators=500)
	model = LinearRegression()

	pipeline = Pipeline(
	steps=[
	("preprocessor", preprocessor),
	("model", model)
	]
	)

	# Step 4: 학습/테스트 데이터 분리
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Step 5: 그리드서치 설정 및 실행
	# param_grid = {
	# "model__n_estimators": [50, 100, 200, 300, 400, 500],
	# "model__max_depth": [None, 10, 20, 30],
	# "model__min_samples_split": [2, 5, 10],
	# "model__min_samples_leaf": [1, 2, 4]
	# }

	# grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
	# grid_search.fit(X_train, y_train)

	# # 최적의 파라미터와 성능 확인
	# best_params = grid_search.best_params_
	# best_model = grid_search.best_estimator_

	# print("최적의 파라미터:", best_params)

	# Step 6: 최적 모델로 평가
	# y_pred = best_model.predict(X_test)

	pipeline.fit(X_train, y_train)
	y_pred = pipeline.predict(X_test)
	rmse = np.sqrt(mean_squared_error(y_test, y_pred))
	r2 = r2_score(y_test, y_pred)

	print(f"RMSE: {rmse:.2f}")
	print(f"R^2: {r2:.2f}")

	# 모델을 파일로 저장하자
	import joblib

	# 모델 저장 디렉토리 생성
	os.makedirs('models', exist_ok=True)

	# 모델 저장
	joblib.dump(pipeline, 'models/healthcare_model.pkl')


	# Step 7: 새로운 데이터 예측 함수
	def predict_insurance_claim(new_data, pipeline):
	"""
	새로운 데이터를 입력받아 보험 청구 금액을 예측
	:param new_data: dict 형태의 입력 데이터
	:param pipeline: 학습된 파이프라인 객체
	:return: 예측된 보험 청구 금액
	"""
	new_df = pd.DataFrame([new_data])
	prediction = pipeline.predict(new_df)
	return prediction[0]

	# 새로운 데이터 예제
	new_patient = {
	"Age": 45,
	"Gender": "Male",
	"BMI": 28.5,
	"Region": "South",
	"Smoker": "Yes",
	"NumVisits": 12
	}

	predicted_claim = predict_insurance_claim(new_patient, pipeline)
	# predicted_claim = predict_insurance_claim(new_patient, best_model)
	print(f"새로운 환자의 예측 보험 청구 금액: {predicted_claim:.2f}")