Spaces:

epalvarez
/

EdwardAlvarez_Insurance_Charge_Prediction

Sleeping

App Files Files Community

EdwardAlvarez_Insurance_Charge_Prediction / app.py

epalvarez

Adding information messages about the log path and prediction result

82d3328 verified about 1 year ago

raw

history blame contribute delete

6.12 kB

	# +++
	# Import the libraries
	#---------------------------------------------------------------------------------------------------------
	import os
	import uuid
	import joblib
	import json

	# IMPORTANT: I already installed the package "gradio" in my current Virtual Environment (VEnvDSDIL_gpu_Py3.12) as: pip install -q gradio_client
	# Do NOT install "gradio_client" package again in Anaconda otherwise it will mess up the package.
	import gradio as gr
	import pandas as pd

	# must install the package "huggingface_hub" first in the current python Virtual Environment, with pip, not with conda, as follows
	# pip install huggingface_hub
	# i.e., in the command line interface within the activated Virtual Environment:
	# (VEnvDSDIL_gpu_Py3.12) epalvarez@DSDILmStation01:~ $ pip install huggingface_hub
	from huggingface_hub import CommitScheduler
	from pathlib import Path
	#---------------------------------------------------------------------------------------------------------

	# Run the training script placed in the same directory as app.py
	# The training script will train and persist a linear regression
	# model with the filename 'model_ic.joblib'
	print(f"\n... Initializing train_ic.py\n")
	os.system('python train_ic.py') # Take a command line argument: execute the "train_ic.py" in a subterminal... this will load the data file and serialize the model into "model_ic.joblib
	print(f"\n... train_ic.py initialized.\n")

	# Load the freshly trained model from disk
	# Reconstruct a Python object from a file persisted with joblib.dump.
	# Returns: The Python object stored in the file.
	# Obtain current directory and data file path
	current_directory = Path.cwd()
	print(f"current_directory: {current_directory}\n")
	# Use joinpath to add subdirectories and a filename
	saved_model_file_path = current_directory.joinpath("model_ic.joblib")
	print(f"saved_model_file_path: {saved_model_file_path}\n")
	# Retrieve serialized model object
	insurance_charge_predictor = joblib.load(filename=saved_model_file_path)

	# Prepare the logging functionality
	log_file = Path("logs/") / f"data_{uuid.uuid4()}.json"
	log_folder = log_file.parent
	print(f"\nInformation:\n\tlog_file: {log_file}\n\tlog_folder: {log_folder}\n")

	# Scheduler will log every 2 API calls:
	scheduler = CommitScheduler(
	repo_id="insurance-charge-mlops-logs", # provide a name "insurance-charge-mlops-logs" for the repo_id
	repo_type="dataset",
	folder_path=log_folder,
	path_in_repo="data",
	every=2
	)

	# Define the "predict function" which will take features, convert to dataframe and make predictions using the saved model
	# the functions runs when 'Submit' is clicked or when a API request is made
	# IMPORTANT Note: do not modify the names of keys for "sample" and "scheduler"; the keys should be named exactly as the names in the columns in the DataFrame.
	# Otherwise, an run-time error will occur.
	#-------------------------------------------------------------------------------------------------------------------------------------------------------------
	def predict_insurance_charge(age, bmi, children, sex, smoker, region):
	sample = {
	'age': age,
	'bmi': bmi,
	'children': children,
	'sex': sex,
	'smoker': smoker,
	'region': region
	}
	data_point = pd.DataFrame([sample])
	prediction = insurance_charge_predictor.predict(data_point).tolist() # use the model_ic.joblib retrieved above to make the prediction

	# While the prediction is made, log both the inputs and outputs to a log file
	# While writing to the log file, ensure that the commit scheduler is locked to avoid parallel access
	# Push prediction to a dataset repo for logging
	# Each time we get a prediction we will determine if we should log it to a hugging_face dataset according to the schedule definition outside this function
	with scheduler.lock:
	with log_file.open("a") as f:
	f.write(json.dumps(
	{
	'age': age,
	'bmi': bmi,
	'children': children,
	'sex': sex,
	'smoker': smoker,
	'region': region,
	'prediction': prediction[0][0]
	}
	))
	f.write("\n")

	prediction_result = prediction[0][0]
	print(f"\nPrediction result: {prediction_result} - {type(prediction_result)}\n")
	#print(f"\nDebug - prediction[0]: {prediction[0]} - {type(prediction[0])}\n")
	#print(f"\nDebug - prediction[0][0]: {prediction[0][0]} - {type(prediction[0][0])}\n")
	return prediction_result
	#return prediction[0]
	#return prediction[0][0]
	#--------------------------------------------------------------------------------------------------------------------------------------------------------------

	# Set up UI components for input and output
	# Input components
	age_input = gr.Number(label="Age [attained years]")
	bmi_input = gr.Number(label='BMI')
	children_input = gr.Number(label='Children [#]')
	sex_input = gr.Dropdown(['male', 'female'], label='Sex')
	smoker_input = gr.Dropdown(['no', 'yes'], label='Smoker')
	region_input = gr.Dropdown(['southeast', 'southwest', 'northeast', 'northwest'], label='Region')
	# Output component
	model_output = gr.Label(label="Insurance Charge [$]")

	# Create the gradio interface, make title "HealthyLife Insurance Charge Prediction"
	demo = gr.Interface(
	fn=predict_insurance_charge,
	inputs=[age_input, bmi_input, children_input,
	sex_input, smoker_input, region_input],
	outputs=model_output,
	title="Insurance Charge Predictor",
	description="This API allows you to predict the appropriate insurance charge based on the input parameters.",
	allow_flagging="auto", # automatically push to the HuggingFace Dataset
	concurrency_limit=8
	)

	# Launch with a load balancer
	demo.queue()
	demo.launch(share=False)
	# To create a public link, set "share=True" in launch() .... but if I execute this app.py locally, then I have to have my computer on for the public users to access the browser interface