ICS5110 / app.py
Gangsterbra123's picture
Upload 17 files
233eb38 verified
history blame
15.1 kB
import gradio as gr
import pickle
import pandas as pd
import ast
import numpy as np
import os
import matplotlib.pyplot as plt
# Set the option to opt into future behavior
pd.set_option('future.no_silent_downcasting', True)
# List of options for the dropdown
[("SVM - Jerome Agius", 0), ("Logistic Regression - Isaac Muscat", 1), ("Random Forest - Kyle Demicoli", 2)]
workclass_options = [('State Government', 'State-gov'),
('Self Employed Not Incorporated', 'Self-emp-not-inc'),
'Private', ('Federal Government', 'Federal-gov'), ('Local Government', 'Local-gov'), ('Self Employed Incorporated', 'Self-emp-inc'), ('Without Pay', 'Without-pay')]
education_option = [('Pre-School', 'Preschool'), '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th', ('High School Graduate', 'HS-grad'), ('Collage', 'Some-college'), ('Associate Degree - Vocational', 'Assoc-voc'), ('Associate Degree - Academic', 'Assoc-acdm'), 'Bachelors', 'Masters', ('Professional School', 'Prof-school'), 'Doctorate']
marital_status_option = [('Never Married','Never-married'), ('Married Civilian Spouse', 'Married-civ-spouse'), 'Divorced', 'Separated', ('Married Armed Forces Spouse', 'Married-AF-spouse'), 'Widowed', ('Married Spouse Absent', 'Married-spouse-absent')]
occupation_option = [('Administrative Clerical', 'Adm-clerical'), ('Executive Managerial', 'Exec-managerial'), ('Handlers and Cleaners', 'Handlers-cleaners'), ('Professional Specialty', 'Prof-specialty'), 'Sales', ('Farming and Fishing', 'Farming-fishing'), ('Machine Operator and Inspector', 'Machine-op-inspct'), ('Other Service', 'Other-service'), ('Transport and Moving', 'Transport-moving'), ('Technical Support', 'Tech-support'), ('Craft and Repair', 'Craft-repair'), ('Protective Services', 'Protective-serv'), ('Armed Forces', 'Armed-Forces'), ('Private Household Services' ,'Priv-house-serv')]
relationship_option = [('Not In Family', 'Not-in-family'), 'Husband', 'Wife', ('Biological Child', 'Own-child'), 'Unmarried', ('Other Relative', 'Other-relative')]
race_option = ['White', 'Black', 'Other', ('Asian', 'Asian-Pac-Islander'), ('Indian', 'Amer-Indian-Eskimo')]
sex_option = sorted(['Male', 'Female'])
age = [0, 100]
capital_gain = [0, 99999]
capital_loss = [0, 4356]
hours_per_week = [20, 60]
children_count = [0, 15]
bmi = [10, 100]
region_option = ['southwest', 'southeast', 'northwest', 'northeast']
smoker_option = ['yes', 'no']
# Mapping for education
education_mapping = "{'Preschool': 1, '1st-4th': 2, '5th-6th': 3, '7th-8th': 4, '9th': 5, '10th': 6, '11th': 7, '12th': 8, 'HS-grad': 9, 'Some-college': 10, 'Assoc-voc': 11, 'Assoc-acdm': 12, 'Bachelors': 13, 'Masters': 14, 'Prof-school': 15, 'Doctorate': 16}"
education_dict = ast.literal_eval(education_mapping)
# List of the columns present in dataframe used to train the model
salary_columns = ['age', 'education-num', 'sex', 'capital-gain', 'capital-loss',
'hours-per-week', 'workclass_Local-gov', 'workclass_Private',
'workclass_Self-emp-inc', 'workclass_Self-emp-not-inc',
'workclass_State-gov', 'workclass_Without-pay',
'marital-status_Married-AF-spouse', 'marital-status_Married-civ-spouse',
'marital-status_Married-spouse-absent', 'marital-status_Never-married',
'marital-status_Separated', 'marital-status_Widowed',
'occupation_Armed-Forces', 'occupation_Craft-repair',
'occupation_Exec-managerial', 'occupation_Farming-fishing',
'occupation_Handlers-cleaners', 'occupation_Machine-op-inspct',
'occupation_Other-service', 'occupation_Priv-house-serv',
'occupation_Prof-specialty', 'occupation_Protective-serv',
'occupation_Sales', 'occupation_Tech-support',
'occupation_Transport-moving', 'relationship_Not-in-family',
'relationship_Other-relative', 'relationship_Own-child',
'relationship_Unmarried', 'relationship_Wife', 'race_Asian-Pac-Islander',
'race_Black', 'race_Other', 'race_White']
health_columns = ['age', 'sex', 'bmi', 'children', 'smoker', 'region_northwest', 'region_southeast', 'region_southwest']
# Code for SVM
def Salary(model, workclass, education, marital_status, occupation, relationship, race, sex, age, capital_gain, capital_loss, hours_per_week):
# Set the working directory to the script's directory
if model == 0:
model_used = "SVM"
with open('models/best_svm_OvM_Salary_Classification.pkl', 'rb') as f:
loaded_model = pickle.load(f)
# Loading the scaler and transform the data
with open('models/z-score_scaler_svm_salary_classification.pkl', 'rb') as f:
scaler = pickle.load(f)
elif model == 1:
model_used = "Logistic Regression"
with open('models/best_lr_Salary_Classification.pkl', 'rb') as f:
loaded_model = pickle.load(f)
# Loading the scaler and transform the data
with open('models/z-score_scaler_lr_salary_classification.pkl', 'rb') as f:
scaler = pickle.load(f)
elif model == 2:
model_used = "Random Forest"
with open('models/best_rf_Salary_Classification.pkl', 'rb') as f:
loaded_model = pickle.load(f)
# Loading the scaler and transform the data
with open('models/z-score_scaler_rf_salary_classification.pkl', 'rb') as f:
scaler = pickle.load(f)
new_data = {
'age': age,
'workclass': workclass,
'education': education,
'marital-status': marital_status,
'occupation': occupation,
'relationship': relationship,
'race': race,
'sex': sex,
'capital-gain': capital_gain,
'capital-loss': capital_loss,
'hours-per-week': hours_per_week,
new_data = pd.DataFrame([new_data])
new_data['education'] = new_data['education'].map(education_dict)
new_data = new_data.rename(columns={'education': 'education-num'})
# Create an empty DataFrame with these columns
formattedDF = pd.DataFrame(columns=salary_columns)
# Copying over the continuous columns
formattedDF['age'] = new_data['age']
formattedDF['education-num'] = new_data['education-num']
formattedDF['capital-gain'] = new_data['capital-gain']
formattedDF['capital-loss'] = new_data['capital-loss']
formattedDF['hours-per-week'] = new_data['hours-per-week']
formattedDF['workclass_'+new_data['workclass']] = 1
formattedDF['marital-status_'+new_data['marital-status']] = 1
formattedDF['occupation_'+new_data['occupation']] = 1
formattedDF['relationship_'+new_data['relationship']] = 1
formattedDF['race_'+new_data['race']] = 1
formattedDF['sex'] = formattedDF['sex'].apply(lambda x: 1 if x == 'Male' else 0)
# Fill remaining columns with 0
formattedDF.fillna(0, inplace=True)
formattedDF = formattedDF.astype(int)
formattedDF = formattedDF[formattedDF.columns.intersection(salary_columns)]
# Assuming 'high_skew_columns' from training is a list of columns with high skewness
for column in ['capital-gain', 'capital-loss']:
formattedDF[column] = np.log1p(formattedDF[column])
# Apply the scaler to the unseen data
continuous_columns = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
formattedDF[continuous_columns] = scaler.transform(formattedDF[continuous_columns])
# Make predictions with the loaded model
prediction = loaded_model.predict(formattedDF)
probability = loaded_model.predict_proba(formattedDF)
# Get the number of classes
num_classes = probability.shape[1]
class_dict = {
0: '<=50K',
1: '>50K'
# Select the probabilities for a single sample (e.g., the first sample)
probabilities = probability[0]
class_labels = [class_dict[i] for i in range(num_classes)]
colors = plt.cm.viridis(np.linspace(0, 1, num_classes)) # Use a colormap for consistent colors
fig, ax = plt.subplots(figsize=(10, 10))
_, _, autotexts = ax.pie(probabilities, colors=colors, autopct='%1.1f%%', startangle=140, pctdistance=1.1)
# Create a legend with colored boxes
legend_elements = []
for i, (color, label) in enumerate(zip(colors, class_labels)):
legend_elements.append(plt.Rectangle((0, 0), 1, 1, color=color, label=label))
ax.legend(handles=legend_elements, loc='upper left')
ax.set_title("Predicted Class Probabilities")
for i, p in enumerate(probabilities):
prob = float(round(p*100, 2))
if prob > 0:
salary_result = '<=50K' if prediction[0] == 0 else '>50K'
return f"Predicted using {model_used} Salary Class: {salary_result}", fig
def Health(model, age, sex, bmi, children, smoker, region):
# Set the working directory to the script's directory
if model == 0:
model_used = "SVM"
with open('models/best_health_svm_OvM_Charges_Classification.pkl', 'rb') as f:
loaded_model = pickle.load(f)
# Loading the scaler and transform the data
with open('models/z-score_scaler_svm_charges_classification.pkl', 'rb') as f:
scaler = pickle.load(f)
elif model == 1:
model_used = "Logistic Regression"
with open('models/best_health_lr_Charges_Classification.pkl', 'rb') as f:
loaded_model = pickle.load(f)
# Loading the scaler and transform the data
with open('models/z-score_scaler_lr_charges_classification.pkl', 'rb') as f:
scaler = pickle.load(f)
elif model == 2:
model_used = "Random Forest"
with open('models/best_rf_Charges_Classification.pkl', 'rb') as f:
loaded_model = pickle.load(f)
# Loading the scaler and transform the data
with open('models/z-score_scaler_rf_charges_classification.pkl', 'rb') as f:
scaler = pickle.load(f)
#Inverting the dict to map the 'charges' values back to 'charges' labels
inverse_mapping_charges = {
0: 'Very Low (<= 5000)',
1: 'Low (5001 - 10000)',
2: 'Moderate (10001 - 15000)',
3: 'High (15001 - 20000)',
4: 'Very High (> 20001)',
new_data = {
'age': age,
'sex': sex,
'bmi': bmi,
'children': children,
'smoker': smoker,
'region': region,
new_data = pd.DataFrame([new_data])
# Create an empty DataFrame with these columns
formattedDF = pd.DataFrame(columns=health_columns)
# Copying over the continuous columns
formattedDF['age'] = new_data['age']
formattedDF['sex'] = new_data['sex'].apply(lambda x: 1 if x == 'Male' else 0)
formattedDF['bmi'] = new_data['bmi']
formattedDF['children'] = new_data['children']
formattedDF['smoker'] = new_data['smoker'].apply(lambda x: 1 if x == 'Yes' else 0)
formattedDF['region_'+new_data['region']] = 1
# Fill remaining columns with 0
formattedDF.fillna(0, inplace=True)
formattedDF = formattedDF.astype(int)
formattedDF = formattedDF[formattedDF.columns.intersection(health_columns)]
# Apply the scaler to the unseen data
continuous_columns = ['age', 'bmi']
formattedDF[continuous_columns] = scaler.transform(formattedDF[continuous_columns])
# Make predictions with the loaded model
prediction = loaded_model.predict(formattedDF)[0]
prediction = inverse_mapping_charges[prediction]
probability = loaded_model.predict_proba(formattedDF)
# Get the number of classes
num_classes = probability.shape[1]
class_dict = {
0: 'Very Low (<= 5000)',
1: 'Low (5001 - 10000)',
2: 'Moderate (10001 - 15000)',
3: 'High (15001 - 20000)',
4: 'Very High (> 20001)',
# Select the probabilities for a single sample (e.g., the first sample)
probabilities = probability[0]
class_labels = [class_dict[i] for i in range(num_classes)]
colors = plt.cm.viridis(np.linspace(0, 1, num_classes)) # Use a colormap for consistent colors
fig, ax = plt.subplots(figsize=(10, 10))
_, _, autotexts = ax.pie(probabilities, colors=colors, autopct='%1.1f%%', startangle=140, pctdistance=1.1)
# Create a legend with colored boxes
legend_elements = []
for i, (color, label) in enumerate(zip(colors, class_labels)):
legend_elements.append(plt.Rectangle((0, 0), 1, 1, color=color, label=label))
ax.legend(handles=legend_elements, loc='upper left')
ax.set_title("Predicted Class Probabilities")
for i, p in enumerate(probabilities):
prob = float(round(p*100, 2))
if prob > 0:
return f"Predicted using {model_used} Charges Class: {prediction}", fig
# interface one
iface1 = gr.Interface(
gr.Dropdown(choices=[("SVM - Jerome Agius", 0), ("Logistic Regression - Isaac Muscat", 1), ("Random Forest - Kyle Demicoli", 2)], label="Model", value=0),
gr.Dropdown(choices=workclass_options, label="Workclass"),
gr.Dropdown(choices=education_option, label="Education"),
gr.Dropdown(choices=marital_status_option, label="Marital Status"),
gr.Dropdown(choices=occupation_option, label="Occupation"),
gr.Dropdown(choices=relationship_option, label="Relationship"),
gr.Dropdown(choices=race_option, label="Race"),
gr.Dropdown(choices=sex_option, label="Sex"),
gr.Slider(minimum=age[0], maximum=age[1], step=1, label="Age"),
gr.Slider(minimum=capital_gain[0], maximum=capital_gain[1], step=1, label="Capital Gain"),
gr.Slider(minimum=capital_loss[0], maximum=capital_loss[1], step=1, label="Capital Loss"),
gr.Slider(minimum=hours_per_week[0], maximum=hours_per_week[1], step=1, label="Hours per Week"),
outputs=[gr.Text(label="Predicted Label"), gr.Plot(label="Predicted Class Probabilities")],
title="SVM - Salary",
# interface two
iface2 = gr.Interface(
gr.Dropdown(choices=[("SVM - Jerome Agius", 0), ("Logistic Regression - Isaac Muscat", 1), ("Random Forest - Kyle Demicoli", 2)], label="Model", value=0),
gr.Slider(minimum=age[0], maximum=age[1], step=1, label="Age"),
gr.Dropdown(choices=sex_option, label="Sex"),
gr.Slider(minimum=bmi[0], maximum=bmi[1], step=0.1, label="BMI"),
gr.Slider(minimum=children_count[0], maximum=children_count[1], step=1, label="No. of Children"),
gr.Dropdown(choices=smoker_option, label="Is Smoker"),
gr.Dropdown(choices=region_option, label="Region"),
outputs=[gr.Text(label="Predicted Label"), gr.Plot(label="Predicted Class Probabilities")],
title="SVM - Health",
demo = gr.TabbedInterface([iface1, iface2], ["Salary Prediction", "Health Charges Prediction"])
# Run the interface