allantacuelwvsu's picture
add dataset section
cc17b16
import streamlit as st
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Load
df = pd.read_csv("datasets/Delhi.csv")
features = [
"Location", "Area", "No. of Bedrooms", "Resale", "SwimmingPool", "CarParking",
"AC", "Wifi", "Microwave", "TV", "DiningTable",
"Sofa", "Wardrobe", "Refrigerator"
]
df = df[["Price"] + features]
# Preprocess
df = pd.get_dummies(df, columns=["Location"], drop_first=True) # One-hot encode locations
X = df.drop("Price", axis=1)
y = df["Price"]
median_price = y.median()
# Train
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)
model = RandomForestRegressor(n_estimators=500, random_state=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
# App
st.title("Regression: Delhi Housing Price Prediction")
st.caption("dataset: https://www.kaggle.com/code/ruchi798/housing-prices-eda-and-prediction/input -> Housing Prices in Metropolitan Areas of India/Delhi.csv")
tab1, tab2, tab3 = st.tabs(["Model Performance", "Dataset", "Price Predictor"])
with tab1:
# Model Assessment
st.header("Model Performance")
# Compute regression evaluation metrics
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
st.caption("Sala sala ang evaluation dunno why, prolly irrelevant features or gasinala preprocessing ko. w/e")
# Display regression metrics
st.write(f"**Mean Absolute Error (MAE):** {mae:,.2f}")
st.write(f"**Mean Squared Error (MSE):** {mse:,.2f}")
st.write(f"**R² Score:** {r2:.2f}")
st.divider()
# Feature Importance
st.subheader("Feature Importance")
feature_importance = model.feature_importances_
feature_names = X.columns
# Aggregate one-hot encoded locations
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df.loc[importance_df['Feature'].str.startswith("Location_"), 'Feature'] = "Location"
importance_df = importance_df.groupby("Feature", as_index=False).sum()
importance_df = importance_df.sort_values(by='Importance', ascending=False)
fig, ax = plt.subplots(figsize=(8, 5))
sns.barplot(y=importance_df['Feature'], x=importance_df['Importance'], palette='coolwarm', ax=ax)
ax.set_xlabel("Importance")
ax.set_ylabel("Feature")
ax.set_title("Feature Importance in Delhi Housing Price Prediction")
st.pyplot(fig)
st.caption("Well, I tried to make a practical predictor based on amenities but it turns out that location is the most important feature, amenities are irrelevant.")
with tab2:
# Dataset
st.header("Dataset")
@st.cache_data()
def load():
return pd.read_csv("datasets/Delhi.csv")
dataset = load()
dataset_processed = df
# Quick preprocess, just for display
def preprocess(data):
data = data.drop(columns=["Location"], errors="ignore")
return data
def corr(data, title):
data = data.select_dtypes(include=["number"])
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax, annot_kws={"size": 3}, cbar_kws={"shrink": .8}) # Reduce size, lots of features.
ax.set_title(title)
st.pyplot(fig)
corr(preprocess(dataset), "Correlation Matrix")
st.caption("'tis all the correlatable data, minus the other irrelevant, non-correlatable data. [Location] feature is missing cos' it is string data, and was one-hotted.")
# Toggle order
view_type = st.radio("Order:", ["Top -> Bottom", "Bottom -> Top"])
# Display head() or tail()
if view_type == "Top -> Bottom":
st.caption("datasets/Delhi.csv")
st.dataframe(dataset.head(len(dataset)))
st.caption("df")
st.dataframe(dataset_processed.head(len(dataset_processed)))
st.caption("Lots of unique values = omega long onehot encoded feature list.")
elif view_type == "Bottom -> Top":
st.caption("datasets/Delhi.csv")
st.dataframe(dataset.tail(len(dataset)).iloc[::-1])
st.caption("df")
st.dataframe(dataset_processed.tail(len(dataset_processed)).iloc[::-1])
st.caption("Lots of unique values = omega long onehot encoded feature list.")
st.divider()
with tab3:
# User Input
st.header("Price Prediction")
col1, col2 = st.columns(2)
with col1:
area = st.slider("Area (sq. ft)", 500, 5000, 1500)
bedrooms = st.slider("Number of Bedrooms", 1, 6, 3)
is_resale = st.radio("Resale or New?", ["Resale", "New"]) == "New"
location = st.selectbox("Location", df.columns[df.columns.str.startswith("Location_")].str.replace("Location_", ""))
with col2:
is_swimming_pool = st.checkbox("Has Swimming Pool")
is_car_parking = st.checkbox("Has Car Parking")
is_ac = st.checkbox("Has Air Conditioning")
is_wifi = st.checkbox("Has Wifi")
is_microwave = st.checkbox("Has Microwave")
is_tv = st.checkbox("Has TV")
is_dining_table = st.checkbox("Has Dining Table")
is_sofa = st.checkbox("Has Sofa")
is_wardrobe = st.checkbox("Has Wardrobe")
is_refrigerator = st.checkbox("Has Refrigerator")
# Process Input Data
input_data = np.zeros(len(X.columns)) # Create zero array matching feature length
input_data[0] = area
input_data[1] = bedrooms
input_data[2] = int(is_resale)
input_data[3] = int(is_swimming_pool)
input_data[4] = int(is_car_parking)
input_data[5] = int(is_ac)
input_data[6] = int(is_wifi)
input_data[7] = int(is_microwave)
input_data[8] = int(is_tv)
input_data[9] = int(is_dining_table)
input_data[10] = int(is_sofa)
input_data[11] = int(is_wardrobe)
input_data[12] = int(is_refrigerator)
# Set the correct location column to 1
loc_index = list(X.columns).index(f"Location_{location}")
input_data[loc_index] = 1
# Predict & Output
predicted_price = model.predict([input_data])[0]
# Set colors kag descs kay bigaon
price_diff = (predicted_price - median_price) / median_price
if price_diff < -0.2:
color = "#ff4d4d" # Below Median
category = "Below Median Price"
description = "This price is significantly lower than the median price in this area. The property may lack premium features and amenities or be in a less desirable location."
elif -0.2 <= price_diff < -0.05:
color = "#ff944d" # Slightly Below
category = "Slightly Below Median Price"
description = "This price is slightly below the median range, which could indicate a competitive offer for budget-conscious buyers."
elif -0.05 <= price_diff <= 0.05:
color = "#ffff4d" # In Median Range
category = "In Median Price Range"
description = "This price falls within the typical range for this area, making it a standard market price."
elif 0.05 < price_diff <= 0.2:
color = "#94ff4d" # Slightly Above
category = "Slightly Above Median Price"
description = "This price is slightly higher than the median, possibly due to added features such as better amenities or a prime location."
else:
color = "#4dff4d" # Above Median
category = "Above Median Price"
description = "This price is significantly above the median, suggesting a premium property with high-end features, amenities, and an excellent location."
# Result
st.subheader("Predicted House Price (INR)")
st.write(f"₹{predicted_price:,.2f}")
st.markdown(f'<h3 style="color:{color};">{category}</h3>', unsafe_allow_html=True)
st.write(description)
st.caption("Dataset is weird so expect anomalous output (negative prices, omega-high prices, etc.).")
st.divider()