Spaces:
Sleeping
Sleeping
File size: 8,093 Bytes
a9bdc1c bf6d3e0 a9bdc1c bf6d3e0 a9bdc1c bf6d3e0 a9bdc1c ff85e80 b2aabc5 cc17b16 a9bdc1c bf6d3e0 a9bdc1c 44176ca a9bdc1c cc17b16 a9bdc1c cc17b16 a9bdc1c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
import streamlit as st
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Load
df = pd.read_csv("datasets/Delhi.csv")
features = [
"Location", "Area", "No. of Bedrooms", "Resale", "SwimmingPool", "CarParking",
"AC", "Wifi", "Microwave", "TV", "DiningTable",
"Sofa", "Wardrobe", "Refrigerator"
]
df = df[["Price"] + features]
# Preprocess
df = pd.get_dummies(df, columns=["Location"], drop_first=True) # One-hot encode locations
X = df.drop("Price", axis=1)
y = df["Price"]
median_price = y.median()
# Train
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)
model = RandomForestRegressor(n_estimators=500, random_state=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
# App
st.title("Regression: Delhi Housing Price Prediction")
st.caption("dataset: https://www.kaggle.com/code/ruchi798/housing-prices-eda-and-prediction/input -> Housing Prices in Metropolitan Areas of India/Delhi.csv")
tab1, tab2, tab3 = st.tabs(["Model Performance", "Dataset", "Price Predictor"])
with tab1:
# Model Assessment
st.header("Model Performance")
# Compute regression evaluation metrics
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
st.caption("Sala sala ang evaluation dunno why, prolly irrelevant features or gasinala preprocessing ko. w/e")
# Display regression metrics
st.write(f"**Mean Absolute Error (MAE):** {mae:,.2f}")
st.write(f"**Mean Squared Error (MSE):** {mse:,.2f}")
st.write(f"**R² Score:** {r2:.2f}")
st.divider()
# Feature Importance
st.subheader("Feature Importance")
feature_importance = model.feature_importances_
feature_names = X.columns
# Aggregate one-hot encoded locations
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df.loc[importance_df['Feature'].str.startswith("Location_"), 'Feature'] = "Location"
importance_df = importance_df.groupby("Feature", as_index=False).sum()
importance_df = importance_df.sort_values(by='Importance', ascending=False)
fig, ax = plt.subplots(figsize=(8, 5))
sns.barplot(y=importance_df['Feature'], x=importance_df['Importance'], palette='coolwarm', ax=ax)
ax.set_xlabel("Importance")
ax.set_ylabel("Feature")
ax.set_title("Feature Importance in Delhi Housing Price Prediction")
st.pyplot(fig)
st.caption("Well, I tried to make a practical predictor based on amenities but it turns out that location is the most important feature, amenities are irrelevant.")
with tab2:
# Dataset
st.header("Dataset")
@st.cache_data()
def load():
return pd.read_csv("datasets/Delhi.csv")
dataset = load()
dataset_processed = df
# Quick preprocess, just for display
def preprocess(data):
data = data.drop(columns=["Location"], errors="ignore")
return data
def corr(data, title):
data = data.select_dtypes(include=["number"])
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax, annot_kws={"size": 3}, cbar_kws={"shrink": .8}) # Reduce size, lots of features.
ax.set_title(title)
st.pyplot(fig)
corr(preprocess(dataset), "Correlation Matrix")
st.caption("'tis all the correlatable data, minus the other irrelevant, non-correlatable data. [Location] feature is missing cos' it is string data, and was one-hotted.")
# Toggle order
view_type = st.radio("Order:", ["Top -> Bottom", "Bottom -> Top"])
# Display head() or tail()
if view_type == "Top -> Bottom":
st.caption("datasets/Delhi.csv")
st.dataframe(dataset.head(len(dataset)))
st.caption("df")
st.dataframe(dataset_processed.head(len(dataset_processed)))
st.caption("Lots of unique values = omega long onehot encoded feature list.")
elif view_type == "Bottom -> Top":
st.caption("datasets/Delhi.csv")
st.dataframe(dataset.tail(len(dataset)).iloc[::-1])
st.caption("df")
st.dataframe(dataset_processed.tail(len(dataset_processed)).iloc[::-1])
st.caption("Lots of unique values = omega long onehot encoded feature list.")
st.divider()
with tab3:
# User Input
st.header("Price Prediction")
col1, col2 = st.columns(2)
with col1:
area = st.slider("Area (sq. ft)", 500, 5000, 1500)
bedrooms = st.slider("Number of Bedrooms", 1, 6, 3)
is_resale = st.radio("Resale or New?", ["Resale", "New"]) == "New"
location = st.selectbox("Location", df.columns[df.columns.str.startswith("Location_")].str.replace("Location_", ""))
with col2:
is_swimming_pool = st.checkbox("Has Swimming Pool")
is_car_parking = st.checkbox("Has Car Parking")
is_ac = st.checkbox("Has Air Conditioning")
is_wifi = st.checkbox("Has Wifi")
is_microwave = st.checkbox("Has Microwave")
is_tv = st.checkbox("Has TV")
is_dining_table = st.checkbox("Has Dining Table")
is_sofa = st.checkbox("Has Sofa")
is_wardrobe = st.checkbox("Has Wardrobe")
is_refrigerator = st.checkbox("Has Refrigerator")
# Process Input Data
input_data = np.zeros(len(X.columns)) # Create zero array matching feature length
input_data[0] = area
input_data[1] = bedrooms
input_data[2] = int(is_resale)
input_data[3] = int(is_swimming_pool)
input_data[4] = int(is_car_parking)
input_data[5] = int(is_ac)
input_data[6] = int(is_wifi)
input_data[7] = int(is_microwave)
input_data[8] = int(is_tv)
input_data[9] = int(is_dining_table)
input_data[10] = int(is_sofa)
input_data[11] = int(is_wardrobe)
input_data[12] = int(is_refrigerator)
# Set the correct location column to 1
loc_index = list(X.columns).index(f"Location_{location}")
input_data[loc_index] = 1
# Predict & Output
predicted_price = model.predict([input_data])[0]
# Set colors kag descs kay bigaon
price_diff = (predicted_price - median_price) / median_price
if price_diff < -0.2:
color = "#ff4d4d" # Below Median
category = "Below Median Price"
description = "This price is significantly lower than the median price in this area. The property may lack premium features and amenities or be in a less desirable location."
elif -0.2 <= price_diff < -0.05:
color = "#ff944d" # Slightly Below
category = "Slightly Below Median Price"
description = "This price is slightly below the median range, which could indicate a competitive offer for budget-conscious buyers."
elif -0.05 <= price_diff <= 0.05:
color = "#ffff4d" # In Median Range
category = "In Median Price Range"
description = "This price falls within the typical range for this area, making it a standard market price."
elif 0.05 < price_diff <= 0.2:
color = "#94ff4d" # Slightly Above
category = "Slightly Above Median Price"
description = "This price is slightly higher than the median, possibly due to added features such as better amenities or a prime location."
else:
color = "#4dff4d" # Above Median
category = "Above Median Price"
description = "This price is significantly above the median, suggesting a premium property with high-end features, amenities, and an excellent location."
# Result
st.subheader("Predicted House Price (INR)")
st.write(f"₹{predicted_price:,.2f}")
st.markdown(f'<h3 style="color:{color};">{category}</h3>', unsafe_allow_html=True)
st.write(description)
st.caption("Dataset is weird so expect anomalous output (negative prices, omega-high prices, etc.).")
st.divider() |