Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score | |
# Load | |
df = pd.read_csv("datasets/Delhi.csv") | |
features = [ | |
"Location", "Area", "No. of Bedrooms", "Resale", "SwimmingPool", "CarParking", | |
"AC", "Wifi", "Microwave", "TV", "DiningTable", | |
"Sofa", "Wardrobe", "Refrigerator" | |
] | |
df = df[["Price"] + features] | |
# Preprocess | |
df = pd.get_dummies(df, columns=["Location"], drop_first=True) # One-hot encode locations | |
X = df.drop("Price", axis=1) | |
y = df["Price"] | |
median_price = y.median() | |
# Train | |
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1) | |
model = RandomForestRegressor(n_estimators=500, random_state=1) | |
model.fit(X_train, y_train) | |
y_pred = model.predict(X_val) | |
# App | |
st.title("Regression: Delhi Housing Price Prediction") | |
st.caption("dataset: https://www.kaggle.com/code/ruchi798/housing-prices-eda-and-prediction/input -> Housing Prices in Metropolitan Areas of India/Delhi.csv") | |
tab1, tab2, tab3 = st.tabs(["Model Performance", "Dataset", "Price Predictor"]) | |
with tab1: | |
# Model Assessment | |
st.header("Model Performance") | |
# Compute regression evaluation metrics | |
mae = mean_absolute_error(y_val, y_pred) | |
mse = mean_squared_error(y_val, y_pred) | |
r2 = r2_score(y_val, y_pred) | |
st.caption("Sala sala ang evaluation dunno why, prolly irrelevant features or gasinala preprocessing ko. w/e") | |
# Display regression metrics | |
st.write(f"**Mean Absolute Error (MAE):** {mae:,.2f}") | |
st.write(f"**Mean Squared Error (MSE):** {mse:,.2f}") | |
st.write(f"**R² Score:** {r2:.2f}") | |
st.divider() | |
# Feature Importance | |
st.subheader("Feature Importance") | |
feature_importance = model.feature_importances_ | |
feature_names = X.columns | |
# Aggregate one-hot encoded locations | |
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance}) | |
importance_df.loc[importance_df['Feature'].str.startswith("Location_"), 'Feature'] = "Location" | |
importance_df = importance_df.groupby("Feature", as_index=False).sum() | |
importance_df = importance_df.sort_values(by='Importance', ascending=False) | |
fig, ax = plt.subplots(figsize=(8, 5)) | |
sns.barplot(y=importance_df['Feature'], x=importance_df['Importance'], palette='coolwarm', ax=ax) | |
ax.set_xlabel("Importance") | |
ax.set_ylabel("Feature") | |
ax.set_title("Feature Importance in Delhi Housing Price Prediction") | |
st.pyplot(fig) | |
st.caption("Well, I tried to make a practical predictor based on amenities but it turns out that location is the most important feature, amenities are irrelevant.") | |
with tab2: | |
# Dataset | |
st.header("Dataset") | |
def load(): | |
return pd.read_csv("datasets/Delhi.csv") | |
dataset = load() | |
dataset_processed = df | |
# Quick preprocess, just for display | |
def preprocess(data): | |
data = data.drop(columns=["Location"], errors="ignore") | |
return data | |
def corr(data, title): | |
data = data.select_dtypes(include=["number"]) | |
fig, ax = plt.subplots(figsize=(8, 6)) | |
sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax, annot_kws={"size": 3}, cbar_kws={"shrink": .8}) # Reduce size, lots of features. | |
ax.set_title(title) | |
st.pyplot(fig) | |
corr(preprocess(dataset), "Correlation Matrix") | |
st.caption("'tis all the correlatable data, minus the other irrelevant, non-correlatable data. [Location] feature is missing cos' it is string data, and was one-hotted.") | |
# Toggle order | |
view_type = st.radio("Order:", ["Top -> Bottom", "Bottom -> Top"]) | |
# Display head() or tail() | |
if view_type == "Top -> Bottom": | |
st.caption("datasets/Delhi.csv") | |
st.dataframe(dataset.head(len(dataset))) | |
st.caption("df") | |
st.dataframe(dataset_processed.head(len(dataset_processed))) | |
st.caption("Lots of unique values = omega long onehot encoded feature list.") | |
elif view_type == "Bottom -> Top": | |
st.caption("datasets/Delhi.csv") | |
st.dataframe(dataset.tail(len(dataset)).iloc[::-1]) | |
st.caption("df") | |
st.dataframe(dataset_processed.tail(len(dataset_processed)).iloc[::-1]) | |
st.caption("Lots of unique values = omega long onehot encoded feature list.") | |
st.divider() | |
with tab3: | |
# User Input | |
st.header("Price Prediction") | |
col1, col2 = st.columns(2) | |
with col1: | |
area = st.slider("Area (sq. ft)", 500, 5000, 1500) | |
bedrooms = st.slider("Number of Bedrooms", 1, 6, 3) | |
is_resale = st.radio("Resale or New?", ["Resale", "New"]) == "New" | |
location = st.selectbox("Location", df.columns[df.columns.str.startswith("Location_")].str.replace("Location_", "")) | |
with col2: | |
is_swimming_pool = st.checkbox("Has Swimming Pool") | |
is_car_parking = st.checkbox("Has Car Parking") | |
is_ac = st.checkbox("Has Air Conditioning") | |
is_wifi = st.checkbox("Has Wifi") | |
is_microwave = st.checkbox("Has Microwave") | |
is_tv = st.checkbox("Has TV") | |
is_dining_table = st.checkbox("Has Dining Table") | |
is_sofa = st.checkbox("Has Sofa") | |
is_wardrobe = st.checkbox("Has Wardrobe") | |
is_refrigerator = st.checkbox("Has Refrigerator") | |
# Process Input Data | |
input_data = np.zeros(len(X.columns)) # Create zero array matching feature length | |
input_data[0] = area | |
input_data[1] = bedrooms | |
input_data[2] = int(is_resale) | |
input_data[3] = int(is_swimming_pool) | |
input_data[4] = int(is_car_parking) | |
input_data[5] = int(is_ac) | |
input_data[6] = int(is_wifi) | |
input_data[7] = int(is_microwave) | |
input_data[8] = int(is_tv) | |
input_data[9] = int(is_dining_table) | |
input_data[10] = int(is_sofa) | |
input_data[11] = int(is_wardrobe) | |
input_data[12] = int(is_refrigerator) | |
# Set the correct location column to 1 | |
loc_index = list(X.columns).index(f"Location_{location}") | |
input_data[loc_index] = 1 | |
# Predict & Output | |
predicted_price = model.predict([input_data])[0] | |
# Set colors kag descs kay bigaon | |
price_diff = (predicted_price - median_price) / median_price | |
if price_diff < -0.2: | |
color = "#ff4d4d" # Below Median | |
category = "Below Median Price" | |
description = "This price is significantly lower than the median price in this area. The property may lack premium features and amenities or be in a less desirable location." | |
elif -0.2 <= price_diff < -0.05: | |
color = "#ff944d" # Slightly Below | |
category = "Slightly Below Median Price" | |
description = "This price is slightly below the median range, which could indicate a competitive offer for budget-conscious buyers." | |
elif -0.05 <= price_diff <= 0.05: | |
color = "#ffff4d" # In Median Range | |
category = "In Median Price Range" | |
description = "This price falls within the typical range for this area, making it a standard market price." | |
elif 0.05 < price_diff <= 0.2: | |
color = "#94ff4d" # Slightly Above | |
category = "Slightly Above Median Price" | |
description = "This price is slightly higher than the median, possibly due to added features such as better amenities or a prime location." | |
else: | |
color = "#4dff4d" # Above Median | |
category = "Above Median Price" | |
description = "This price is significantly above the median, suggesting a premium property with high-end features, amenities, and an excellent location." | |
# Result | |
st.subheader("Predicted House Price (INR)") | |
st.write(f"₹{predicted_price:,.2f}") | |
st.markdown(f'<h3 style="color:{color};">{category}</h3>', unsafe_allow_html=True) | |
st.write(description) | |
st.caption("Dataset is weird so expect anomalous output (negative prices, omega-high prices, etc.).") | |
st.divider() |