Spaces:

allantacuelwvsu
/

delhi_housing_price

Sleeping

File size: 8,093 Bytes

import streamlit as st
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load
df = pd.read_csv("datasets/Delhi.csv")
features = [
    "Location", "Area", "No. of Bedrooms", "Resale", "SwimmingPool", "CarParking",
    "AC", "Wifi", "Microwave", "TV", "DiningTable",
    "Sofa", "Wardrobe", "Refrigerator"
]
df = df[["Price"] + features]

# Preprocess
df = pd.get_dummies(df, columns=["Location"], drop_first=True)  # One-hot encode locations
X = df.drop("Price", axis=1)
y = df["Price"]
median_price = y.median()

# Train
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)
model = RandomForestRegressor(n_estimators=500, random_state=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

# App
st.title("Regression: Delhi Housing Price Prediction")
st.caption("dataset: https://www.kaggle.com/code/ruchi798/housing-prices-eda-and-prediction/input -> Housing Prices in Metropolitan Areas of India/Delhi.csv")
tab1, tab2, tab3 = st.tabs(["Model Performance", "Dataset", "Price Predictor"])

with tab1:
    # Model Assessment
    st.header("Model Performance")
    
    # Compute regression evaluation metrics
    mae = mean_absolute_error(y_val, y_pred)
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    st.caption("Sala sala ang evaluation dunno why, prolly irrelevant features or gasinala preprocessing ko. w/e")

    # Display regression metrics
    st.write(f"**Mean Absolute Error (MAE):** {mae:,.2f}")
    st.write(f"**Mean Squared Error (MSE):** {mse:,.2f}")
    st.write(f"**R² Score:** {r2:.2f}")
    st.divider()
    
    # Feature Importance
    st.subheader("Feature Importance")
    feature_importance = model.feature_importances_
    feature_names = X.columns

    # Aggregate one-hot encoded locations
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
    importance_df.loc[importance_df['Feature'].str.startswith("Location_"), 'Feature'] = "Location"
    importance_df = importance_df.groupby("Feature", as_index=False).sum()
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    fig, ax = plt.subplots(figsize=(8, 5))
    sns.barplot(y=importance_df['Feature'], x=importance_df['Importance'], palette='coolwarm', ax=ax)
    ax.set_xlabel("Importance")
    ax.set_ylabel("Feature")
    ax.set_title("Feature Importance in Delhi Housing Price Prediction")
    st.pyplot(fig)
    st.caption("Well, I tried to make a practical predictor based on amenities but it turns out that location is the most important feature, amenities are irrelevant.")

with tab2:
    # Dataset
    st.header("Dataset")
    
    @st.cache_data()
    def load():
        return pd.read_csv("datasets/Delhi.csv")
    dataset = load()
    dataset_processed = df
    
    # Quick preprocess, just for display
    def preprocess(data):
        data = data.drop(columns=["Location"], errors="ignore")
        return data

    def corr(data, title):
        data = data.select_dtypes(include=["number"])
        fig, ax = plt.subplots(figsize=(8, 6))
        sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax, annot_kws={"size": 3}, cbar_kws={"shrink": .8}) # Reduce size, lots of features.
        ax.set_title(title)
        st.pyplot(fig)
    corr(preprocess(dataset), "Correlation Matrix")
    st.caption("'tis all the correlatable data, minus the other irrelevant, non-correlatable data. [Location] feature is missing cos' it is string data, and was one-hotted.")
    
    # Toggle order
    view_type = st.radio("Order:", ["Top -> Bottom", "Bottom -> Top"])
    
    # Display head() or tail()
    if view_type == "Top -> Bottom":
        st.caption("datasets/Delhi.csv")
        st.dataframe(dataset.head(len(dataset)))
        st.caption("df")
        st.dataframe(dataset_processed.head(len(dataset_processed)))
        st.caption("Lots of unique values = omega long onehot encoded feature list.")
        
    elif view_type == "Bottom -> Top":
        st.caption("datasets/Delhi.csv")
        st.dataframe(dataset.tail(len(dataset)).iloc[::-1])
        st.caption("df")
        st.dataframe(dataset_processed.tail(len(dataset_processed)).iloc[::-1])
        st.caption("Lots of unique values = omega long onehot encoded feature list.")        
    st.divider()
with tab3:
    # User Input
    st.header("Price Prediction")
    col1, col2 = st.columns(2)
    with col1:
        area = st.slider("Area (sq. ft)", 500, 5000, 1500)
        bedrooms = st.slider("Number of Bedrooms", 1, 6, 3)
        is_resale = st.radio("Resale or New?", ["Resale", "New"]) == "New"
        location = st.selectbox("Location", df.columns[df.columns.str.startswith("Location_")].str.replace("Location_", ""))
    with col2:
        is_swimming_pool = st.checkbox("Has Swimming Pool")
        is_car_parking = st.checkbox("Has Car Parking")
        is_ac = st.checkbox("Has Air Conditioning")
        is_wifi = st.checkbox("Has Wifi")
        is_microwave = st.checkbox("Has Microwave")
        is_tv = st.checkbox("Has TV")
        is_dining_table = st.checkbox("Has Dining Table")
        is_sofa = st.checkbox("Has Sofa")
        is_wardrobe = st.checkbox("Has Wardrobe")
        is_refrigerator = st.checkbox("Has Refrigerator")
        
        # Process Input Data
    input_data = np.zeros(len(X.columns))  # Create zero array matching feature length
    input_data[0] = area
    input_data[1] = bedrooms
    input_data[2] = int(is_resale)
    input_data[3] = int(is_swimming_pool)
    input_data[4] = int(is_car_parking)
    input_data[5] = int(is_ac)
    input_data[6] = int(is_wifi)
    input_data[7] = int(is_microwave)
    input_data[8] = int(is_tv)
    input_data[9] = int(is_dining_table)
    input_data[10] = int(is_sofa)
    input_data[11] = int(is_wardrobe)
    input_data[12] = int(is_refrigerator)

    # Set the correct location column to 1
    loc_index = list(X.columns).index(f"Location_{location}")
    input_data[loc_index] = 1

    # Predict & Output
    predicted_price = model.predict([input_data])[0]

    # Set colors kag descs kay bigaon
    price_diff = (predicted_price - median_price) / median_price
    if price_diff < -0.2:
        color = "#ff4d4d"  # Below Median
        category = "Below Median Price"
        description = "This price is significantly lower than the median price in this area. The property may lack premium features and amenities or be in a less desirable location."
    elif -0.2 <= price_diff < -0.05:
        color = "#ff944d"  # Slightly Below
        category = "Slightly Below Median Price"
        description = "This price is slightly below the median range, which could indicate a competitive offer for budget-conscious buyers."
    elif -0.05 <= price_diff <= 0.05:
        color = "#ffff4d"  # In Median Range
        category = "In Median Price Range"
        description = "This price falls within the typical range for this area, making it a standard market price."
    elif 0.05 < price_diff <= 0.2:
        color = "#94ff4d"  # Slightly Above
        category = "Slightly Above Median Price"
        description = "This price is slightly higher than the median, possibly due to added features such as better amenities or a prime location."
    else:
        color = "#4dff4d"  # Above Median
        category = "Above Median Price"
        description = "This price is significantly above the median, suggesting a premium property with high-end features, amenities, and an excellent location."

    # Result
    st.subheader("Predicted House Price (INR)")
    st.write(f"₹{predicted_price:,.2f}")
    st.markdown(f'<h3 style="color:{color};">{category}</h3>', unsafe_allow_html=True)
    st.write(description)
    st.caption("Dataset is weird so expect anomalous output (negative prices, omega-high prices, etc.).")
    st.divider()