File size: 8,093 Bytes
a9bdc1c
 
 
 
 
bf6d3e0
a9bdc1c
 
 
 
 
bf6d3e0
 
 
 
 
 
a9bdc1c
 
 
 
 
 
 
 
 
bf6d3e0
a9bdc1c
 
 
 
ff85e80
b2aabc5
cc17b16
a9bdc1c
 
 
 
 
 
 
 
 
bf6d3e0
a9bdc1c
 
 
 
 
 
44176ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a9bdc1c
 
cc17b16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a9bdc1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc17b16
a9bdc1c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import streamlit as st
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load
df = pd.read_csv("datasets/Delhi.csv")
features = [
    "Location", "Area", "No. of Bedrooms", "Resale", "SwimmingPool", "CarParking",
    "AC", "Wifi", "Microwave", "TV", "DiningTable",
    "Sofa", "Wardrobe", "Refrigerator"
]
df = df[["Price"] + features]

# Preprocess
df = pd.get_dummies(df, columns=["Location"], drop_first=True)  # One-hot encode locations
X = df.drop("Price", axis=1)
y = df["Price"]
median_price = y.median()

# Train
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)
model = RandomForestRegressor(n_estimators=500, random_state=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

# App
st.title("Regression: Delhi Housing Price Prediction")
st.caption("dataset: https://www.kaggle.com/code/ruchi798/housing-prices-eda-and-prediction/input -> Housing Prices in Metropolitan Areas of India/Delhi.csv")
tab1, tab2, tab3 = st.tabs(["Model Performance", "Dataset", "Price Predictor"])

with tab1:
    # Model Assessment
    st.header("Model Performance")
    
    # Compute regression evaluation metrics
    mae = mean_absolute_error(y_val, y_pred)
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    st.caption("Sala sala ang evaluation dunno why, prolly irrelevant features or gasinala preprocessing ko. w/e")

    # Display regression metrics
    st.write(f"**Mean Absolute Error (MAE):** {mae:,.2f}")
    st.write(f"**Mean Squared Error (MSE):** {mse:,.2f}")
    st.write(f"**R² Score:** {r2:.2f}")
    st.divider()
    
    # Feature Importance
    st.subheader("Feature Importance")
    feature_importance = model.feature_importances_
    feature_names = X.columns

    # Aggregate one-hot encoded locations
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
    importance_df.loc[importance_df['Feature'].str.startswith("Location_"), 'Feature'] = "Location"
    importance_df = importance_df.groupby("Feature", as_index=False).sum()
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    fig, ax = plt.subplots(figsize=(8, 5))
    sns.barplot(y=importance_df['Feature'], x=importance_df['Importance'], palette='coolwarm', ax=ax)
    ax.set_xlabel("Importance")
    ax.set_ylabel("Feature")
    ax.set_title("Feature Importance in Delhi Housing Price Prediction")
    st.pyplot(fig)
    st.caption("Well, I tried to make a practical predictor based on amenities but it turns out that location is the most important feature, amenities are irrelevant.")

with tab2:
    # Dataset
    st.header("Dataset")
    
    @st.cache_data()
    def load():
        return pd.read_csv("datasets/Delhi.csv")
    dataset = load()
    dataset_processed = df
    
    # Quick preprocess, just for display
    def preprocess(data):
        data = data.drop(columns=["Location"], errors="ignore")
        return data

    def corr(data, title):
        data = data.select_dtypes(include=["number"])
        fig, ax = plt.subplots(figsize=(8, 6))
        sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax, annot_kws={"size": 3}, cbar_kws={"shrink": .8}) # Reduce size, lots of features.
        ax.set_title(title)
        st.pyplot(fig)
    corr(preprocess(dataset), "Correlation Matrix")
    st.caption("'tis all the correlatable data, minus the other irrelevant, non-correlatable data. [Location] feature is missing cos' it is string data, and was one-hotted.")
    
    # Toggle order
    view_type = st.radio("Order:", ["Top -> Bottom", "Bottom -> Top"])
    
    # Display head() or tail()
    if view_type == "Top -> Bottom":
        st.caption("datasets/Delhi.csv")
        st.dataframe(dataset.head(len(dataset)))
        st.caption("df")
        st.dataframe(dataset_processed.head(len(dataset_processed)))
        st.caption("Lots of unique values = omega long onehot encoded feature list.")
        
    elif view_type == "Bottom -> Top":
        st.caption("datasets/Delhi.csv")
        st.dataframe(dataset.tail(len(dataset)).iloc[::-1])
        st.caption("df")
        st.dataframe(dataset_processed.tail(len(dataset_processed)).iloc[::-1])
        st.caption("Lots of unique values = omega long onehot encoded feature list.")        
    st.divider()
with tab3:
    # User Input
    st.header("Price Prediction")
    col1, col2 = st.columns(2)
    with col1:
        area = st.slider("Area (sq. ft)", 500, 5000, 1500)
        bedrooms = st.slider("Number of Bedrooms", 1, 6, 3)
        is_resale = st.radio("Resale or New?", ["Resale", "New"]) == "New"
        location = st.selectbox("Location", df.columns[df.columns.str.startswith("Location_")].str.replace("Location_", ""))
    with col2:
        is_swimming_pool = st.checkbox("Has Swimming Pool")
        is_car_parking = st.checkbox("Has Car Parking")
        is_ac = st.checkbox("Has Air Conditioning")
        is_wifi = st.checkbox("Has Wifi")
        is_microwave = st.checkbox("Has Microwave")
        is_tv = st.checkbox("Has TV")
        is_dining_table = st.checkbox("Has Dining Table")
        is_sofa = st.checkbox("Has Sofa")
        is_wardrobe = st.checkbox("Has Wardrobe")
        is_refrigerator = st.checkbox("Has Refrigerator")
        
        # Process Input Data
    input_data = np.zeros(len(X.columns))  # Create zero array matching feature length
    input_data[0] = area
    input_data[1] = bedrooms
    input_data[2] = int(is_resale)
    input_data[3] = int(is_swimming_pool)
    input_data[4] = int(is_car_parking)
    input_data[5] = int(is_ac)
    input_data[6] = int(is_wifi)
    input_data[7] = int(is_microwave)
    input_data[8] = int(is_tv)
    input_data[9] = int(is_dining_table)
    input_data[10] = int(is_sofa)
    input_data[11] = int(is_wardrobe)
    input_data[12] = int(is_refrigerator)

    # Set the correct location column to 1
    loc_index = list(X.columns).index(f"Location_{location}")
    input_data[loc_index] = 1

    # Predict & Output
    predicted_price = model.predict([input_data])[0]

    # Set colors kag descs kay bigaon
    price_diff = (predicted_price - median_price) / median_price
    if price_diff < -0.2:
        color = "#ff4d4d"  # Below Median
        category = "Below Median Price"
        description = "This price is significantly lower than the median price in this area. The property may lack premium features and amenities or be in a less desirable location."
    elif -0.2 <= price_diff < -0.05:
        color = "#ff944d"  # Slightly Below
        category = "Slightly Below Median Price"
        description = "This price is slightly below the median range, which could indicate a competitive offer for budget-conscious buyers."
    elif -0.05 <= price_diff <= 0.05:
        color = "#ffff4d"  # In Median Range
        category = "In Median Price Range"
        description = "This price falls within the typical range for this area, making it a standard market price."
    elif 0.05 < price_diff <= 0.2:
        color = "#94ff4d"  # Slightly Above
        category = "Slightly Above Median Price"
        description = "This price is slightly higher than the median, possibly due to added features such as better amenities or a prime location."
    else:
        color = "#4dff4d"  # Above Median
        category = "Above Median Price"
        description = "This price is significantly above the median, suggesting a premium property with high-end features, amenities, and an excellent location."

    # Result
    st.subheader("Predicted House Price (INR)")
    st.write(f"₹{predicted_price:,.2f}")
    st.markdown(f'<h3 style="color:{color};">{category}</h3>', unsafe_allow_html=True)
    st.write(description)
    st.caption("Dataset is weird so expect anomalous output (negative prices, omega-high prices, etc.).")
    st.divider()