Spaces:

allantacuelwvsu
/

delhi_housing_price

Sleeping

App Files Files Community

delhi_housing_price / app.py

allantacuelwvsu

add dataset section

cc17b16 18 days ago

raw

history blame contribute delete

8.09 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import seaborn as sns
	import matplotlib.pyplot as plt
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

	# Load
	df = pd.read_csv("datasets/Delhi.csv")
	features = [
	"Location", "Area", "No. of Bedrooms", "Resale", "SwimmingPool", "CarParking",
	"AC", "Wifi", "Microwave", "TV", "DiningTable",
	"Sofa", "Wardrobe", "Refrigerator"
	]
	df = df[["Price"] + features]

	# Preprocess
	df = pd.get_dummies(df, columns=["Location"], drop_first=True) # One-hot encode locations
	X = df.drop("Price", axis=1)
	y = df["Price"]
	median_price = y.median()

	# Train
	X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)
	model = RandomForestRegressor(n_estimators=500, random_state=1)
	model.fit(X_train, y_train)
	y_pred = model.predict(X_val)

	# App
	st.title("Regression: Delhi Housing Price Prediction")
	st.caption("dataset: https://www.kaggle.com/code/ruchi798/housing-prices-eda-and-prediction/input -> Housing Prices in Metropolitan Areas of India/Delhi.csv")
	tab1, tab2, tab3 = st.tabs(["Model Performance", "Dataset", "Price Predictor"])

	with tab1:
	# Model Assessment
	st.header("Model Performance")

	# Compute regression evaluation metrics
	mae = mean_absolute_error(y_val, y_pred)
	mse = mean_squared_error(y_val, y_pred)
	r2 = r2_score(y_val, y_pred)
	st.caption("Sala sala ang evaluation dunno why, prolly irrelevant features or gasinala preprocessing ko. w/e")

	# Display regression metrics
	st.write(f"Mean Absolute Error (MAE): {mae:,.2f}")
	st.write(f"Mean Squared Error (MSE): {mse:,.2f}")
	st.write(f"R² Score: {r2:.2f}")
	st.divider()

	# Feature Importance
	st.subheader("Feature Importance")
	feature_importance = model.feature_importances_
	feature_names = X.columns

	# Aggregate one-hot encoded locations
	importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
	importance_df.loc[importance_df['Feature'].str.startswith("Location_"), 'Feature'] = "Location"
	importance_df = importance_df.groupby("Feature", as_index=False).sum()
	importance_df = importance_df.sort_values(by='Importance', ascending=False)
	fig, ax = plt.subplots(figsize=(8, 5))
	sns.barplot(y=importance_df['Feature'], x=importance_df['Importance'], palette='coolwarm', ax=ax)
	ax.set_xlabel("Importance")
	ax.set_ylabel("Feature")
	ax.set_title("Feature Importance in Delhi Housing Price Prediction")
	st.pyplot(fig)
	st.caption("Well, I tried to make a practical predictor based on amenities but it turns out that location is the most important feature, amenities are irrelevant.")

	with tab2:
	# Dataset
	st.header("Dataset")

	@st.cache_data()
	def load():
	return pd.read_csv("datasets/Delhi.csv")
	dataset = load()
	dataset_processed = df

	# Quick preprocess, just for display
	def preprocess(data):
	data = data.drop(columns=["Location"], errors="ignore")
	return data

	def corr(data, title):
	data = data.select_dtypes(include=["number"])
	fig, ax = plt.subplots(figsize=(8, 6))
	sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax, annot_kws={"size": 3}, cbar_kws={"shrink": .8}) # Reduce size, lots of features.
	ax.set_title(title)
	st.pyplot(fig)
	corr(preprocess(dataset), "Correlation Matrix")
	st.caption("'tis all the correlatable data, minus the other irrelevant, non-correlatable data. [Location] feature is missing cos' it is string data, and was one-hotted.")

	# Toggle order
	view_type = st.radio("Order:", ["Top -> Bottom", "Bottom -> Top"])

	# Display head() or tail()
	if view_type == "Top -> Bottom":
	st.caption("datasets/Delhi.csv")
	st.dataframe(dataset.head(len(dataset)))
	st.caption("df")
	st.dataframe(dataset_processed.head(len(dataset_processed)))
	st.caption("Lots of unique values = omega long onehot encoded feature list.")

	elif view_type == "Bottom -> Top":
	st.caption("datasets/Delhi.csv")
	st.dataframe(dataset.tail(len(dataset)).iloc[::-1])
	st.caption("df")
	st.dataframe(dataset_processed.tail(len(dataset_processed)).iloc[::-1])
	st.caption("Lots of unique values = omega long onehot encoded feature list.")
	st.divider()
	with tab3:
	# User Input
	st.header("Price Prediction")
	col1, col2 = st.columns(2)
	with col1:
	area = st.slider("Area (sq. ft)", 500, 5000, 1500)
	bedrooms = st.slider("Number of Bedrooms", 1, 6, 3)
	is_resale = st.radio("Resale or New?", ["Resale", "New"]) == "New"
	location = st.selectbox("Location", df.columns[df.columns.str.startswith("Location_")].str.replace("Location_", ""))
	with col2:
	is_swimming_pool = st.checkbox("Has Swimming Pool")
	is_car_parking = st.checkbox("Has Car Parking")
	is_ac = st.checkbox("Has Air Conditioning")
	is_wifi = st.checkbox("Has Wifi")
	is_microwave = st.checkbox("Has Microwave")
	is_tv = st.checkbox("Has TV")
	is_dining_table = st.checkbox("Has Dining Table")
	is_sofa = st.checkbox("Has Sofa")
	is_wardrobe = st.checkbox("Has Wardrobe")
	is_refrigerator = st.checkbox("Has Refrigerator")

	# Process Input Data
	input_data = np.zeros(len(X.columns)) # Create zero array matching feature length
	input_data[0] = area
	input_data[1] = bedrooms
	input_data[2] = int(is_resale)
	input_data[3] = int(is_swimming_pool)
	input_data[4] = int(is_car_parking)
	input_data[5] = int(is_ac)
	input_data[6] = int(is_wifi)
	input_data[7] = int(is_microwave)
	input_data[8] = int(is_tv)
	input_data[9] = int(is_dining_table)
	input_data[10] = int(is_sofa)
	input_data[11] = int(is_wardrobe)
	input_data[12] = int(is_refrigerator)

	# Set the correct location column to 1
	loc_index = list(X.columns).index(f"Location_{location}")
	input_data[loc_index] = 1

	# Predict & Output
	predicted_price = model.predict([input_data])[0]

	# Set colors kag descs kay bigaon
	price_diff = (predicted_price - median_price) / median_price
	if price_diff < -0.2:
	color = "#ff4d4d" # Below Median
	category = "Below Median Price"
	description = "This price is significantly lower than the median price in this area. The property may lack premium features and amenities or be in a less desirable location."
	elif -0.2 <= price_diff < -0.05:
	color = "#ff944d" # Slightly Below
	category = "Slightly Below Median Price"
	description = "This price is slightly below the median range, which could indicate a competitive offer for budget-conscious buyers."
	elif -0.05 <= price_diff <= 0.05:
	color = "#ffff4d" # In Median Range
	category = "In Median Price Range"
	description = "This price falls within the typical range for this area, making it a standard market price."
	elif 0.05 < price_diff <= 0.2:
	color = "#94ff4d" # Slightly Above
	category = "Slightly Above Median Price"
	description = "This price is slightly higher than the median, possibly due to added features such as better amenities or a prime location."
	else:
	color = "#4dff4d" # Above Median
	category = "Above Median Price"
	description = "This price is significantly above the median, suggesting a premium property with high-end features, amenities, and an excellent location."

	# Result
	st.subheader("Predicted House Price (INR)")
	st.write(f"₹{predicted_price:,.2f}")
	st.markdown(f'<h3 style="color:{color};">{category}</h3>', unsafe_allow_html=True)
	st.write(description)
	st.caption("Dataset is weird so expect anomalous output (negative prices, omega-high prices, etc.).")
	st.divider()