import streamlit as st import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix from sklearn.model_selection import train_test_split # Load train_df = pd.read_csv("datasets/train.csv") # Preprocess train_df["Age"].fillna(train_df["Age"].median(), inplace=True) train_df["Embarked"].fillna(train_df["Embarked"].mode()[0], inplace=True) train_df["Sex"] = train_df["Sex"].map({"male": 0, "female": 1}) train_df = pd.get_dummies(train_df, columns=["Embarked"]) train_df["FamilySize"] = train_df["SibSp"] + train_df["Parch"] + 1 train_df.drop(["Name", "Ticket", "Cabin", "PassengerId"], axis=1, inplace=True) # Train X = train_df.drop("Survived", axis=1) y = train_df["Survived"] X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1) model = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=1 ) model.fit(X_train, y_train) y_pred = model.predict(X_val) # App st.title("Classification: Titanic Survival Prediction") st.caption("dataset: https://www.kaggle.com/code/mrisdal/exploring-survival-on-the-titanic/input -> test.csv || train.csv") tab1, tab2, tab3 = st.tabs(["Model Performance", "Dataset", "Survival Chance Predictor"]) with tab1: # Model Assessment st.header("Model Performance") st.write(f"**Accuracy:** {accuracy_score(y_val, y_pred):.2f}") st.write(f"**Precision:** {precision_score(y_val, y_pred):.2f}") st.write(f"**Recall:** {recall_score(y_val, y_pred):.2f}") st.write(f"**F1 Score:** {f1_score(y_val, y_pred):.2f}") fig, ax = plt.subplots() sns.heatmap(confusion_matrix(y_val, y_pred), annot=True, fmt="d", cmap="Blues", ax=ax) st.pyplot(fig) # Feature Importance st.header("Feature Importance") feature_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False) # Correct get_dummies column splitting # Concatenate embarked_importance = ( feature_importances["Embarked_C"] + feature_importances["Embarked_Q"] + feature_importances["Embarked_S"] ) # Replace feature_importances = feature_importances.drop(["Embarked_C", "Embarked_Q", "Embarked_S"]) feature_importances["Embarked"] = embarked_importance feature_importances = feature_importances.sort_values(ascending=False) fig, ax = plt.subplots() sns.barplot(x=feature_importances, y=feature_importances.index, ax=ax) ax.set_xlabel("Importance") ax.set_ylabel("Feature") ax.set_title("Feature Importance in Titanic Survival Prediction") st.pyplot(fig) st.divider() with tab2: # Dataset st.header("Dataset") @st.cache_data() def load(): return pd.read_csv("datasets/train.csv") dataset = load() dataset_processed = train_df # Quick preprocess, just for display def preprocess(data): data["Sex"] = data["Sex"].map({"male": 0, "female": 1}) data["FamilySize"] = data["SibSp"] + data["Parch"] + 1 return data def corr(data, title): data = data.select_dtypes(include=["number"]) fig, ax = plt.subplots(figsize=(8, 6)) sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5, ax=ax) ax.set_title(title) st.pyplot(fig) corr(preprocess(dataset), "Correlation Matrix") st.caption("'tis all the correlatable data, minus the other irrelevant, non-correlatable data. [Embarked] feature is missing cos' it is string data, and was one-hotted.") # Toggle order view_type = st.radio("Order:", ["Top -> Bottom", "Bottom -> Top"]) # Display head() or tail() if view_type == "Top -> Bottom": st.caption("datasets/train.csv") st.dataframe(dataset.head(len(dataset))) st.caption("df") st.dataframe(dataset_processed.head(len(dataset_processed))) elif view_type == "Bottom -> Top": st.caption("datasets/train.csv") st.dataframe(dataset.tail(len(dataset)).iloc[::-1]) st.caption("df") st.dataframe(dataset_processed.tail(len(dataset_processed)).iloc[::-1]) st.caption("There's a test.csv file in my datasets/ folder, but it's just an artifact from my Kaggle download. Can't be bothered to organize.") st.divider() with tab3: # Predictor st.header("Survival Chance Prediction") # Preset Character Data def set_character(pclass, sex, age, sibsp, parch, fare, embarked): st.session_state.pclass = pclass st.session_state.sex = sex st.session_state.age = age st.session_state.sibsp = sibsp st.session_state.parch = parch st.session_state.fare = fare st.session_state.embarked = embarked with st.expander("Movie Characters"): col1, col2 = st.columns(2) with col1: st.button("Rose DeWitt Bukater", on_click=set_character, args=("First Class", "Female", 17, 0, 1, 300.0, "South Hampton")) # Rose was 17, embarked from South Hampton, was in first class and paid $300 as an estimate of the average of the 1st class fare, maybe a slightly less luxurious cabin. . She was with her mother. st.button("Jack Dawson", on_click=set_character, args=("Third Class", "Male", 20, 0, 0, 10.0, "South Hampton")) # Jack was 20, embarked from South Hampton, was in third class and paid a random estimate of $10 as the bet for the poker game where he won is 3rd class ticket. Of course, he was alone. with col2: st.button("Caledon Hockley", on_click=set_character, args=("First Class", "Male", 30, 0, 0, 500.0, "Cherbourg")) # Caledon was 30, embarked from Cherbourg, was in first class and paid $500 as an estimate of the high average of the 1st class fare. He was alone. st.button("Ruth DeWitt Bukater", on_click=set_character, args=("First Class", "Female", 45, 0, 1, 300.0, "South Hampton")) # Ruth was 45, embarked from South Hampton, was in first class and paid $300 as an estimate of the average of the 1st class fare. She was with her daughter. pclass = st.radio("Passenger Class", ["First Class", "Second Class", "Third Class"], index=["First Class", "Second Class", "Third Class"].index(st.session_state.get("pclass", "Second Class"))) sex = st.radio("Sex", ["Male", "Female"], index=["Male", "Female"].index(st.session_state.get("sex", "Male"))) age = st.slider("Age", 0, 100, st.session_state.get("age", 30)) sibsp = st.slider("Siblings/Spouses Aboard", 0, 8, st.session_state.get("sibsp", 0)) parch = st.slider("Parents/Children Aboard", 0, 6, st.session_state.get("parch", 0)) fare = st.slider("Fare", 0.0, 500.0, st.session_state.get("fare", 30.0), step=5.0) embarked = st.radio("Port of Embarkation", ["Cherbourg", "Queenstown", "South Hampton"], index=["Cherbourg", "Queenstown", "South Hampton"].index(st.session_state.get("embarked", "South Hampton"))) # Process Input sex = 1 if sex == "Female" else 0 embarked_C = 1 if embarked == "Cherbourg" else 0 embarked_Q = 1 if embarked == "Queenstown" else 0 embarked_S = 1 if embarked == "South Hampton" else 0 if pclass == "First Class": pclass = 1 elif pclass == "Second Class": pclass = 2 else: pclass = 3 family_size = sibsp + parch + 1 input_data = np.array([[pclass, sex, age, sibsp, parch, fare, family_size, embarked_C, embarked_Q, embarked_S]]) # Predict and Output survival_probability = model.predict_proba(input_data)[0][1] st.subheader("Survival Probability") # Thresholds to add color probability if survival_probability < 0.25: color = "red" elif survival_probability < 0.50: color = "orange" elif survival_probability < 0.75: color = "yellow" elif survival_probability < 0.90: color = "#FFFF99" else: color = "green" st.markdown(f"

{survival_probability:.2%}", unsafe_allow_html=True) st.divider()