import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.svm import SVC from sklearn.metrics import confusion_matrix, classification_report # Function to visualize decision boundary def visualize_classifier(classifier, X, y, title=''): min_x, max_x = X[:, 0].min() - 1.0, X[:, 0].max() + 1.0 min_y, max_y = X[:, 1].min() - 1.0, X[:, 1].max() + 1.0 mesh_step_size = 0.01 x_vals, y_vals = np.meshgrid(np.arange(min_x, max_x, mesh_step_size), np.arange(min_y, max_y, mesh_step_size)) output = classifier.predict(np.c_[x_vals.ravel(), y_vals.ravel()]) output = output.reshape(x_vals.shape) fig, ax = plt.subplots() ax.set_title(title) ax.pcolormesh(x_vals, y_vals, output, cmap=plt.cm.gray, shading='auto') ax.scatter(X[:, 0], X[:, 1], c=y, s=75, edgecolors='black', linewidth=1, cmap=plt.cm.Paired) ax.set_xlim(x_vals.min(), x_vals.max()) ax.set_ylim(y_vals.min(), y_vals.max()) ax.set_xticks(np.arange(int(X[:, 0].min() - 1), int(X[:, 0].max() + 1), 1.0)) ax.set_yticks(np.arange(int(X[:, 1].min() - 1), int(X[:, 1].max() + 1), 1.0)) st.pyplot(fig) def main(): # Load the dataset st.title("SVM Kernel Performance Comparison") about = """ # π§ SVM Kernel Comparison: Understanding the Impact on Overlapped Data In machine learning, **Support Vector Machines (SVMs)** are powerful classifiers that work well for both linear and non-linear decision boundaries. However, the performance of an SVM heavily depends on the **choice of kernel function**. Let's analyze how different kernels handle **overlapped data** and why choosing the right kernel is crucial. ## π Kernel Performance Breakdown ### 1οΈβ£ **Linear Kernel** π’ - π Assumes the data is **linearly separable** (i.e., can be divided by a straight line). - β Works well when classes are well-separated. - β Struggles with highly overlapped data, leading to **poor generalization**. - π **Best for:** High-dimensional sparse data (e.g., text classification). ### 2οΈβ£ **Polynomial Kernel** π - π Expands feature space by computing polynomial combinations of features. - β Can model more complex decision boundaries. - β **High-degree polynomials** can lead to **overfitting**. - π **Best for:** Medium-complexity patterns where interactions between features matter. ### 3οΈβ£ **Radial Basis Function (RBF) Kernel** π΅ - π₯ Uses **Gaussian similarity** to map data into a higher-dimensional space. - β Excels in handling **highly non-linear** and **overlapped** data. - β Requires careful tuning of the **gamma** parameter to avoid underfitting or overfitting. - π **Best for:** Complex, non-linear relationships (e.g., image classification). ## π― Choosing the Right Kernel - If data is **linearly separable**, a **linear kernel** is efficient and interpretable. - If data has **moderate overlap**, a **polynomial kernel** provides flexibility. - If data is **highly overlapped and non-linear**, the **RBF kernel** is often the best choice. ### π€ Key Takeaway The **right kernel choice** significantly impacts classification accuracy. While RBF is a strong default for **complex overlapped data**, simpler kernels should be preferred when appropriate to reduce computation cost and improve interpretability. **Experimentation and hyperparameter tuning are essential** to achieving the best results. π *βThere is no one-size-fits-all kernel β understanding your data is the key to unlocking SVMβs full potential!β* π Created by: Louie F.Cervantes, M.Eng. (Information Engineering) """ with st.expander("About SVM Kernels"): st.markdown(about) uploaded_file = './data/overlapped.csv' if uploaded_file: df = pd.read_csv(uploaded_file) st.write("### Data Preview") st.dataframe(df) # Assuming the last column is the target X = df.iloc[:, :-1] y = df.iloc[:, -1] # Splitting dataset X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # Plot overlapped clusters st.write("### Cluster Visualization") fig, ax = plt.subplots() scatter = sns.scatterplot(x=X.iloc[:, 0], y=X.iloc[:, 1], hue=y, palette='coolwarm', alpha=0.6) plt.xlabel("Feature 1") plt.ylabel("Feature 2") plt.title("Overlapped Clusters") st.pyplot(fig) # Function to train SVM and get performance metrics def evaluate_svm(kernel_type): model = SVC(kernel=kernel_type) model.fit(X_train, y_train) y_pred = model.predict(X_test) cm = confusion_matrix(y_test, y_pred) cr = classification_report(y_test, y_pred, output_dict=True) return model, cm, cr # Streamlit tabs tab1, tab2, tab3 = st.tabs(["Linear Kernel", "Polynomial Kernel", "RBF Kernel"]) for tab, kernel in zip([tab1, tab2, tab3], ["linear", "poly", "rbf"]): with tab: st.write(f"## SVM with {kernel.capitalize()} Kernel") model, cm, cr = evaluate_svm(kernel) # Confusion matrix st.write("### Confusion Matrix") fig, ax = plt.subplots() sns.heatmap(cm, annot=True, fmt='d', cmap='Blues') plt.xlabel("Predicted") plt.ylabel("Actual") plt.title("Confusion Matrix") st.pyplot(fig) # Classification report st.write("### Classification Report") st.dataframe(pd.DataFrame(cr).transpose()) # Decision boundary st.write("### Decision Boundary") visualize_classifier(model, X.to_numpy(), y.to_numpy(), title=f"Decision Boundary - {kernel.capitalize()} Kernel") # Explanation explanation = { "linear": "The linear kernel performs well when the data is linearly separable.", "poly": "The polynomial kernel captures more complex relationships but may overfit with high-degree polynomials.", "rbf": "The RBF kernel is effective in capturing non-linear relationships in the data but requires careful tuning of parameters." } st.markdown(f"**Performance Analysis:** {explanation[kernel]}") if __name__ == "__main__": main()