Spaces:
Sleeping
Sleeping
Added the app descrption
Browse files
app.py
CHANGED
@@ -26,69 +26,113 @@ def visualize_classifier(classifier, X, y, title=''):
|
|
26 |
ax.set_yticks(np.arange(int(X[:, 1].min() - 1), int(X[:, 1].max() + 1), 1.0))
|
27 |
st.pyplot(fig)
|
28 |
|
29 |
-
|
30 |
-
|
|
|
31 |
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
ax.set_yticks(np.arange(int(X[:, 1].min() - 1), int(X[:, 1].max() + 1), 1.0))
|
27 |
st.pyplot(fig)
|
28 |
|
29 |
+
def main():
|
30 |
+
# Load the dataset
|
31 |
+
st.title("SVM Kernel Performance Comparison")
|
32 |
|
33 |
+
about = """
|
34 |
+
# π§ SVM Kernel Comparison: Understanding the Impact on Overlapped Data
|
35 |
+
|
36 |
+
In machine learning, **Support Vector Machines (SVMs)** are powerful classifiers that work well for both linear and non-linear decision boundaries. However, the performance of an SVM heavily depends on the **choice of kernel function**. Let's analyze how different kernels handle **overlapped data** and why choosing the right kernel is crucial.
|
37 |
+
|
38 |
+
## π Kernel Performance Breakdown
|
39 |
+
|
40 |
+
### 1οΈβ£ **Linear Kernel** π’
|
41 |
+
- π Assumes the data is **linearly separable** (i.e., can be divided by a straight line).
|
42 |
+
- β
Works well when classes are well-separated.
|
43 |
+
- β Struggles with highly overlapped data, leading to **poor generalization**.
|
44 |
+
- π **Best for:** High-dimensional sparse data (e.g., text classification).
|
45 |
+
|
46 |
+
### 2οΈβ£ **Polynomial Kernel** π
|
47 |
+
- π Expands feature space by computing polynomial combinations of features.
|
48 |
+
- β
Can model more complex decision boundaries.
|
49 |
+
- β **High-degree polynomials** can lead to **overfitting**.
|
50 |
+
- π **Best for:** Medium-complexity patterns where interactions between features matter.
|
51 |
+
|
52 |
+
### 3οΈβ£ **Radial Basis Function (RBF) Kernel** π΅
|
53 |
+
- π₯ Uses **Gaussian similarity** to map data into a higher-dimensional space.
|
54 |
+
- β
Excels in handling **highly non-linear** and **overlapped** data.
|
55 |
+
- β Requires careful tuning of the **gamma** parameter to avoid underfitting or overfitting.
|
56 |
+
- π **Best for:** Complex, non-linear relationships (e.g., image classification).
|
57 |
+
|
58 |
+
## π― Choosing the Right Kernel
|
59 |
+
- If data is **linearly separable**, a **linear kernel** is efficient and interpretable.
|
60 |
+
- If data has **moderate overlap**, a **polynomial kernel** provides flexibility.
|
61 |
+
- If data is **highly overlapped and non-linear**, the **RBF kernel** is often the best choice.
|
62 |
+
|
63 |
+
### π€ Key Takeaway
|
64 |
+
The **right kernel choice** significantly impacts classification accuracy. While RBF is a strong default for **complex overlapped data**, simpler kernels should be preferred when appropriate to reduce computation cost and improve interpretability. **Experimentation and hyperparameter tuning are essential** to achieving the best results.
|
65 |
+
|
66 |
+
π *βThere is no one-size-fits-all kernel β understanding your data is the key to unlocking SVMβs full potential!β*
|
67 |
|
68 |
+
π Created by: Louie F.Cervantes, M.Eng. (Information Engineering)
|
69 |
+
"""
|
70 |
+
with st.expander("About SVM Kernels"):
|
71 |
+
st.markdown(about)
|
72 |
+
|
73 |
+
uploaded_file = './data/overlapped.csv'
|
74 |
+
if uploaded_file:
|
75 |
+
df = pd.read_csv(uploaded_file)
|
76 |
+
st.write("### Data Preview")
|
77 |
+
st.dataframe(df)
|
78 |
+
|
79 |
+
# Assuming the last column is the target
|
80 |
+
X = df.iloc[:, :-1]
|
81 |
+
y = df.iloc[:, -1]
|
82 |
+
|
83 |
+
# Splitting dataset
|
84 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
|
85 |
+
|
86 |
+
# Plot overlapped clusters
|
87 |
+
st.write("### Cluster Visualization")
|
88 |
+
fig, ax = plt.subplots()
|
89 |
+
scatter = sns.scatterplot(x=X.iloc[:, 0], y=X.iloc[:, 1], hue=y, palette='coolwarm', alpha=0.6)
|
90 |
+
plt.xlabel("Feature 1")
|
91 |
+
plt.ylabel("Feature 2")
|
92 |
+
plt.title("Overlapped Clusters")
|
93 |
+
st.pyplot(fig)
|
94 |
+
|
95 |
+
# Function to train SVM and get performance metrics
|
96 |
+
def evaluate_svm(kernel_type):
|
97 |
+
model = SVC(kernel=kernel_type)
|
98 |
+
model.fit(X_train, y_train)
|
99 |
+
y_pred = model.predict(X_test)
|
100 |
+
cm = confusion_matrix(y_test, y_pred)
|
101 |
+
cr = classification_report(y_test, y_pred, output_dict=True)
|
102 |
+
return model, cm, cr
|
103 |
+
|
104 |
+
# Streamlit tabs
|
105 |
+
tab1, tab2, tab3 = st.tabs(["Linear Kernel", "Polynomial Kernel", "RBF Kernel"])
|
106 |
+
|
107 |
+
for tab, kernel in zip([tab1, tab2, tab3], ["linear", "poly", "rbf"]):
|
108 |
+
with tab:
|
109 |
+
st.write(f"## SVM with {kernel.capitalize()} Kernel")
|
110 |
+
model, cm, cr = evaluate_svm(kernel)
|
111 |
+
|
112 |
+
# Confusion matrix
|
113 |
+
st.write("### Confusion Matrix")
|
114 |
+
fig, ax = plt.subplots()
|
115 |
+
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
|
116 |
+
plt.xlabel("Predicted")
|
117 |
+
plt.ylabel("Actual")
|
118 |
+
plt.title("Confusion Matrix")
|
119 |
+
st.pyplot(fig)
|
120 |
+
|
121 |
+
# Classification report
|
122 |
+
st.write("### Classification Report")
|
123 |
+
st.dataframe(pd.DataFrame(cr).transpose())
|
124 |
+
|
125 |
+
# Decision boundary
|
126 |
+
st.write("### Decision Boundary")
|
127 |
+
visualize_classifier(model, X.to_numpy(), y.to_numpy(), title=f"Decision Boundary - {kernel.capitalize()} Kernel")
|
128 |
+
|
129 |
+
# Explanation
|
130 |
+
explanation = {
|
131 |
+
"linear": "The linear kernel performs well when the data is linearly separable.",
|
132 |
+
"poly": "The polynomial kernel captures more complex relationships but may overfit with high-degree polynomials.",
|
133 |
+
"rbf": "The RBF kernel is effective in capturing non-linear relationships in the data but requires careful tuning of parameters."
|
134 |
+
}
|
135 |
+
st.markdown(f"**Performance Analysis:** {explanation[kernel]}")
|
136 |
+
|
137 |
+
if __name__ == "__main__":
|
138 |
+
main()
|