louiecerv commited on
Commit
1989436
Β·
1 Parent(s): cc2adcb

Added the app descrption

Browse files
Files changed (1) hide show
  1. app.py +108 -64
app.py CHANGED
@@ -26,69 +26,113 @@ def visualize_classifier(classifier, X, y, title=''):
26
  ax.set_yticks(np.arange(int(X[:, 1].min() - 1), int(X[:, 1].max() + 1), 1.0))
27
  st.pyplot(fig)
28
 
29
- # Load the dataset
30
- st.title("SVM Kernel Performance Comparison")
 
31
 
32
- uploaded_file = './data/overlapped.csv'
33
- if uploaded_file:
34
- df = pd.read_csv(uploaded_file)
35
- st.write("### Data Preview")
36
- st.dataframe(df)
37
-
38
- # Assuming the last column is the target
39
- X = df.iloc[:, :-1]
40
- y = df.iloc[:, -1]
41
-
42
- # Splitting dataset
43
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
44
-
45
- # Plot overlapped clusters
46
- st.write("### Cluster Visualization")
47
- fig, ax = plt.subplots()
48
- scatter = sns.scatterplot(x=X.iloc[:, 0], y=X.iloc[:, 1], hue=y, palette='coolwarm', alpha=0.6)
49
- plt.xlabel("Feature 1")
50
- plt.ylabel("Feature 2")
51
- plt.title("Overlapped Clusters")
52
- st.pyplot(fig)
53
-
54
- # Function to train SVM and get performance metrics
55
- def evaluate_svm(kernel_type):
56
- model = SVC(kernel=kernel_type)
57
- model.fit(X_train, y_train)
58
- y_pred = model.predict(X_test)
59
- cm = confusion_matrix(y_test, y_pred)
60
- cr = classification_report(y_test, y_pred, output_dict=True)
61
- return model, cm, cr
62
-
63
- # Streamlit tabs
64
- tab1, tab2, tab3 = st.tabs(["Linear Kernel", "Polynomial Kernel", "RBF Kernel"])
 
65
 
66
- for tab, kernel in zip([tab1, tab2, tab3], ["linear", "poly", "rbf"]):
67
- with tab:
68
- st.write(f"## SVM with {kernel.capitalize()} Kernel")
69
- model, cm, cr = evaluate_svm(kernel)
70
-
71
- # Confusion matrix
72
- st.write("### Confusion Matrix")
73
- fig, ax = plt.subplots()
74
- sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
75
- plt.xlabel("Predicted")
76
- plt.ylabel("Actual")
77
- plt.title("Confusion Matrix")
78
- st.pyplot(fig)
79
-
80
- # Classification report
81
- st.write("### Classification Report")
82
- st.dataframe(pd.DataFrame(cr).transpose())
83
-
84
- # Decision boundary
85
- st.write("### Decision Boundary")
86
- visualize_classifier(model, X.to_numpy(), y.to_numpy(), title=f"Decision Boundary - {kernel.capitalize()} Kernel")
87
-
88
- # Explanation
89
- explanation = {
90
- "linear": "The linear kernel performs well when the data is linearly separable.",
91
- "poly": "The polynomial kernel captures more complex relationships but may overfit with high-degree polynomials.",
92
- "rbf": "The RBF kernel is effective in capturing non-linear relationships in the data but requires careful tuning of parameters."
93
- }
94
- st.markdown(f"**Performance Analysis:** {explanation[kernel]}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  ax.set_yticks(np.arange(int(X[:, 1].min() - 1), int(X[:, 1].max() + 1), 1.0))
27
  st.pyplot(fig)
28
 
29
+ def main():
30
+ # Load the dataset
31
+ st.title("SVM Kernel Performance Comparison")
32
 
33
+ about = """
34
+ # 🧠 SVM Kernel Comparison: Understanding the Impact on Overlapped Data
35
+
36
+ In machine learning, **Support Vector Machines (SVMs)** are powerful classifiers that work well for both linear and non-linear decision boundaries. However, the performance of an SVM heavily depends on the **choice of kernel function**. Let's analyze how different kernels handle **overlapped data** and why choosing the right kernel is crucial.
37
+
38
+ ## πŸ” Kernel Performance Breakdown
39
+
40
+ ### 1️⃣ **Linear Kernel** 🟒
41
+ - πŸ“ Assumes the data is **linearly separable** (i.e., can be divided by a straight line).
42
+ - βœ… Works well when classes are well-separated.
43
+ - ❌ Struggles with highly overlapped data, leading to **poor generalization**.
44
+ - πŸš€ **Best for:** High-dimensional sparse data (e.g., text classification).
45
+
46
+ ### 2️⃣ **Polynomial Kernel** πŸ“ˆ
47
+ - πŸ”„ Expands feature space by computing polynomial combinations of features.
48
+ - βœ… Can model more complex decision boundaries.
49
+ - ❌ **High-degree polynomials** can lead to **overfitting**.
50
+ - πŸš€ **Best for:** Medium-complexity patterns where interactions between features matter.
51
+
52
+ ### 3️⃣ **Radial Basis Function (RBF) Kernel** πŸ”΅
53
+ - πŸ”₯ Uses **Gaussian similarity** to map data into a higher-dimensional space.
54
+ - βœ… Excels in handling **highly non-linear** and **overlapped** data.
55
+ - ❌ Requires careful tuning of the **gamma** parameter to avoid underfitting or overfitting.
56
+ - πŸš€ **Best for:** Complex, non-linear relationships (e.g., image classification).
57
+
58
+ ## 🎯 Choosing the Right Kernel
59
+ - If data is **linearly separable**, a **linear kernel** is efficient and interpretable.
60
+ - If data has **moderate overlap**, a **polynomial kernel** provides flexibility.
61
+ - If data is **highly overlapped and non-linear**, the **RBF kernel** is often the best choice.
62
+
63
+ ### πŸ€– Key Takeaway
64
+ The **right kernel choice** significantly impacts classification accuracy. While RBF is a strong default for **complex overlapped data**, simpler kernels should be preferred when appropriate to reduce computation cost and improve interpretability. **Experimentation and hyperparameter tuning are essential** to achieving the best results.
65
+
66
+ πŸ”Ž *β€œThere is no one-size-fits-all kernel – understanding your data is the key to unlocking SVM’s full potential!”*
67
 
68
+ πŸš€ Created by: Louie F.Cervantes, M.Eng. (Information Engineering)
69
+ """
70
+ with st.expander("About SVM Kernels"):
71
+ st.markdown(about)
72
+
73
+ uploaded_file = './data/overlapped.csv'
74
+ if uploaded_file:
75
+ df = pd.read_csv(uploaded_file)
76
+ st.write("### Data Preview")
77
+ st.dataframe(df)
78
+
79
+ # Assuming the last column is the target
80
+ X = df.iloc[:, :-1]
81
+ y = df.iloc[:, -1]
82
+
83
+ # Splitting dataset
84
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
85
+
86
+ # Plot overlapped clusters
87
+ st.write("### Cluster Visualization")
88
+ fig, ax = plt.subplots()
89
+ scatter = sns.scatterplot(x=X.iloc[:, 0], y=X.iloc[:, 1], hue=y, palette='coolwarm', alpha=0.6)
90
+ plt.xlabel("Feature 1")
91
+ plt.ylabel("Feature 2")
92
+ plt.title("Overlapped Clusters")
93
+ st.pyplot(fig)
94
+
95
+ # Function to train SVM and get performance metrics
96
+ def evaluate_svm(kernel_type):
97
+ model = SVC(kernel=kernel_type)
98
+ model.fit(X_train, y_train)
99
+ y_pred = model.predict(X_test)
100
+ cm = confusion_matrix(y_test, y_pred)
101
+ cr = classification_report(y_test, y_pred, output_dict=True)
102
+ return model, cm, cr
103
+
104
+ # Streamlit tabs
105
+ tab1, tab2, tab3 = st.tabs(["Linear Kernel", "Polynomial Kernel", "RBF Kernel"])
106
+
107
+ for tab, kernel in zip([tab1, tab2, tab3], ["linear", "poly", "rbf"]):
108
+ with tab:
109
+ st.write(f"## SVM with {kernel.capitalize()} Kernel")
110
+ model, cm, cr = evaluate_svm(kernel)
111
+
112
+ # Confusion matrix
113
+ st.write("### Confusion Matrix")
114
+ fig, ax = plt.subplots()
115
+ sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
116
+ plt.xlabel("Predicted")
117
+ plt.ylabel("Actual")
118
+ plt.title("Confusion Matrix")
119
+ st.pyplot(fig)
120
+
121
+ # Classification report
122
+ st.write("### Classification Report")
123
+ st.dataframe(pd.DataFrame(cr).transpose())
124
+
125
+ # Decision boundary
126
+ st.write("### Decision Boundary")
127
+ visualize_classifier(model, X.to_numpy(), y.to_numpy(), title=f"Decision Boundary - {kernel.capitalize()} Kernel")
128
+
129
+ # Explanation
130
+ explanation = {
131
+ "linear": "The linear kernel performs well when the data is linearly separable.",
132
+ "poly": "The polynomial kernel captures more complex relationships but may overfit with high-degree polynomials.",
133
+ "rbf": "The RBF kernel is effective in capturing non-linear relationships in the data but requires careful tuning of parameters."
134
+ }
135
+ st.markdown(f"**Performance Analysis:** {explanation[kernel]}")
136
+
137
+ if __name__ == "__main__":
138
+ main()