Spaces:
Sleeping
Sleeping
Upload 15 files
Browse files- Ada_boost.py +130 -0
- Agglomerative_clustering.py +50 -0
- Decision_tree.py +108 -0
- ICA.py +80 -0
- KNN.py +79 -0
- LDA.py +94 -0
- Linear_regression.py +82 -0
- Logit.py +106 -0
- Naive_bayes.py +97 -0
- PCA.py +78 -0
- Perceptron.py +73 -0
- Random_forest.py +98 -0
- SVC.py +143 -0
- SVR.py +139 -0
- k_mean_clustering.py +79 -0
Ada_boost.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn import datasets
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
from sklearn.ensemble import AdaBoostClassifier as ABC
|
| 7 |
+
from sklearn.metrics import accuracy_score
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
from sklearn.metrics import confusion_matrix
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class ada_boost_st:
|
| 14 |
+
def __init__(self, database, test_size=0.2):
|
| 15 |
+
self.database = database
|
| 16 |
+
self.test_size = test_size
|
| 17 |
+
self.desc = r'''
|
| 18 |
+
# **AdaBoost**
|
| 19 |
+
|
| 20 |
+
Este algoritmo se basa en ir agrupando otros algoritmos de clasificaci贸n, para que en conjunto generen una predicci贸n.
|
| 21 |
+
|
| 22 |
+
Asimismo, y a diferencia del algoritmo de Random Forest, es que el **voto** de cada estimador no valen lo mismo, es decir, existe un grado de importancia (**weight**) entre los estimadores que siendo estos ponderados por sus votos es que generan la predicci贸n del algoritmo.
|
| 23 |
+
|
| 24 |
+
**Weak Learner (Decision Stump)**
|
| 25 |
+
|
| 26 |
+
Es un algoritmo que sencillamente clasifica los datos seg煤n un l铆mite (similar a uno de los pasos del algoritmo de Decision Tree)
|
| 27 |
+
|
| 28 |
+
**Error**
|
| 29 |
+
|
| 30 |
+
- Primera iteraci贸n
|
| 31 |
+
|
| 32 |
+
$$
|
| 33 |
+
系_{1} = \frac{desaciertos}{N}
|
| 34 |
+
$$
|
| 35 |
+
|
| 36 |
+
- A partir de la segunda iteraci贸n
|
| 37 |
+
|
| 38 |
+
$$
|
| 39 |
+
系_{t} = \sum weights
|
| 40 |
+
$$
|
| 41 |
+
|
| 42 |
+
Nota: Si el error es mayor a 0.5, se intercambia la clasificaci贸n y se calcula el $error = 1 - error$
|
| 43 |
+
|
| 44 |
+
**Weights**
|
| 45 |
+
|
| 46 |
+
- Al inicio
|
| 47 |
+
$$
|
| 48 |
+
w_{0} = \frac{1}{N} para cada muestra
|
| 49 |
+
$$
|
| 50 |
+
|
| 51 |
+
- Luego
|
| 52 |
+
|
| 53 |
+
$$
|
| 54 |
+
w = \frac{w \cdot e^{- 伪yh(X)}}{\sum w}
|
| 55 |
+
$$
|
| 56 |
+
|
| 57 |
+
**Performance**
|
| 58 |
+
|
| 59 |
+
$$
|
| 60 |
+
\alpha = 0.5 \cdot log(\frac{1-系_{t}}{系_{t}})
|
| 61 |
+
$$
|
| 62 |
+
|
| 63 |
+
**Predicci贸n**
|
| 64 |
+
|
| 65 |
+
$$
|
| 66 |
+
y = sign(\sum_{t}^{T} 伪_{t} \cdot h(X))
|
| 67 |
+
$$
|
| 68 |
+
|
| 69 |
+
**Training**
|
| 70 |
+
|
| 71 |
+
Se inicializan los pesos de cada muestra en $\frac{1}{N}$
|
| 72 |
+
|
| 73 |
+
- Entrenamos a un clasificador d茅bil (se busca la mejor variable y l铆mite para segmentar)
|
| 74 |
+
- Calculamos el error $系_{t} = \sum_{desaciertos} weights$
|
| 75 |
+
- Cambiar el error y la polaridad si este es mayor a 0.5
|
| 76 |
+
- Calcular $\alpha = 0.5 \cdot log(\frac{1 - \epsilon_{t}}{系_{t}})$
|
| 77 |
+
- Actualizar los pesos: $w = \frac{w \cdot e^{- 伪h(X)}}{Z}$
|
| 78 |
+
|
| 79 |
+
'''
|
| 80 |
+
self.n_clf = 5
|
| 81 |
+
|
| 82 |
+
def params(self):
|
| 83 |
+
self.n_clf = st.slider('Numero de estimadores',
|
| 84 |
+
min_value=1,
|
| 85 |
+
max_value=15,
|
| 86 |
+
value=5)
|
| 87 |
+
|
| 88 |
+
def solve(self):
|
| 89 |
+
self.X, self.y = self.database.data, self.database.target
|
| 90 |
+
X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
|
| 91 |
+
self.sklearn_clf = ABC(n_estimators=self.n_clf, random_state=1234)
|
| 92 |
+
self.sklearn_clf.fit(X_train, y_train)
|
| 93 |
+
y_pred = self.sklearn_clf.predict(X_test)
|
| 94 |
+
acc = accuracy_score(y_pred, y_test)
|
| 95 |
+
|
| 96 |
+
c1, c2 = st.columns([4, 1])
|
| 97 |
+
c2.metric('Acierto', value=f'{np.round(acc, 2)*100}%')
|
| 98 |
+
df = pd.DataFrame(confusion_matrix(y_pred, y_test))
|
| 99 |
+
labels = self.database.target_names
|
| 100 |
+
df.columns = labels
|
| 101 |
+
df.index = labels
|
| 102 |
+
c1.write('**Confusion Matrix**')
|
| 103 |
+
c1.dataframe(df)
|
| 104 |
+
|
| 105 |
+
def visualization(self):
|
| 106 |
+
n_features = int(self.database.data.shape[1])
|
| 107 |
+
self.x_feature = st.slider('Variables en eje x', 1, n_features, 1)
|
| 108 |
+
self.y_feature = st.slider('Variables en eje y', 1, n_features, 2)
|
| 109 |
+
|
| 110 |
+
self.X = np.c_[self.database.data[:, self.x_feature-1:self.x_feature], self.database.data[:, self.y_feature-1:self.y_feature]]
|
| 111 |
+
self.y = self.database.target
|
| 112 |
+
X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
|
| 113 |
+
self.sklearn_clf = ABC(n_estimators=self.n_clf, random_state=1234)
|
| 114 |
+
self.sklearn_clf.fit(X_train, y_train)
|
| 115 |
+
|
| 116 |
+
x1_min, x1_max = self.X[:, 0].min() - 0.5, self.X[:, 0].max() + 0.5
|
| 117 |
+
x2_min, x2_max = self.X[:, 1].min() - 0.5, self.X[:, 1].max() + 0.5
|
| 118 |
+
h = 0.02 # Salto que vamos dando
|
| 119 |
+
x1_i = np.arange(x1_min, x1_max, h)
|
| 120 |
+
x2_i = np.arange(x2_min, x2_max, h)
|
| 121 |
+
x1_x1, x2_x2 = np.meshgrid(x1_i, x2_i)
|
| 122 |
+
y_pred = self.sklearn_clf.predict(np.c_[x1_x1.ravel(), x2_x2.ravel()])
|
| 123 |
+
y_pred = y_pred.reshape(x1_x1.shape)
|
| 124 |
+
|
| 125 |
+
plt.figure(1, figsize=(12, 8))
|
| 126 |
+
plt.pcolormesh(x1_x1, x2_x2, y_pred, cmap=plt.cm.Paired)
|
| 127 |
+
plt.scatter(self.X[:, 0], self.X[:, 1], c=self.y, edgecolors='k', cmap=plt.cm.Paired)
|
| 128 |
+
plt.xlim(x1_x1.min(), x1_x1.max())
|
| 129 |
+
plt.ylim(x2_x2.min(), x2_x2.max())
|
| 130 |
+
return plt.gcf()
|
Agglomerative_clustering.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn import datasets
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
from sklearn.svm import SVR
|
| 7 |
+
from sklearn.cluster import AgglomerativeClustering
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class agglomerative_clustering_st:
|
| 12 |
+
def __init__(self, database, test_size=0.2):
|
| 13 |
+
self.database = database
|
| 14 |
+
self.test_size = test_size
|
| 15 |
+
self.desc = r'''
|
| 16 |
+
# **Agglomerative Clustering**
|
| 17 |
+
|
| 18 |
+
Agglomerative Clustering es un tipo de algoritmo que agrupa de manera jer谩rquica. De esta manera lo que se hace es considerar a cada observaci贸n como un cluster y luego ir juntando aquellos que sean m谩s similares. Esto lo repetimos hasta alcanzar un numero de clusters deseado.
|
| 19 |
+
|
| 20 |
+
**M茅todo**
|
| 21 |
+
- Inicializamos todos los puntos como clusters
|
| 22 |
+
- Tomamos dos clusters que se encuentren cercanos y los unific谩ramos en un 煤nico cluster.
|
| 23 |
+
- Repetimos el paso anterior hasta conseguir un numero de clusters deseado.
|
| 24 |
+
|
| 25 |
+
**Criterios para medir la similitud entre clusters**
|
| 26 |
+
|
| 27 |
+
- Distancia entre los puntos **m谩s cercanos** de dos clusters distintos.
|
| 28 |
+
- Distancia entre los puntos **m谩s lejanos** de dos clusters distintos.
|
| 29 |
+
- Distancia entre los promedios de cada cluster.
|
| 30 |
+
|
| 31 |
+
'''
|
| 32 |
+
self.x_feature = 1
|
| 33 |
+
self.y_feature = 2
|
| 34 |
+
self.n_clusters = 3
|
| 35 |
+
|
| 36 |
+
def params(self):
|
| 37 |
+
n_targets = len(set(self.database.target))
|
| 38 |
+
self.n_clusters = st.slider('Numero de clusters', 1, n_targets, 1)
|
| 39 |
+
|
| 40 |
+
def solve(self):
|
| 41 |
+
n_features = int(self.database.data.shape[1])
|
| 42 |
+
self.x_feature = st.slider('Variables en eje x', 1, n_features, 1)
|
| 43 |
+
self.y_feature = st.slider('Variables en eje y', 1, n_features, 2)
|
| 44 |
+
X = self.database.data
|
| 45 |
+
sklearn_clus = AgglomerativeClustering(self.n_clusters, linkage='single')
|
| 46 |
+
pred = sklearn_clus.fit_predict(X)
|
| 47 |
+
fig, ax = plt.subplots(figsize=(12,8))
|
| 48 |
+
ax.scatter(X[:, self.x_feature-1], X[:, self.y_feature-1], c=pred)
|
| 49 |
+
plt.title(f'{self.n_clusters} Clusters')
|
| 50 |
+
return fig
|
Decision_tree.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn import datasets
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
from sklearn.tree import DecisionTreeClassifier
|
| 7 |
+
from sklearn.metrics import accuracy_score
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
from sklearn.metrics import confusion_matrix
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class Decision_tree_st:
|
| 14 |
+
def __init__(self, database, test_size=0.2):
|
| 15 |
+
self.database = database
|
| 16 |
+
self.test_size = test_size
|
| 17 |
+
self.desc = r'''
|
| 18 |
+
# **Decision Tree**
|
| 19 |
+
|
| 20 |
+
**Entropy**
|
| 21 |
+
|
| 22 |
+
$$
|
| 23 |
+
E = - \sum p(X) \cdot log_{2}(p(X))
|
| 24 |
+
$$
|
| 25 |
+
|
| 26 |
+
$$
|
| 27 |
+
p(X) = \frac{len(x)}{n}
|
| 28 |
+
$$
|
| 29 |
+
|
| 30 |
+
**Ganancia de informaci贸n**
|
| 31 |
+
|
| 32 |
+
$$
|
| 33 |
+
IG = E(parent) - [weight \quad average] \cdot E(children)
|
| 34 |
+
$$
|
| 35 |
+
|
| 36 |
+
**M茅todo (para construir el 谩rbol)**
|
| 37 |
+
|
| 38 |
+
- Se comienza desde el primer nodo y para cada se selecciona la mejor separaci贸n en base a la ganancia de informaci贸n.
|
| 39 |
+
- De la ganancia de informaci贸n m谩s alta se rescata la variable y el l铆mite.
|
| 40 |
+
- Luego se aplica la segmentaci贸n a cada nodo, en base a la variable y limite encontrado.
|
| 41 |
+
- Se itera con estos pasos hasta cumplirse alg煤n criterio
|
| 42 |
+
- **maximium depth**: cantidad de nodos m谩ximos al final
|
| 43 |
+
- **minimum samples**: cantidad m铆nima de elementos que puede tener los nodos
|
| 44 |
+
- **no more class distribution**: No existen m谩s elementos para segmentar
|
| 45 |
+
|
| 46 |
+
**Aproximaci贸n (predicci贸n)**
|
| 47 |
+
|
| 48 |
+
- Se sigue las segmentaciones en el orden del 谩rbol (de arriba a abajo)
|
| 49 |
+
- Cuando se llega a un nodo al final del 谩rbol se predice seg煤n el valor m谩s com煤n en esa muestra.
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
'''
|
| 53 |
+
self.max_depth = 100
|
| 54 |
+
self.min_samples_split = 2
|
| 55 |
+
self.stop_criterion = 'max_depth'
|
| 56 |
+
|
| 57 |
+
def params(self):
|
| 58 |
+
self.stop_criterion = st.radio('Criterio de termino:', options=['max_depth', 'min_samples_split'])
|
| 59 |
+
if self.stop_criterion == 'max_depth': self.max_depth = st.slider('Valor max deph:', 1, 100, 10)
|
| 60 |
+
elif self.stop_criterion == 'min_samples_split': self.min_samples_split = st.slider('Valor min_samples_split:', 2, 1000, 5)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def solve(self):
|
| 64 |
+
self.X, self.y = self.database.data, self.database.target
|
| 65 |
+
X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
|
| 66 |
+
if self.stop_criterion == 'max_depth': self.sklearn_clf = DecisionTreeClassifier(max_depth=self.max_depth, random_state=1234)
|
| 67 |
+
elif self.stop_criterion == 'min_samples_split': self.sklearn_clf = DecisionTreeClassifier(min_samples_split=self.min_samples_split, random_state=1234)
|
| 68 |
+
|
| 69 |
+
self.sklearn_clf.fit(X_train, y_train)
|
| 70 |
+
y_pred = self.sklearn_clf.predict(X_test)
|
| 71 |
+
acc = accuracy_score(y_pred, y_test)
|
| 72 |
+
|
| 73 |
+
c1, c2 = st.columns([4, 1])
|
| 74 |
+
c2.metric('Acierto', value=f'{np.round(acc, 2)*100}%')
|
| 75 |
+
df = pd.DataFrame(confusion_matrix(y_pred, y_test))
|
| 76 |
+
labels = self.database.target_names
|
| 77 |
+
df.columns = labels
|
| 78 |
+
df.index = labels
|
| 79 |
+
c1.write('**Confusion Matrix**')
|
| 80 |
+
c1.dataframe(df)
|
| 81 |
+
|
| 82 |
+
def visualization(self):
|
| 83 |
+
n_features = int(self.database.data.shape[1])
|
| 84 |
+
self.x_feature = st.slider('Variables en eje x', 1, n_features, 1)
|
| 85 |
+
self.y_feature = st.slider('Variables en eje y', 1, n_features, 2)
|
| 86 |
+
|
| 87 |
+
self.X = np.c_[self.database.data[:, self.x_feature-1:self.x_feature], self.database.data[:, self.y_feature-1:self.y_feature]]
|
| 88 |
+
self.y = self.database.target
|
| 89 |
+
X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
|
| 90 |
+
if self.stop_criterion == 'max_depth': self.sklearn_clf = DecisionTreeClassifier(max_depth=self.max_depth, random_state=1234)
|
| 91 |
+
elif self.stop_criterion == 'min_samples_split': self.sklearn_clf = DecisionTreeClassifier(min_samples_split=self.min_samples_split, random_state=1234)
|
| 92 |
+
self.sklearn_clf.fit(X_train, y_train)
|
| 93 |
+
|
| 94 |
+
x1_min, x1_max = self.X[:, 0].min() - 0.5, self.X[:, 0].max() + 0.5
|
| 95 |
+
x2_min, x2_max = self.X[:, 1].min() - 0.5, self.X[:, 1].max() + 0.5
|
| 96 |
+
h = 0.02 # Salto que vamos dando
|
| 97 |
+
x1_i = np.arange(x1_min, x1_max, h)
|
| 98 |
+
x2_i = np.arange(x2_min, x2_max, h)
|
| 99 |
+
x1_x1, x2_x2 = np.meshgrid(x1_i, x2_i)
|
| 100 |
+
y_pred = self.sklearn_clf.predict(np.c_[x1_x1.ravel(), x2_x2.ravel()])
|
| 101 |
+
y_pred = y_pred.reshape(x1_x1.shape)
|
| 102 |
+
|
| 103 |
+
plt.figure(1, figsize=(12, 8))
|
| 104 |
+
plt.pcolormesh(x1_x1, x2_x2, y_pred, cmap=plt.cm.Paired)
|
| 105 |
+
plt.scatter(self.X[:, 0], self.X[:, 1], c=self.y, edgecolors='k', cmap=plt.cm.Paired)
|
| 106 |
+
plt.xlim(x1_x1.min(), x1_x1.max())
|
| 107 |
+
plt.ylim(x2_x2.min(), x2_x2.max())
|
| 108 |
+
return plt.gcf()
|
ICA.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn import datasets
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
from sklearn.svm import SVR
|
| 7 |
+
from sklearn.decomposition import FastICA
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
|
| 10 |
+
class ICA_st:
|
| 11 |
+
def __init__(self, database, test_size=0.2):
|
| 12 |
+
self.database = database
|
| 13 |
+
self.test_size = test_size
|
| 14 |
+
self.desc = r'''
|
| 15 |
+
# **ICA (Independent Component Analysis)**
|
| 16 |
+
|
| 17 |
+
ICA es un m茅todo que se utiliza para identificar las componentes de una se帽al multivariada. De esta manera es que podemos extraer un componente que se encuentre mezclados con otros.
|
| 18 |
+
|
| 19 |
+
- A $X$ restarle su media $\bar{X}$
|
| 20 |
+
- Transformar $X$ de manera que las potenciales correlaciones entre las componentes sean removidas y que la varianza para cada componente sea igual a 1. (Hacer que la matriz de covarianza se parezca a la matriz de identidad)
|
| 21 |
+
|
| 22 |
+
$$
|
| 23 |
+
\hat{x} = E \cdot \sqrt{D} \cdot E^{T} \cdot x
|
| 24 |
+
$$
|
| 25 |
+
|
| 26 |
+
- $D$: Diagonal con valores propios (de la matriz de covarianzas)
|
| 27 |
+
- $E$: Matrix con vectores propios (de la matriz de covarianzas)
|
| 28 |
+
|
| 29 |
+
- Escoger valores aleatorios para armar la matriz $W$.
|
| 30 |
+
- Calcular los nuevos valores para $W$
|
| 31 |
+
|
| 32 |
+
$$
|
| 33 |
+
w_{i} = \frac{1}{n} \sum X \cdot tanh(W^{T} \cdot X) - \frac{1}{n} \sum X \cdot (1 - tanh^{2}(W^{T} \cdot X) \cdot W)
|
| 34 |
+
$$
|
| 35 |
+
|
| 36 |
+
$$
|
| 37 |
+
w_{i} = w_{i} - \sum_{j=1}^{p-1} (w_{p}^{T}w_{j})w_{j}
|
| 38 |
+
$$
|
| 39 |
+
|
| 40 |
+
- Normalizar $w_{p}$
|
| 41 |
+
|
| 42 |
+
$$
|
| 43 |
+
w_{p} = \frac{w_{p}}{||w_{p}||}
|
| 44 |
+
$$
|
| 45 |
+
|
| 46 |
+
- Chequear condici贸n de termino. Si no se cumple volvemos a calcular los nuevos valores de $w$
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
$$
|
| 50 |
+
w_{p}^{T}w_{p+1} - 1 < Tolerance
|
| 51 |
+
$$
|
| 52 |
+
|
| 53 |
+
- Calcular las fuentes independientes como $S = W \cdot X$'''
|
| 54 |
+
|
| 55 |
+
self.x_feature = 1
|
| 56 |
+
self.y_feature = 2
|
| 57 |
+
self.n_components = 2
|
| 58 |
+
|
| 59 |
+
def params(self):
|
| 60 |
+
n_features = int(self.database.data.shape[1])
|
| 61 |
+
self.n_components = st.slider('Numero de componentes', 1, n_features, 2)
|
| 62 |
+
|
| 63 |
+
def solve(self):
|
| 64 |
+
self.x_feature = st.slider('Componente eje x', 1, self.n_components, 1)
|
| 65 |
+
self.y_feature = st.slider('Componente eje y', 1, self.n_components, 2)
|
| 66 |
+
X = self.database.data
|
| 67 |
+
y = self.database.target
|
| 68 |
+
sklearn_clus = FastICA(n_components=self.n_components)
|
| 69 |
+
X_proyected_sk = sklearn_clus.fit_transform(X)
|
| 70 |
+
|
| 71 |
+
x1 = X_proyected_sk[:, self.x_feature-1]
|
| 72 |
+
x2 = X_proyected_sk[:, self.y_feature-1]
|
| 73 |
+
|
| 74 |
+
plt.figure(1, figsize=(12, 8))
|
| 75 |
+
plt.scatter(x1, x2, c=y, edgecolors='none', alpha=0.8, cmap=plt.cm.get_cmap('viridis', len(y)))
|
| 76 |
+
plt.xlabel(f'Componente {self.x_feature}')
|
| 77 |
+
plt.ylabel(f'Componente {self.y_feature}')
|
| 78 |
+
plt.colorbar()
|
| 79 |
+
|
| 80 |
+
return plt.gcf()
|
KNN.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn import datasets
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
from sklearn.neighbors import KNeighborsClassifier
|
| 7 |
+
from sklearn.metrics import accuracy_score
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
from sklearn.metrics import confusion_matrix
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class KNN_st:
|
| 14 |
+
def __init__(self, database, test_size=0.2):
|
| 15 |
+
self.database = database
|
| 16 |
+
self.test_size = test_size
|
| 17 |
+
self.desc = '''
|
| 18 |
+
# **Algoritmo KNN (K Nearest Neighbor)**
|
| 19 |
+
|
| 20 |
+
Este algoritmo se basa en que, para predecir una clasificaci贸n sobre un nuevo dato, lo primero que debemos hacer es calcula la **distancia euclidiana** con el resto de los datos, **seleccionar los k datos con menor distancia** (m谩s cercanos) y por ultimo **asignar la clasificaci贸n en funci贸n a la moda** (categor铆a m谩s repetida) de esos k datos seleccionados.
|
| 21 |
+
|
| 22 |
+
**Distancia Euclidiana**
|
| 23 |
+
|
| 24 |
+
$$
|
| 25 |
+
Dist= \sqrt(\sum_{i=1}^n (Xtest_{i} - Xtrain_{i})^2))
|
| 26 |
+
$$
|
| 27 |
+
|
| 28 |
+
'''
|
| 29 |
+
self.neighbors = 5
|
| 30 |
+
|
| 31 |
+
def params(self):
|
| 32 |
+
self.neighbors = st.slider('Numero de vecinos',
|
| 33 |
+
min_value=0,
|
| 34 |
+
max_value=15,
|
| 35 |
+
value=5)
|
| 36 |
+
|
| 37 |
+
def solve(self):
|
| 38 |
+
self.X, self.y = self.database.data, self.database.target
|
| 39 |
+
X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
|
| 40 |
+
self.sklearn_clf = KNeighborsClassifier(self.neighbors)
|
| 41 |
+
self.sklearn_clf.fit(X_train, y_train)
|
| 42 |
+
y_pred = self.sklearn_clf.predict(X_test)
|
| 43 |
+
acc = accuracy_score(y_pred, y_test)
|
| 44 |
+
|
| 45 |
+
c1, c2 = st.columns([4, 1])
|
| 46 |
+
c2.metric('Acierto', value=f'{np.round(acc, 2)*100}%')
|
| 47 |
+
df = pd.DataFrame(confusion_matrix(y_pred, y_test))
|
| 48 |
+
labels = self.database.target_names
|
| 49 |
+
df.columns = labels
|
| 50 |
+
df.index = labels
|
| 51 |
+
c1.write('**Confusion Matrix**')
|
| 52 |
+
c1.dataframe(df)
|
| 53 |
+
|
| 54 |
+
def visualization(self):
|
| 55 |
+
n_features = int(self.database.data.shape[1])
|
| 56 |
+
self.x_feature = st.slider('Variables en eje x', 1, n_features, 1)
|
| 57 |
+
self.y_feature = st.slider('Variables en eje y', 1, n_features, 2)
|
| 58 |
+
|
| 59 |
+
self.X = np.c_[self.database.data[:, self.x_feature-1:self.x_feature], self.database.data[:, self.y_feature-1:self.y_feature]]
|
| 60 |
+
self.y = self.database.target
|
| 61 |
+
X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
|
| 62 |
+
self.sklearn_clf = KNeighborsClassifier(self.neighbors)
|
| 63 |
+
self.sklearn_clf.fit(X_train, y_train)
|
| 64 |
+
|
| 65 |
+
x1_min, x1_max = self.X[:, 0].min() - 0.5, self.X[:, 0].max() + 0.5
|
| 66 |
+
x2_min, x2_max = self.X[:, 1].min() - 0.5, self.X[:, 1].max() + 0.5
|
| 67 |
+
h = 0.02 # Salto que vamos dando
|
| 68 |
+
x1_i = np.arange(x1_min, x1_max, h)
|
| 69 |
+
x2_i = np.arange(x2_min, x2_max, h)
|
| 70 |
+
x1_x1, x2_x2 = np.meshgrid(x1_i, x2_i)
|
| 71 |
+
y_pred = self.sklearn_clf.predict(np.c_[x1_x1.ravel(), x2_x2.ravel()])
|
| 72 |
+
y_pred = y_pred.reshape(x1_x1.shape)
|
| 73 |
+
|
| 74 |
+
plt.figure(1, figsize=(12, 8))
|
| 75 |
+
plt.pcolormesh(x1_x1, x2_x2, y_pred, cmap=plt.cm.Paired)
|
| 76 |
+
plt.scatter(self.X[:, 0], self.X[:, 1], c=self.y, edgecolors='k', cmap=plt.cm.Paired)
|
| 77 |
+
plt.xlim(x1_x1.min(), x1_x1.max())
|
| 78 |
+
plt.ylim(x2_x2.min(), x2_x2.max())
|
| 79 |
+
return plt.gcf()
|
LDA.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn import datasets
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
from sklearn.svm import SVR
|
| 7 |
+
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
|
| 10 |
+
class LDA_st:
|
| 11 |
+
def __init__(self, database, test_size=0.2):
|
| 12 |
+
self.database = database
|
| 13 |
+
self.test_size = test_size
|
| 14 |
+
self.desc = r'''
|
| 15 |
+
# **LDA (Linear Discrimination Analysis)**
|
| 16 |
+
|
| 17 |
+
**Objetivo**
|
| 18 |
+
|
| 19 |
+
Reducir el n煤mero de variables (**features**).
|
| 20 |
+
El objetivo es proyectar un conjunto de datos a un espacio dimensional m谩s reducido. (Similar a como se hac铆a con **PCA**)
|
| 21 |
+
|
| 22 |
+
**PCA vs LDA**
|
| 23 |
+
|
| 24 |
+
- **PCA**: Encontrar los ejes que maximizan la varianza en los datos.
|
| 25 |
+
- **LDA**: El inter茅s esta puesto en los ejes que maximizan la separaci贸n entre clases de datos.
|
| 26 |
+
- **LDA**: es un tipo de **aprendizaje supervisado** (utiliza la clasificaci贸n (etiquetas) de los datos para entrenar al algoritmo), en cambio **PCA** es un tipo de **aprendizaje no supervisado** (sin etiquetas)
|
| 27 |
+
|
| 28 |
+
**Within-class scatter matrix**
|
| 29 |
+
|
| 30 |
+
$$
|
| 31 |
+
S_{w} = \sum_{c} S_{c}
|
| 32 |
+
$$
|
| 33 |
+
|
| 34 |
+
$$
|
| 35 |
+
S_{c} = \sum_{i \in c} (x_{i} - \bar{x_{c}}) \cdot (x_{i} - \bar{x_{c}})^{T}
|
| 36 |
+
$$
|
| 37 |
+
|
| 38 |
+
**Between class scatter matrix**
|
| 39 |
+
|
| 40 |
+
$$
|
| 41 |
+
S_{B} = \sum_{c} 畏 \cdot (\bar{x_{c}} - \bar{x}) \cdot (\bar{x_{c}} - \bar{x})^{T}
|
| 42 |
+
$$
|
| 43 |
+
|
| 44 |
+
**Vectores y valores propios**
|
| 45 |
+
|
| 46 |
+
Calcular los vectores y valores propios de la siguiente matriz:
|
| 47 |
+
|
| 48 |
+
$$
|
| 49 |
+
S_{W}^{-1} S_{B}
|
| 50 |
+
$$
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
**M茅todo**
|
| 54 |
+
|
| 55 |
+
- Calcular $S_{B}$
|
| 56 |
+
- Calcular $S_{W}$
|
| 57 |
+
- Calcular los vectores y valores propios de $S_{W}^{-1} S_{B}$
|
| 58 |
+
- Ordenar los vectores propios en funci贸n de los valores propios de manera decreciente
|
| 59 |
+
- Escoger los primeros k vectores propios los cuales vendr谩n a representar las nuevas k dimensiones
|
| 60 |
+
- Transformar los datos en las nuevas dimensiones (**se hace con producto punto**) '''
|
| 61 |
+
|
| 62 |
+
self.x_feature = 1
|
| 63 |
+
self.y_feature = 2
|
| 64 |
+
self.n_components = 2
|
| 65 |
+
|
| 66 |
+
def params(self):
|
| 67 |
+
self.n_clases = len(set(self.database.target))
|
| 68 |
+
self.n_features = int(self.database.data.shape[1])
|
| 69 |
+
self.min = int(np.min([self.n_clases-1, self.n_features]))
|
| 70 |
+
if self.min == 1: pass
|
| 71 |
+
elif self.min == 2: self.n_components = 2
|
| 72 |
+
else: self.n_components = st.slider('Numero de componentes', 2, self.min, 2)
|
| 73 |
+
|
| 74 |
+
def solve(self):
|
| 75 |
+
if self.min == 1: pass
|
| 76 |
+
else:
|
| 77 |
+
self.x_feature = st.slider('Componente eje x', 1, self.n_components, 1)
|
| 78 |
+
self.y_feature = st.slider('Componente eje y', 1, self.n_components, 2)
|
| 79 |
+
X = self.database.data
|
| 80 |
+
y = self.database.target
|
| 81 |
+
sklearn_clus = LinearDiscriminantAnalysis(n_components=self.n_components)
|
| 82 |
+
sklearn_clus.fit(X, y)
|
| 83 |
+
X_proyected_sk = sklearn_clus.transform(X)
|
| 84 |
+
|
| 85 |
+
x1 = X_proyected_sk[:, self.x_feature-1]
|
| 86 |
+
x2 = X_proyected_sk[:, self.y_feature-1]
|
| 87 |
+
|
| 88 |
+
plt.figure(figsize=(12, 8))
|
| 89 |
+
plt.scatter(x1, x2, c=y, edgecolors='none', alpha=0.8, cmap=plt.cm.get_cmap('viridis', len(y)))
|
| 90 |
+
plt.xlabel(f'Componente {self.x_feature}')
|
| 91 |
+
plt.ylabel(f'Componente {self.y_feature}')
|
| 92 |
+
plt.colorbar()
|
| 93 |
+
|
| 94 |
+
return plt.gcf()
|
Linear_regression.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn import datasets
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
from sklearn.linear_model import LinearRegression
|
| 7 |
+
from sklearn.metrics import mean_squared_error
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
|
| 10 |
+
class linear_regression_st:
|
| 11 |
+
def __init__(self, database, test_size=0.2):
|
| 12 |
+
self.database = database
|
| 13 |
+
self.test_size = test_size
|
| 14 |
+
self.desc = r'''
|
| 15 |
+
# **Linear Regression**
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
**Predicci贸n (aproximaci贸n)**
|
| 19 |
+
$$
|
| 20 |
+
\hat{y} = wx + b
|
| 21 |
+
$$
|
| 22 |
+
|
| 23 |
+
**Funci贸n de costos**
|
| 24 |
+
|
| 25 |
+
$$
|
| 26 |
+
Loss = MSE = \frac{1}{N} \sum_{i=1}^n (y_{i} - \hat{y_{i}})^2
|
| 27 |
+
$$
|
| 28 |
+
|
| 29 |
+
**Calculo del gradiente**
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
$$
|
| 33 |
+
\left[\begin{array}{ll}\frac{d_{loss}}{dw} \\ \frac{d_{loss}}{db} \end{array} \right] = \left[\begin{array}{ll} \frac{1}{N} \sum -2x_{i}(y_{i} - (wx_{i} + b)) \\ \frac{1}{N} \sum -2(y_{i} - (wx_{i} + b)) \end{array} \right]
|
| 34 |
+
$$
|
| 35 |
+
|
| 36 |
+
**M茅todo del Descenso del Gradiente**
|
| 37 |
+
|
| 38 |
+
- Inicializar los pesos ($w$) y el sesgo ($b$)
|
| 39 |
+
- Iteramos
|
| 40 |
+
- Calcular el gradiente
|
| 41 |
+
- Actualizamos los par谩metros (lr=learning rate)
|
| 42 |
+
|
| 43 |
+
$$
|
| 44 |
+
w = w - lr*dw
|
| 45 |
+
$$
|
| 46 |
+
|
| 47 |
+
$$
|
| 48 |
+
b = b-lr*db
|
| 49 |
+
$$
|
| 50 |
+
|
| 51 |
+
- Terminamos de iterar
|
| 52 |
+
'''
|
| 53 |
+
|
| 54 |
+
def solve(self):
|
| 55 |
+
self.X, self.y = self.database.data, self.database.target
|
| 56 |
+
X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
|
| 57 |
+
self.sklearn_regr = LinearRegression()
|
| 58 |
+
self.sklearn_regr.fit(X_train, y_train)
|
| 59 |
+
y_pred = self.sklearn_regr.predict(X_test)
|
| 60 |
+
acc = mean_squared_error(y_pred, y_test)
|
| 61 |
+
st.metric('MSE (Mean Square Error)', value=f'{np.round(acc, 2)}')
|
| 62 |
+
|
| 63 |
+
def visualization(self):
|
| 64 |
+
n_features = int(self.database.data.shape[1])
|
| 65 |
+
self.x_feature = st.slider('Variable en eje x', 1, n_features, 1)
|
| 66 |
+
|
| 67 |
+
self.X = self.database.data[:, self.x_feature-1:self.x_feature]
|
| 68 |
+
self.y = self.database.target
|
| 69 |
+
X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
|
| 70 |
+
self.sklearn_regr = LinearRegression()
|
| 71 |
+
self.sklearn_regr.fit(X_train, y_train)
|
| 72 |
+
|
| 73 |
+
x1_min = self.X.min()
|
| 74 |
+
x1_max = self.X.max()
|
| 75 |
+
|
| 76 |
+
x_pred = np.linspace(x1_min, x1_max, 100).reshape([100, 1])
|
| 77 |
+
y_pred = self.sklearn_regr.predict(x_pred)
|
| 78 |
+
|
| 79 |
+
plt.figure(1, figsize=(12, 8))
|
| 80 |
+
plt.scatter(self.X, self.y, edgecolors='k', cmap=plt.cm.Paired)
|
| 81 |
+
plt.plot(x_pred, y_pred)
|
| 82 |
+
return plt.gcf()
|
Logit.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn import datasets
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
from sklearn.linear_model import LogisticRegression as LR_sk
|
| 7 |
+
from sklearn.metrics import accuracy_score
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
from sklearn.metrics import confusion_matrix
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class Logit_st:
|
| 14 |
+
def __init__(self, database, test_size=0.2):
|
| 15 |
+
self.database = database
|
| 16 |
+
self.test_size = test_size
|
| 17 |
+
self.desc = r'''
|
| 18 |
+
# **Logistic Regresion**
|
| 19 |
+
|
| 20 |
+
**Predicci贸n (Aproximaci贸n)**
|
| 21 |
+
|
| 22 |
+
$$
|
| 23 |
+
z = wx + b
|
| 24 |
+
$$
|
| 25 |
+
|
| 26 |
+
$$
|
| 27 |
+
\hat{y} = \frac{1}{1+e^{-z}}
|
| 28 |
+
$$
|
| 29 |
+
|
| 30 |
+
**Funci贸n de perdida (cross entropy)**
|
| 31 |
+
|
| 32 |
+
$$
|
| 33 |
+
loss = \frac{1}{N} \sum_{i=1}^{n} [y^{i}log(\hat{y(x^{i})}) + (1-y^{i})log(1 - \hat{y(x^{i})})]
|
| 34 |
+
$$
|
| 35 |
+
|
| 36 |
+
**Gradientes**
|
| 37 |
+
|
| 38 |
+
$$
|
| 39 |
+
\left[\begin{array}{ll} \frac{d_{loss}}{dw} \\ \frac{d_{loss}}{db} \end{array}\right] = \left[\begin{array}{ll} \frac{1}{N} \sum 2x_{i}(\hat{y} - y_{i}) \\ \frac{1}{N} \sum 2(\hat{y} - y_{i}) \end{array}\right]
|
| 40 |
+
$$
|
| 41 |
+
|
| 42 |
+
**Metodo de Gradient Descent**
|
| 43 |
+
- Iniciar par谩metros
|
| 44 |
+
- Iterar
|
| 45 |
+
- Calcular el error (loss)
|
| 46 |
+
- Actualizar los pesos ($lr$=learning rate)
|
| 47 |
+
|
| 48 |
+
$$
|
| 49 |
+
w = w - lr*dw
|
| 50 |
+
$$
|
| 51 |
+
|
| 52 |
+
$$
|
| 53 |
+
b = b - lr*db
|
| 54 |
+
$$
|
| 55 |
+
|
| 56 |
+
- Terminar de iterar
|
| 57 |
+
'''
|
| 58 |
+
self.x_feature = 1
|
| 59 |
+
self.y_feature = 2
|
| 60 |
+
|
| 61 |
+
def params(self):
|
| 62 |
+
pass
|
| 63 |
+
|
| 64 |
+
def solve(self):
|
| 65 |
+
self.X, self.y = self.database.data, self.database.target
|
| 66 |
+
X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
|
| 67 |
+
self.sklearn_clf = LR_sk(max_iter=1000, random_state=1234)
|
| 68 |
+
self.sklearn_clf.fit(X_train, y_train)
|
| 69 |
+
y_pred = self.sklearn_clf.predict(X_test)
|
| 70 |
+
acc = accuracy_score(y_pred, y_test)
|
| 71 |
+
|
| 72 |
+
c1, c2 = st.columns([4, 1])
|
| 73 |
+
c2.metric('Acierto', value=f'{np.round(acc, 2)*100}%')
|
| 74 |
+
df = pd.DataFrame(confusion_matrix(y_pred, y_test))
|
| 75 |
+
labels = self.database.target_names
|
| 76 |
+
df.columns = labels
|
| 77 |
+
df.index = labels
|
| 78 |
+
c1.write('**Confusion Matrix**')
|
| 79 |
+
c1.dataframe(df)
|
| 80 |
+
|
| 81 |
+
def visualization(self):
|
| 82 |
+
n_features = int(self.database.data.shape[1])
|
| 83 |
+
self.x_feature = st.slider('Variables en eje x', 1, n_features, 1)
|
| 84 |
+
self.y_feature = st.slider('Variables en eje y', 1, n_features, 2)
|
| 85 |
+
|
| 86 |
+
self.X = np.c_[self.database.data[:, self.x_feature-1:self.x_feature], self.database.data[:, self.y_feature-1:self.y_feature]]
|
| 87 |
+
self.y = self.database.target
|
| 88 |
+
X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
|
| 89 |
+
self.sklearn_clf = LR_sk(max_iter=1000, random_state=1234)
|
| 90 |
+
self.sklearn_clf.fit(X_train, y_train)
|
| 91 |
+
|
| 92 |
+
x1_min, x1_max = self.X[:, 0].min() - 0.5, self.X[:, 0].max() + 0.5
|
| 93 |
+
x2_min, x2_max = self.X[:, 1].min() - 0.5, self.X[:, 1].max() + 0.5
|
| 94 |
+
h = 0.02 # Salto que vamos dando
|
| 95 |
+
x1_i = np.arange(x1_min, x1_max, h)
|
| 96 |
+
x2_i = np.arange(x2_min, x2_max, h)
|
| 97 |
+
x1_x1, x2_x2 = np.meshgrid(x1_i, x2_i)
|
| 98 |
+
y_pred = self.sklearn_clf.predict(np.c_[x1_x1.ravel(), x2_x2.ravel()])
|
| 99 |
+
y_pred = y_pred.reshape(x1_x1.shape)
|
| 100 |
+
|
| 101 |
+
plt.figure(1, figsize=(12, 8))
|
| 102 |
+
plt.pcolormesh(x1_x1, x2_x2, y_pred, cmap=plt.cm.Paired)
|
| 103 |
+
plt.scatter(self.X[:, 0], self.X[:, 1], c=self.y, edgecolors='k', cmap=plt.cm.Paired)
|
| 104 |
+
plt.xlim(x1_x1.min(), x1_x1.max())
|
| 105 |
+
plt.ylim(x2_x2.min(), x2_x2.max())
|
| 106 |
+
return plt.gcf()
|
Naive_bayes.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn import datasets
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
from sklearn.naive_bayes import GaussianNB
|
| 7 |
+
from sklearn.metrics import accuracy_score
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
from sklearn.metrics import confusion_matrix
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class naive_bayes_st:
|
| 14 |
+
def __init__(self, database, test_size=0.2):
|
| 15 |
+
self.database = database
|
| 16 |
+
self.test_size = test_size
|
| 17 |
+
self.desc = r'''
|
| 18 |
+
# **Naive Bayes**
|
| 19 |
+
|
| 20 |
+
Particularmente, este algoritmo no lo conoc铆a, y por lo que he visto hasta ahora funciona como un **clasificador** bas谩ndose principalmente en el **teorema de bayes**.
|
| 21 |
+
|
| 22 |
+
**Teorema de bayes**
|
| 23 |
+
|
| 24 |
+
$$
|
| 25 |
+
P(A/B) = \frac{P(B/A) \cdot P(A)}{P(B)}
|
| 26 |
+
$$
|
| 27 |
+
|
| 28 |
+
Eso s铆, para aprovechar este teorema es que se tiene que cumplir la condici贸n de que los atributos o **componentes del vector X sean independientes entre s铆 (Se asume que los eventos son independientes)**.
|
| 29 |
+
|
| 30 |
+
$$
|
| 31 |
+
P(y/X) = \frac{P(X/y) \cdot P(y)}{P(X)} = \frac{P(x_{1}/y) \quad ... \quad P(x_{n}/y) \cdot P(y)}{P(X)}
|
| 32 |
+
$$
|
| 33 |
+
|
| 34 |
+
As铆, luego la manera de escoger a que clasificaci贸n pertenece el vector X, es calculando todas las probabilidades condicionales (**Nota**: el $P(x)$ lo podemos omitir ya que va a estar presente en todas las ecuaciones)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
$$
|
| 38 |
+
y = argmax_{y} \quad P(x_{1}/y) \quad ... \quad P(x_{n}/y) \cdot P(y)
|
| 39 |
+
$$
|
| 40 |
+
|
| 41 |
+
$$
|
| 42 |
+
y = argmax_{y} \quad log(P(x_{1}/y)) + \quad ... \quad + log(P(x_{n}/y)) + log(P(y))
|
| 43 |
+
$$
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
**Por 煤ltimo, nos falta definir:**
|
| 47 |
+
|
| 48 |
+
$P(y)$: Frecuencia (cantidad de veces que est谩 presente la clasificaci贸n y en los datos)
|
| 49 |
+
|
| 50 |
+
$$
|
| 51 |
+
P(x_{i}/y) = \frac{1}{\sqrt{2\pi \sigma_{y}^{2}}} \cdot e^{(-\frac{(x_{i} - \mu_{y})^2}{2蟽_{y}^{2}})}
|
| 52 |
+
$$
|
| 53 |
+
'''
|
| 54 |
+
|
| 55 |
+
def solve(self):
|
| 56 |
+
self.X, self.y = self.database.data, self.database.target
|
| 57 |
+
X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
|
| 58 |
+
self.sklearn_clf = GaussianNB()
|
| 59 |
+
self.sklearn_clf.fit(X_train, y_train)
|
| 60 |
+
y_pred = self.sklearn_clf.predict(X_test)
|
| 61 |
+
acc = accuracy_score(y_pred, y_test)
|
| 62 |
+
|
| 63 |
+
c1, c2 = st.columns([4, 1])
|
| 64 |
+
c2.metric('Acierto', value=f'{np.round(acc, 2)*100}%')
|
| 65 |
+
df = pd.DataFrame(confusion_matrix(y_pred, y_test))
|
| 66 |
+
labels = self.database.target_names
|
| 67 |
+
df.columns = labels
|
| 68 |
+
df.index = labels
|
| 69 |
+
c1.write('**Confusion Matrix**')
|
| 70 |
+
c1.dataframe(df)
|
| 71 |
+
|
| 72 |
+
def visualization(self):
|
| 73 |
+
n_features = int(self.database.data.shape[1])
|
| 74 |
+
self.x_feature = st.slider('Variables en eje x', 1, n_features, 1)
|
| 75 |
+
self.y_feature = st.slider('Variables en eje y', 1, n_features, 2)
|
| 76 |
+
|
| 77 |
+
self.X = np.c_[self.database.data[:, self.x_feature-1:self.x_feature], self.database.data[:, self.y_feature-1:self.y_feature]]
|
| 78 |
+
self.y = self.database.target
|
| 79 |
+
X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
|
| 80 |
+
self.sklearn_clf = GaussianNB()
|
| 81 |
+
self.sklearn_clf.fit(X_train, y_train)
|
| 82 |
+
|
| 83 |
+
x1_min, x1_max = self.X[:, 0].min() - 0.5, self.X[:, 0].max() + 0.5
|
| 84 |
+
x2_min, x2_max = self.X[:, 1].min() - 0.5, self.X[:, 1].max() + 0.5
|
| 85 |
+
h = 0.02 # Salto que vamos dando
|
| 86 |
+
x1_i = np.arange(x1_min, x1_max, h)
|
| 87 |
+
x2_i = np.arange(x2_min, x2_max, h)
|
| 88 |
+
x1_x1, x2_x2 = np.meshgrid(x1_i, x2_i)
|
| 89 |
+
y_pred = self.sklearn_clf.predict(np.c_[x1_x1.ravel(), x2_x2.ravel()])
|
| 90 |
+
y_pred = y_pred.reshape(x1_x1.shape)
|
| 91 |
+
|
| 92 |
+
plt.figure(1, figsize=(12, 8))
|
| 93 |
+
plt.pcolormesh(x1_x1, x2_x2, y_pred, cmap=plt.cm.Paired)
|
| 94 |
+
plt.scatter(self.X[:, 0], self.X[:, 1], c=self.y, edgecolors='k', cmap=plt.cm.Paired)
|
| 95 |
+
plt.xlim(x1_x1.min(), x1_x1.max())
|
| 96 |
+
plt.ylim(x2_x2.min(), x2_x2.max())
|
| 97 |
+
return plt.gcf()
|
PCA.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn import datasets
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
from sklearn.svm import SVR
|
| 7 |
+
from sklearn.decomposition import PCA as PCA_sk
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
|
| 10 |
+
class PCA_st:
|
| 11 |
+
def __init__(self, database, test_size=0.2):
|
| 12 |
+
self.database = database
|
| 13 |
+
self.test_size = test_size
|
| 14 |
+
self.desc = r'''
|
| 15 |
+
# **PCA (Principal Component Analysis)**
|
| 16 |
+
|
| 17 |
+
El objetivo principal con este m茅todo es definir una nueva dimensi贸n para el set de datos (siendo estas nuevas dimensiones ortogonales y por tanto independientes).
|
| 18 |
+
|
| 19 |
+
**Varianza**
|
| 20 |
+
|
| 21 |
+
$$
|
| 22 |
+
var(X) = \frac{1}{n} \sum (X_{i} - \bar{X})^2
|
| 23 |
+
$$
|
| 24 |
+
|
| 25 |
+
**Matriz de Covarianzas**
|
| 26 |
+
|
| 27 |
+
$$
|
| 28 |
+
Cov(X, Y) = \frac{1}{n} \sum (X_{i} - \bar{X})(Y_{i} - \bar{Y})^T
|
| 29 |
+
$$
|
| 30 |
+
|
| 31 |
+
$$
|
| 32 |
+
Cov(X, X) = \frac{1}{n} \sum (X_{i} - \bar{X})(X_{i} - \bar{X})^T
|
| 33 |
+
$$
|
| 34 |
+
|
| 35 |
+
**Valores y Vectores Propios**
|
| 36 |
+
|
| 37 |
+
Los vectores propios apuntan en la direcci贸n donde se genera la m谩xima varianza y el correspondiente valor propio indica el grado de importancia del vector.
|
| 38 |
+
|
| 39 |
+
$$
|
| 40 |
+
A \vec{v} = 位 \vec{v}
|
| 41 |
+
$$
|
| 42 |
+
|
| 43 |
+
**Metodo**
|
| 44 |
+
- Sustraer al vector X su media.
|
| 45 |
+
- Calcular la Cov(X, X)
|
| 46 |
+
- Calcular los vectores y valores propios de las matrices de covarianza
|
| 47 |
+
- Ordenar los vectores propios seg煤n su importancia (en base a su valor propio) en orden decreciente
|
| 48 |
+
- Escoger los primeros k vectores propios y estos pasaran a ser las nuevas k dimensiones
|
| 49 |
+
- Por 煤ltimo, transformar (proyectar) los datos en las nuevas dimensiones (esto se hace con un producto punto)'''
|
| 50 |
+
|
| 51 |
+
self.x_feature = 1
|
| 52 |
+
self.y_feature = 2
|
| 53 |
+
self.n_components = 2
|
| 54 |
+
|
| 55 |
+
def params(self):
|
| 56 |
+
n_features = int(self.database.data.shape[1])
|
| 57 |
+
self.n_components = st.slider('Numero de componentes', 1, n_features, 2)
|
| 58 |
+
|
| 59 |
+
def solve(self):
|
| 60 |
+
self.x_feature = st.slider('Componente eje x', 1, self.n_components, 1)
|
| 61 |
+
self.y_feature = st.slider('Componente eje y', 1, self.n_components, 2)
|
| 62 |
+
X = self.database.data
|
| 63 |
+
y = self.database.target
|
| 64 |
+
sklearn_clus = PCA_sk(n_components=self.n_components)
|
| 65 |
+
sklearn_clus.fit(X)
|
| 66 |
+
X_proyected_sk = sklearn_clus.transform(X)
|
| 67 |
+
|
| 68 |
+
x1 = X_proyected_sk[:, self.x_feature-1]
|
| 69 |
+
x2 = X_proyected_sk[:, self.y_feature-1]
|
| 70 |
+
|
| 71 |
+
plt.figure(1, figsize=(12, 8))
|
| 72 |
+
plt.scatter(x1, x2, c=y, edgecolors='none', alpha=0.8, cmap=plt.cm.get_cmap('viridis', len(y)))
|
| 73 |
+
plt.xlabel(f'Componente {self.x_feature}')
|
| 74 |
+
plt.ylabel(f'Componente {self.y_feature}')
|
| 75 |
+
plt.colorbar()
|
| 76 |
+
|
| 77 |
+
#fig = plt.show().get_fig()
|
| 78 |
+
return plt.gcf()
|
Perceptron.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn import datasets
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
from sklearn.linear_model import Perceptron as P_sk
|
| 7 |
+
from sklearn.metrics import mean_squared_error
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class perceptron_st:
|
| 12 |
+
def __init__(self, database, test_size=0.2):
|
| 13 |
+
self.database = database
|
| 14 |
+
self.test_size = test_size
|
| 15 |
+
self.desc = r'''
|
| 16 |
+
# **Perceptron**
|
| 17 |
+
|
| 18 |
+
Este es el modelo m谩s sencillo y que sirve de introducci贸n a los modelos de redes neuronales. En particular, su funcionamiento es bastante similar al modelo de regresi贸n linear. con la diferencia de que ocupa una funci贸n de activaci贸n en la salida (**funci贸n no lineal**).
|
| 19 |
+
|
| 20 |
+
**Modelo Lineal**
|
| 21 |
+
|
| 22 |
+
$$
|
| 23 |
+
f(w, b) = w^{t}x + b
|
| 24 |
+
$$
|
| 25 |
+
|
| 26 |
+
**Funci贸n de Activaci贸n**
|
| 27 |
+
|
| 28 |
+
$$
|
| 29 |
+
z(x) \in (0, 1) \quad si \quad x \geq 0
|
| 30 |
+
$$
|
| 31 |
+
|
| 32 |
+
**Aproximaci贸n (predicci贸n)**
|
| 33 |
+
|
| 34 |
+
$$
|
| 35 |
+
\hat{y} = z(w^{t}x + b)
|
| 36 |
+
$$
|
| 37 |
+
|
| 38 |
+
**Reglas de actualizaci贸n (aqu铆 se encuentra incluido el bias)**
|
| 39 |
+
|
| 40 |
+
$$
|
| 41 |
+
w = w + \Delta w = w + lr(y_{i} - \hat{y_{i}})x_{i}
|
| 42 |
+
$$
|
| 43 |
+
'''
|
| 44 |
+
|
| 45 |
+
def solve(self):
|
| 46 |
+
self.X, self.y = self.database.data, self.database.target
|
| 47 |
+
X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
|
| 48 |
+
self.sklearn_regr = P_sk(random_state=1234)
|
| 49 |
+
self.sklearn_regr.fit(X_train, y_train)
|
| 50 |
+
y_pred = self.sklearn_regr.predict(X_test)
|
| 51 |
+
acc = mean_squared_error(y_pred, y_test)
|
| 52 |
+
st.metric('MSE (Mean Square Error)', value=f'{np.round(acc, 2)}')
|
| 53 |
+
|
| 54 |
+
def visualization(self):
|
| 55 |
+
n_features = int(self.database.data.shape[1])
|
| 56 |
+
self.x_feature = st.slider('Variable en eje x', 1, n_features, 1)
|
| 57 |
+
|
| 58 |
+
self.X = self.database.data[:, self.x_feature-1:self.x_feature]
|
| 59 |
+
self.y = self.database.target
|
| 60 |
+
X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
|
| 61 |
+
self.sklearn_regr = P_sk(random_state=1234)
|
| 62 |
+
self.sklearn_regr.fit(X_train, y_train)
|
| 63 |
+
|
| 64 |
+
x1_min = self.X.min()
|
| 65 |
+
x1_max = self.X.max()
|
| 66 |
+
|
| 67 |
+
x_pred = np.linspace(x1_min, x1_max, 100).reshape([100, 1])
|
| 68 |
+
y_pred = self.sklearn_regr.predict(x_pred)
|
| 69 |
+
|
| 70 |
+
plt.figure(1, figsize=(12, 8))
|
| 71 |
+
plt.scatter(self.X, self.y, edgecolors='k', cmap=plt.cm.Paired)
|
| 72 |
+
plt.plot(x_pred, y_pred)
|
| 73 |
+
return plt.gcf()
|
Random_forest.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn import datasets
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
from sklearn.ensemble import RandomForestClassifier as rf
|
| 7 |
+
from sklearn.metrics import accuracy_score
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
from sklearn.metrics import confusion_matrix
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class random_forest_st:
|
| 14 |
+
def __init__(self, database, test_size=0.2):
|
| 15 |
+
self.database = database
|
| 16 |
+
self.test_size = test_size
|
| 17 |
+
self.desc = '''
|
| 18 |
+
# **Random Forest**
|
| 19 |
+
|
| 20 |
+
Este algoritmo se construye en base al algoritmo de **Decision Tree**. As铆, lo que se hace es:
|
| 21 |
+
|
| 22 |
+
- Definir cantidad de estimadores (**Decision Tree**)
|
| 23 |
+
- Cada estimador entrenarlo con una muestra del set de datos de entrenamiento, variando as铆 la cantidad de variables y la cantidad de datos con la cual se entrenan estos estimadores.
|
| 24 |
+
- Luego, para generar la predicci贸n de algoritmo, lo que se hace es consultar a cada estimador su predicci贸n y "**de manera democr谩tica**" se escoge la opci贸n m谩s "**votada**"
|
| 25 |
+
'''
|
| 26 |
+
self.n_trees = 100
|
| 27 |
+
self.min_samples_split = 2
|
| 28 |
+
self.max_depth = 100
|
| 29 |
+
self.n_feats = None
|
| 30 |
+
self.stop_criterion = 'max_depth'
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def params(self):
|
| 34 |
+
self.stop_criterion = st.radio('Criterio de termino:', options=['max_depth', 'min_samples_split'])
|
| 35 |
+
if self.stop_criterion == 'max_depth': self.max_depth = st.slider('Valor max deph:', 1, 100, 10)
|
| 36 |
+
elif self.stop_criterion == 'min_samples_split': self.min_samples_split = st.slider('Valor min_samples_split:', 2, 1000, 5)
|
| 37 |
+
self.n_trees = st.slider('Cantidad de estimadores: ', 1, 100, 3)
|
| 38 |
+
self.n_feats = st.slider('Fraccion de categorias para contruir los estimadores: ', 0.0, 1.0, 0.5)
|
| 39 |
+
|
| 40 |
+
def solve(self):
|
| 41 |
+
self.X, self.y = self.database.data, self.database.target
|
| 42 |
+
X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
|
| 43 |
+
if self.stop_criterion == 'max_depth': self.sklearn_clf = rf(n_estimators=self.n_trees,
|
| 44 |
+
max_depth=self.max_depth,
|
| 45 |
+
max_features=self.n_feats,
|
| 46 |
+
random_state=1234)
|
| 47 |
+
elif self.stop_criterion == 'min_samples_split': self.sklearn_clf = rf(n_estimators=self.n_trees,
|
| 48 |
+
min_samples_split=self.min_samples_split,
|
| 49 |
+
max_features=self.n_feats,
|
| 50 |
+
random_state=1234)
|
| 51 |
+
|
| 52 |
+
#self.sklearn_clf = rf(n_estimators=self.n_trees)
|
| 53 |
+
self.sklearn_clf.fit(X_train, y_train)
|
| 54 |
+
y_pred = self.sklearn_clf.predict(X_test)
|
| 55 |
+
acc = accuracy_score(y_pred, y_test)
|
| 56 |
+
|
| 57 |
+
c1, c2 = st.columns([4, 1])
|
| 58 |
+
c2.metric('Acierto', value=f'{np.round(acc, 2)*100}%')
|
| 59 |
+
df = pd.DataFrame(confusion_matrix(y_pred, y_test))
|
| 60 |
+
labels = self.database.target_names
|
| 61 |
+
df.columns = labels
|
| 62 |
+
df.index = labels
|
| 63 |
+
c1.write('**Confusion Matrix**')
|
| 64 |
+
c1.dataframe(df)
|
| 65 |
+
|
| 66 |
+
def visualization(self):
|
| 67 |
+
n_features = int(self.database.data.shape[1])
|
| 68 |
+
self.x_feature = st.slider('Variables en eje x', 1, n_features, 1)
|
| 69 |
+
self.y_feature = st.slider('Variables en eje y', 1, n_features, 2)
|
| 70 |
+
|
| 71 |
+
self.X = np.c_[self.database.data[:, self.x_feature-1:self.x_feature], self.database.data[:, self.y_feature-1:self.y_feature]]
|
| 72 |
+
self.y = self.database.target
|
| 73 |
+
X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
|
| 74 |
+
if self.stop_criterion == 'max_depth': self.sklearn_clf = rf(n_estimators=self.n_trees,
|
| 75 |
+
max_depth=self.max_depth,
|
| 76 |
+
max_features=self.n_feats,
|
| 77 |
+
random_state=1234)
|
| 78 |
+
elif self.stop_criterion == 'min_samples_split': self.sklearn_clf = rf(n_estimators=self.n_trees,
|
| 79 |
+
min_samples_split=self.min_samples_split,
|
| 80 |
+
max_features=self.n_feats,
|
| 81 |
+
random_state=1234)
|
| 82 |
+
self.sklearn_clf.fit(X_train, y_train)
|
| 83 |
+
|
| 84 |
+
x1_min, x1_max = self.X[:, 0].min() - 0.5, self.X[:, 0].max() + 0.5
|
| 85 |
+
x2_min, x2_max = self.X[:, 1].min() - 0.5, self.X[:, 1].max() + 0.5
|
| 86 |
+
h = 0.02 # Salto que vamos dando
|
| 87 |
+
x1_i = np.arange(x1_min, x1_max, h)
|
| 88 |
+
x2_i = np.arange(x2_min, x2_max, h)
|
| 89 |
+
x1_x1, x2_x2 = np.meshgrid(x1_i, x2_i)
|
| 90 |
+
y_pred = self.sklearn_clf.predict(np.c_[x1_x1.ravel(), x2_x2.ravel()])
|
| 91 |
+
y_pred = y_pred.reshape(x1_x1.shape)
|
| 92 |
+
|
| 93 |
+
plt.figure(1, figsize=(12, 8))
|
| 94 |
+
plt.pcolormesh(x1_x1, x2_x2, y_pred, cmap=plt.cm.Paired)
|
| 95 |
+
plt.scatter(self.X[:, 0], self.X[:, 1], c=self.y, edgecolors='k', cmap=plt.cm.Paired)
|
| 96 |
+
plt.xlim(x1_x1.min(), x1_x1.max())
|
| 97 |
+
plt.ylim(x2_x2.min(), x2_x2.max())
|
| 98 |
+
return plt.gcf()
|
SVC.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn import datasets
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
from sklearn import svm
|
| 7 |
+
from sklearn.metrics import accuracy_score
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
from sklearn.metrics import confusion_matrix
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class SVC_st:
|
| 14 |
+
def __init__(self, database, test_size=0.2):
|
| 15 |
+
self.database = database
|
| 16 |
+
self.test_size = test_size
|
| 17 |
+
self.desc = r'''
|
| 18 |
+
# **Support Vector Machine**
|
| 19 |
+
|
| 20 |
+
Este algoritmo tiene por objetivo la b煤squeda de un hiperplano que segregue los datos atendiendo a estas dos condiciones:
|
| 21 |
+
|
| 22 |
+
$$
|
| 23 |
+
wx - b = 0
|
| 24 |
+
$$
|
| 25 |
+
|
| 26 |
+
$$
|
| 27 |
+
max \quad \frac{2}{||w||}
|
| 28 |
+
$$
|
| 29 |
+
|
| 30 |
+
**Linear model (2 categor铆as (1 y -1))**
|
| 31 |
+
|
| 32 |
+
$$
|
| 33 |
+
wx - b = 0
|
| 34 |
+
$$
|
| 35 |
+
|
| 36 |
+
$$
|
| 37 |
+
wx_{i} - b \geq 1 \quad si \quad y_{i} = 1
|
| 38 |
+
$$
|
| 39 |
+
|
| 40 |
+
$$
|
| 41 |
+
wx_{i} - b \leq 1 \quad si \quad y_{i} = -1
|
| 42 |
+
$$
|
| 43 |
+
|
| 44 |
+
**Estas 3 ecuaciones se resumen en la siguiente:**
|
| 45 |
+
|
| 46 |
+
$$
|
| 47 |
+
y_{i}(wx_{i} - b) \geq 1
|
| 48 |
+
$$
|
| 49 |
+
|
| 50 |
+
**Funci贸n de costos (loss)**
|
| 51 |
+
|
| 52 |
+
$$
|
| 53 |
+
loss = 位||w||^2 + \frac{1}{n} \sum_{i=1}^{n} max(0, 1-y_{i}(wx_{i}-b))
|
| 54 |
+
$$
|
| 55 |
+
|
| 56 |
+
De esta manera las **derivadas** en funci贸n de los par谩metros siguen las siguientes reglas:
|
| 57 |
+
|
| 58 |
+
- si $y_{i}(xw - b) \geq 1$:
|
| 59 |
+
|
| 60 |
+
$$
|
| 61 |
+
\left[\begin{array}{ll} \frac{d_{loss}}{d_{w_{k}}} \\ \frac{d_{loss}}{db} \end{array} \right] = \left [\begin{array}{ll} 2 \lambda w_{k} \\ 0 \end{array} \right]
|
| 62 |
+
$$
|
| 63 |
+
|
| 64 |
+
- si $y_{i}(xw - b) < 1$:
|
| 65 |
+
|
| 66 |
+
$$
|
| 67 |
+
\left[\begin{array}{ll}\frac{d_{loss}}{d_{w_{k}}} \\ \frac{d_{loss}}{db} \end{array} \right] = \left[\begin{array}{ll} 2\lambda w_{k} - y_{i} \cdot x_{i} \\ y_{i} \end{array} \right]
|
| 68 |
+
$$
|
| 69 |
+
|
| 70 |
+
**Reglas de actualizaci贸n (Gradient Descent)**
|
| 71 |
+
|
| 72 |
+
- Inicializar par谩metros
|
| 73 |
+
- Iterar
|
| 74 |
+
- Calcular loss
|
| 75 |
+
- Calcular gradiente
|
| 76 |
+
- Actualizar par谩metros
|
| 77 |
+
|
| 78 |
+
$$
|
| 79 |
+
w = w - lr \cdot dw
|
| 80 |
+
$$
|
| 81 |
+
|
| 82 |
+
$$
|
| 83 |
+
b = b - lr \cdot db
|
| 84 |
+
$$
|
| 85 |
+
|
| 86 |
+
- Terminar de iterar
|
| 87 |
+
'''
|
| 88 |
+
self.kernel = 'linear'
|
| 89 |
+
self.gamma = 2
|
| 90 |
+
self.degree = 3
|
| 91 |
+
|
| 92 |
+
def params(self):
|
| 93 |
+
tipo = st.selectbox('Tipo de kernel', options=['linear',
|
| 94 |
+
'poly',
|
| 95 |
+
'rbf'])
|
| 96 |
+
self.kernel = tipo
|
| 97 |
+
self.gamma = st.slider('Parametro gamma', 1, 10, 2)
|
| 98 |
+
if tipo == 'poly': self.degree = st.slider('Cantidad de grados del polinomio', 1, 10, 3)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def solve(self):
|
| 102 |
+
self.X, self.y = self.database.data, self.database.target
|
| 103 |
+
X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
|
| 104 |
+
self.sklearn_clf = svm.SVC(kernel=self.kernel, gamma=self.gamma, random_state=1234)
|
| 105 |
+
self.sklearn_clf.fit(X_train, y_train)
|
| 106 |
+
y_pred = self.sklearn_clf.predict(X_test)
|
| 107 |
+
acc = accuracy_score(y_pred, y_test)
|
| 108 |
+
|
| 109 |
+
c1, c2 = st.columns([4, 1])
|
| 110 |
+
c2.metric('Acierto', value=f'{np.round(acc, 2)*100}%')
|
| 111 |
+
df = pd.DataFrame(confusion_matrix(y_pred, y_test))
|
| 112 |
+
labels = self.database.target_names
|
| 113 |
+
df.columns = labels
|
| 114 |
+
df.index = labels
|
| 115 |
+
c1.write('**Confusion Matrix**')
|
| 116 |
+
c1.dataframe(df)
|
| 117 |
+
|
| 118 |
+
def visualization(self):
|
| 119 |
+
n_features = int(self.database.data.shape[1])
|
| 120 |
+
self.x_feature = st.slider('Variables en eje x', 1, n_features, 1)
|
| 121 |
+
self.y_feature = st.slider('Variables en eje y', 1, n_features, 2)
|
| 122 |
+
|
| 123 |
+
self.X = np.c_[self.database.data[:, self.x_feature-1:self.x_feature], self.database.data[:, self.y_feature-1:self.y_feature]]
|
| 124 |
+
self.y = self.database.target
|
| 125 |
+
X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
|
| 126 |
+
self.sklearn_clf = svm.SVC(kernel=self.kernel, gamma=self.gamma, random_state=1234)
|
| 127 |
+
self.sklearn_clf.fit(X_train, y_train)
|
| 128 |
+
|
| 129 |
+
x1_min, x1_max = self.X[:, 0].min() - 0.5, self.X[:, 0].max() + 0.5
|
| 130 |
+
x2_min, x2_max = self.X[:, 1].min() - 0.5, self.X[:, 1].max() + 0.5
|
| 131 |
+
h = 0.02 # Salto que vamos dando
|
| 132 |
+
x1_i = np.arange(x1_min, x1_max, h)
|
| 133 |
+
x2_i = np.arange(x2_min, x2_max, h)
|
| 134 |
+
x1_x1, x2_x2 = np.meshgrid(x1_i, x2_i)
|
| 135 |
+
y_pred = self.sklearn_clf.predict(np.c_[x1_x1.ravel(), x2_x2.ravel()])
|
| 136 |
+
y_pred = y_pred.reshape(x1_x1.shape)
|
| 137 |
+
|
| 138 |
+
plt.figure(1, figsize=(12, 8))
|
| 139 |
+
plt.pcolormesh(x1_x1, x2_x2, y_pred, cmap=plt.cm.Paired)
|
| 140 |
+
plt.scatter(self.X[:, 0], self.X[:, 1], c=self.y, edgecolors='k', cmap=plt.cm.Paired)
|
| 141 |
+
plt.xlim(x1_x1.min(), x1_x1.max())
|
| 142 |
+
plt.ylim(x2_x2.min(), x2_x2.max())
|
| 143 |
+
return plt.gcf()
|
SVR.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn import datasets
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
from sklearn.svm import SVR
|
| 7 |
+
from sklearn.metrics import mean_squared_error
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
|
| 10 |
+
class SVR_st:
|
| 11 |
+
def __init__(self, database, test_size=0.2):
|
| 12 |
+
self.database = database
|
| 13 |
+
self.test_size = test_size
|
| 14 |
+
self.desc = r'''
|
| 15 |
+
# **SVR (Support Vector Regression)**
|
| 16 |
+
|
| 17 |
+
El objetivo es encontrar la funci贸n $f(x)$ que produzca el valor $y$ con una distancia no m谩s lejana que $\epsilon$ para cada uno de los puntos de entrenamiento $x$.
|
| 18 |
+
|
| 19 |
+
**Linear SVM Regression: Primal Formula**
|
| 20 |
+
|
| 21 |
+
Supongamos que estamos trabajando con un set de datos X (multivariable) y con una variable dependiente y.
|
| 22 |
+
|
| 23 |
+
Entonces la funci贸n lineal seria:
|
| 24 |
+
|
| 25 |
+
$$
|
| 26 |
+
f(X) = X^{T}\beta + b
|
| 27 |
+
$$
|
| 28 |
+
|
| 29 |
+
Luego, para asegurar que los par谩metros $\beta$ sean lo m谩s chicos (flat) posibles es que se busca minimizar:
|
| 30 |
+
|
| 31 |
+
$$
|
| 32 |
+
J(\beta) = \frac{1}{2}\beta^{T}\beta
|
| 33 |
+
$$
|
| 34 |
+
|
| 35 |
+
Restringido bajo las siguientes condiciones:
|
| 36 |
+
|
| 37 |
+
$$
|
| 38 |
+
|y_{n} - (X_{n}^{T}\beta + b)| \leq \epsilon \quad \forall n \in N
|
| 39 |
+
$$
|
| 40 |
+
|
| 41 |
+
Como es posible que no exista una funci贸n $f(x)$ que pueda satisfacer estas condiciones se introduce los t茅rminos $鈩嘷{n}$ y $鈩嘷{n}^{*}$ las cuales vienen a representar algo as铆 como variables de holgura.
|
| 42 |
+
|
| 43 |
+
As铆, luego nuestra funci贸n objetivo cambia a:
|
| 44 |
+
|
| 45 |
+
$$
|
| 46 |
+
J(\beta) = \frac{1}{2}\beta^{T}\beta + C\sum_{n=1}^{N} (鈩嘷{n} + 鈩嘷{n}^{*})
|
| 47 |
+
$$
|
| 48 |
+
|
| 49 |
+
Sujeto a:
|
| 50 |
+
|
| 51 |
+
$$
|
| 52 |
+
y_{n} - (X_{n}^{T} \beta +b) \leq \epsilon + 鈩嘷{n} \quad \forall n \in N
|
| 53 |
+
$$
|
| 54 |
+
|
| 55 |
+
$$
|
| 56 |
+
(X_{n}^{T} \beta +b) - y_{n}\leq \epsilon + 鈩嘷{n}^{*} \quad \forall n \in N
|
| 57 |
+
$$
|
| 58 |
+
|
| 59 |
+
$$
|
| 60 |
+
鈩嘷{n}^{*} \geq 0 \quad \forall n \in N
|
| 61 |
+
$$
|
| 62 |
+
|
| 63 |
+
$$
|
| 64 |
+
鈩嘷{n} \geq 0 \quad \forall n \in N
|
| 65 |
+
$$
|
| 66 |
+
|
| 67 |
+
**Nota**: $C$ Son un conjunto de valores todos positivos que tiene por funci贸n penalizar las observaciones que se escapen del margen $\epsilon$
|
| 68 |
+
|
| 69 |
+
**Nonlinear SVM Regression**
|
| 70 |
+
|
| 71 |
+
En caso de que el problema no se pueda adaptar bien utilizando un modelo lineal, podemos adaptar todo este desarrollo cambiando el producto punto $X_{i}^{T}X_{j}$ por $G(X_{i}, X_{j})$.
|
| 72 |
+
|
| 73 |
+
| Kernel Name | Kernel Function |
|
| 74 |
+
|-------------|-----------------|
|
| 75 |
+
|Linear (dot product)| $G(X_{i}, X_{j}) = X_{i}^{T}X_{j}$|
|
| 76 |
+
|Gaussian|$G(X_{i}, X_{j}) = e^{-\lvert \rvert X_{i} - X_{j}^{2} \lvert \rvert}$|
|
| 77 |
+
|Polynomial|$G(X_{i}, X_{j}) = (1 + X_{i}^{T}X_{j})^{q}$|
|
| 78 |
+
|
| 79 |
+
**Nota:** $q$ es el grado del polinomio
|
| 80 |
+
|
| 81 |
+
**Fuente**: https://www.mathworks.com/help/stats/understanding-support-vector-machine-regression.html
|
| 82 |
+
|
| 83 |
+
'''
|
| 84 |
+
|
| 85 |
+
def params(self):
|
| 86 |
+
self.selected_kernel = st.selectbox('Tipo de kernel:', options=['linear', 'poly', 'rbf', 'sigmoid'])
|
| 87 |
+
if self.selected_kernel == 'poly': self.degree = st.slider('Grados del polinomio', 1, 6, 3)
|
| 88 |
+
min = float(np.min([0, np.min(self.database.target)]))/2
|
| 89 |
+
max = float(np.max(self.database.target))/2
|
| 90 |
+
mean = float(np.mean(self.database.target))/2
|
| 91 |
+
self.C = st.slider('Parametro de penalizacion C:', 1.0, 4*max, 4*mean)
|
| 92 |
+
self.epsilon = st.slider('Epsilon: ', min, max, mean)
|
| 93 |
+
|
| 94 |
+
def solve(self):
|
| 95 |
+
self.X, self.y = self.database.data, self.database.target
|
| 96 |
+
X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
|
| 97 |
+
if self.selected_kernel == 'poly': self.sklearn_regr = SVR(kernel=self.selected_kernel,
|
| 98 |
+
degree=self.degree,
|
| 99 |
+
C=self.C,
|
| 100 |
+
epsilon=self.epsilon)
|
| 101 |
+
else: self.sklearn_regr = SVR(kernel=self.selected_kernel,
|
| 102 |
+
C=self.C,
|
| 103 |
+
epsilon=self.epsilon)
|
| 104 |
+
self.sklearn_regr.fit(X_train, y_train)
|
| 105 |
+
y_pred = self.sklearn_regr.predict(X_test)
|
| 106 |
+
acc = mean_squared_error(y_pred, y_test)
|
| 107 |
+
st.metric('MSE (Mean Square Error)', value=f'{np.round(acc, 2)}')
|
| 108 |
+
|
| 109 |
+
def visualization(self):
|
| 110 |
+
n_features = int(self.database.data.shape[1])
|
| 111 |
+
self.x_feature = st.slider('Variable en eje x', 1, n_features, 1)
|
| 112 |
+
|
| 113 |
+
self.X = self.database.data[:, self.x_feature-1:self.x_feature]
|
| 114 |
+
self.y = self.database.target
|
| 115 |
+
X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
|
| 116 |
+
if self.selected_kernel == 'poly': self.sklearn_regr = SVR(kernel=self.selected_kernel,
|
| 117 |
+
degree=self.degree,
|
| 118 |
+
C=self.C,
|
| 119 |
+
epsilon=self.epsilon)
|
| 120 |
+
else: self.sklearn_regr = SVR(kernel=self.selected_kernel,
|
| 121 |
+
C=self.C,
|
| 122 |
+
epsilon=self.epsilon)
|
| 123 |
+
self.sklearn_regr.fit(X_train, y_train)
|
| 124 |
+
|
| 125 |
+
x1_min = self.X.min()
|
| 126 |
+
x1_max = self.X.max()
|
| 127 |
+
|
| 128 |
+
x_pred = np.linspace(x1_min, x1_max, 100).reshape([100, 1])
|
| 129 |
+
y_pred = self.sklearn_regr.predict(x_pred)
|
| 130 |
+
y_pred_up = [i+self.epsilon for i in y_pred]
|
| 131 |
+
y_pred_down = [i-self.epsilon for i in y_pred]
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
plt.figure(1, figsize=(12, 8))
|
| 135 |
+
plt.scatter(self.X, self.y, edgecolors='k', cmap=plt.cm.Paired)
|
| 136 |
+
plt.plot(x_pred, y_pred, color='red') # linea de prediccion
|
| 137 |
+
plt.plot(x_pred, y_pred_up, linestyle='--', color='green')
|
| 138 |
+
plt.plot(x_pred, y_pred_down, linestyle='--', color='green')
|
| 139 |
+
return plt.gcf()
|
k_mean_clustering.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn import datasets
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
from sklearn.svm import SVR
|
| 7 |
+
from sklearn.cluster import KMeans as KM
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def plot(X, clusters, centroids, x_feature, y_feature):
|
| 12 |
+
fig, ax = plt.subplots(figsize=(12, 8))
|
| 13 |
+
|
| 14 |
+
for i, index in enumerate(clusters):
|
| 15 |
+
x = X[index].T[x_feature-1]
|
| 16 |
+
y = X[index].T[y_feature-1]
|
| 17 |
+
point = np.array([x, y])
|
| 18 |
+
ax.scatter(*point)
|
| 19 |
+
|
| 20 |
+
for point in centroids:
|
| 21 |
+
x = point[x_feature-1]
|
| 22 |
+
y = point[y_feature-1]
|
| 23 |
+
points = np.array([x, y])
|
| 24 |
+
ax.scatter(*points, marker="o", linewidth=15)
|
| 25 |
+
|
| 26 |
+
return fig
|
| 27 |
+
|
| 28 |
+
class k_mean_clustering_st:
|
| 29 |
+
def __init__(self, database, test_size=0.2):
|
| 30 |
+
self.database = database
|
| 31 |
+
self.test_size = test_size
|
| 32 |
+
self.desc = r'''
|
| 33 |
+
# **K-Mean Clustering**
|
| 34 |
+
|
| 35 |
+
El objetivo en esta ocasi贸n es segmentar informaci贸n desclasificada (**unsupervised learning**)
|
| 36 |
+
|
| 37 |
+
As铆, este m茅todo asigna a una muestra de datos una clase en base a la distancia promedio entre los datos.
|
| 38 |
+
|
| 39 |
+
**Iterative Optimization**
|
| 40 |
+
|
| 41 |
+
- Inicializamos los centros de manera aleatoria
|
| 42 |
+
- Iteramos hasta converger
|
| 43 |
+
- Actualizamos las clasificaciones de los datos utilizando el centroide.
|
| 44 |
+
- Actualizamos el centroide. (este corresponde a la posici贸n del centro para una clase)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
**Distancia entre vectores**
|
| 48 |
+
|
| 49 |
+
$$
|
| 50 |
+
d(p, q) = \sqrt{\sum (p_{i} - q_{i})^{2}}
|
| 51 |
+
$$
|
| 52 |
+
'''
|
| 53 |
+
self.x_feature = 1
|
| 54 |
+
self.y_feature = 2
|
| 55 |
+
self.n_clusters = 3
|
| 56 |
+
self.max_iter = 150
|
| 57 |
+
|
| 58 |
+
def params(self):
|
| 59 |
+
self.n_features = int(self.database.data.shape[1])
|
| 60 |
+
self.n_clusters = st.slider('Numero de segmentos', 1, 10, 3)
|
| 61 |
+
self.max_iter = st.slider('Numero maximo de iteraciones', 100, 200, 150)
|
| 62 |
+
|
| 63 |
+
def solve(self):
|
| 64 |
+
self.x_feature = st.slider('Variables en eje x', 1, self.n_features, 1)
|
| 65 |
+
self.y_feature = st.slider('Variables en eje y', 1, self.n_features, 2)
|
| 66 |
+
X = self.database.data
|
| 67 |
+
sklearn_clus = KM(n_clusters=self.n_clusters, max_iter=self.max_iter)
|
| 68 |
+
sklearn_clus.fit(X)
|
| 69 |
+
pred = sklearn_clus.predict(X)
|
| 70 |
+
classes = np.unique(pred)
|
| 71 |
+
clusters = [[] for i in classes]
|
| 72 |
+
for idx, value in enumerate(pred):
|
| 73 |
+
clusters[value].append(idx)
|
| 74 |
+
|
| 75 |
+
return plot(X=X,
|
| 76 |
+
clusters=clusters,
|
| 77 |
+
centroids=sklearn_clus.cluster_centers_,
|
| 78 |
+
x_feature=self.x_feature,
|
| 79 |
+
y_feature=self.y_feature)
|