raaraya commited on
Commit
42d0bac
1 Parent(s): 8b2613f

Upload 15 files

Browse files
Files changed (15) hide show
  1. Ada_boost.py +130 -0
  2. Agglomerative_clustering.py +50 -0
  3. Decision_tree.py +108 -0
  4. ICA.py +80 -0
  5. KNN.py +79 -0
  6. LDA.py +94 -0
  7. Linear_regression.py +82 -0
  8. Logit.py +106 -0
  9. Naive_bayes.py +97 -0
  10. PCA.py +78 -0
  11. Perceptron.py +73 -0
  12. Random_forest.py +98 -0
  13. SVC.py +143 -0
  14. SVR.py +139 -0
  15. k_mean_clustering.py +79 -0
Ada_boost.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn import datasets
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.ensemble import AdaBoostClassifier as ABC
7
+ from sklearn.metrics import accuracy_score
8
+ import matplotlib.pyplot as plt
9
+ from sklearn.metrics import confusion_matrix
10
+
11
+
12
+
13
+ class ada_boost_st:
14
+ def __init__(self, database, test_size=0.2):
15
+ self.database = database
16
+ self.test_size = test_size
17
+ self.desc = r'''
18
+ # **AdaBoost**
19
+
20
+ Este algoritmo se basa en ir agrupando otros algoritmos de clasificaci贸n, para que en conjunto generen una predicci贸n.
21
+
22
+ Asimismo, y a diferencia del algoritmo de Random Forest, es que el **voto** de cada estimador no valen lo mismo, es decir, existe un grado de importancia (**weight**) entre los estimadores que siendo estos ponderados por sus votos es que generan la predicci贸n del algoritmo.
23
+
24
+ **Weak Learner (Decision Stump)**
25
+
26
+ Es un algoritmo que sencillamente clasifica los datos seg煤n un l铆mite (similar a uno de los pasos del algoritmo de Decision Tree)
27
+
28
+ **Error**
29
+
30
+ - Primera iteraci贸n
31
+
32
+ $$
33
+ 系_{1} = \frac{desaciertos}{N}
34
+ $$
35
+
36
+ - A partir de la segunda iteraci贸n
37
+
38
+ $$
39
+ 系_{t} = \sum weights
40
+ $$
41
+
42
+ Nota: Si el error es mayor a 0.5, se intercambia la clasificaci贸n y se calcula el $error = 1 - error$
43
+
44
+ **Weights**
45
+
46
+ - Al inicio
47
+ $$
48
+ w_{0} = \frac{1}{N} para cada muestra
49
+ $$
50
+
51
+ - Luego
52
+
53
+ $$
54
+ w = \frac{w \cdot e^{- 伪yh(X)}}{\sum w}
55
+ $$
56
+
57
+ **Performance**
58
+
59
+ $$
60
+ \alpha = 0.5 \cdot log(\frac{1-系_{t}}{系_{t}})
61
+ $$
62
+
63
+ **Predicci贸n**
64
+
65
+ $$
66
+ y = sign(\sum_{t}^{T} 伪_{t} \cdot h(X))
67
+ $$
68
+
69
+ **Training**
70
+
71
+ Se inicializan los pesos de cada muestra en $\frac{1}{N}$
72
+
73
+ - Entrenamos a un clasificador d茅bil (se busca la mejor variable y l铆mite para segmentar)
74
+ - Calculamos el error $系_{t} = \sum_{desaciertos} weights$
75
+ - Cambiar el error y la polaridad si este es mayor a 0.5
76
+ - Calcular $\alpha = 0.5 \cdot log(\frac{1 - \epsilon_{t}}{系_{t}})$
77
+ - Actualizar los pesos: $w = \frac{w \cdot e^{- 伪h(X)}}{Z}$
78
+
79
+ '''
80
+ self.n_clf = 5
81
+
82
+ def params(self):
83
+ self.n_clf = st.slider('Numero de estimadores',
84
+ min_value=1,
85
+ max_value=15,
86
+ value=5)
87
+
88
+ def solve(self):
89
+ self.X, self.y = self.database.data, self.database.target
90
+ X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
91
+ self.sklearn_clf = ABC(n_estimators=self.n_clf, random_state=1234)
92
+ self.sklearn_clf.fit(X_train, y_train)
93
+ y_pred = self.sklearn_clf.predict(X_test)
94
+ acc = accuracy_score(y_pred, y_test)
95
+
96
+ c1, c2 = st.columns([4, 1])
97
+ c2.metric('Acierto', value=f'{np.round(acc, 2)*100}%')
98
+ df = pd.DataFrame(confusion_matrix(y_pred, y_test))
99
+ labels = self.database.target_names
100
+ df.columns = labels
101
+ df.index = labels
102
+ c1.write('**Confusion Matrix**')
103
+ c1.dataframe(df)
104
+
105
+ def visualization(self):
106
+ n_features = int(self.database.data.shape[1])
107
+ self.x_feature = st.slider('Variables en eje x', 1, n_features, 1)
108
+ self.y_feature = st.slider('Variables en eje y', 1, n_features, 2)
109
+
110
+ self.X = np.c_[self.database.data[:, self.x_feature-1:self.x_feature], self.database.data[:, self.y_feature-1:self.y_feature]]
111
+ self.y = self.database.target
112
+ X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
113
+ self.sklearn_clf = ABC(n_estimators=self.n_clf, random_state=1234)
114
+ self.sklearn_clf.fit(X_train, y_train)
115
+
116
+ x1_min, x1_max = self.X[:, 0].min() - 0.5, self.X[:, 0].max() + 0.5
117
+ x2_min, x2_max = self.X[:, 1].min() - 0.5, self.X[:, 1].max() + 0.5
118
+ h = 0.02 # Salto que vamos dando
119
+ x1_i = np.arange(x1_min, x1_max, h)
120
+ x2_i = np.arange(x2_min, x2_max, h)
121
+ x1_x1, x2_x2 = np.meshgrid(x1_i, x2_i)
122
+ y_pred = self.sklearn_clf.predict(np.c_[x1_x1.ravel(), x2_x2.ravel()])
123
+ y_pred = y_pred.reshape(x1_x1.shape)
124
+
125
+ plt.figure(1, figsize=(12, 8))
126
+ plt.pcolormesh(x1_x1, x2_x2, y_pred, cmap=plt.cm.Paired)
127
+ plt.scatter(self.X[:, 0], self.X[:, 1], c=self.y, edgecolors='k', cmap=plt.cm.Paired)
128
+ plt.xlim(x1_x1.min(), x1_x1.max())
129
+ plt.ylim(x2_x2.min(), x2_x2.max())
130
+ return plt.gcf()
Agglomerative_clustering.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn import datasets
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.svm import SVR
7
+ from sklearn.cluster import AgglomerativeClustering
8
+ import matplotlib.pyplot as plt
9
+
10
+
11
+ class agglomerative_clustering_st:
12
+ def __init__(self, database, test_size=0.2):
13
+ self.database = database
14
+ self.test_size = test_size
15
+ self.desc = r'''
16
+ # **Agglomerative Clustering**
17
+
18
+ Agglomerative Clustering es un tipo de algoritmo que agrupa de manera jer谩rquica. De esta manera lo que se hace es considerar a cada observaci贸n como un cluster y luego ir juntando aquellos que sean m谩s similares. Esto lo repetimos hasta alcanzar un numero de clusters deseado.
19
+
20
+ **M茅todo**
21
+ - Inicializamos todos los puntos como clusters
22
+ - Tomamos dos clusters que se encuentren cercanos y los unific谩ramos en un 煤nico cluster.
23
+ - Repetimos el paso anterior hasta conseguir un numero de clusters deseado.
24
+
25
+ **Criterios para medir la similitud entre clusters**
26
+
27
+ - Distancia entre los puntos **m谩s cercanos** de dos clusters distintos.
28
+ - Distancia entre los puntos **m谩s lejanos** de dos clusters distintos.
29
+ - Distancia entre los promedios de cada cluster.
30
+
31
+ '''
32
+ self.x_feature = 1
33
+ self.y_feature = 2
34
+ self.n_clusters = 3
35
+
36
+ def params(self):
37
+ n_targets = len(set(self.database.target))
38
+ self.n_clusters = st.slider('Numero de clusters', 1, n_targets, 1)
39
+
40
+ def solve(self):
41
+ n_features = int(self.database.data.shape[1])
42
+ self.x_feature = st.slider('Variables en eje x', 1, n_features, 1)
43
+ self.y_feature = st.slider('Variables en eje y', 1, n_features, 2)
44
+ X = self.database.data
45
+ sklearn_clus = AgglomerativeClustering(self.n_clusters, linkage='single')
46
+ pred = sklearn_clus.fit_predict(X)
47
+ fig, ax = plt.subplots(figsize=(12,8))
48
+ ax.scatter(X[:, self.x_feature-1], X[:, self.y_feature-1], c=pred)
49
+ plt.title(f'{self.n_clusters} Clusters')
50
+ return fig
Decision_tree.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn import datasets
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.tree import DecisionTreeClassifier
7
+ from sklearn.metrics import accuracy_score
8
+ import matplotlib.pyplot as plt
9
+ from sklearn.metrics import confusion_matrix
10
+
11
+
12
+
13
+ class Decision_tree_st:
14
+ def __init__(self, database, test_size=0.2):
15
+ self.database = database
16
+ self.test_size = test_size
17
+ self.desc = r'''
18
+ # **Decision Tree**
19
+
20
+ **Entropy**
21
+
22
+ $$
23
+ E = - \sum p(X) \cdot log_{2}(p(X))
24
+ $$
25
+
26
+ $$
27
+ p(X) = \frac{len(x)}{n}
28
+ $$
29
+
30
+ **Ganancia de informaci贸n**
31
+
32
+ $$
33
+ IG = E(parent) - [weight \quad average] \cdot E(children)
34
+ $$
35
+
36
+ **M茅todo (para construir el 谩rbol)**
37
+
38
+ - Se comienza desde el primer nodo y para cada se selecciona la mejor separaci贸n en base a la ganancia de informaci贸n.
39
+ - De la ganancia de informaci贸n m谩s alta se rescata la variable y el l铆mite.
40
+ - Luego se aplica la segmentaci贸n a cada nodo, en base a la variable y limite encontrado.
41
+ - Se itera con estos pasos hasta cumplirse alg煤n criterio
42
+ - **maximium depth**: cantidad de nodos m谩ximos al final
43
+ - **minimum samples**: cantidad m铆nima de elementos que puede tener los nodos
44
+ - **no more class distribution**: No existen m谩s elementos para segmentar
45
+
46
+ **Aproximaci贸n (predicci贸n)**
47
+
48
+ - Se sigue las segmentaciones en el orden del 谩rbol (de arriba a abajo)
49
+ - Cuando se llega a un nodo al final del 谩rbol se predice seg煤n el valor m谩s com煤n en esa muestra.
50
+
51
+
52
+ '''
53
+ self.max_depth = 100
54
+ self.min_samples_split = 2
55
+ self.stop_criterion = 'max_depth'
56
+
57
+ def params(self):
58
+ self.stop_criterion = st.radio('Criterio de termino:', options=['max_depth', 'min_samples_split'])
59
+ if self.stop_criterion == 'max_depth': self.max_depth = st.slider('Valor max deph:', 1, 100, 10)
60
+ elif self.stop_criterion == 'min_samples_split': self.min_samples_split = st.slider('Valor min_samples_split:', 2, 1000, 5)
61
+
62
+
63
+ def solve(self):
64
+ self.X, self.y = self.database.data, self.database.target
65
+ X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
66
+ if self.stop_criterion == 'max_depth': self.sklearn_clf = DecisionTreeClassifier(max_depth=self.max_depth, random_state=1234)
67
+ elif self.stop_criterion == 'min_samples_split': self.sklearn_clf = DecisionTreeClassifier(min_samples_split=self.min_samples_split, random_state=1234)
68
+
69
+ self.sklearn_clf.fit(X_train, y_train)
70
+ y_pred = self.sklearn_clf.predict(X_test)
71
+ acc = accuracy_score(y_pred, y_test)
72
+
73
+ c1, c2 = st.columns([4, 1])
74
+ c2.metric('Acierto', value=f'{np.round(acc, 2)*100}%')
75
+ df = pd.DataFrame(confusion_matrix(y_pred, y_test))
76
+ labels = self.database.target_names
77
+ df.columns = labels
78
+ df.index = labels
79
+ c1.write('**Confusion Matrix**')
80
+ c1.dataframe(df)
81
+
82
+ def visualization(self):
83
+ n_features = int(self.database.data.shape[1])
84
+ self.x_feature = st.slider('Variables en eje x', 1, n_features, 1)
85
+ self.y_feature = st.slider('Variables en eje y', 1, n_features, 2)
86
+
87
+ self.X = np.c_[self.database.data[:, self.x_feature-1:self.x_feature], self.database.data[:, self.y_feature-1:self.y_feature]]
88
+ self.y = self.database.target
89
+ X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
90
+ if self.stop_criterion == 'max_depth': self.sklearn_clf = DecisionTreeClassifier(max_depth=self.max_depth, random_state=1234)
91
+ elif self.stop_criterion == 'min_samples_split': self.sklearn_clf = DecisionTreeClassifier(min_samples_split=self.min_samples_split, random_state=1234)
92
+ self.sklearn_clf.fit(X_train, y_train)
93
+
94
+ x1_min, x1_max = self.X[:, 0].min() - 0.5, self.X[:, 0].max() + 0.5
95
+ x2_min, x2_max = self.X[:, 1].min() - 0.5, self.X[:, 1].max() + 0.5
96
+ h = 0.02 # Salto que vamos dando
97
+ x1_i = np.arange(x1_min, x1_max, h)
98
+ x2_i = np.arange(x2_min, x2_max, h)
99
+ x1_x1, x2_x2 = np.meshgrid(x1_i, x2_i)
100
+ y_pred = self.sklearn_clf.predict(np.c_[x1_x1.ravel(), x2_x2.ravel()])
101
+ y_pred = y_pred.reshape(x1_x1.shape)
102
+
103
+ plt.figure(1, figsize=(12, 8))
104
+ plt.pcolormesh(x1_x1, x2_x2, y_pred, cmap=plt.cm.Paired)
105
+ plt.scatter(self.X[:, 0], self.X[:, 1], c=self.y, edgecolors='k', cmap=plt.cm.Paired)
106
+ plt.xlim(x1_x1.min(), x1_x1.max())
107
+ plt.ylim(x2_x2.min(), x2_x2.max())
108
+ return plt.gcf()
ICA.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn import datasets
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.svm import SVR
7
+ from sklearn.decomposition import FastICA
8
+ import matplotlib.pyplot as plt
9
+
10
+ class ICA_st:
11
+ def __init__(self, database, test_size=0.2):
12
+ self.database = database
13
+ self.test_size = test_size
14
+ self.desc = r'''
15
+ # **ICA (Independent Component Analysis)**
16
+
17
+ ICA es un m茅todo que se utiliza para identificar las componentes de una se帽al multivariada. De esta manera es que podemos extraer un componente que se encuentre mezclados con otros.
18
+
19
+ - A $X$ restarle su media $\bar{X}$
20
+ - Transformar $X$ de manera que las potenciales correlaciones entre las componentes sean removidas y que la varianza para cada componente sea igual a 1. (Hacer que la matriz de covarianza se parezca a la matriz de identidad)
21
+
22
+ $$
23
+ \hat{x} = E \cdot \sqrt{D} \cdot E^{T} \cdot x
24
+ $$
25
+
26
+ - $D$: Diagonal con valores propios (de la matriz de covarianzas)
27
+ - $E$: Matrix con vectores propios (de la matriz de covarianzas)
28
+
29
+ - Escoger valores aleatorios para armar la matriz $W$.
30
+ - Calcular los nuevos valores para $W$
31
+
32
+ $$
33
+ w_{i} = \frac{1}{n} \sum X \cdot tanh(W^{T} \cdot X) - \frac{1}{n} \sum X \cdot (1 - tanh^{2}(W^{T} \cdot X) \cdot W)
34
+ $$
35
+
36
+ $$
37
+ w_{i} = w_{i} - \sum_{j=1}^{p-1} (w_{p}^{T}w_{j})w_{j}
38
+ $$
39
+
40
+ - Normalizar $w_{p}$
41
+
42
+ $$
43
+ w_{p} = \frac{w_{p}}{||w_{p}||}
44
+ $$
45
+
46
+ - Chequear condici贸n de termino. Si no se cumple volvemos a calcular los nuevos valores de $w$
47
+
48
+
49
+ $$
50
+ w_{p}^{T}w_{p+1} - 1 < Tolerance
51
+ $$
52
+
53
+ - Calcular las fuentes independientes como $S = W \cdot X$'''
54
+
55
+ self.x_feature = 1
56
+ self.y_feature = 2
57
+ self.n_components = 2
58
+
59
+ def params(self):
60
+ n_features = int(self.database.data.shape[1])
61
+ self.n_components = st.slider('Numero de componentes', 1, n_features, 2)
62
+
63
+ def solve(self):
64
+ self.x_feature = st.slider('Componente eje x', 1, self.n_components, 1)
65
+ self.y_feature = st.slider('Componente eje y', 1, self.n_components, 2)
66
+ X = self.database.data
67
+ y = self.database.target
68
+ sklearn_clus = FastICA(n_components=self.n_components)
69
+ X_proyected_sk = sklearn_clus.fit_transform(X)
70
+
71
+ x1 = X_proyected_sk[:, self.x_feature-1]
72
+ x2 = X_proyected_sk[:, self.y_feature-1]
73
+
74
+ plt.figure(1, figsize=(12, 8))
75
+ plt.scatter(x1, x2, c=y, edgecolors='none', alpha=0.8, cmap=plt.cm.get_cmap('viridis', len(y)))
76
+ plt.xlabel(f'Componente {self.x_feature}')
77
+ plt.ylabel(f'Componente {self.y_feature}')
78
+ plt.colorbar()
79
+
80
+ return plt.gcf()
KNN.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn import datasets
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.neighbors import KNeighborsClassifier
7
+ from sklearn.metrics import accuracy_score
8
+ import matplotlib.pyplot as plt
9
+ from sklearn.metrics import confusion_matrix
10
+
11
+
12
+
13
+ class KNN_st:
14
+ def __init__(self, database, test_size=0.2):
15
+ self.database = database
16
+ self.test_size = test_size
17
+ self.desc = '''
18
+ # **Algoritmo KNN (K Nearest Neighbor)**
19
+
20
+ Este algoritmo se basa en que, para predecir una clasificaci贸n sobre un nuevo dato, lo primero que debemos hacer es calcula la **distancia euclidiana** con el resto de los datos, **seleccionar los k datos con menor distancia** (m谩s cercanos) y por ultimo **asignar la clasificaci贸n en funci贸n a la moda** (categor铆a m谩s repetida) de esos k datos seleccionados.
21
+
22
+ **Distancia Euclidiana**
23
+
24
+ $$
25
+ Dist= \sqrt(\sum_{i=1}^n (Xtest_{i} - Xtrain_{i})^2))
26
+ $$
27
+
28
+ '''
29
+ self.neighbors = 5
30
+
31
+ def params(self):
32
+ self.neighbors = st.slider('Numero de vecinos',
33
+ min_value=0,
34
+ max_value=15,
35
+ value=5)
36
+
37
+ def solve(self):
38
+ self.X, self.y = self.database.data, self.database.target
39
+ X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
40
+ self.sklearn_clf = KNeighborsClassifier(self.neighbors)
41
+ self.sklearn_clf.fit(X_train, y_train)
42
+ y_pred = self.sklearn_clf.predict(X_test)
43
+ acc = accuracy_score(y_pred, y_test)
44
+
45
+ c1, c2 = st.columns([4, 1])
46
+ c2.metric('Acierto', value=f'{np.round(acc, 2)*100}%')
47
+ df = pd.DataFrame(confusion_matrix(y_pred, y_test))
48
+ labels = self.database.target_names
49
+ df.columns = labels
50
+ df.index = labels
51
+ c1.write('**Confusion Matrix**')
52
+ c1.dataframe(df)
53
+
54
+ def visualization(self):
55
+ n_features = int(self.database.data.shape[1])
56
+ self.x_feature = st.slider('Variables en eje x', 1, n_features, 1)
57
+ self.y_feature = st.slider('Variables en eje y', 1, n_features, 2)
58
+
59
+ self.X = np.c_[self.database.data[:, self.x_feature-1:self.x_feature], self.database.data[:, self.y_feature-1:self.y_feature]]
60
+ self.y = self.database.target
61
+ X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
62
+ self.sklearn_clf = KNeighborsClassifier(self.neighbors)
63
+ self.sklearn_clf.fit(X_train, y_train)
64
+
65
+ x1_min, x1_max = self.X[:, 0].min() - 0.5, self.X[:, 0].max() + 0.5
66
+ x2_min, x2_max = self.X[:, 1].min() - 0.5, self.X[:, 1].max() + 0.5
67
+ h = 0.02 # Salto que vamos dando
68
+ x1_i = np.arange(x1_min, x1_max, h)
69
+ x2_i = np.arange(x2_min, x2_max, h)
70
+ x1_x1, x2_x2 = np.meshgrid(x1_i, x2_i)
71
+ y_pred = self.sklearn_clf.predict(np.c_[x1_x1.ravel(), x2_x2.ravel()])
72
+ y_pred = y_pred.reshape(x1_x1.shape)
73
+
74
+ plt.figure(1, figsize=(12, 8))
75
+ plt.pcolormesh(x1_x1, x2_x2, y_pred, cmap=plt.cm.Paired)
76
+ plt.scatter(self.X[:, 0], self.X[:, 1], c=self.y, edgecolors='k', cmap=plt.cm.Paired)
77
+ plt.xlim(x1_x1.min(), x1_x1.max())
78
+ plt.ylim(x2_x2.min(), x2_x2.max())
79
+ return plt.gcf()
LDA.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn import datasets
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.svm import SVR
7
+ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
8
+ import matplotlib.pyplot as plt
9
+
10
+ class LDA_st:
11
+ def __init__(self, database, test_size=0.2):
12
+ self.database = database
13
+ self.test_size = test_size
14
+ self.desc = r'''
15
+ # **LDA (Linear Discrimination Analysis)**
16
+
17
+ **Objetivo**
18
+
19
+ Reducir el n煤mero de variables (**features**).
20
+ El objetivo es proyectar un conjunto de datos a un espacio dimensional m谩s reducido. (Similar a como se hac铆a con **PCA**)
21
+
22
+ **PCA vs LDA**
23
+
24
+ - **PCA**: Encontrar los ejes que maximizan la varianza en los datos.
25
+ - **LDA**: El inter茅s esta puesto en los ejes que maximizan la separaci贸n entre clases de datos.
26
+ - **LDA**: es un tipo de **aprendizaje supervisado** (utiliza la clasificaci贸n (etiquetas) de los datos para entrenar al algoritmo), en cambio **PCA** es un tipo de **aprendizaje no supervisado** (sin etiquetas)
27
+
28
+ **Within-class scatter matrix**
29
+
30
+ $$
31
+ S_{w} = \sum_{c} S_{c}
32
+ $$
33
+
34
+ $$
35
+ S_{c} = \sum_{i \in c} (x_{i} - \bar{x_{c}}) \cdot (x_{i} - \bar{x_{c}})^{T}
36
+ $$
37
+
38
+ **Between class scatter matrix**
39
+
40
+ $$
41
+ S_{B} = \sum_{c} 畏 \cdot (\bar{x_{c}} - \bar{x}) \cdot (\bar{x_{c}} - \bar{x})^{T}
42
+ $$
43
+
44
+ **Vectores y valores propios**
45
+
46
+ Calcular los vectores y valores propios de la siguiente matriz:
47
+
48
+ $$
49
+ S_{W}^{-1} S_{B}
50
+ $$
51
+
52
+
53
+ **M茅todo**
54
+
55
+ - Calcular $S_{B}$
56
+ - Calcular $S_{W}$
57
+ - Calcular los vectores y valores propios de $S_{W}^{-1} S_{B}$
58
+ - Ordenar los vectores propios en funci贸n de los valores propios de manera decreciente
59
+ - Escoger los primeros k vectores propios los cuales vendr谩n a representar las nuevas k dimensiones
60
+ - Transformar los datos en las nuevas dimensiones (**se hace con producto punto**) '''
61
+
62
+ self.x_feature = 1
63
+ self.y_feature = 2
64
+ self.n_components = 2
65
+
66
+ def params(self):
67
+ self.n_clases = len(set(self.database.target))
68
+ self.n_features = int(self.database.data.shape[1])
69
+ self.min = int(np.min([self.n_clases-1, self.n_features]))
70
+ if self.min == 1: pass
71
+ elif self.min == 2: self.n_components = 2
72
+ else: self.n_components = st.slider('Numero de componentes', 2, self.min, 2)
73
+
74
+ def solve(self):
75
+ if self.min == 1: pass
76
+ else:
77
+ self.x_feature = st.slider('Componente eje x', 1, self.n_components, 1)
78
+ self.y_feature = st.slider('Componente eje y', 1, self.n_components, 2)
79
+ X = self.database.data
80
+ y = self.database.target
81
+ sklearn_clus = LinearDiscriminantAnalysis(n_components=self.n_components)
82
+ sklearn_clus.fit(X, y)
83
+ X_proyected_sk = sklearn_clus.transform(X)
84
+
85
+ x1 = X_proyected_sk[:, self.x_feature-1]
86
+ x2 = X_proyected_sk[:, self.y_feature-1]
87
+
88
+ plt.figure(figsize=(12, 8))
89
+ plt.scatter(x1, x2, c=y, edgecolors='none', alpha=0.8, cmap=plt.cm.get_cmap('viridis', len(y)))
90
+ plt.xlabel(f'Componente {self.x_feature}')
91
+ plt.ylabel(f'Componente {self.y_feature}')
92
+ plt.colorbar()
93
+
94
+ return plt.gcf()
Linear_regression.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn import datasets
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.linear_model import LinearRegression
7
+ from sklearn.metrics import mean_squared_error
8
+ import matplotlib.pyplot as plt
9
+
10
+ class linear_regression_st:
11
+ def __init__(self, database, test_size=0.2):
12
+ self.database = database
13
+ self.test_size = test_size
14
+ self.desc = r'''
15
+ # **Linear Regression**
16
+
17
+
18
+ **Predicci贸n (aproximaci贸n)**
19
+ $$
20
+ \hat{y} = wx + b
21
+ $$
22
+
23
+ **Funci贸n de costos**
24
+
25
+ $$
26
+ Loss = MSE = \frac{1}{N} \sum_{i=1}^n (y_{i} - \hat{y_{i}})^2
27
+ $$
28
+
29
+ **Calculo del gradiente**
30
+
31
+
32
+ $$
33
+ \left[\begin{array}{ll}\frac{d_{loss}}{dw} \\ \frac{d_{loss}}{db} \end{array} \right] = \left[\begin{array}{ll} \frac{1}{N} \sum -2x_{i}(y_{i} - (wx_{i} + b)) \\ \frac{1}{N} \sum -2(y_{i} - (wx_{i} + b)) \end{array} \right]
34
+ $$
35
+
36
+ **M茅todo del Descenso del Gradiente**
37
+
38
+ - Inicializar los pesos ($w$) y el sesgo ($b$)
39
+ - Iteramos
40
+ - Calcular el gradiente
41
+ - Actualizamos los par谩metros (lr=learning rate)
42
+
43
+ $$
44
+ w = w - lr*dw
45
+ $$
46
+
47
+ $$
48
+ b = b-lr*db
49
+ $$
50
+
51
+ - Terminamos de iterar
52
+ '''
53
+
54
+ def solve(self):
55
+ self.X, self.y = self.database.data, self.database.target
56
+ X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
57
+ self.sklearn_regr = LinearRegression()
58
+ self.sklearn_regr.fit(X_train, y_train)
59
+ y_pred = self.sklearn_regr.predict(X_test)
60
+ acc = mean_squared_error(y_pred, y_test)
61
+ st.metric('MSE (Mean Square Error)', value=f'{np.round(acc, 2)}')
62
+
63
+ def visualization(self):
64
+ n_features = int(self.database.data.shape[1])
65
+ self.x_feature = st.slider('Variable en eje x', 1, n_features, 1)
66
+
67
+ self.X = self.database.data[:, self.x_feature-1:self.x_feature]
68
+ self.y = self.database.target
69
+ X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
70
+ self.sklearn_regr = LinearRegression()
71
+ self.sklearn_regr.fit(X_train, y_train)
72
+
73
+ x1_min = self.X.min()
74
+ x1_max = self.X.max()
75
+
76
+ x_pred = np.linspace(x1_min, x1_max, 100).reshape([100, 1])
77
+ y_pred = self.sklearn_regr.predict(x_pred)
78
+
79
+ plt.figure(1, figsize=(12, 8))
80
+ plt.scatter(self.X, self.y, edgecolors='k', cmap=plt.cm.Paired)
81
+ plt.plot(x_pred, y_pred)
82
+ return plt.gcf()
Logit.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn import datasets
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.linear_model import LogisticRegression as LR_sk
7
+ from sklearn.metrics import accuracy_score
8
+ import matplotlib.pyplot as plt
9
+ from sklearn.metrics import confusion_matrix
10
+
11
+
12
+
13
+ class Logit_st:
14
+ def __init__(self, database, test_size=0.2):
15
+ self.database = database
16
+ self.test_size = test_size
17
+ self.desc = r'''
18
+ # **Logistic Regresion**
19
+
20
+ **Predicci贸n (Aproximaci贸n)**
21
+
22
+ $$
23
+ z = wx + b
24
+ $$
25
+
26
+ $$
27
+ \hat{y} = \frac{1}{1+e^{-z}}
28
+ $$
29
+
30
+ **Funci贸n de perdida (cross entropy)**
31
+
32
+ $$
33
+ loss = \frac{1}{N} \sum_{i=1}^{n} [y^{i}log(\hat{y(x^{i})}) + (1-y^{i})log(1 - \hat{y(x^{i})})]
34
+ $$
35
+
36
+ **Gradientes**
37
+
38
+ $$
39
+ \left[\begin{array}{ll} \frac{d_{loss}}{dw} \\ \frac{d_{loss}}{db} \end{array}\right] = \left[\begin{array}{ll} \frac{1}{N} \sum 2x_{i}(\hat{y} - y_{i}) \\ \frac{1}{N} \sum 2(\hat{y} - y_{i}) \end{array}\right]
40
+ $$
41
+
42
+ **Metodo de Gradient Descent**
43
+ - Iniciar par谩metros
44
+ - Iterar
45
+ - Calcular el error (loss)
46
+ - Actualizar los pesos ($lr$=learning rate)
47
+
48
+ $$
49
+ w = w - lr*dw
50
+ $$
51
+
52
+ $$
53
+ b = b - lr*db
54
+ $$
55
+
56
+ - Terminar de iterar
57
+ '''
58
+ self.x_feature = 1
59
+ self.y_feature = 2
60
+
61
+ def params(self):
62
+ pass
63
+
64
+ def solve(self):
65
+ self.X, self.y = self.database.data, self.database.target
66
+ X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
67
+ self.sklearn_clf = LR_sk(max_iter=1000, random_state=1234)
68
+ self.sklearn_clf.fit(X_train, y_train)
69
+ y_pred = self.sklearn_clf.predict(X_test)
70
+ acc = accuracy_score(y_pred, y_test)
71
+
72
+ c1, c2 = st.columns([4, 1])
73
+ c2.metric('Acierto', value=f'{np.round(acc, 2)*100}%')
74
+ df = pd.DataFrame(confusion_matrix(y_pred, y_test))
75
+ labels = self.database.target_names
76
+ df.columns = labels
77
+ df.index = labels
78
+ c1.write('**Confusion Matrix**')
79
+ c1.dataframe(df)
80
+
81
+ def visualization(self):
82
+ n_features = int(self.database.data.shape[1])
83
+ self.x_feature = st.slider('Variables en eje x', 1, n_features, 1)
84
+ self.y_feature = st.slider('Variables en eje y', 1, n_features, 2)
85
+
86
+ self.X = np.c_[self.database.data[:, self.x_feature-1:self.x_feature], self.database.data[:, self.y_feature-1:self.y_feature]]
87
+ self.y = self.database.target
88
+ X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
89
+ self.sklearn_clf = LR_sk(max_iter=1000, random_state=1234)
90
+ self.sklearn_clf.fit(X_train, y_train)
91
+
92
+ x1_min, x1_max = self.X[:, 0].min() - 0.5, self.X[:, 0].max() + 0.5
93
+ x2_min, x2_max = self.X[:, 1].min() - 0.5, self.X[:, 1].max() + 0.5
94
+ h = 0.02 # Salto que vamos dando
95
+ x1_i = np.arange(x1_min, x1_max, h)
96
+ x2_i = np.arange(x2_min, x2_max, h)
97
+ x1_x1, x2_x2 = np.meshgrid(x1_i, x2_i)
98
+ y_pred = self.sklearn_clf.predict(np.c_[x1_x1.ravel(), x2_x2.ravel()])
99
+ y_pred = y_pred.reshape(x1_x1.shape)
100
+
101
+ plt.figure(1, figsize=(12, 8))
102
+ plt.pcolormesh(x1_x1, x2_x2, y_pred, cmap=plt.cm.Paired)
103
+ plt.scatter(self.X[:, 0], self.X[:, 1], c=self.y, edgecolors='k', cmap=plt.cm.Paired)
104
+ plt.xlim(x1_x1.min(), x1_x1.max())
105
+ plt.ylim(x2_x2.min(), x2_x2.max())
106
+ return plt.gcf()
Naive_bayes.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn import datasets
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.naive_bayes import GaussianNB
7
+ from sklearn.metrics import accuracy_score
8
+ import matplotlib.pyplot as plt
9
+ from sklearn.metrics import confusion_matrix
10
+
11
+
12
+
13
+ class naive_bayes_st:
14
+ def __init__(self, database, test_size=0.2):
15
+ self.database = database
16
+ self.test_size = test_size
17
+ self.desc = r'''
18
+ # **Naive Bayes**
19
+
20
+ Particularmente, este algoritmo no lo conoc铆a, y por lo que he visto hasta ahora funciona como un **clasificador** bas谩ndose principalmente en el **teorema de bayes**.
21
+
22
+ **Teorema de bayes**
23
+
24
+ $$
25
+ P(A/B) = \frac{P(B/A) \cdot P(A)}{P(B)}
26
+ $$
27
+
28
+ Eso s铆, para aprovechar este teorema es que se tiene que cumplir la condici贸n de que los atributos o **componentes del vector X sean independientes entre s铆 (Se asume que los eventos son independientes)**.
29
+
30
+ $$
31
+ P(y/X) = \frac{P(X/y) \cdot P(y)}{P(X)} = \frac{P(x_{1}/y) \quad ... \quad P(x_{n}/y) \cdot P(y)}{P(X)}
32
+ $$
33
+
34
+ As铆, luego la manera de escoger a que clasificaci贸n pertenece el vector X, es calculando todas las probabilidades condicionales (**Nota**: el $P(x)$ lo podemos omitir ya que va a estar presente en todas las ecuaciones)
35
+
36
+
37
+ $$
38
+ y = argmax_{y} \quad P(x_{1}/y) \quad ... \quad P(x_{n}/y) \cdot P(y)
39
+ $$
40
+
41
+ $$
42
+ y = argmax_{y} \quad log(P(x_{1}/y)) + \quad ... \quad + log(P(x_{n}/y)) + log(P(y))
43
+ $$
44
+
45
+
46
+ **Por 煤ltimo, nos falta definir:**
47
+
48
+ $P(y)$: Frecuencia (cantidad de veces que est谩 presente la clasificaci贸n y en los datos)
49
+
50
+ $$
51
+ P(x_{i}/y) = \frac{1}{\sqrt{2\pi \sigma_{y}^{2}}} \cdot e^{(-\frac{(x_{i} - \mu_{y})^2}{2蟽_{y}^{2}})}
52
+ $$
53
+ '''
54
+
55
+ def solve(self):
56
+ self.X, self.y = self.database.data, self.database.target
57
+ X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
58
+ self.sklearn_clf = GaussianNB()
59
+ self.sklearn_clf.fit(X_train, y_train)
60
+ y_pred = self.sklearn_clf.predict(X_test)
61
+ acc = accuracy_score(y_pred, y_test)
62
+
63
+ c1, c2 = st.columns([4, 1])
64
+ c2.metric('Acierto', value=f'{np.round(acc, 2)*100}%')
65
+ df = pd.DataFrame(confusion_matrix(y_pred, y_test))
66
+ labels = self.database.target_names
67
+ df.columns = labels
68
+ df.index = labels
69
+ c1.write('**Confusion Matrix**')
70
+ c1.dataframe(df)
71
+
72
+ def visualization(self):
73
+ n_features = int(self.database.data.shape[1])
74
+ self.x_feature = st.slider('Variables en eje x', 1, n_features, 1)
75
+ self.y_feature = st.slider('Variables en eje y', 1, n_features, 2)
76
+
77
+ self.X = np.c_[self.database.data[:, self.x_feature-1:self.x_feature], self.database.data[:, self.y_feature-1:self.y_feature]]
78
+ self.y = self.database.target
79
+ X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
80
+ self.sklearn_clf = GaussianNB()
81
+ self.sklearn_clf.fit(X_train, y_train)
82
+
83
+ x1_min, x1_max = self.X[:, 0].min() - 0.5, self.X[:, 0].max() + 0.5
84
+ x2_min, x2_max = self.X[:, 1].min() - 0.5, self.X[:, 1].max() + 0.5
85
+ h = 0.02 # Salto que vamos dando
86
+ x1_i = np.arange(x1_min, x1_max, h)
87
+ x2_i = np.arange(x2_min, x2_max, h)
88
+ x1_x1, x2_x2 = np.meshgrid(x1_i, x2_i)
89
+ y_pred = self.sklearn_clf.predict(np.c_[x1_x1.ravel(), x2_x2.ravel()])
90
+ y_pred = y_pred.reshape(x1_x1.shape)
91
+
92
+ plt.figure(1, figsize=(12, 8))
93
+ plt.pcolormesh(x1_x1, x2_x2, y_pred, cmap=plt.cm.Paired)
94
+ plt.scatter(self.X[:, 0], self.X[:, 1], c=self.y, edgecolors='k', cmap=plt.cm.Paired)
95
+ plt.xlim(x1_x1.min(), x1_x1.max())
96
+ plt.ylim(x2_x2.min(), x2_x2.max())
97
+ return plt.gcf()
PCA.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn import datasets
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.svm import SVR
7
+ from sklearn.decomposition import PCA as PCA_sk
8
+ import matplotlib.pyplot as plt
9
+
10
+ class PCA_st:
11
+ def __init__(self, database, test_size=0.2):
12
+ self.database = database
13
+ self.test_size = test_size
14
+ self.desc = r'''
15
+ # **PCA (Principal Component Analysis)**
16
+
17
+ El objetivo principal con este m茅todo es definir una nueva dimensi贸n para el set de datos (siendo estas nuevas dimensiones ortogonales y por tanto independientes).
18
+
19
+ **Varianza**
20
+
21
+ $$
22
+ var(X) = \frac{1}{n} \sum (X_{i} - \bar{X})^2
23
+ $$
24
+
25
+ **Matriz de Covarianzas**
26
+
27
+ $$
28
+ Cov(X, Y) = \frac{1}{n} \sum (X_{i} - \bar{X})(Y_{i} - \bar{Y})^T
29
+ $$
30
+
31
+ $$
32
+ Cov(X, X) = \frac{1}{n} \sum (X_{i} - \bar{X})(X_{i} - \bar{X})^T
33
+ $$
34
+
35
+ **Valores y Vectores Propios**
36
+
37
+ Los vectores propios apuntan en la direcci贸n donde se genera la m谩xima varianza y el correspondiente valor propio indica el grado de importancia del vector.
38
+
39
+ $$
40
+ A \vec{v} = 位 \vec{v}
41
+ $$
42
+
43
+ **Metodo**
44
+ - Sustraer al vector X su media.
45
+ - Calcular la Cov(X, X)
46
+ - Calcular los vectores y valores propios de las matrices de covarianza
47
+ - Ordenar los vectores propios seg煤n su importancia (en base a su valor propio) en orden decreciente
48
+ - Escoger los primeros k vectores propios y estos pasaran a ser las nuevas k dimensiones
49
+ - Por 煤ltimo, transformar (proyectar) los datos en las nuevas dimensiones (esto se hace con un producto punto)'''
50
+
51
+ self.x_feature = 1
52
+ self.y_feature = 2
53
+ self.n_components = 2
54
+
55
+ def params(self):
56
+ n_features = int(self.database.data.shape[1])
57
+ self.n_components = st.slider('Numero de componentes', 1, n_features, 2)
58
+
59
+ def solve(self):
60
+ self.x_feature = st.slider('Componente eje x', 1, self.n_components, 1)
61
+ self.y_feature = st.slider('Componente eje y', 1, self.n_components, 2)
62
+ X = self.database.data
63
+ y = self.database.target
64
+ sklearn_clus = PCA_sk(n_components=self.n_components)
65
+ sklearn_clus.fit(X)
66
+ X_proyected_sk = sklearn_clus.transform(X)
67
+
68
+ x1 = X_proyected_sk[:, self.x_feature-1]
69
+ x2 = X_proyected_sk[:, self.y_feature-1]
70
+
71
+ plt.figure(1, figsize=(12, 8))
72
+ plt.scatter(x1, x2, c=y, edgecolors='none', alpha=0.8, cmap=plt.cm.get_cmap('viridis', len(y)))
73
+ plt.xlabel(f'Componente {self.x_feature}')
74
+ plt.ylabel(f'Componente {self.y_feature}')
75
+ plt.colorbar()
76
+
77
+ #fig = plt.show().get_fig()
78
+ return plt.gcf()
Perceptron.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn import datasets
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.linear_model import Perceptron as P_sk
7
+ from sklearn.metrics import mean_squared_error
8
+ import matplotlib.pyplot as plt
9
+
10
+
11
+ class perceptron_st:
12
+ def __init__(self, database, test_size=0.2):
13
+ self.database = database
14
+ self.test_size = test_size
15
+ self.desc = r'''
16
+ # **Perceptron**
17
+
18
+ Este es el modelo m谩s sencillo y que sirve de introducci贸n a los modelos de redes neuronales. En particular, su funcionamiento es bastante similar al modelo de regresi贸n linear. con la diferencia de que ocupa una funci贸n de activaci贸n en la salida (**funci贸n no lineal**).
19
+
20
+ **Modelo Lineal**
21
+
22
+ $$
23
+ f(w, b) = w^{t}x + b
24
+ $$
25
+
26
+ **Funci贸n de Activaci贸n**
27
+
28
+ $$
29
+ z(x) \in (0, 1) \quad si \quad x \geq 0
30
+ $$
31
+
32
+ **Aproximaci贸n (predicci贸n)**
33
+
34
+ $$
35
+ \hat{y} = z(w^{t}x + b)
36
+ $$
37
+
38
+ **Reglas de actualizaci贸n (aqu铆 se encuentra incluido el bias)**
39
+
40
+ $$
41
+ w = w + \Delta w = w + lr(y_{i} - \hat{y_{i}})x_{i}
42
+ $$
43
+ '''
44
+
45
+ def solve(self):
46
+ self.X, self.y = self.database.data, self.database.target
47
+ X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
48
+ self.sklearn_regr = P_sk(random_state=1234)
49
+ self.sklearn_regr.fit(X_train, y_train)
50
+ y_pred = self.sklearn_regr.predict(X_test)
51
+ acc = mean_squared_error(y_pred, y_test)
52
+ st.metric('MSE (Mean Square Error)', value=f'{np.round(acc, 2)}')
53
+
54
+ def visualization(self):
55
+ n_features = int(self.database.data.shape[1])
56
+ self.x_feature = st.slider('Variable en eje x', 1, n_features, 1)
57
+
58
+ self.X = self.database.data[:, self.x_feature-1:self.x_feature]
59
+ self.y = self.database.target
60
+ X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
61
+ self.sklearn_regr = P_sk(random_state=1234)
62
+ self.sklearn_regr.fit(X_train, y_train)
63
+
64
+ x1_min = self.X.min()
65
+ x1_max = self.X.max()
66
+
67
+ x_pred = np.linspace(x1_min, x1_max, 100).reshape([100, 1])
68
+ y_pred = self.sklearn_regr.predict(x_pred)
69
+
70
+ plt.figure(1, figsize=(12, 8))
71
+ plt.scatter(self.X, self.y, edgecolors='k', cmap=plt.cm.Paired)
72
+ plt.plot(x_pred, y_pred)
73
+ return plt.gcf()
Random_forest.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn import datasets
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.ensemble import RandomForestClassifier as rf
7
+ from sklearn.metrics import accuracy_score
8
+ import matplotlib.pyplot as plt
9
+ from sklearn.metrics import confusion_matrix
10
+
11
+
12
+
13
+ class random_forest_st:
14
+ def __init__(self, database, test_size=0.2):
15
+ self.database = database
16
+ self.test_size = test_size
17
+ self.desc = '''
18
+ # **Random Forest**
19
+
20
+ Este algoritmo se construye en base al algoritmo de **Decision Tree**. As铆, lo que se hace es:
21
+
22
+ - Definir cantidad de estimadores (**Decision Tree**)
23
+ - Cada estimador entrenarlo con una muestra del set de datos de entrenamiento, variando as铆 la cantidad de variables y la cantidad de datos con la cual se entrenan estos estimadores.
24
+ - Luego, para generar la predicci贸n de algoritmo, lo que se hace es consultar a cada estimador su predicci贸n y "**de manera democr谩tica**" se escoge la opci贸n m谩s "**votada**"
25
+ '''
26
+ self.n_trees = 100
27
+ self.min_samples_split = 2
28
+ self.max_depth = 100
29
+ self.n_feats = None
30
+ self.stop_criterion = 'max_depth'
31
+
32
+
33
+ def params(self):
34
+ self.stop_criterion = st.radio('Criterio de termino:', options=['max_depth', 'min_samples_split'])
35
+ if self.stop_criterion == 'max_depth': self.max_depth = st.slider('Valor max deph:', 1, 100, 10)
36
+ elif self.stop_criterion == 'min_samples_split': self.min_samples_split = st.slider('Valor min_samples_split:', 2, 1000, 5)
37
+ self.n_trees = st.slider('Cantidad de estimadores: ', 1, 100, 3)
38
+ self.n_feats = st.slider('Fraccion de categorias para contruir los estimadores: ', 0.0, 1.0, 0.5)
39
+
40
+ def solve(self):
41
+ self.X, self.y = self.database.data, self.database.target
42
+ X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
43
+ if self.stop_criterion == 'max_depth': self.sklearn_clf = rf(n_estimators=self.n_trees,
44
+ max_depth=self.max_depth,
45
+ max_features=self.n_feats,
46
+ random_state=1234)
47
+ elif self.stop_criterion == 'min_samples_split': self.sklearn_clf = rf(n_estimators=self.n_trees,
48
+ min_samples_split=self.min_samples_split,
49
+ max_features=self.n_feats,
50
+ random_state=1234)
51
+
52
+ #self.sklearn_clf = rf(n_estimators=self.n_trees)
53
+ self.sklearn_clf.fit(X_train, y_train)
54
+ y_pred = self.sklearn_clf.predict(X_test)
55
+ acc = accuracy_score(y_pred, y_test)
56
+
57
+ c1, c2 = st.columns([4, 1])
58
+ c2.metric('Acierto', value=f'{np.round(acc, 2)*100}%')
59
+ df = pd.DataFrame(confusion_matrix(y_pred, y_test))
60
+ labels = self.database.target_names
61
+ df.columns = labels
62
+ df.index = labels
63
+ c1.write('**Confusion Matrix**')
64
+ c1.dataframe(df)
65
+
66
+ def visualization(self):
67
+ n_features = int(self.database.data.shape[1])
68
+ self.x_feature = st.slider('Variables en eje x', 1, n_features, 1)
69
+ self.y_feature = st.slider('Variables en eje y', 1, n_features, 2)
70
+
71
+ self.X = np.c_[self.database.data[:, self.x_feature-1:self.x_feature], self.database.data[:, self.y_feature-1:self.y_feature]]
72
+ self.y = self.database.target
73
+ X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
74
+ if self.stop_criterion == 'max_depth': self.sklearn_clf = rf(n_estimators=self.n_trees,
75
+ max_depth=self.max_depth,
76
+ max_features=self.n_feats,
77
+ random_state=1234)
78
+ elif self.stop_criterion == 'min_samples_split': self.sklearn_clf = rf(n_estimators=self.n_trees,
79
+ min_samples_split=self.min_samples_split,
80
+ max_features=self.n_feats,
81
+ random_state=1234)
82
+ self.sklearn_clf.fit(X_train, y_train)
83
+
84
+ x1_min, x1_max = self.X[:, 0].min() - 0.5, self.X[:, 0].max() + 0.5
85
+ x2_min, x2_max = self.X[:, 1].min() - 0.5, self.X[:, 1].max() + 0.5
86
+ h = 0.02 # Salto que vamos dando
87
+ x1_i = np.arange(x1_min, x1_max, h)
88
+ x2_i = np.arange(x2_min, x2_max, h)
89
+ x1_x1, x2_x2 = np.meshgrid(x1_i, x2_i)
90
+ y_pred = self.sklearn_clf.predict(np.c_[x1_x1.ravel(), x2_x2.ravel()])
91
+ y_pred = y_pred.reshape(x1_x1.shape)
92
+
93
+ plt.figure(1, figsize=(12, 8))
94
+ plt.pcolormesh(x1_x1, x2_x2, y_pred, cmap=plt.cm.Paired)
95
+ plt.scatter(self.X[:, 0], self.X[:, 1], c=self.y, edgecolors='k', cmap=plt.cm.Paired)
96
+ plt.xlim(x1_x1.min(), x1_x1.max())
97
+ plt.ylim(x2_x2.min(), x2_x2.max())
98
+ return plt.gcf()
SVC.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn import datasets
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn import svm
7
+ from sklearn.metrics import accuracy_score
8
+ import matplotlib.pyplot as plt
9
+ from sklearn.metrics import confusion_matrix
10
+
11
+
12
+
13
+ class SVC_st:
14
+ def __init__(self, database, test_size=0.2):
15
+ self.database = database
16
+ self.test_size = test_size
17
+ self.desc = r'''
18
+ # **Support Vector Machine**
19
+
20
+ Este algoritmo tiene por objetivo la b煤squeda de un hiperplano que segregue los datos atendiendo a estas dos condiciones:
21
+
22
+ $$
23
+ wx - b = 0
24
+ $$
25
+
26
+ $$
27
+ max \quad \frac{2}{||w||}
28
+ $$
29
+
30
+ **Linear model (2 categor铆as (1 y -1))**
31
+
32
+ $$
33
+ wx - b = 0
34
+ $$
35
+
36
+ $$
37
+ wx_{i} - b \geq 1 \quad si \quad y_{i} = 1
38
+ $$
39
+
40
+ $$
41
+ wx_{i} - b \leq 1 \quad si \quad y_{i} = -1
42
+ $$
43
+
44
+ **Estas 3 ecuaciones se resumen en la siguiente:**
45
+
46
+ $$
47
+ y_{i}(wx_{i} - b) \geq 1
48
+ $$
49
+
50
+ **Funci贸n de costos (loss)**
51
+
52
+ $$
53
+ loss = 位||w||^2 + \frac{1}{n} \sum_{i=1}^{n} max(0, 1-y_{i}(wx_{i}-b))
54
+ $$
55
+
56
+ De esta manera las **derivadas** en funci贸n de los par谩metros siguen las siguientes reglas:
57
+
58
+ - si $y_{i}(xw - b) \geq 1$:
59
+
60
+ $$
61
+ \left[\begin{array}{ll} \frac{d_{loss}}{d_{w_{k}}} \\ \frac{d_{loss}}{db} \end{array} \right] = \left [\begin{array}{ll} 2 \lambda w_{k} \\ 0 \end{array} \right]
62
+ $$
63
+
64
+ - si $y_{i}(xw - b) < 1$:
65
+
66
+ $$
67
+ \left[\begin{array}{ll}\frac{d_{loss}}{d_{w_{k}}} \\ \frac{d_{loss}}{db} \end{array} \right] = \left[\begin{array}{ll} 2\lambda w_{k} - y_{i} \cdot x_{i} \\ y_{i} \end{array} \right]
68
+ $$
69
+
70
+ **Reglas de actualizaci贸n (Gradient Descent)**
71
+
72
+ - Inicializar par谩metros
73
+ - Iterar
74
+ - Calcular loss
75
+ - Calcular gradiente
76
+ - Actualizar par谩metros
77
+
78
+ $$
79
+ w = w - lr \cdot dw
80
+ $$
81
+
82
+ $$
83
+ b = b - lr \cdot db
84
+ $$
85
+
86
+ - Terminar de iterar
87
+ '''
88
+ self.kernel = 'linear'
89
+ self.gamma = 2
90
+ self.degree = 3
91
+
92
+ def params(self):
93
+ tipo = st.selectbox('Tipo de kernel', options=['linear',
94
+ 'poly',
95
+ 'rbf'])
96
+ self.kernel = tipo
97
+ self.gamma = st.slider('Parametro gamma', 1, 10, 2)
98
+ if tipo == 'poly': self.degree = st.slider('Cantidad de grados del polinomio', 1, 10, 3)
99
+
100
+
101
+ def solve(self):
102
+ self.X, self.y = self.database.data, self.database.target
103
+ X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
104
+ self.sklearn_clf = svm.SVC(kernel=self.kernel, gamma=self.gamma, random_state=1234)
105
+ self.sklearn_clf.fit(X_train, y_train)
106
+ y_pred = self.sklearn_clf.predict(X_test)
107
+ acc = accuracy_score(y_pred, y_test)
108
+
109
+ c1, c2 = st.columns([4, 1])
110
+ c2.metric('Acierto', value=f'{np.round(acc, 2)*100}%')
111
+ df = pd.DataFrame(confusion_matrix(y_pred, y_test))
112
+ labels = self.database.target_names
113
+ df.columns = labels
114
+ df.index = labels
115
+ c1.write('**Confusion Matrix**')
116
+ c1.dataframe(df)
117
+
118
+ def visualization(self):
119
+ n_features = int(self.database.data.shape[1])
120
+ self.x_feature = st.slider('Variables en eje x', 1, n_features, 1)
121
+ self.y_feature = st.slider('Variables en eje y', 1, n_features, 2)
122
+
123
+ self.X = np.c_[self.database.data[:, self.x_feature-1:self.x_feature], self.database.data[:, self.y_feature-1:self.y_feature]]
124
+ self.y = self.database.target
125
+ X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
126
+ self.sklearn_clf = svm.SVC(kernel=self.kernel, gamma=self.gamma, random_state=1234)
127
+ self.sklearn_clf.fit(X_train, y_train)
128
+
129
+ x1_min, x1_max = self.X[:, 0].min() - 0.5, self.X[:, 0].max() + 0.5
130
+ x2_min, x2_max = self.X[:, 1].min() - 0.5, self.X[:, 1].max() + 0.5
131
+ h = 0.02 # Salto que vamos dando
132
+ x1_i = np.arange(x1_min, x1_max, h)
133
+ x2_i = np.arange(x2_min, x2_max, h)
134
+ x1_x1, x2_x2 = np.meshgrid(x1_i, x2_i)
135
+ y_pred = self.sklearn_clf.predict(np.c_[x1_x1.ravel(), x2_x2.ravel()])
136
+ y_pred = y_pred.reshape(x1_x1.shape)
137
+
138
+ plt.figure(1, figsize=(12, 8))
139
+ plt.pcolormesh(x1_x1, x2_x2, y_pred, cmap=plt.cm.Paired)
140
+ plt.scatter(self.X[:, 0], self.X[:, 1], c=self.y, edgecolors='k', cmap=plt.cm.Paired)
141
+ plt.xlim(x1_x1.min(), x1_x1.max())
142
+ plt.ylim(x2_x2.min(), x2_x2.max())
143
+ return plt.gcf()
SVR.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn import datasets
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.svm import SVR
7
+ from sklearn.metrics import mean_squared_error
8
+ import matplotlib.pyplot as plt
9
+
10
+ class SVR_st:
11
+ def __init__(self, database, test_size=0.2):
12
+ self.database = database
13
+ self.test_size = test_size
14
+ self.desc = r'''
15
+ # **SVR (Support Vector Regression)**
16
+
17
+ El objetivo es encontrar la funci贸n $f(x)$ que produzca el valor $y$ con una distancia no m谩s lejana que $\epsilon$ para cada uno de los puntos de entrenamiento $x$.
18
+
19
+ **Linear SVM Regression: Primal Formula**
20
+
21
+ Supongamos que estamos trabajando con un set de datos X (multivariable) y con una variable dependiente y.
22
+
23
+ Entonces la funci贸n lineal seria:
24
+
25
+ $$
26
+ f(X) = X^{T}\beta + b
27
+ $$
28
+
29
+ Luego, para asegurar que los par谩metros $\beta$ sean lo m谩s chicos (flat) posibles es que se busca minimizar:
30
+
31
+ $$
32
+ J(\beta) = \frac{1}{2}\beta^{T}\beta
33
+ $$
34
+
35
+ Restringido bajo las siguientes condiciones:
36
+
37
+ $$
38
+ |y_{n} - (X_{n}^{T}\beta + b)| \leq \epsilon \quad \forall n \in N
39
+ $$
40
+
41
+ Como es posible que no exista una funci贸n $f(x)$ que pueda satisfacer estas condiciones se introduce los t茅rminos $鈩嘷{n}$ y $鈩嘷{n}^{*}$ las cuales vienen a representar algo as铆 como variables de holgura.
42
+
43
+ As铆, luego nuestra funci贸n objetivo cambia a:
44
+
45
+ $$
46
+ J(\beta) = \frac{1}{2}\beta^{T}\beta + C\sum_{n=1}^{N} (鈩嘷{n} + 鈩嘷{n}^{*})
47
+ $$
48
+
49
+ Sujeto a:
50
+
51
+ $$
52
+ y_{n} - (X_{n}^{T} \beta +b) \leq \epsilon + 鈩嘷{n} \quad \forall n \in N
53
+ $$
54
+
55
+ $$
56
+ (X_{n}^{T} \beta +b) - y_{n}\leq \epsilon + 鈩嘷{n}^{*} \quad \forall n \in N
57
+ $$
58
+
59
+ $$
60
+ 鈩嘷{n}^{*} \geq 0 \quad \forall n \in N
61
+ $$
62
+
63
+ $$
64
+ 鈩嘷{n} \geq 0 \quad \forall n \in N
65
+ $$
66
+
67
+ **Nota**: $C$ Son un conjunto de valores todos positivos que tiene por funci贸n penalizar las observaciones que se escapen del margen $\epsilon$
68
+
69
+ **Nonlinear SVM Regression**
70
+
71
+ En caso de que el problema no se pueda adaptar bien utilizando un modelo lineal, podemos adaptar todo este desarrollo cambiando el producto punto $X_{i}^{T}X_{j}$ por $G(X_{i}, X_{j})$.
72
+
73
+ | Kernel Name | Kernel Function |
74
+ |-------------|-----------------|
75
+ |Linear (dot product)| $G(X_{i}, X_{j}) = X_{i}^{T}X_{j}$|
76
+ |Gaussian|$G(X_{i}, X_{j}) = e^{-\lvert \rvert X_{i} - X_{j}^{2} \lvert \rvert}$|
77
+ |Polynomial|$G(X_{i}, X_{j}) = (1 + X_{i}^{T}X_{j})^{q}$|
78
+
79
+ **Nota:** $q$ es el grado del polinomio
80
+
81
+ **Fuente**: https://www.mathworks.com/help/stats/understanding-support-vector-machine-regression.html
82
+
83
+ '''
84
+
85
+ def params(self):
86
+ self.selected_kernel = st.selectbox('Tipo de kernel:', options=['linear', 'poly', 'rbf', 'sigmoid'])
87
+ if self.selected_kernel == 'poly': self.degree = st.slider('Grados del polinomio', 1, 6, 3)
88
+ min = float(np.min([0, np.min(self.database.target)]))/2
89
+ max = float(np.max(self.database.target))/2
90
+ mean = float(np.mean(self.database.target))/2
91
+ self.C = st.slider('Parametro de penalizacion C:', 1.0, 4*max, 4*mean)
92
+ self.epsilon = st.slider('Epsilon: ', min, max, mean)
93
+
94
+ def solve(self):
95
+ self.X, self.y = self.database.data, self.database.target
96
+ X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
97
+ if self.selected_kernel == 'poly': self.sklearn_regr = SVR(kernel=self.selected_kernel,
98
+ degree=self.degree,
99
+ C=self.C,
100
+ epsilon=self.epsilon)
101
+ else: self.sklearn_regr = SVR(kernel=self.selected_kernel,
102
+ C=self.C,
103
+ epsilon=self.epsilon)
104
+ self.sklearn_regr.fit(X_train, y_train)
105
+ y_pred = self.sklearn_regr.predict(X_test)
106
+ acc = mean_squared_error(y_pred, y_test)
107
+ st.metric('MSE (Mean Square Error)', value=f'{np.round(acc, 2)}')
108
+
109
+ def visualization(self):
110
+ n_features = int(self.database.data.shape[1])
111
+ self.x_feature = st.slider('Variable en eje x', 1, n_features, 1)
112
+
113
+ self.X = self.database.data[:, self.x_feature-1:self.x_feature]
114
+ self.y = self.database.target
115
+ X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=1234)
116
+ if self.selected_kernel == 'poly': self.sklearn_regr = SVR(kernel=self.selected_kernel,
117
+ degree=self.degree,
118
+ C=self.C,
119
+ epsilon=self.epsilon)
120
+ else: self.sklearn_regr = SVR(kernel=self.selected_kernel,
121
+ C=self.C,
122
+ epsilon=self.epsilon)
123
+ self.sklearn_regr.fit(X_train, y_train)
124
+
125
+ x1_min = self.X.min()
126
+ x1_max = self.X.max()
127
+
128
+ x_pred = np.linspace(x1_min, x1_max, 100).reshape([100, 1])
129
+ y_pred = self.sklearn_regr.predict(x_pred)
130
+ y_pred_up = [i+self.epsilon for i in y_pred]
131
+ y_pred_down = [i-self.epsilon for i in y_pred]
132
+
133
+
134
+ plt.figure(1, figsize=(12, 8))
135
+ plt.scatter(self.X, self.y, edgecolors='k', cmap=plt.cm.Paired)
136
+ plt.plot(x_pred, y_pred, color='red') # linea de prediccion
137
+ plt.plot(x_pred, y_pred_up, linestyle='--', color='green')
138
+ plt.plot(x_pred, y_pred_down, linestyle='--', color='green')
139
+ return plt.gcf()
k_mean_clustering.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn import datasets
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.svm import SVR
7
+ from sklearn.cluster import KMeans as KM
8
+ import matplotlib.pyplot as plt
9
+
10
+
11
+ def plot(X, clusters, centroids, x_feature, y_feature):
12
+ fig, ax = plt.subplots(figsize=(12, 8))
13
+
14
+ for i, index in enumerate(clusters):
15
+ x = X[index].T[x_feature-1]
16
+ y = X[index].T[y_feature-1]
17
+ point = np.array([x, y])
18
+ ax.scatter(*point)
19
+
20
+ for point in centroids:
21
+ x = point[x_feature-1]
22
+ y = point[y_feature-1]
23
+ points = np.array([x, y])
24
+ ax.scatter(*points, marker="o", linewidth=15)
25
+
26
+ return fig
27
+
28
+ class k_mean_clustering_st:
29
+ def __init__(self, database, test_size=0.2):
30
+ self.database = database
31
+ self.test_size = test_size
32
+ self.desc = r'''
33
+ # **K-Mean Clustering**
34
+
35
+ El objetivo en esta ocasi贸n es segmentar informaci贸n desclasificada (**unsupervised learning**)
36
+
37
+ As铆, este m茅todo asigna a una muestra de datos una clase en base a la distancia promedio entre los datos.
38
+
39
+ **Iterative Optimization**
40
+
41
+ - Inicializamos los centros de manera aleatoria
42
+ - Iteramos hasta converger
43
+ - Actualizamos las clasificaciones de los datos utilizando el centroide.
44
+ - Actualizamos el centroide. (este corresponde a la posici贸n del centro para una clase)
45
+
46
+
47
+ **Distancia entre vectores**
48
+
49
+ $$
50
+ d(p, q) = \sqrt{\sum (p_{i} - q_{i})^{2}}
51
+ $$
52
+ '''
53
+ self.x_feature = 1
54
+ self.y_feature = 2
55
+ self.n_clusters = 3
56
+ self.max_iter = 150
57
+
58
+ def params(self):
59
+ self.n_features = int(self.database.data.shape[1])
60
+ self.n_clusters = st.slider('Numero de segmentos', 1, 10, 3)
61
+ self.max_iter = st.slider('Numero maximo de iteraciones', 100, 200, 150)
62
+
63
+ def solve(self):
64
+ self.x_feature = st.slider('Variables en eje x', 1, self.n_features, 1)
65
+ self.y_feature = st.slider('Variables en eje y', 1, self.n_features, 2)
66
+ X = self.database.data
67
+ sklearn_clus = KM(n_clusters=self.n_clusters, max_iter=self.max_iter)
68
+ sklearn_clus.fit(X)
69
+ pred = sklearn_clus.predict(X)
70
+ classes = np.unique(pred)
71
+ clusters = [[] for i in classes]
72
+ for idx, value in enumerate(pred):
73
+ clusters[value].append(idx)
74
+
75
+ return plot(X=X,
76
+ clusters=clusters,
77
+ centroids=sklearn_clus.cluster_centers_,
78
+ x_feature=self.x_feature,
79
+ y_feature=self.y_feature)