|
|
|
|
|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import seaborn as sns |
|
import matplotlib.pyplot as plt |
|
import plotly.express as px |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.preprocessing import StandardScaler |
|
from sklearn.decomposition import PCA |
|
pd.DataFrame.iteritems = pd.DataFrame.items |
|
scaler = StandardScaler() |
|
|
|
st.title("Visualización y Clusterización automática de Data de Estudiantes") |
|
st.write("Cargue el archivo PKL para visualizar el análisis de su contenido.") |
|
uploaded_file = st.file_uploader("Cargar archivo: ", type='pkl') |
|
|
|
if uploaded_file is not None: |
|
|
|
df = pd.read_pickle(uploaded_file) |
|
|
|
|
|
df_050 = df.dropna(axis=0) |
|
df_050.index = df_050.DNI |
|
|
|
|
|
col_selec = [] |
|
for col in df_050.columns: |
|
u_col = df_050[col].unique() |
|
if len(u_col) < 25: |
|
col_selec.append(col) |
|
|
|
st.write('Esta es la lista de variables que será usada para la clusterización.') |
|
st.write(col_selec) |
|
|
|
df_100 = df_050[col_selec] |
|
df_110 = pd.get_dummies(df_100) |
|
|
|
st.write('Esta es la matriz de correlación de todas las categorías') |
|
|
|
corr_matrix = df_110.corr() |
|
plt.figure(figsize=(21, 21)) |
|
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5, annot_kws={'size': 5}) |
|
plt.title('Mapa de Calor de la Correlation de Variables') |
|
st.pyplot(plt) |
|
|
|
st.write('A continuación se va a hacer el clustering usando PCA.') |
|
|
|
X_sc = scaler.fit_transform(df_110) |
|
st.write('La forma de la data es: ', X_sc.shape) |
|
has_nan = np.isnan(X_sc).sum() |
|
pca = PCA(n_components=2) |
|
pca.fit(X_sc) |
|
X_pca = pca.transform(X_sc) |
|
data_200 = df_100 |
|
data_200['pca_1'] = X_pca[:, 0] |
|
data_200['pca_2'] = X_pca[:, 1] |
|
|
|
plt.figure(figsize=(8, 8)) |
|
plt.scatter(data_200.pca_1, data_200.pca_2) |
|
plt.title('Diagrama de Dispersión PCA') |
|
plt.xlabel('Principal Component 1') |
|
plt.ylabel('Principal Component 2') |
|
|
|
st.pyplot(plt) |
|
st.write(data_200.columns) |
|
|
|
|
|
|
|
|
|
|
|
INGRE = st.selectbox('Estado: ', ['Abandono', 'Activo']) |
|
|
|
data_210 = data_200[data_200['ESTADO_INGRESANTE']==INGRE] |
|
|
|
fig2 = px.scatter(data_210, x='pca_1', y='pca_2', title='Distribución PCA', width=800, height=800) |
|
st.plotly_chart(fig2) |
|
|
|
GRIDSIZEX = st.slider(0, 100, 10) |
|
|
|
plt.figure(figsize=(10, 8)) |
|
plt_extracto = plt.hexbin(data_210.pca_1, data_210.pca_2, gridsize=GRIDSIZEX, cmap='inferno') |
|
plt.colorbar() |
|
plt.title('Hexbin Plot of PCA-Transformed Data') |
|
plt.xlabel('Principal Component 1') |
|
plt.ylabel('Principal Component 2') |
|
st.pyplot(plt) |
|
|
|
plt.figure(figsize=(7, 4)) |
|
densidades = pd.DataFrame(plt_extracto.get_array()) |
|
densidades.hist(bins=50, log=True) |
|
plt.xlabel('Cantidad de Ocurrencias') |
|
plt.ylabel('Frecuencia') |
|
plt.title('Histograma de Densidades') |
|
st.pyplot(plt) |
|
|
|
offsets = plt_extracto.get_offsets() |
|
offsets_df = pd.DataFrame(offsets) |
|
st.write(offsets_df.shape) |
|
offsets_df['densidad'] = densidades[0] |
|
offsets_df.columns = ['col1', 'col2', 'densidad'] |
|
offset_selec = offsets_df.sort_values(by='densidad', ascending=False) |
|
patrones_df = pd.DataFrame(index = [0,1,2,3,4,5,6,7,8,9], data=offset_selec.values[0:10], columns=offset_selec.columns) |
|
st.write(patrones_df) |
|
|
|
NUM_CASOS = st.slider("¿Cuántos casos elige explorar?", 1, 10, 3) |
|
st.write('Usted ha elegido ', NUM_CASOS, 'casos.') |
|
|
|
radiohex = (data_210.pca_1.max() - data_210.pca_1.min())/50/2 |
|
|
|
CASOX = st.selectbox('Elija el caso: ', (1, 2, 3)) |
|
|
|
a, b = patrones_df.col1[CASOX], patrones_df.col2[CASOX] |
|
enfoqueX = data_210[ |
|
(data_210.pca_1 > a - radiohex) & |
|
(data_210.pca_1 < a + radiohex) & |
|
(data_210.pca_2 > b - radiohex) & |
|
(data_210.pca_2 < b + radiohex) |
|
] |
|
|
|
st.write(enfoqueX.shape) |
|
LISTA_SELEC = st.multiselect('Escoja la variable de color: ', list(enfoqueX.columns)) |
|
st.write(LISTA_SELEC) |
|
|
|
fig2 = px.parallel_categories(data_frame=enfoqueX[list(LISTA_SELEC)]) |
|
st.plotly_chart(fig2) |
|
|
|
|
|
|