File size: 5,271 Bytes
c5ec08c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
# pages/data_exploration.py
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from utils.data_processor import DataProcessor
from utils.visualizer import Visualizer
def app():
st.title("Data Exploration")
# Initialize classes
data_processor = DataProcessor()
visualizer = Visualizer()
# Load data function
@st.cache_data
def load_data():
# Check if data exists in the data directory
data_path = "data/creditcard.csv"
if os.path.exists(data_path):
return pd.read_csv(data_path)
else:
st.warning("Default dataset not found. Please upload a dataset.")
return None
# Load data
df = load_data()
if df is None:
uploaded_file = st.file_uploader("Upload a CSV file", type="csv")
if uploaded_file is not None:
df = pd.read_csv(uploaded_file)
df.to_csv("data/uploaded_data.csv", index=False)
if df is not None:
st.write(f"Dataset shape: {df.shape[0]} rows and {df.shape[1]} columns")
# Data overview
st.header("Data Overview")
st.write(df.head())
# Data information
st.header("Data Information")
buffer = pd.DataFrame({
'Column': df.columns,
'Type': df.dtypes,
'Non-Null Count': df.count(),
'Null Count': df.isnull().sum(),
'Unique Values': [df[col].nunique() for col in df.columns]
})
st.write(buffer)
# Statistical summary
st.header("Statistical Summary")
st.write(df.describe())
# Class distribution
st.header("Class Distribution")
if 'Class' in df.columns:
fig = visualizer.plot_class_distribution(df)
st.pyplot(fig)
# Calculate fraud percentage
fraud_percentage = df['Class'].mean() * 100
st.write(f"Fraud transactions: {fraud_percentage:.2f}% of the dataset")
else:
st.warning("No 'Class' column found in the dataset. Please ensure your target variable is named 'Class'.")
# Feature distributions
st.header("Feature Distributions")
num_features = st.slider("Number of features to display", 1, min(10, len(df.columns)-1), 5)
fig = visualizer.plot_feature_distributions(df, n_features=num_features)
st.pyplot(fig)
# Correlation matrix
st.header("Correlation Matrix")
fig = visualizer.plot_correlation_matrix(df)
st.pyplot(fig)
# Transaction amount analysis
if 'Amount' in df.columns:
st.header("Transaction Amount Analysis")
col1, col2 = st.columns(2)
with col1:
st.subheader("Amount Distribution")
fig, ax = plt.subplots(figsize=(10, 6))
sns.histplot(data=df, x='Amount', bins=50, kde=True, ax=ax)
st.pyplot(fig)
with col2:
if 'Class' in df.columns:
st.subheader("Amount by Class")
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(x='Class', y='Amount', data=df, ax=ax)
st.pyplot(fig)
# Time analysis
if 'Time' in df.columns:
st.header("Transaction Time Analysis")
# Convert time to hours
df_time = df.copy()
df_time['Hour'] = (df_time['Time'] / 3600) % 24
fig, ax = plt.subplots(figsize=(12, 6))
if 'Class' in df.columns:
sns.histplot(data=df_time, x='Hour', hue='Class', bins=24, kde=True, ax=ax)
else:
sns.histplot(data=df_time, x='Hour', bins=24, kde=True, ax=ax)
plt.title('Transaction Distribution by Hour of Day')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Transactions')
st.pyplot(fig)
# Feature analysis for fraud detection
if 'Class' in df.columns:
st.header("Feature Analysis for Fraud Detection")
# Select top features correlated with fraud
corr_with_fraud = df.corr()['Class'].sort_values(ascending=False)
top_features = corr_with_fraud[1:6].index.tolist() # Skip Class itself
st.subheader("Top Features Correlated with Fraud")
st.write(corr_with_fraud[1:11]) # Show top 10 correlations
# Plot distributions of top features by fraud/non-fraud
st.subheader("Distributions of Top Features by Class")
for feature in top_features:
fig, ax = plt.subplots(figsize=(10, 6))
sns.histplot(data=df, x=feature, hue='Class', bins=50, kde=True, ax=ax)
plt.title(f'Distribution of {feature} by Class')
st.pyplot(fig)
if __name__ == "__main__":
app() |