File size: 5,271 Bytes
c5ec08c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# pages/data_exploration.py
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from utils.data_processor import DataProcessor
from utils.visualizer import Visualizer

def app():
    st.title("Data Exploration")
    
    # Initialize classes
    data_processor = DataProcessor()
    visualizer = Visualizer()
    
    # Load data function
    @st.cache_data
    def load_data():
        # Check if data exists in the data directory
        data_path = "data/creditcard.csv"
        if os.path.exists(data_path):
            return pd.read_csv(data_path)
        else:
            st.warning("Default dataset not found. Please upload a dataset.")
            return None
    
    # Load data
    df = load_data()
    if df is None:
        uploaded_file = st.file_uploader("Upload a CSV file", type="csv")
        if uploaded_file is not None:
            df = pd.read_csv(uploaded_file)
            df.to_csv("data/uploaded_data.csv", index=False)
    
    if df is not None:
        st.write(f"Dataset shape: {df.shape[0]} rows and {df.shape[1]} columns")
        
        # Data overview
        st.header("Data Overview")
        st.write(df.head())
        
        # Data information
        st.header("Data Information")
        buffer = pd.DataFrame({
            'Column': df.columns,
            'Type': df.dtypes,
            'Non-Null Count': df.count(),
            'Null Count': df.isnull().sum(),
            'Unique Values': [df[col].nunique() for col in df.columns]
        })
        st.write(buffer)
        
        # Statistical summary
        st.header("Statistical Summary")
        st.write(df.describe())
        
        # Class distribution
        st.header("Class Distribution")
        if 'Class' in df.columns:
            fig = visualizer.plot_class_distribution(df)
            st.pyplot(fig)
            
            # Calculate fraud percentage
            fraud_percentage = df['Class'].mean() * 100
            st.write(f"Fraud transactions: {fraud_percentage:.2f}% of the dataset")
        else:
            st.warning("No 'Class' column found in the dataset. Please ensure your target variable is named 'Class'.")
        
        # Feature distributions
        st.header("Feature Distributions")
        num_features = st.slider("Number of features to display", 1, min(10, len(df.columns)-1), 5)
        fig = visualizer.plot_feature_distributions(df, n_features=num_features)
        st.pyplot(fig)
        
        # Correlation matrix
        st.header("Correlation Matrix")
        fig = visualizer.plot_correlation_matrix(df)
        st.pyplot(fig)
        
        # Transaction amount analysis
        if 'Amount' in df.columns:
            st.header("Transaction Amount Analysis")
            
            col1, col2 = st.columns(2)
            
            with col1:
                st.subheader("Amount Distribution")
                fig, ax = plt.subplots(figsize=(10, 6))
                sns.histplot(data=df, x='Amount', bins=50, kde=True, ax=ax)
                st.pyplot(fig)
            
            with col2:
                if 'Class' in df.columns:
                    st.subheader("Amount by Class")
                    fig, ax = plt.subplots(figsize=(10, 6))
                    sns.boxplot(x='Class', y='Amount', data=df, ax=ax)
                    st.pyplot(fig)
        
        # Time analysis
        if 'Time' in df.columns:
            st.header("Transaction Time Analysis")
            
            # Convert time to hours
            df_time = df.copy()
            df_time['Hour'] = (df_time['Time'] / 3600) % 24
            
            fig, ax = plt.subplots(figsize=(12, 6))
            if 'Class' in df.columns:
                sns.histplot(data=df_time, x='Hour', hue='Class', bins=24, kde=True, ax=ax)
            else:
                sns.histplot(data=df_time, x='Hour', bins=24, kde=True, ax=ax)
            plt.title('Transaction Distribution by Hour of Day')
            plt.xlabel('Hour of Day')
            plt.ylabel('Number of Transactions')
            st.pyplot(fig)
            
        # Feature analysis for fraud detection
        if 'Class' in df.columns:
            st.header("Feature Analysis for Fraud Detection")
            
            # Select top features correlated with fraud
            corr_with_fraud = df.corr()['Class'].sort_values(ascending=False)
            top_features = corr_with_fraud[1:6].index.tolist()  # Skip Class itself
            
            st.subheader("Top Features Correlated with Fraud")
            st.write(corr_with_fraud[1:11])  # Show top 10 correlations
            
            # Plot distributions of top features by fraud/non-fraud
            st.subheader("Distributions of Top Features by Class")
            for feature in top_features:
                fig, ax = plt.subplots(figsize=(10, 6))
                sns.histplot(data=df, x=feature, hue='Class', bins=50, kde=True, ax=ax)
                plt.title(f'Distribution of {feature} by Class')
                st.pyplot(fig)

if __name__ == "__main__":
    app()