Spaces:

Abs6187
/

Fraud_Detection_API_excecute4_Part2

Sleeping

App Files Files Community

Fraud_Detection_API_excecute4_Part2 / data_exploration.py

Abs6187

Upload 12 files

c5ec08c verified 6 months ago

raw

history blame contribute delete

5.27 kB

	# pages/data_exploration.py
	import streamlit as st
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	import os
	from utils.data_processor import DataProcessor
	from utils.visualizer import Visualizer

	def app():
	st.title("Data Exploration")

	# Initialize classes
	data_processor = DataProcessor()
	visualizer = Visualizer()

	# Load data function
	@st.cache_data
	def load_data():
	# Check if data exists in the data directory
	data_path = "data/creditcard.csv"
	if os.path.exists(data_path):
	return pd.read_csv(data_path)
	else:
	st.warning("Default dataset not found. Please upload a dataset.")
	return None

	# Load data
	df = load_data()
	if df is None:
	uploaded_file = st.file_uploader("Upload a CSV file", type="csv")
	if uploaded_file is not None:
	df = pd.read_csv(uploaded_file)
	df.to_csv("data/uploaded_data.csv", index=False)

	if df is not None:
	st.write(f"Dataset shape: {df.shape[0]} rows and {df.shape[1]} columns")

	# Data overview
	st.header("Data Overview")
	st.write(df.head())

	# Data information
	st.header("Data Information")
	buffer = pd.DataFrame({
	'Column': df.columns,
	'Type': df.dtypes,
	'Non-Null Count': df.count(),
	'Null Count': df.isnull().sum(),
	'Unique Values': [df[col].nunique() for col in df.columns]
	})
	st.write(buffer)

	# Statistical summary
	st.header("Statistical Summary")
	st.write(df.describe())

	# Class distribution
	st.header("Class Distribution")
	if 'Class' in df.columns:
	fig = visualizer.plot_class_distribution(df)
	st.pyplot(fig)

	# Calculate fraud percentage
	fraud_percentage = df['Class'].mean() * 100
	st.write(f"Fraud transactions: {fraud_percentage:.2f}% of the dataset")
	else:
	st.warning("No 'Class' column found in the dataset. Please ensure your target variable is named 'Class'.")

	# Feature distributions
	st.header("Feature Distributions")
	num_features = st.slider("Number of features to display", 1, min(10, len(df.columns)-1), 5)
	fig = visualizer.plot_feature_distributions(df, n_features=num_features)
	st.pyplot(fig)

	# Correlation matrix
	st.header("Correlation Matrix")
	fig = visualizer.plot_correlation_matrix(df)
	st.pyplot(fig)

	# Transaction amount analysis
	if 'Amount' in df.columns:
	st.header("Transaction Amount Analysis")

	col1, col2 = st.columns(2)

	with col1:
	st.subheader("Amount Distribution")
	fig, ax = plt.subplots(figsize=(10, 6))
	sns.histplot(data=df, x='Amount', bins=50, kde=True, ax=ax)
	st.pyplot(fig)

	with col2:
	if 'Class' in df.columns:
	st.subheader("Amount by Class")
	fig, ax = plt.subplots(figsize=(10, 6))
	sns.boxplot(x='Class', y='Amount', data=df, ax=ax)
	st.pyplot(fig)

	# Time analysis
	if 'Time' in df.columns:
	st.header("Transaction Time Analysis")

	# Convert time to hours
	df_time = df.copy()
	df_time['Hour'] = (df_time['Time'] / 3600) % 24

	fig, ax = plt.subplots(figsize=(12, 6))
	if 'Class' in df.columns:
	sns.histplot(data=df_time, x='Hour', hue='Class', bins=24, kde=True, ax=ax)
	else:
	sns.histplot(data=df_time, x='Hour', bins=24, kde=True, ax=ax)
	plt.title('Transaction Distribution by Hour of Day')
	plt.xlabel('Hour of Day')
	plt.ylabel('Number of Transactions')
	st.pyplot(fig)

	# Feature analysis for fraud detection
	if 'Class' in df.columns:
	st.header("Feature Analysis for Fraud Detection")

	# Select top features correlated with fraud
	corr_with_fraud = df.corr()['Class'].sort_values(ascending=False)
	top_features = corr_with_fraud[1:6].index.tolist() # Skip Class itself

	st.subheader("Top Features Correlated with Fraud")
	st.write(corr_with_fraud[1:11]) # Show top 10 correlations

	# Plot distributions of top features by fraud/non-fraud
	st.subheader("Distributions of Top Features by Class")
	for feature in top_features:
	fig, ax = plt.subplots(figsize=(10, 6))
	sns.histplot(data=df, x=feature, hue='Class', bins=50, kde=True, ax=ax)
	plt.title(f'Distribution of {feature} by Class')
	st.pyplot(fig)

	if __name__ == "__main__":
	app()