import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from imblearn.over_sampling import SMOTE from sklearn.model_selection import train_test_split from sklearn import __version__ as sklearn_version from packaging import version class DataProcessor: def __init__(self): self.scaler = StandardScaler() # Handle different scikit-learn versions if version.parse(sklearn_version) >= version.parse('1.2.0'): self.encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') else: self.encoder = OneHotEncoder(sparse=False, handle_unknown='ignore') def load_data(self, file_path): """Load the dataset from a CSV file""" try: df = pd.read_csv(file_path) return df except Exception as e: print(f"Error loading data: {e}") return None def preprocess_data(self, df, target_col='Class'): """Preprocess the data for model training""" # Handle missing values df = df.fillna(df.mean()) # Separate features and target X = df.drop(columns=[target_col]) y = df[target_col] # Split data into train and test sets X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) # Scale numerical features num_features = X.select_dtypes(include=['int64', 'float64']).columns # Get categorical features if any cat_features = X.select_dtypes(include=['object', 'category']).columns # Create preprocessing pipelines if version.parse(sklearn_version) >= version.parse('1.2.0'): preprocessor = ColumnTransformer( transformers=[ ('num', StandardScaler(), num_features), ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_features) ] if len(cat_features) > 0 else [ ('num', StandardScaler(), num_features) ] ) else: preprocessor = ColumnTransformer( transformers=[ ('num', StandardScaler(), num_features), ('cat', OneHotEncoder(sparse=False, handle_unknown='ignore'), cat_features) ] if len(cat_features) > 0 else [ ('num', StandardScaler(), num_features) ] ) # Fit and transform the training data X_train_processed = preprocessor.fit_transform(X_train) X_test_processed = preprocessor.transform(X_test) # Handle class imbalance using SMOTE smote = SMOTE(random_state=42) X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train) return X_train_resampled, X_test_processed, y_train_resampled, y_test, preprocessor def engineer_features(self, df): """Create new features for fraud detection""" # Copy the dataframe to avoid modifying the original df_new = df.copy() # If Time column exists, create time-based features if 'Time' in df_new.columns: # Convert seconds to hours of the day (assuming Time is in seconds from a reference point) df_new['Hour'] = (df_new['Time'] / 3600) % 24 # Flag for transactions during odd hours (midnight to 5 AM) df_new['Odd_Hour'] = ((df_new['Hour'] >= 0) & (df_new['Hour'] < 5)).astype(int) # If Amount column exists, create amount-based features if 'Amount' in df_new.columns: # Log transform for amount (to handle skewed distribution) df_new['Log_Amount'] = np.log1p(df_new['Amount']) # Flag for high-value transactions (top 5%) threshold = df_new['Amount'].quantile(0.95) df_new['High_Value'] = (df_new['Amount'] > threshold).astype(int) # Transaction frequency features (if multiple transactions per account) if 'card_id' in df_new.columns: # Assuming there's a card or account ID # Number of transactions per card tx_count = df_new.groupby('card_id').size().reset_index(name='Tx_Count') df_new = df_new.merge(tx_count, on='card_id', how='left') # Average transaction amount per card avg_amount = df_new.groupby('card_id')['Amount'].mean().reset_index(name='Avg_Amount') df_new = df_new.merge(avg_amount, on='card_id', how='left') # Transaction amount deviation from average df_new['Amount_Deviation'] = df_new['Amount'] - df_new['Avg_Amount'] return df_new