import pandas as pd | |
from sklearn.model_selection import train_test_split | |
def load_data(file_path): | |
"""Load data from a CSV file.""" | |
return pd.read_csv(file_path) | |
def clean_data(df): | |
"""Clean the dataset by handling missing values and duplicates.""" | |
df = df.dropna() | |
df = df.drop_duplicates() | |
return df | |
def preprocess_data(df, target_column): | |
"""Preprocess the data by splitting into features and target.""" | |
X = df.drop(columns=[target_column]) | |
y = df[target_column] | |
return X, y | |
def split_data(X, y, test_size=0.2, random_state=42): | |
"""Split the data into training and testing sets.""" | |
return train_test_split(X, y, test_size=test_size, random_state=random_state) | |