import pandas as pd
from sklearn.model_selection import train_test_split

def load_data(file_path):
    """Load data from a CSV file."""
    return pd.read_csv(file_path)

def clean_data(df):
    """Clean the dataset by handling missing values and duplicates."""
    df = df.dropna()
    df = df.drop_duplicates()
    return df

def preprocess_data(df, target_column):
    """Preprocess the data by splitting into features and target."""
    X = df.drop(columns=[target_column])
    y = df[target_column]
    return X, y

def split_data(X, y, test_size=0.2, random_state=42):
    """Split the data into training and testing sets."""
    return train_test_split(X, y, test_size=test_size, random_state=random_state)