# test_data_quality.py
import pytest
import pandas as pd
import os

# --- Configuration ---
PROCESSED_DATA_FILE = "data/processed/ALL_PSYCHOLOGY_DATA_normalized.parquet"
EXPECTED_COLUMNS = ['question', 'answer', 'source', 'licence']
EXPECTED_SOURCES = [
    'BoltMonkey/psychology-question-answer',
    'Gragroo/psychology-question-answer_psygpt_with_validation',
    'PsychoLexQA',
    'MMLU/professional_psychology',
    'MMLU/high_school_psychology'
]
MIN_QUESTION_LENGTH = 10  # A reasonable minimum length for a question
MAX_QUESTION_LENGTH = 1500 # A reasonable maximum

@pytest.fixture(scope="module")
def data():
    """A pytest fixture to load the main dataset once for all tests."""
    if not os.path.exists(PROCESSED_DATA_FILE):
        pytest.fail(f"FATAL: Processed data file not found at {PROCESSED_DATA_FILE}. Run normalize_psych_data.py first.")
    return pd.read_parquet(PROCESSED_DATA_FILE)

# --- Test Cases ---

def test_file_exists():
    """Test 1: Ensures the processed data file was actually created."""
    assert os.path.exists(PROCESSED_DATA_FILE), "The final processed parquet file is missing."

def test_schema_is_correct(data):
    """Test 2: Validates that all expected columns are present."""
    for col in EXPECTED_COLUMNS:
        assert col in data.columns, f"Missing expected column: '{col}'"

def test_no_missing_critical_data(data):
    """Test 3: Ensures there are no nulls in the core 'question' and 'answer' fields."""
    assert data['question'].isnull().sum() == 0, "There are missing values in the 'question' column."
    assert data['answer'].isnull().sum() == 0, "There are missing values in the 'answer' column."

def test_content_plausibility(data):
    """Test 4: Checks if the data content is reasonable (e.g., not too short)."""
    shortest_question = data['question'].str.len().min()
    assert shortest_question >= MIN_QUESTION_LENGTH, f"Found a question with length {shortest_question}, which is shorter than the minimum threshold of {MIN_QUESTION_LENGTH}."
    longest_question = data['question'].str.len().max()
    assert longest_question <= MAX_QUESTION_LENGTH, f"Found a question with length {longest_question}, which is longer than the maximum threshold of {MAX_QUESTION_LENGTH}."


def test_source_column_is_valid(data):
    """Test 5: Checks if the 'source' column contains only known, expected values."""
    unexpected_sources = set(data['source'].unique()) - set(EXPECTED_SOURCES)
    assert not unexpected_sources, f"Found unexpected data sources: {unexpected_sources}"