# test_data_quality.py import pytest import pandas as pd import os # --- Configuration --- PROCESSED_DATA_FILE = "data/processed/ALL_PSYCHOLOGY_DATA_normalized.parquet" EXPECTED_COLUMNS = ['question', 'answer', 'source', 'licence'] EXPECTED_SOURCES = [ 'BoltMonkey/psychology-question-answer', 'Gragroo/psychology-question-answer_psygpt_with_validation', 'PsychoLexQA', 'MMLU/professional_psychology', 'MMLU/high_school_psychology' ] MIN_QUESTION_LENGTH = 10 # A reasonable minimum length for a question MAX_QUESTION_LENGTH = 1500 # A reasonable maximum @pytest.fixture(scope="module") def data(): """A pytest fixture to load the main dataset once for all tests.""" if not os.path.exists(PROCESSED_DATA_FILE): pytest.fail(f"FATAL: Processed data file not found at {PROCESSED_DATA_FILE}. Run normalize_psych_data.py first.") return pd.read_parquet(PROCESSED_DATA_FILE) # --- Test Cases --- def test_file_exists(): """Test 1: Ensures the processed data file was actually created.""" assert os.path.exists(PROCESSED_DATA_FILE), "The final processed parquet file is missing." def test_schema_is_correct(data): """Test 2: Validates that all expected columns are present.""" for col in EXPECTED_COLUMNS: assert col in data.columns, f"Missing expected column: '{col}'" def test_no_missing_critical_data(data): """Test 3: Ensures there are no nulls in the core 'question' and 'answer' fields.""" assert data['question'].isnull().sum() == 0, "There are missing values in the 'question' column." assert data['answer'].isnull().sum() == 0, "There are missing values in the 'answer' column." def test_content_plausibility(data): """Test 4: Checks if the data content is reasonable (e.g., not too short).""" shortest_question = data['question'].str.len().min() assert shortest_question >= MIN_QUESTION_LENGTH, f"Found a question with length {shortest_question}, which is shorter than the minimum threshold of {MIN_QUESTION_LENGTH}." longest_question = data['question'].str.len().max() assert longest_question <= MAX_QUESTION_LENGTH, f"Found a question with length {longest_question}, which is longer than the maximum threshold of {MAX_QUESTION_LENGTH}." def test_source_column_is_valid(data): """Test 5: Checks if the 'source' column contains only known, expected values.""" unexpected_sources = set(data['source'].unique()) - set(EXPECTED_SOURCES) assert not unexpected_sources, f"Found unexpected data sources: {unexpected_sources}"