psychology-tutor-engine / investigate_data.py
adfras's picture
Initial commit: Psychology tutor engine and data pipelines
1da14e1
# investigate_data.py
# A robust script to find and analyze rows in the dataset based on column length.
import pandas as pd
import argparse
import os
# --- Configuration ---
DEFAULT_DATA_FILE = "data/processed/ALL_PSYCHOLOGY_DATA_normalized.parquet"
DEFAULT_COLUMN = "question"
DEFAULT_THRESHOLD = 10
def investigate_column_length(file_path: str, column_name: str, threshold: int, comparison: str):
"""
Loads a parquet file and prints rows where the length of a specified column
is less than or greater than a given threshold.
Args:
file_path (str): The path to the parquet data file.
column_name (str): The name of the column to investigate.
threshold (int): The length threshold to check against.
comparison (str): Either 'less' or 'greater'.
"""
# --- 1. Input Validation ---
if not os.path.exists(file_path):
print(f"Error: Data file not found at '{file_path}'")
return
if comparison not in ['less', 'greater']:
print(f"Error: Invalid comparison type '{comparison}'. Must be 'less' or 'greater'.")
return
print(f"--- Starting Investigation ---")
print(f"File: {file_path}")
print(f"Column: {column_name}")
print(f"Threshold: {threshold}")
print(f"Condition: Length is {comparison} than {threshold}")
print("----------------------------\n")
# --- 2. Data Loading and Analysis ---
try:
df = pd.read_parquet(file_path)
if column_name not in df.columns:
print(f"Error: Column '{column_name}' not found in the dataset.")
print(f"Available columns are: {list(df.columns)}")
return
# Ensure the column is of string type for .str accessor
df[column_name] = df[column_name].astype(str)
# Apply the filter based on the comparison type
if comparison == 'less':
filtered_df = df[df[column_name].str.len() < threshold]
else: # comparison == 'greater'
filtered_df = df[df[column_name].str.len() > threshold]
# --- 3. Reporting Results ---
num_found = len(filtered_df)
if num_found > 0:
print(f"SUCCESS: Found {num_found} rows meeting the condition.\n")
# Display relevant columns for context
display_columns = [column_name, 'source', 'answer']
# Ensure display columns exist before trying to show them
valid_display_columns = [col for col in display_columns if col in df.columns]
with pd.option_context('display.max_rows', None, 'display.max_colwidth', 100):
print(filtered_df[valid_display_columns])
else:
print("SUCCESS: Found 0 rows meeting the specified condition.")
except Exception as e:
print(f"An unexpected error occurred: {e}")
if __name__ == "__main__":
# --- 4. Command-Line Interface (CLI) Setup ---
parser = argparse.ArgumentParser(
description="Investigate a dataset by finding rows with specific column lengths.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter # Shows default values in help message
)
parser.add_argument(
"--file",
default=DEFAULT_DATA_FILE,
help="Path to the .parquet data file to investigate."
)
parser.add_argument(
"--column",
default=DEFAULT_COLUMN,
help="The column whose length you want to check."
)
parser.add_argument(
"--threshold",
type=int,
default=DEFAULT_THRESHOLD,
help="The character length threshold."
)
parser.add_argument(
"--comparison",
choices=['less', 'greater'],
default='less',
help="Set to 'less' to find rows shorter than the threshold, or 'greater' for longer."
)
args = parser.parse_args()
investigate_column_length(
file_path=args.file,
column_name=args.column,
threshold=args.threshold,
comparison=args.comparison
)