Spaces:

adfras
/

psychology-tutor-engine

Runtime error

App Files Files Community

psychology-tutor-engine / investigate_data.py

adfras

Initial commit: Psychology tutor engine and data pipelines

1da14e1 3 months ago

raw

history blame contribute delete

4.06 kB

	# investigate_data.py
	# A robust script to find and analyze rows in the dataset based on column length.

	import pandas as pd
	import argparse
	import os

	# --- Configuration ---
	DEFAULT_DATA_FILE = "data/processed/ALL_PSYCHOLOGY_DATA_normalized.parquet"
	DEFAULT_COLUMN = "question"
	DEFAULT_THRESHOLD = 10

	def investigate_column_length(file_path: str, column_name: str, threshold: int, comparison: str):
	"""
	Loads a parquet file and prints rows where the length of a specified column
	is less than or greater than a given threshold.

	Args:
	file_path (str): The path to the parquet data file.
	column_name (str): The name of the column to investigate.
	threshold (int): The length threshold to check against.
	comparison (str): Either 'less' or 'greater'.
	"""
	# --- 1. Input Validation ---
	if not os.path.exists(file_path):
	print(f"Error: Data file not found at '{file_path}'")
	return

	if comparison not in ['less', 'greater']:
	print(f"Error: Invalid comparison type '{comparison}'. Must be 'less' or 'greater'.")
	return

	print(f"--- Starting Investigation ---")
	print(f"File: {file_path}")
	print(f"Column: {column_name}")
	print(f"Threshold: {threshold}")
	print(f"Condition: Length is {comparison} than {threshold}")
	print("----------------------------\n")

	# --- 2. Data Loading and Analysis ---
	try:
	df = pd.read_parquet(file_path)

	if column_name not in df.columns:
	print(f"Error: Column '{column_name}' not found in the dataset.")
	print(f"Available columns are: {list(df.columns)}")
	return

	# Ensure the column is of string type for .str accessor
	df[column_name] = df[column_name].astype(str)

	# Apply the filter based on the comparison type
	if comparison == 'less':
	filtered_df = df[df[column_name].str.len() < threshold]
	else: # comparison == 'greater'
	filtered_df = df[df[column_name].str.len() > threshold]

	# --- 3. Reporting Results ---
	num_found = len(filtered_df)
	if num_found > 0:
	print(f"SUCCESS: Found {num_found} rows meeting the condition.\n")
	# Display relevant columns for context
	display_columns = [column_name, 'source', 'answer']
	# Ensure display columns exist before trying to show them
	valid_display_columns = [col for col in display_columns if col in df.columns]

	with pd.option_context('display.max_rows', None, 'display.max_colwidth', 100):
	print(filtered_df[valid_display_columns])
	else:
	print("SUCCESS: Found 0 rows meeting the specified condition.")

	except Exception as e:
	print(f"An unexpected error occurred: {e}")


	if __name__ == "__main__":
	# --- 4. Command-Line Interface (CLI) Setup ---
	parser = argparse.ArgumentParser(
	description="Investigate a dataset by finding rows with specific column lengths.",
	formatter_class=argparse.ArgumentDefaultsHelpFormatter # Shows default values in help message
	)

	parser.add_argument(
	"--file",
	default=DEFAULT_DATA_FILE,
	help="Path to the .parquet data file to investigate."
	)
	parser.add_argument(
	"--column",
	default=DEFAULT_COLUMN,
	help="The column whose length you want to check."
	)
	parser.add_argument(
	"--threshold",
	type=int,
	default=DEFAULT_THRESHOLD,
	help="The character length threshold."
	)
	parser.add_argument(
	"--comparison",
	choices=['less', 'greater'],
	default='less',
	help="Set to 'less' to find rows shorter than the threshold, or 'greater' for longer."
	)

	args = parser.parse_args()

	investigate_column_length(
	file_path=args.file,
	column_name=args.column,
	threshold=args.threshold,
	comparison=args.comparison
	)