Spaces:
Runtime error
Runtime error
# investigate_data.py | |
# A robust script to find and analyze rows in the dataset based on column length. | |
import pandas as pd | |
import argparse | |
import os | |
# --- Configuration --- | |
DEFAULT_DATA_FILE = "data/processed/ALL_PSYCHOLOGY_DATA_normalized.parquet" | |
DEFAULT_COLUMN = "question" | |
DEFAULT_THRESHOLD = 10 | |
def investigate_column_length(file_path: str, column_name: str, threshold: int, comparison: str): | |
""" | |
Loads a parquet file and prints rows where the length of a specified column | |
is less than or greater than a given threshold. | |
Args: | |
file_path (str): The path to the parquet data file. | |
column_name (str): The name of the column to investigate. | |
threshold (int): The length threshold to check against. | |
comparison (str): Either 'less' or 'greater'. | |
""" | |
# --- 1. Input Validation --- | |
if not os.path.exists(file_path): | |
print(f"Error: Data file not found at '{file_path}'") | |
return | |
if comparison not in ['less', 'greater']: | |
print(f"Error: Invalid comparison type '{comparison}'. Must be 'less' or 'greater'.") | |
return | |
print(f"--- Starting Investigation ---") | |
print(f"File: {file_path}") | |
print(f"Column: {column_name}") | |
print(f"Threshold: {threshold}") | |
print(f"Condition: Length is {comparison} than {threshold}") | |
print("----------------------------\n") | |
# --- 2. Data Loading and Analysis --- | |
try: | |
df = pd.read_parquet(file_path) | |
if column_name not in df.columns: | |
print(f"Error: Column '{column_name}' not found in the dataset.") | |
print(f"Available columns are: {list(df.columns)}") | |
return | |
# Ensure the column is of string type for .str accessor | |
df[column_name] = df[column_name].astype(str) | |
# Apply the filter based on the comparison type | |
if comparison == 'less': | |
filtered_df = df[df[column_name].str.len() < threshold] | |
else: # comparison == 'greater' | |
filtered_df = df[df[column_name].str.len() > threshold] | |
# --- 3. Reporting Results --- | |
num_found = len(filtered_df) | |
if num_found > 0: | |
print(f"SUCCESS: Found {num_found} rows meeting the condition.\n") | |
# Display relevant columns for context | |
display_columns = [column_name, 'source', 'answer'] | |
# Ensure display columns exist before trying to show them | |
valid_display_columns = [col for col in display_columns if col in df.columns] | |
with pd.option_context('display.max_rows', None, 'display.max_colwidth', 100): | |
print(filtered_df[valid_display_columns]) | |
else: | |
print("SUCCESS: Found 0 rows meeting the specified condition.") | |
except Exception as e: | |
print(f"An unexpected error occurred: {e}") | |
if __name__ == "__main__": | |
# --- 4. Command-Line Interface (CLI) Setup --- | |
parser = argparse.ArgumentParser( | |
description="Investigate a dataset by finding rows with specific column lengths.", | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter # Shows default values in help message | |
) | |
parser.add_argument( | |
"--file", | |
default=DEFAULT_DATA_FILE, | |
help="Path to the .parquet data file to investigate." | |
) | |
parser.add_argument( | |
"--column", | |
default=DEFAULT_COLUMN, | |
help="The column whose length you want to check." | |
) | |
parser.add_argument( | |
"--threshold", | |
type=int, | |
default=DEFAULT_THRESHOLD, | |
help="The character length threshold." | |
) | |
parser.add_argument( | |
"--comparison", | |
choices=['less', 'greater'], | |
default='less', | |
help="Set to 'less' to find rows shorter than the threshold, or 'greater' for longer." | |
) | |
args = parser.parse_args() | |
investigate_column_length( | |
file_path=args.file, | |
column_name=args.column, | |
threshold=args.threshold, | |
comparison=args.comparison | |
) |