File size: 4,060 Bytes
1da14e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# investigate_data.py
# A robust script to find and analyze rows in the dataset based on column length.

import pandas as pd
import argparse
import os

# --- Configuration ---
DEFAULT_DATA_FILE = "data/processed/ALL_PSYCHOLOGY_DATA_normalized.parquet"
DEFAULT_COLUMN = "question"
DEFAULT_THRESHOLD = 10

def investigate_column_length(file_path: str, column_name: str, threshold: int, comparison: str):
    """
    Loads a parquet file and prints rows where the length of a specified column
    is less than or greater than a given threshold.
    
    Args:
        file_path (str): The path to the parquet data file.
        column_name (str): The name of the column to investigate.
        threshold (int): The length threshold to check against.
        comparison (str): Either 'less' or 'greater'.
    """
    # --- 1. Input Validation ---
    if not os.path.exists(file_path):
        print(f"Error: Data file not found at '{file_path}'")
        return

    if comparison not in ['less', 'greater']:
        print(f"Error: Invalid comparison type '{comparison}'. Must be 'less' or 'greater'.")
        return

    print(f"--- Starting Investigation ---")
    print(f"File:        {file_path}")
    print(f"Column:      {column_name}")
    print(f"Threshold:   {threshold}")
    print(f"Condition:   Length is {comparison} than {threshold}")
    print("----------------------------\n")

    # --- 2. Data Loading and Analysis ---
    try:
        df = pd.read_parquet(file_path)

        if column_name not in df.columns:
            print(f"Error: Column '{column_name}' not found in the dataset.")
            print(f"Available columns are: {list(df.columns)}")
            return
            
        # Ensure the column is of string type for .str accessor
        df[column_name] = df[column_name].astype(str)
        
        # Apply the filter based on the comparison type
        if comparison == 'less':
            filtered_df = df[df[column_name].str.len() < threshold]
        else: # comparison == 'greater'
            filtered_df = df[df[column_name].str.len() > threshold]

        # --- 3. Reporting Results ---
        num_found = len(filtered_df)
        if num_found > 0:
            print(f"SUCCESS: Found {num_found} rows meeting the condition.\n")
            # Display relevant columns for context
            display_columns = [column_name, 'source', 'answer']
            # Ensure display columns exist before trying to show them
            valid_display_columns = [col for col in display_columns if col in df.columns]
            
            with pd.option_context('display.max_rows', None, 'display.max_colwidth', 100):
                print(filtered_df[valid_display_columns])
        else:
            print("SUCCESS: Found 0 rows meeting the specified condition.")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")


if __name__ == "__main__":
    # --- 4. Command-Line Interface (CLI) Setup ---
    parser = argparse.ArgumentParser(
        description="Investigate a dataset by finding rows with specific column lengths.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter # Shows default values in help message
    )
    
    parser.add_argument(
        "--file",
        default=DEFAULT_DATA_FILE,
        help="Path to the .parquet data file to investigate."
    )
    parser.add_argument(
        "--column",
        default=DEFAULT_COLUMN,
        help="The column whose length you want to check."
    )
    parser.add_argument(
        "--threshold",
        type=int,
        default=DEFAULT_THRESHOLD,
        help="The character length threshold."
    )
    parser.add_argument(
        "--comparison",
        choices=['less', 'greater'],
        default='less',
        help="Set to 'less' to find rows shorter than the threshold, or 'greater' for longer."
    )

    args = parser.parse_args()
    
    investigate_column_length(
        file_path=args.file,
        column_name=args.column,
        threshold=args.threshold,
        comparison=args.comparison
    )