# normalize_psych_data.py # FINAL CORRECTED VERSION import os import requests import re import pandas as pd from datasets import load_dataset from tqdm import tqdm import warnings from langdetect import detect, LangDetectException # Import the new library warnings.simplefilter(action='ignore', category=FutureWarning) # --- Configuration --- RAW_DATA_DIR = "data/raw_psych_data" NORMALIZED_DATA_DIR = "data/processed" os.makedirs(RAW_DATA_DIR, exist_ok=True) os.makedirs(NORMALIZED_DATA_DIR, exist_ok=True) SCHEMA = ['question', 'answer', 'source', 'licence'] # ... (All the download_file, save_normalized_df, and process_* functions remain exactly the same) ... # --- Helper Functions --- def download_file(url, local_filename): local_path = os.path.join(RAW_DATA_DIR, local_filename) if os.path.exists(local_path): return local_path print(f"Downloading {url} to {local_path}...") try: with requests.get(url, stream=True, timeout=120) as r: r.raise_for_status() total_size = int(r.headers.get('content-length', 0)) with open(local_path, 'wb') as f, tqdm( total=total_size, unit='iB', unit_scale=True, desc=local_filename ) as pbar: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) pbar.update(len(chunk)) return local_path except requests.exceptions.RequestException as e: print(f"ERROR: Download failed for {url}. Error: {e}") return None def save_normalized_df(df, filename): # This function now ONLY saves. It does not filter. assert set(df.columns) >= {"question", "answer"}, f"DataFrame for {filename} is missing 'question' or 'answer' columns. Found: {list(df.columns)}" df = df.dropna(subset=['question', 'answer']) df = df[df['question'].astype(str).str.strip() != ''] df = df[df['answer'].astype(str).str.strip() != ''] df = df[SCHEMA].copy() output_path = os.path.join(NORMALIZED_DATA_DIR, filename) df.to_parquet(output_path, index=False) # --- Dataset Processing Functions --- def process_boltmonkey(): print("\n--- Processing: BoltMonkey ---") url = "https://huggingface.co/datasets/BoltMonkey/psychology-question-answer/resolve/main/data/train/train.json?download=true" filepath = download_file(url, "boltmonkey.json") if not filepath: return pd.DataFrame() df = pd.read_json(filepath) df['source'] = 'BoltMonkey/psychology-question-answer' df['licence'] = 'CC-BY-NC' return df def process_gragroo(): print("\n--- Processing: Gragroo ---") url = "https://huggingface.co/datasets/Gragroo/psychology-question-answer_psygpt_with_validation/resolve/main/data/train-00000-of-00001.parquet?download=true" filepath = download_file(url, "gragroo_train.parquet") if not filepath: return pd.DataFrame() pairs = [] for conv in pd.read_parquet(filepath)["conversations"]: q = None for turn in conv: if turn["from"] == "human": q = turn["value"].strip() elif turn["from"] == "assistant" and q: pairs.append({"question": q, "answer": turn["value"].strip()}) q = None if not pairs: return pd.DataFrame() df = pd.DataFrame(pairs) df["source"] = "Gragroo/psychology-question-answer_psygpt_with_validation" df["licence"] = "CC-BY-NC" return df def process_psycholexqa(): print("\n--- Processing: PsychoLexQA ---") try: ds = load_dataset("aminabbasi/PsychoLexQA", split="train") df = ds.to_pandas().rename(columns={"instruction": "question", "output": "answer"}) df["source"] = "PsychoLexQA" df["licence"] = "CC-BY-NC" return df except Exception as e: print(f"ERROR: Could not load PsychoLexQA. Accept the licence on Hugging Face first. Error: {e}") return pd.DataFrame() def process_mmlu(): print("\n--- Processing: MMLU Psychology ---") all_dfs = [] for split in ["high_school_psychology", "professional_psychology"]: try: ds = load_dataset("cais/mmlu", name=split, split="test") df = ds.to_pandas() def format_answer(row): choices_text = "\n".join([f"{chr(65+i)}. {choice}" for i, choice in enumerate(row['choices'])]) correct_choice = row['choices'][row['answer']] return f"{row['question']}\n\n{choices_text}", f"The correct answer is {chr(65 + row['answer'])}: {correct_choice}" df['question'], df['answer'] = zip(*df.apply(format_answer, axis=1)) df['source'], df['licence'] = f'MMLU/{split}', 'MIT' all_dfs.append(df) except Exception as e: print(f"ERROR: Could not process MMLU split {split}. Error: {e}") return pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame() # --- Main Execution --- if __name__ == "__main__": all_dataframes = [] processing_functions = [process_boltmonkey, process_gragroo, process_psycholexqa, process_mmlu] for func in processing_functions: try: df = func() if not df.empty: all_dataframes.append(df[SCHEMA]) except Exception as e: print(f"A critical error occurred during execution of {func.__name__}: {e}") if all_dataframes: print("\n--- Combining all datasets ---") final_df = pd.concat(all_dataframes, ignore_index=True) # --- APPLYING FINAL QUALITY FILTERS --- print(f"\nApplying final quality filters to {len(final_df)} combined rows...") original_rows = len(final_df) # Filter 1: Question Length final_df = final_df[final_df['question'].str.len() >= 10].copy() # --- NEW: Filter 2: Language Detection --- def is_english(text): try: # Returns True if language is English, False otherwise return detect(text) == 'en' except LangDetectException: # If language cannot be detected, assume it's not valid English return False print("Detecting language for each question... (This might take a moment)") # Apply the function to the 'question' column mask = final_df['question'].apply(is_english) final_df = final_df[mask] # --- END OF NEW FILTER --- rows_removed = original_rows - len(final_df) if rows_removed > 0: print(f"-> SUCCESS: Filtered out {rows_removed} rows due to length or language.") # Save the final, clean dataset final_output_path = os.path.join(NORMALIZED_DATA_DIR, "ALL_PSYCHOLOGY_DATA_normalized.parquet") final_df.to_parquet(final_output_path, index=False) print(f"\nSaved final combined data to {final_output_path} ({len(final_df)} rows)") print("\n--- Final Summary ---") print("Breakdown by source:") print(final_df['source'].value_counts()) else: print("\nNo data was processed successfully. Check logs for errors.")