psychology-tutor-engine / generate_distractor_training_set.py
adfras's picture
Initial commit: Psychology tutor engine and data pipelines
1da14e1
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os
from tqdm import tqdm
# --- Configuration ---
DATA_FILE = "data/processed/psychology_data_with_embeddings.parquet"
OUTPUT_DIR = "data/training_sets"
OUTPUT_FILE = os.path.join(OUTPUT_DIR, "distractor_generation_training_data.parquet")
NUM_SAMPLES_TO_GENERATE = 50000
SIMILARITY_MIN = 0.3
SIMILARITY_MAX = 0.7
# Process in batches to balance speed and memory usage.
# This size is safe for most standard computers.
BATCH_SIZE = 1000
if __name__ == "__main__":
print("--- Starting FAST & SAFE Automated Training Set Generation ---")
os.makedirs(OUTPUT_DIR, exist_ok=True)
if not os.path.exists(DATA_FILE):
print(f"FATAL: Data file with embeddings not found at '{DATA_FILE}'. Please run compute_embeddings.py first.")
else:
print("Loading data and embeddings...")
df = pd.read_parquet(DATA_FILE)
if NUM_SAMPLES_TO_GENERATE > len(df):
NUM_SAMPLES_TO_GENERATE = len(df)
df_sample = df.sample(n=NUM_SAMPLES_TO_GENERATE, random_state=42)
embedding_cols = [col for col in df.columns if col.startswith('embed_')]
all_embeddings = df[embedding_cols].values
training_records = []
num_batches = int(np.ceil(len(df_sample) / BATCH_SIZE))
print(f"Processing {len(df_sample)} samples in {num_batches} batches of size {BATCH_SIZE}...")
for i in tqdm(range(num_batches)):
# Get the current batch of questions
batch_start = i * BATCH_SIZE
batch_end = (i + 1) * BATCH_SIZE
batch_df = df_sample.iloc[batch_start:batch_end]
batch_embeddings = batch_df[embedding_cols].values
# --- FAST MATRIX OPERATION ---
# Calculate similarity for the entire batch at once.
# This creates a small, temporary matrix (e.g., 1000 x 400k)
sim_matrix_batch = cosine_similarity(batch_embeddings, all_embeddings)
# Now, iterate through the results for this small batch
for j in range(len(batch_df)):
scores = sim_matrix_batch[j]
candidate_indices = np.where((scores > SIMILARITY_MIN) & (scores < SIMILARITY_MAX))[0]
# Get the original index of the current question
current_question_original_index = batch_df.index[j]
candidate_indices = candidate_indices[candidate_indices != current_question_original_index]
if len(candidate_indices) > 0:
distractor_idx = np.random.choice(candidate_indices)
else:
sorted_indices = np.argsort(scores)
fallback_choice = sorted_indices[int(len(sorted_indices) * 0.3)]
distractor_idx = fallback_choice if fallback_choice != current_question_original_index else sorted_indices[int(len(sorted_indices) * 0.3) + 1]
training_records.append({
'question': batch_df.iloc[j]['question'],
'correct_answer': batch_df.iloc[j]['answer'],
'distractor': df.iloc[distractor_idx]['answer']
})
print("\nConstructing final training set from processed records...")
training_data = pd.DataFrame(training_records)
training_data.to_parquet(OUTPUT_FILE, index=False)
print("\n--- SUCCESS ---")
print(f"Automatically generated a training set with {len(training_data)} examples.")
print(f"File saved to: '{OUTPUT_FILE}'")
print("\nFirst 5 examples:")
print(training_data.head())