Spaces:

adfras
/

psychology-tutor-engine

Runtime error

App Files Files Community

psychology-tutor-engine / generate_distractor_training_set.py

adfras

Initial commit: Psychology tutor engine and data pipelines

1da14e1 3 months ago

raw

history blame contribute delete

3.77 kB

	import pandas as pd
	import numpy as np
	from sklearn.metrics.pairwise import cosine_similarity
	import os
	from tqdm import tqdm

	# --- Configuration ---
	DATA_FILE = "data/processed/psychology_data_with_embeddings.parquet"
	OUTPUT_DIR = "data/training_sets"
	OUTPUT_FILE = os.path.join(OUTPUT_DIR, "distractor_generation_training_data.parquet")

	NUM_SAMPLES_TO_GENERATE = 50000
	SIMILARITY_MIN = 0.3
	SIMILARITY_MAX = 0.7
	# Process in batches to balance speed and memory usage.
	# This size is safe for most standard computers.
	BATCH_SIZE = 1000

	if __name__ == "__main__":
	print("--- Starting FAST & SAFE Automated Training Set Generation ---")

	os.makedirs(OUTPUT_DIR, exist_ok=True)

	if not os.path.exists(DATA_FILE):
	print(f"FATAL: Data file with embeddings not found at '{DATA_FILE}'. Please run compute_embeddings.py first.")
	else:
	print("Loading data and embeddings...")
	df = pd.read_parquet(DATA_FILE)

	if NUM_SAMPLES_TO_GENERATE > len(df):
	NUM_SAMPLES_TO_GENERATE = len(df)

	df_sample = df.sample(n=NUM_SAMPLES_TO_GENERATE, random_state=42)

	embedding_cols = [col for col in df.columns if col.startswith('embed_')]
	all_embeddings = df[embedding_cols].values

	training_records = []
	num_batches = int(np.ceil(len(df_sample) / BATCH_SIZE))

	print(f"Processing {len(df_sample)} samples in {num_batches} batches of size {BATCH_SIZE}...")

	for i in tqdm(range(num_batches)):
	# Get the current batch of questions
	batch_start = i * BATCH_SIZE
	batch_end = (i + 1) * BATCH_SIZE
	batch_df = df_sample.iloc[batch_start:batch_end]
	batch_embeddings = batch_df[embedding_cols].values

	# --- FAST MATRIX OPERATION ---
	# Calculate similarity for the entire batch at once.
	# This creates a small, temporary matrix (e.g., 1000 x 400k)
	sim_matrix_batch = cosine_similarity(batch_embeddings, all_embeddings)

	# Now, iterate through the results for this small batch
	for j in range(len(batch_df)):
	scores = sim_matrix_batch[j]

	candidate_indices = np.where((scores > SIMILARITY_MIN) & (scores < SIMILARITY_MAX))[0]

	# Get the original index of the current question
	current_question_original_index = batch_df.index[j]
	candidate_indices = candidate_indices[candidate_indices != current_question_original_index]

	if len(candidate_indices) > 0:
	distractor_idx = np.random.choice(candidate_indices)
	else:
	sorted_indices = np.argsort(scores)
	fallback_choice = sorted_indices[int(len(sorted_indices) * 0.3)]
	distractor_idx = fallback_choice if fallback_choice != current_question_original_index else sorted_indices[int(len(sorted_indices) * 0.3) + 1]

	training_records.append({
	'question': batch_df.iloc[j]['question'],
	'correct_answer': batch_df.iloc[j]['answer'],
	'distractor': df.iloc[distractor_idx]['answer']
	})

	print("\nConstructing final training set from processed records...")
	training_data = pd.DataFrame(training_records)

	training_data.to_parquet(OUTPUT_FILE, index=False)

	print("\n--- SUCCESS ---")
	print(f"Automatically generated a training set with {len(training_data)} examples.")
	print(f"File saved to: '{OUTPUT_FILE}'")
	print("\nFirst 5 examples:")
	print(training_data.head())