# feature_engineering.py import pandas as pd import numpy as np def simulate_student_interactions(df_qa, num_students, interactions_per_student): """Generates a realistic, time-series log from static Q&A data.""" if df_qa is None or df_qa.empty: return pd.DataFrame() print(f"\nSimulating interaction logs for {num_students} students...") all_interactions = [] for student_id in range(num_students): student_interactions = df_qa.sample(n=interactions_per_student, replace=True).copy() student_interactions['student_id'] = student_id mastery = {source: 0.1 for source in df_qa['source'].unique()} correct_list = [] for _, row in student_interactions.iterrows(): source = row['source'] is_correct = 1 if np.random.rand() < mastery.get(source, 0.1) else 0 correct_list.append(is_correct) mastery[source] += (1 - mastery.get(source, 0.1)) * 0.25 if is_correct else -mastery.get(source, 0.1) * 0.1 student_interactions['is_correct'] = correct_list correct_times = np.random.normal(25, 5, size=len(student_interactions)) incorrect_times = np.random.normal(60, 15, size=len(student_interactions)) student_interactions['response_time_sec'] = np.where(student_interactions['is_correct'] == 1, correct_times, incorrect_times).clip(5, 300) student_interactions['timestamp'] = pd.to_datetime(pd.Timestamp.now() + pd.to_timedelta(np.arange(len(student_interactions)), 'm')) all_interactions.append(student_interactions) df_simulated = pd.concat(all_interactions, ignore_index=True) if all_interactions else pd.DataFrame() print(f"Simulation complete. Generated {len(df_simulated):,} interactions.") return df_simulated def create_features(df, skill_encoder): """ Takes a dataframe of student interactions and engineers the features needed for the LGBM model. """ processed_df = df.copy() known_sources = skill_encoder.classes_ processed_df = processed_df[processed_df['source'].isin(known_sources)] if processed_df.empty: return pd.DataFrame() processed_df['skill_id_encoded'] = skill_encoder.transform(processed_df['source']) processed_df.sort_values(['student_id', 'timestamp'], inplace=True, kind='mergesort') processed_df['prior_is_correct'] = processed_df.groupby('student_id')['is_correct'].shift(1) processed_df['prior_response_time'] = processed_df.groupby('student_id')['response_time_sec'].shift(1) processed_df['skill_attempts'] = processed_df.groupby(['student_id', 'skill_id_encoded']).cumcount() skill_correct_sum = processed_df.groupby(['student_id', 'skill_id_encoded'])['is_correct'].cumsum() processed_df['skill_correct_rate'] = (skill_correct_sum - processed_df['is_correct']) / processed_df['skill_attempts'] processed_df['skill_correct_rate'] = processed_df['skill_correct_rate'].fillna(0.5) processed_df['question_length'] = processed_df['question'].str.len().fillna(0) processed_df.dropna(subset=['prior_is_correct', 'prior_response_time'], inplace=True) return processed_df