# compute_embeddings.py import pandas as pd from sentence_transformers import SentenceTransformer import os # --- Configuration --- DATA_FILE = "data/processed/ALL_PSYCHOLOGY_DATA_normalized.parquet" OUTPUT_FILE = "data/processed/psychology_data_with_embeddings.parquet" # This model is small, fast, and effective for sentence-level tasks. MODEL_NAME = 'all-MiniLM-L6-v2' if __name__ == "__main__": print("--- Starting Embedding Computation ---") if not os.path.exists(DATA_FILE): print(f"FATAL: Data file not found at {DATA_FILE}. Please run normalize_psych_data.py first.") else: df = pd.read_parquet(DATA_FILE) print(f"Loading sentence-transformer model: '{MODEL_NAME}'...") # This will download the model on the first run model = SentenceTransformer(MODEL_NAME) print(f"Computing embeddings for {len(df)} questions... (This may take a while)") # The .encode() method takes a list of strings and returns a list of numpy arrays (embeddings) embeddings = model.encode(df['question'].tolist(), show_progress_bar=True) # The embedding is a 384-dimensional vector for this model. # We'll store it as separate columns in the dataframe. embedding_df = pd.DataFrame(embeddings, index=df.index) embedding_df = embedding_df.add_prefix('embed_') # Combine the original dataframe with the new embedding columns df_with_embeddings = pd.concat([df, embedding_df], axis=1) print(f"Saving new dataframe with embeddings to '{OUTPUT_FILE}'...") df_with_embeddings.to_parquet(OUTPUT_FILE) print("\nSUCCESS: Embeddings computed and saved.") print(f"New dataframe shape: {df_with_embeddings.shape}")