# compute_embeddings.py
import pandas as pd
from sentence_transformers import SentenceTransformer
import os

# --- Configuration ---
DATA_FILE = "data/processed/ALL_PSYCHOLOGY_DATA_normalized.parquet"
OUTPUT_FILE = "data/processed/psychology_data_with_embeddings.parquet"
# This model is small, fast, and effective for sentence-level tasks.
MODEL_NAME = 'all-MiniLM-L6-v2' 

if __name__ == "__main__":
    print("--- Starting Embedding Computation ---")

    if not os.path.exists(DATA_FILE):
        print(f"FATAL: Data file not found at {DATA_FILE}. Please run normalize_psych_data.py first.")
    else:
        df = pd.read_parquet(DATA_FILE)

        print(f"Loading sentence-transformer model: '{MODEL_NAME}'...")
        # This will download the model on the first run
        model = SentenceTransformer(MODEL_NAME)

        print(f"Computing embeddings for {len(df)} questions... (This may take a while)")
        # The .encode() method takes a list of strings and returns a list of numpy arrays (embeddings)
        embeddings = model.encode(df['question'].tolist(), show_progress_bar=True)

        # The embedding is a 384-dimensional vector for this model.
        # We'll store it as separate columns in the dataframe.
        embedding_df = pd.DataFrame(embeddings, index=df.index)
        embedding_df = embedding_df.add_prefix('embed_')

        # Combine the original dataframe with the new embedding columns
        df_with_embeddings = pd.concat([df, embedding_df], axis=1)

        print(f"Saving new dataframe with embeddings to '{OUTPUT_FILE}'...")
        df_with_embeddings.to_parquet(OUTPUT_FILE)

        print("\nSUCCESS: Embeddings computed and saved.")
        print(f"New dataframe shape: {df_with_embeddings.shape}")