Arash-Alborz's picture
Upload pipeline.py
99dce5b verified
raw
history blame contribute delete
820 Bytes
# feature_extraction/pipeline.py
import numpy as np
import joblib
from feature_extraction.embedding_from_text import get_bert_embedding
from feature_extraction.liwc_from_text import load_liwc_dic, liwc_vector
# Load the LIWC lexicon once
liwc_map = load_liwc_dic("models/output.dic")
# Load the scaler
scaler = joblib.load("models/scaler.pkl")
def text_to_features(text: str) -> np.ndarray:
# Get BERT embedding (768-dim)
emb_vec = get_bert_embedding(text)
# Get LIWC vector (~64-dim)
liwc_vec, _ = liwc_vector(text, liwc_map)
# Combine into one long vector
full_vec = np.concatenate([emb_vec, liwc_vec])
# Standardize using the saved scaler
scaled_vec = scaler.transform([full_vec]) # shape: (1, total_dim)
return scaled_vec # Return the standardized vector for prediction