# feature_extraction/liwc_from_csv.py import numpy as np import pandas as pd from collections import defaultdict, Counter import re def load_liwc_dic(dic_path="models/output.dic"): category_map = defaultdict(list) with open(dic_path, 'r', encoding='utf-8') as f: for line in f: if ':' not in line: continue parts = line.strip().split() category = parts[0].rstrip(':') words = parts[1:] category_map[category] = words return category_map def extract_liwc_from_csv(csv_path, category_map): df = pd.read_csv(csv_path) sorted_categories = sorted(category_map.keys()) def process_row(row): text = " ".join(str(row[q]) for q in ['Q1', 'Q2', 'Q3'] if pd.notna(row[q])) tokens = re.findall(r"\b\w+\b", text.lower()) counts = Counter() for category, words in category_map.items(): for token in tokens: if token in words: counts[category] += 1 vec = np.array([counts.get(cat, 0) for cat in sorted_categories]) if np.sum(vec) > 0: vec = vec / np.sum(vec) return vec liwc_features = df.apply(process_row, axis=1, result_type="expand") liwc_features.columns = [f"liwc_{cat}" for cat in sorted_categories] return liwc_features