Commit
·
cf24ede
1
Parent(s):
2c4b5cc
Create features/feature_engineer.py
Browse filesAdding Enhanced Feature Engineering Pipeline
- features/feature_engineer.py +505 -0
features/feature_engineer.py
ADDED
@@ -0,0 +1,505 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# File: features/feature_engineer.py
|
2 |
+
# Enhanced Feature Engineering Pipeline for Priority 6
|
3 |
+
|
4 |
+
import json
|
5 |
+
import joblib
|
6 |
+
import logging
|
7 |
+
import numpy as np
|
8 |
+
import pandas as pd
|
9 |
+
from pathlib import Path
|
10 |
+
from datetime import datetime
|
11 |
+
from scipy.sparse import hstack, csr_matrix
|
12 |
+
from typing import Dict, List, Any, Optional, Tuple
|
13 |
+
|
14 |
+
from sklearn.pipeline import Pipeline
|
15 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
16 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
17 |
+
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
|
18 |
+
from sklearn.preprocessing import StandardScaler, FunctionTransformer
|
19 |
+
|
20 |
+
import warnings
|
21 |
+
warnings.filterwarnings('ignore')
|
22 |
+
|
23 |
+
# Import feature analyzers
|
24 |
+
from features.sentiment_analyzer import SentimentAnalyzer
|
25 |
+
from features.readability_analyzer import ReadabilityAnalyzer
|
26 |
+
from features.entity_analyzer import EntityAnalyzer
|
27 |
+
from features.linguistic_analyzer import LinguisticAnalyzer
|
28 |
+
|
29 |
+
# Configure logging
|
30 |
+
logging.basicConfig(level=logging.INFO)
|
31 |
+
logger = logging.getLogger(__name__)
|
32 |
+
|
33 |
+
|
34 |
+
class AdvancedFeatureEngineer(BaseEstimator, TransformerMixin):
|
35 |
+
"""
|
36 |
+
Advanced feature engineering pipeline combining multiple NLP feature extractors
|
37 |
+
for enhanced fake news detection performance.
|
38 |
+
"""
|
39 |
+
|
40 |
+
def __init__(self,
|
41 |
+
enable_sentiment: bool = True,
|
42 |
+
enable_readability: bool = True,
|
43 |
+
enable_entities: bool = True,
|
44 |
+
enable_linguistic: bool = True,
|
45 |
+
feature_selection_k: int = 5000,
|
46 |
+
tfidf_max_features: int = 10000,
|
47 |
+
ngram_range: Tuple[int, int] = (1, 3),
|
48 |
+
min_df: int = 2,
|
49 |
+
max_df: float = 0.95):
|
50 |
+
"""
|
51 |
+
Initialize the advanced feature engineering pipeline.
|
52 |
+
|
53 |
+
Args:
|
54 |
+
enable_sentiment: Enable sentiment analysis features
|
55 |
+
enable_readability: Enable readability/complexity features
|
56 |
+
enable_entities: Enable named entity recognition features
|
57 |
+
enable_linguistic: Enable advanced linguistic features
|
58 |
+
feature_selection_k: Number of features to select
|
59 |
+
tfidf_max_features: Maximum TF-IDF features
|
60 |
+
ngram_range: N-gram range for TF-IDF
|
61 |
+
min_df: Minimum document frequency for TF-IDF
|
62 |
+
max_df: Maximum document frequency for TF-IDF
|
63 |
+
"""
|
64 |
+
self.enable_sentiment = enable_sentiment
|
65 |
+
self.enable_readability = enable_readability
|
66 |
+
self.enable_entities = enable_entities
|
67 |
+
self.enable_linguistic = enable_linguistic
|
68 |
+
self.feature_selection_k = feature_selection_k
|
69 |
+
self.tfidf_max_features = tfidf_max_features
|
70 |
+
self.ngram_range = ngram_range
|
71 |
+
self.min_df = min_df
|
72 |
+
self.max_df = max_df
|
73 |
+
|
74 |
+
# Initialize feature extractors
|
75 |
+
self.sentiment_analyzer = SentimentAnalyzer() if enable_sentiment else None
|
76 |
+
self.readability_analyzer = ReadabilityAnalyzer() if enable_readability else None
|
77 |
+
self.entity_analyzer = EntityAnalyzer() if enable_entities else None
|
78 |
+
self.linguistic_analyzer = LinguisticAnalyzer() if enable_linguistic else None
|
79 |
+
|
80 |
+
# Initialize TF-IDF components
|
81 |
+
self.tfidf_vectorizer = None
|
82 |
+
self.feature_selector = None
|
83 |
+
self.feature_scaler = None
|
84 |
+
|
85 |
+
# Feature metadata
|
86 |
+
self.feature_names_ = []
|
87 |
+
self.feature_importance_ = {}
|
88 |
+
self.is_fitted_ = False
|
89 |
+
|
90 |
+
def fit(self, X, y=None):
|
91 |
+
"""
|
92 |
+
Fit the feature engineering pipeline.
|
93 |
+
|
94 |
+
Args:
|
95 |
+
X: Text data (array-like of strings)
|
96 |
+
y: Target labels (optional, for supervised feature selection)
|
97 |
+
"""
|
98 |
+
logger.info("Fitting advanced feature engineering pipeline...")
|
99 |
+
|
100 |
+
# Convert to array if needed
|
101 |
+
if isinstance(X, pd.Series):
|
102 |
+
X = X.values
|
103 |
+
elif isinstance(X, list):
|
104 |
+
X = np.array(X)
|
105 |
+
|
106 |
+
# Validate input
|
107 |
+
if len(X) == 0:
|
108 |
+
raise ValueError("Cannot fit on empty data")
|
109 |
+
|
110 |
+
# Initialize TF-IDF vectorizer
|
111 |
+
self.tfidf_vectorizer = TfidfVectorizer(
|
112 |
+
max_features=self.tfidf_max_features,
|
113 |
+
ngram_range=self.ngram_range,
|
114 |
+
min_df=self.min_df,
|
115 |
+
max_df=self.max_df,
|
116 |
+
stop_words='english',
|
117 |
+
sublinear_tf=True,
|
118 |
+
norm='l2',
|
119 |
+
lowercase=True
|
120 |
+
)
|
121 |
+
|
122 |
+
# Fit TF-IDF on text data
|
123 |
+
logger.info("Fitting TF-IDF vectorizer...")
|
124 |
+
tfidf_features = self.tfidf_vectorizer.fit_transform(X)
|
125 |
+
|
126 |
+
# Extract additional features
|
127 |
+
additional_features = self._extract_additional_features(X, fit=True)
|
128 |
+
|
129 |
+
# Combine all features
|
130 |
+
if additional_features.shape[1] > 0:
|
131 |
+
all_features = hstack([tfidf_features, additional_features])
|
132 |
+
else:
|
133 |
+
all_features = tfidf_features
|
134 |
+
|
135 |
+
logger.info(f"Total features before selection: {all_features.shape[1]}")
|
136 |
+
|
137 |
+
# Feature selection
|
138 |
+
if y is not None and self.feature_selection_k < all_features.shape[1]:
|
139 |
+
logger.info(f"Performing feature selection (k={self.feature_selection_k})...")
|
140 |
+
|
141 |
+
# Use chi2 for text features and mutual information for numerical features
|
142 |
+
self.feature_selector = SelectKBest(
|
143 |
+
score_func=chi2,
|
144 |
+
k=min(self.feature_selection_k, all_features.shape[1])
|
145 |
+
)
|
146 |
+
|
147 |
+
# Ensure non-negative features for chi2
|
148 |
+
if hasattr(all_features, 'toarray'):
|
149 |
+
features_dense = all_features.toarray()
|
150 |
+
else:
|
151 |
+
features_dense = all_features
|
152 |
+
|
153 |
+
# Make features non-negative for chi2
|
154 |
+
features_dense = np.maximum(features_dense, 0)
|
155 |
+
|
156 |
+
self.feature_selector.fit(features_dense, y)
|
157 |
+
selected_features = self.feature_selector.transform(features_dense)
|
158 |
+
|
159 |
+
logger.info(f"Selected {selected_features.shape[1]} features")
|
160 |
+
else:
|
161 |
+
selected_features = all_features
|
162 |
+
|
163 |
+
# Scale numerical features (additional features only)
|
164 |
+
if additional_features.shape[1] > 0:
|
165 |
+
self.feature_scaler = StandardScaler()
|
166 |
+
# Only scale the additional features part
|
167 |
+
additional_selected = selected_features[:, -additional_features.shape[1]:]
|
168 |
+
self.feature_scaler.fit(additional_selected)
|
169 |
+
|
170 |
+
# Generate feature names
|
171 |
+
self._generate_feature_names()
|
172 |
+
|
173 |
+
# Calculate feature importance if possible
|
174 |
+
if y is not None and self.feature_selector is not None:
|
175 |
+
self._calculate_feature_importance()
|
176 |
+
|
177 |
+
self.is_fitted_ = True
|
178 |
+
logger.info("Feature engineering pipeline fitted successfully")
|
179 |
+
|
180 |
+
return self
|
181 |
+
|
182 |
+
def transform(self, X):
|
183 |
+
"""
|
184 |
+
Transform text data into enhanced feature vectors.
|
185 |
+
|
186 |
+
Args:
|
187 |
+
X: Text data (array-like of strings)
|
188 |
+
|
189 |
+
Returns:
|
190 |
+
Transformed feature matrix
|
191 |
+
"""
|
192 |
+
if not self.is_fitted_:
|
193 |
+
raise ValueError("Pipeline must be fitted before transforming")
|
194 |
+
|
195 |
+
# Convert to array if needed
|
196 |
+
if isinstance(X, pd.Series):
|
197 |
+
X = X.values
|
198 |
+
elif isinstance(X, list):
|
199 |
+
X = np.array(X)
|
200 |
+
|
201 |
+
# Extract TF-IDF features
|
202 |
+
tfidf_features = self.tfidf_vectorizer.transform(X)
|
203 |
+
|
204 |
+
# Extract additional features
|
205 |
+
additional_features = self._extract_additional_features(X, fit=False)
|
206 |
+
|
207 |
+
# Combine features
|
208 |
+
if additional_features.shape[1] > 0:
|
209 |
+
all_features = hstack([tfidf_features, additional_features])
|
210 |
+
else:
|
211 |
+
all_features = tfidf_features
|
212 |
+
|
213 |
+
# Apply feature selection
|
214 |
+
if self.feature_selector is not None:
|
215 |
+
if hasattr(all_features, 'toarray'):
|
216 |
+
features_dense = all_features.toarray()
|
217 |
+
else:
|
218 |
+
features_dense = all_features
|
219 |
+
|
220 |
+
# Ensure non-negative for consistency
|
221 |
+
features_dense = np.maximum(features_dense, 0)
|
222 |
+
selected_features = self.feature_selector.transform(features_dense)
|
223 |
+
else:
|
224 |
+
selected_features = all_features
|
225 |
+
|
226 |
+
# Scale additional features if scaler exists
|
227 |
+
if self.feature_scaler is not None and additional_features.shape[1] > 0:
|
228 |
+
# Scale only the additional features part
|
229 |
+
tfidf_selected = selected_features[:, :-additional_features.shape[1]]
|
230 |
+
additional_selected = selected_features[:, -additional_features.shape[1]:]
|
231 |
+
additional_scaled = self.feature_scaler.transform(additional_selected)
|
232 |
+
|
233 |
+
# Combine back
|
234 |
+
if hasattr(tfidf_selected, 'toarray'):
|
235 |
+
tfidf_selected = tfidf_selected.toarray()
|
236 |
+
|
237 |
+
final_features = np.hstack([tfidf_selected, additional_scaled])
|
238 |
+
else:
|
239 |
+
if hasattr(selected_features, 'toarray'):
|
240 |
+
final_features = selected_features.toarray()
|
241 |
+
else:
|
242 |
+
final_features = selected_features
|
243 |
+
|
244 |
+
return final_features
|
245 |
+
|
246 |
+
def _extract_additional_features(self, X, fit=False):
|
247 |
+
"""Extract additional features beyond TF-IDF"""
|
248 |
+
feature_arrays = []
|
249 |
+
|
250 |
+
try:
|
251 |
+
# Sentiment features
|
252 |
+
if self.sentiment_analyzer is not None:
|
253 |
+
logger.info("Extracting sentiment features...")
|
254 |
+
if fit:
|
255 |
+
sentiment_features = self.sentiment_analyzer.fit_transform(X)
|
256 |
+
else:
|
257 |
+
sentiment_features = self.sentiment_analyzer.transform(X)
|
258 |
+
feature_arrays.append(sentiment_features)
|
259 |
+
|
260 |
+
# Readability features
|
261 |
+
if self.readability_analyzer is not None:
|
262 |
+
logger.info("Extracting readability features...")
|
263 |
+
if fit:
|
264 |
+
readability_features = self.readability_analyzer.fit_transform(X)
|
265 |
+
else:
|
266 |
+
readability_features = self.readability_analyzer.transform(X)
|
267 |
+
feature_arrays.append(readability_features)
|
268 |
+
|
269 |
+
# Entity features
|
270 |
+
if self.entity_analyzer is not None:
|
271 |
+
logger.info("Extracting entity features...")
|
272 |
+
if fit:
|
273 |
+
entity_features = self.entity_analyzer.fit_transform(X)
|
274 |
+
else:
|
275 |
+
entity_features = self.entity_analyzer.transform(X)
|
276 |
+
feature_arrays.append(entity_features)
|
277 |
+
|
278 |
+
# Linguistic features
|
279 |
+
if self.linguistic_analyzer is not None:
|
280 |
+
logger.info("Extracting linguistic features...")
|
281 |
+
if fit:
|
282 |
+
linguistic_features = self.linguistic_analyzer.fit_transform(X)
|
283 |
+
else:
|
284 |
+
linguistic_features = self.linguistic_analyzer.transform(X)
|
285 |
+
feature_arrays.append(linguistic_features)
|
286 |
+
|
287 |
+
# Combine all additional features
|
288 |
+
if feature_arrays:
|
289 |
+
additional_features = np.hstack(feature_arrays)
|
290 |
+
logger.info(f"Extracted {additional_features.shape[1]} additional features")
|
291 |
+
else:
|
292 |
+
additional_features = np.empty((len(X), 0))
|
293 |
+
|
294 |
+
except Exception as e:
|
295 |
+
logger.warning(f"Error extracting additional features: {e}")
|
296 |
+
additional_features = np.empty((len(X), 0))
|
297 |
+
|
298 |
+
return additional_features
|
299 |
+
|
300 |
+
def _generate_feature_names(self):
|
301 |
+
"""Generate comprehensive feature names"""
|
302 |
+
self.feature_names_ = []
|
303 |
+
|
304 |
+
# TF-IDF feature names
|
305 |
+
if self.tfidf_vectorizer is not None:
|
306 |
+
tfidf_names = [f"tfidf_{name}" for name in self.tfidf_vectorizer.get_feature_names_out()]
|
307 |
+
self.feature_names_.extend(tfidf_names)
|
308 |
+
|
309 |
+
# Additional feature names
|
310 |
+
if self.sentiment_analyzer is not None:
|
311 |
+
self.feature_names_.extend(self.sentiment_analyzer.get_feature_names())
|
312 |
+
|
313 |
+
if self.readability_analyzer is not None:
|
314 |
+
self.feature_names_.extend(self.readability_analyzer.get_feature_names())
|
315 |
+
|
316 |
+
if self.entity_analyzer is not None:
|
317 |
+
self.feature_names_.extend(self.entity_analyzer.get_feature_names())
|
318 |
+
|
319 |
+
if self.linguistic_analyzer is not None:
|
320 |
+
self.feature_names_.extend(self.linguistic_analyzer.get_feature_names())
|
321 |
+
|
322 |
+
# Apply feature selection to names if applicable
|
323 |
+
if self.feature_selector is not None:
|
324 |
+
selected_indices = self.feature_selector.get_support()
|
325 |
+
self.feature_names_ = [name for i, name in enumerate(self.feature_names_) if selected_indices[i]]
|
326 |
+
|
327 |
+
def _calculate_feature_importance(self):
|
328 |
+
"""Calculate feature importance scores"""
|
329 |
+
if self.feature_selector is not None:
|
330 |
+
scores = self.feature_selector.scores_
|
331 |
+
selected_indices = self.feature_selector.get_support()
|
332 |
+
|
333 |
+
# Get scores for selected features
|
334 |
+
selected_scores = scores[selected_indices]
|
335 |
+
|
336 |
+
# Create importance dictionary
|
337 |
+
self.feature_importance_ = {
|
338 |
+
name: float(score) for name, score in zip(self.feature_names_, selected_scores)
|
339 |
+
}
|
340 |
+
|
341 |
+
# Sort by importance
|
342 |
+
self.feature_importance_ = dict(
|
343 |
+
sorted(self.feature_importance_.items(), key=lambda x: x[1], reverse=True)
|
344 |
+
)
|
345 |
+
|
346 |
+
def get_feature_names(self):
|
347 |
+
"""Get names of output features"""
|
348 |
+
if not self.is_fitted_:
|
349 |
+
raise ValueError("Pipeline must be fitted first")
|
350 |
+
return self.feature_names_
|
351 |
+
|
352 |
+
def get_feature_importance(self, top_k=None):
|
353 |
+
"""Get feature importance scores"""
|
354 |
+
if not self.feature_importance_:
|
355 |
+
return {}
|
356 |
+
|
357 |
+
if top_k is not None:
|
358 |
+
return dict(list(self.feature_importance_.items())[:top_k])
|
359 |
+
|
360 |
+
return self.feature_importance_
|
361 |
+
|
362 |
+
def get_feature_metadata(self):
|
363 |
+
"""Get comprehensive feature metadata"""
|
364 |
+
if not self.is_fitted_:
|
365 |
+
raise ValueError("Pipeline must be fitted first")
|
366 |
+
|
367 |
+
metadata = {
|
368 |
+
'total_features': len(self.feature_names_),
|
369 |
+
'feature_types': {
|
370 |
+
'tfidf_features': sum(1 for name in self.feature_names_ if name.startswith('tfidf_')),
|
371 |
+
'sentiment_features': sum(1 for name in self.feature_names_ if name.startswith('sentiment_')),
|
372 |
+
'readability_features': sum(1 for name in self.feature_names_ if name.startswith('readability_')),
|
373 |
+
'entity_features': sum(1 for name in self.feature_names_ if name.startswith('entity_')),
|
374 |
+
'linguistic_features': sum(1 for name in self.feature_names_ if name.startswith('linguistic_'))
|
375 |
+
},
|
376 |
+
'configuration': {
|
377 |
+
'enable_sentiment': self.enable_sentiment,
|
378 |
+
'enable_readability': self.enable_readability,
|
379 |
+
'enable_entities': self.enable_entities,
|
380 |
+
'enable_linguistic': self.enable_linguistic,
|
381 |
+
'feature_selection_k': self.feature_selection_k,
|
382 |
+
'tfidf_max_features': self.tfidf_max_features,
|
383 |
+
'ngram_range': self.ngram_range
|
384 |
+
},
|
385 |
+
'feature_importance_available': bool(self.feature_importance_),
|
386 |
+
'timestamp': datetime.now().isoformat()
|
387 |
+
}
|
388 |
+
|
389 |
+
return metadata
|
390 |
+
|
391 |
+
def save_pipeline(self, filepath):
|
392 |
+
"""Save the fitted pipeline"""
|
393 |
+
if not self.is_fitted_:
|
394 |
+
raise ValueError("Pipeline must be fitted before saving")
|
395 |
+
|
396 |
+
save_data = {
|
397 |
+
'feature_engineer': self,
|
398 |
+
'metadata': self.get_feature_metadata(),
|
399 |
+
'feature_names': self.feature_names_,
|
400 |
+
'feature_importance': self.feature_importance_
|
401 |
+
}
|
402 |
+
|
403 |
+
joblib.dump(save_data, filepath)
|
404 |
+
logger.info(f"Feature engineering pipeline saved to {filepath}")
|
405 |
+
|
406 |
+
@classmethod
|
407 |
+
def load_pipeline(cls, filepath):
|
408 |
+
"""Load a fitted pipeline"""
|
409 |
+
save_data = joblib.load(filepath)
|
410 |
+
feature_engineer = save_data['feature_engineer']
|
411 |
+
|
412 |
+
logger.info(f"Feature engineering pipeline loaded from {filepath}")
|
413 |
+
return feature_engineer
|
414 |
+
|
415 |
+
|
416 |
+
def create_enhanced_pipeline(X_train, y_train,
|
417 |
+
enable_sentiment=True,
|
418 |
+
enable_readability=True,
|
419 |
+
enable_entities=True,
|
420 |
+
enable_linguistic=True,
|
421 |
+
feature_selection_k=5000):
|
422 |
+
"""
|
423 |
+
Create and fit an enhanced feature engineering pipeline.
|
424 |
+
|
425 |
+
Args:
|
426 |
+
X_train: Training text data
|
427 |
+
y_train: Training labels
|
428 |
+
enable_sentiment: Enable sentiment analysis features
|
429 |
+
enable_readability: Enable readability features
|
430 |
+
enable_entities: Enable entity features
|
431 |
+
enable_linguistic: Enable linguistic features
|
432 |
+
feature_selection_k: Number of features to select
|
433 |
+
|
434 |
+
Returns:
|
435 |
+
Fitted AdvancedFeatureEngineer instance
|
436 |
+
"""
|
437 |
+
logger.info("Creating enhanced feature engineering pipeline...")
|
438 |
+
|
439 |
+
# Create feature engineer
|
440 |
+
feature_engineer = AdvancedFeatureEngineer(
|
441 |
+
enable_sentiment=enable_sentiment,
|
442 |
+
enable_readability=enable_readability,
|
443 |
+
enable_entities=enable_entities,
|
444 |
+
enable_linguistic=enable_linguistic,
|
445 |
+
feature_selection_k=feature_selection_k
|
446 |
+
)
|
447 |
+
|
448 |
+
# Fit the pipeline
|
449 |
+
feature_engineer.fit(X_train, y_train)
|
450 |
+
|
451 |
+
# Log feature information
|
452 |
+
metadata = feature_engineer.get_feature_metadata()
|
453 |
+
logger.info(f"Enhanced pipeline created with {metadata['total_features']} features")
|
454 |
+
logger.info(f"Feature breakdown: {metadata['feature_types']}")
|
455 |
+
|
456 |
+
return feature_engineer
|
457 |
+
|
458 |
+
|
459 |
+
def analyze_feature_importance(feature_engineer, top_k=20):
|
460 |
+
"""
|
461 |
+
Analyze and display feature importance.
|
462 |
+
|
463 |
+
Args:
|
464 |
+
feature_engineer: Fitted AdvancedFeatureEngineer instance
|
465 |
+
top_k: Number of top features to analyze
|
466 |
+
|
467 |
+
Returns:
|
468 |
+
Dictionary with feature analysis results
|
469 |
+
"""
|
470 |
+
if not feature_engineer.is_fitted_:
|
471 |
+
raise ValueError("Feature engineer must be fitted first")
|
472 |
+
|
473 |
+
# Get feature importance
|
474 |
+
importance = feature_engineer.get_feature_importance(top_k=top_k)
|
475 |
+
metadata = feature_engineer.get_feature_metadata()
|
476 |
+
|
477 |
+
# Analyze feature types in top features
|
478 |
+
top_features = list(importance.keys())
|
479 |
+
feature_type_counts = {}
|
480 |
+
|
481 |
+
for feature in top_features:
|
482 |
+
if feature.startswith('tfidf_'):
|
483 |
+
feature_type = 'tfidf'
|
484 |
+
elif feature.startswith('sentiment_'):
|
485 |
+
feature_type = 'sentiment'
|
486 |
+
elif feature.startswith('readability_'):
|
487 |
+
feature_type = 'readability'
|
488 |
+
elif feature.startswith('entity_'):
|
489 |
+
feature_type = 'entity'
|
490 |
+
elif feature.startswith('linguistic_'):
|
491 |
+
feature_type = 'linguistic'
|
492 |
+
else:
|
493 |
+
feature_type = 'other'
|
494 |
+
|
495 |
+
feature_type_counts[feature_type] = feature_type_counts.get(feature_type, 0) + 1
|
496 |
+
|
497 |
+
analysis = {
|
498 |
+
'top_features': importance,
|
499 |
+
'feature_type_distribution': feature_type_counts,
|
500 |
+
'total_features': metadata['total_features'],
|
501 |
+
'feature_breakdown': metadata['feature_types'],
|
502 |
+
'analysis_timestamp': datetime.now().isoformat()
|
503 |
+
}
|
504 |
+
|
505 |
+
return analysis
|